Commit ab24f6a1 by Trịnh Hoàng Phúc

Merge branch 'feature/fix_review_11_5_2020' into 'master'

Fix review 11/05/2020

See merge request !17
parents e3ce2959 2e086f87
Pipeline #620 failed with stages
in 0 seconds
class City < ApplicationRecord class City < ApplicationRecord
validates :title, presence: true, uniqueness: true
has_and_belongs_to_many :jobs has_and_belongs_to_many :jobs
end end
class Company < ApplicationRecord class Company < ApplicationRecord
validates :title, presence: true
has_many :jobs has_many :jobs
end end
class Industry < ApplicationRecord class Industry < ApplicationRecord
validates :title, presence: true, uniqueness: true
has_and_belongs_to_many :jobs has_and_belongs_to_many :jobs
end end
class Job < ApplicationRecord class Job < ApplicationRecord
scope :by_cities, -> (city_id) {includes(:cities).where("cities.id = ?", city_id).references(:cities)}
scope :by_industries, -> (industry_id) {includes(:industries).where("industries.id = ?", industry_id).references(:industries)}
scope :by_companies, -> (company_id) {where("company_id = #{company_id}")}
EXPORT_CSV_ATTRIBUTES = %w(title updated_date_job level years_of_experience salary expiration_date).freeze
belongs_to :company belongs_to :company
has_many :applies has_many :applies
...@@ -10,9 +16,15 @@ class Job < ApplicationRecord ...@@ -10,9 +16,15 @@ class Job < ApplicationRecord
has_and_belongs_to_many :industries has_and_belongs_to_many :industries
has_and_belongs_to_many :cities has_and_belongs_to_many :cities
scope :by_cities, -> (city_id) {includes(:cities).where("cities.id = ?", city_id).references(:cities)} validate :updated_date_job_cannot_be_greater_than_expiration_date
scope :by_industries, -> (industry_id) {includes(:industries).where("industries.id = ?", industry_id).references(:industries)}
scope :by_companies, -> (company_id) {where("company_id = #{company_id}")}
EXPORT_CSV_ATTRIBUTES = %w(title updated_date_job level years_of_experience salary expiration_date).freeze validates :title, length: { minimum: 6 }
validates :title, :updated_date_job, :level, :expiration_date, :salary, :min_salary, :max_salary, presence: true
validates :min_salary, :max_salary, numericality: { only_integer: true }
def updated_date_job_cannot_be_greater_than_expiration_date
if DateTime.parse(updated_date_job).to_i > DateTime.parse(expiration_date).to_i
errors.add(:updated_date_job, "can't be greater than expiration date")
end
end
end end
class CrawlerService
def self.convert_salary(salary)
return [0, 999_999_999] if salary == "Cạnh tranh"
vn_salary = salary.tr("^[0-9]{1,2}[.,]\d{1-2}", " ")
.tr(",",".")
.split(" ")
.map { |s| (s.to_f*1_000_000).to_i }
return [0, vn_salary[0]] if salary.include? "Dưới"
return [vn_salary[0], 0] if salary.include? "Trên"
[vn_salary[0], vn_salary[1]]
end
def self.imports(job_attributes, company_attributes, cities, industries)
raise Exception.new "Not enough data transferred" if job_attributes.nil? || company_attributes.nil? || cities.nil? || industries.nil?
ActiveRecord::Base.transaction do
job_attributes[:company_id] = Company.find_or_create_by!(company_attributes).id
job = Job.find_or_create_by!(job_attributes)
if job.errors.full_messages.present?
raise Exception.new "#{job.errors.full_messages.join(",")}"
raise ActiveRecord::Rollback
end
cities = cities.map do |city|
City.find_or_create_by({title: city})
end
industries = industries.map do |industry|
Industry.find_or_create_by({title: industry})
end
cities.each do |city|
job.cities << city
end
industries.each do |industry|
job.industries << industry
end
end
end
end
\ No newline at end of file
class AddForeignToCities < ActiveRecord::Migration[6.0] class AddForeignToCities < ActiveRecord::Migration[6.0]
def change def change
add_column :cities, :foreign, :boolean, :default => false add_column :cities, :foreign, :boolean, default: false
end end
end end
class AddColumnsToJobs < ActiveRecord::Migration[6.0]
def change
add_column :jobs, :min_salary, :bigint, default: 0
add_column :jobs, :max_salary, :bigint, default: 0
add_column :jobs, :benefit, :text
add_column :jobs, :job_requirements, :text
add_column :jobs, :other_information, :text
end
end
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2020_04_23_044651) do ActiveRecord::Schema.define(version: 2020_05_11_055632) do
create_table "admins", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t| create_table "admins", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
t.string "email", default: "", null: false t.string "email", default: "", null: false
...@@ -89,6 +89,11 @@ ActiveRecord::Schema.define(version: 2020_04_23_044651) do ...@@ -89,6 +89,11 @@ ActiveRecord::Schema.define(version: 2020_04_23_044651) do
t.bigint "company_id" t.bigint "company_id"
t.datetime "created_at", precision: 6, null: false t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false t.datetime "updated_at", precision: 6, null: false
t.bigint "min_salary", default: 0
t.bigint "max_salary", default: 0
t.text "benefit"
t.text "job_requirements"
t.text "other_information"
t.index ["company_id"], name: "index_jobs_on_company_id" t.index ["company_id"], name: "index_jobs_on_company_id"
end end
......
...@@ -5,176 +5,125 @@ namespace :crawler do ...@@ -5,176 +5,125 @@ namespace :crawler do
desc "Crawler Careerbuilder" desc "Crawler Careerbuilder"
task job: :environment do task job: :environment do
# Define exception logger # Define crawler logger
exception_logger = ActiveSupport::Logger.new("log/exception_logger.log") logger = Logger.new("log/crawler_logger.log")
# Define skip logger
skip_url_logger = ActiveSupport::Logger.new("log/skip_url_logger.log")
# Loop page # Loop page
(10..12).each do |page| (1..2).each do |page|
# Fetch and parse HTML document # Fetch and parse HTML document
html_jobs = Nokogiri::HTML.parse(open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html")) html_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
# Loop item # Loop item
html_jobs.css(".jobs-side-list .job-item").each do |item| html_jobs.css(".jobs-side-list .job-item").each do |item|
url = item.css(".figure .figcaption .title .job_link @href").text
html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(url)))
if html_job_detail.at_css(".search-result-list-detail .tabs div#tab-1").nil?
logger.warn "Another template #{url}"
next
end
# Set salary, min-salary, max-salary
if item.at_css(".figure .figcaption .caption .salary").text.include? "USD"
logger.warn "Another template #{url}"
next
end
salary = item.at_css(".figure .figcaption .caption .salary").text.gsub("$ ","")
min_salary, max_salary = CrawlerService.convert_salary(salary)
# Job attributes # Job attributes
job_attributes = { job_attributes = {
title: item.css(".figure .figcaption .title a @title").text, title: item.at_css(".figure .figcaption .title a @title").text,
updated_date_job: item.css(".bottom-right-icon .time time").text, updated_date_job: item.at_css(".bottom-right-icon .time time").text,
level: nil, salary: salary,
years_of_experience: nil, min_salary: min_salary,
salary: item.css(".figure .figcaption .caption .salary").text.gsub("$ ",""), max_salary: max_salary
expiration_date: nil,
job_description: nil,
company_id: nil,
} }
# Defind cities array html_job_detail.css(".job-detail-content .row .has-background ul li").each do |ele|
cities = [] type = ele.at_css("strong").text
item.css(".figure .figcaption .caption .location ul li").each do |city|
city = check_exist_or_create_city(city.text.strip)
cities << city
end
if item.css(".figure .image a @href").text != "javascript:void(0);"
# Company attributes
html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text)))
unless html_company_detail.at_css(".jobsby-company").nil?
company_attributes = {
title: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content .name").text,
address: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content p")[1].text,
logo: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .img @src").text,
description: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content ul").inner_html.strip
}
# Check exist or create company
job_attributes[:company_id] = check_exist_or_create_company(company_attributes)
end
end
# Defind industry ids array
industries = []
html_job_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .figcaption .title .job_link @href").text)))
unless html_job_detail.at_css(".search-result-list-detail").nil?
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .has-background ul li").each do |ele|
type = ele.css("strong").text
case type case type
when "Hết hạn nộp" when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.css("p").text.strip job_attributes[:expiration_date] = ele.at_css("p").text.squish
when "Cấp bậc" when "Cấp bậc"
job_attributes[:level] = ele.css("p").text.strip job_attributes[:level] = ele.at_css("p").text.squish
when "Kinh nghiệm" when "Kinh nghiệm"
job_attributes[:years_of_experience] = ele.css("p").text.strip job_attributes[:years_of_experience] = ele.at_css("p").text.squish
end end
end end
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").each do |ele|
industry = check_exist_or_create_industry(ele.text.gsub(",","").strip)
industries << industry
end
# Get description for job attributes
description = ""
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele| html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele|
description << ele.inner_html next if ele.at_css(".detail-title").nil?
end type = ele.at_css(".detail-title").text
# Set description for job attributes case type
job_attributes[:job_description] = description.strip when "Phúc lợi "
else job_attributes[:benefit] = ele.at_css("ul").inner_html.squish
skip_url_logger.info "another template #{item.css(".figure .figcaption .title .job_link @href").text}" when "Mô tả Công việc"
end job_attributes[:job_description] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Mô tả Công việc</h3>","")
# Create job when "Yêu Cầu Công Việc"
job = check_exist_or_create_job(job_attributes) job_attributes[:job_requirements] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Yêu Cầu Công Việc</h3>","")
# Create city_job when "Thông tin khác"
if cities.count > 0 job_attributes[:other_information] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Thông tin khác</h3>","")
cities.each do |city|
job.cities << city
end end
end end
# Create industry_job next if item.at_css(".figure .image a @href").text == "javascript:void(0);"
if industries.count > 0 # Company attributes
industries.each do |industry| html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text)))
job.industries << industry next if html_company_detail.at_css(".jobsby-company").nil?
company_css = ".jobsby-company .company-introduction .company-info .info "
company_attributes = {
title: html_company_detail.at_css(company_css + ".content .name").text,
address: html_company_detail.css(company_css + ".content p")[1].text,
logo: html_company_detail.at_css(company_css + ".img @src").text,
description: html_company_detail.at_css(company_css + ".content ul").inner_html.squish
}
# Defind cities array
cities = item.css(".figure .figcaption .caption .location ul li").map do |city|
city.text.squish
end end
# Defind industries array
industries = html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").map do |industry|
industry.text.tr(",","").squish
end end
rescue
exception_logger.info "Error url: #{item.css(".figure .figcaption .title .job_link @href").text}" result = CrawlerService.imports(job_attributes, company_attributes, cities, industries)
logger.info "Crawl success url : #{url}"
rescue Exception => e
logger.error e
next next
end end
end end
end end
task city: :environment do task city: :environment do
# Fetch and parse HTML document # Fetch and parse HTML document
html_cities = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html")) html_cities = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
unless html_cities.at_css(".find-jobsby-categories .main-jobs-by-location").nil?
# Defind cities array
cities = []
# Get city in country # Get city in country
html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").each do |title| cities_in_country = html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").map do |title|
city = { {
title: title.text.gsub("Việc làm tại ","").strip, title: title.text.gsub("Việc làm tại ","").squish,
foreign: false foreign: false
} }
cities << city
end end
# Get city foreign # Get city foreign
html_cities.css(".find-jobsby-categories .main-jobs-by-location .overseas-jobs .list-overseas-jobs li a").each do |title| cities_foreign = html_cities.css(".find-jobsby-categories .main-jobs-by-location .overseas-jobs .list-overseas-jobs li a").map do |title|
city = { {
title: title.text.strip, title: title.text.squish,
foreign: true foreign: true
} }
cities << city
end end
if cities.count > 0 cities = cities_in_country + cities_foreign
if cities.length > 0
City.import cities City.import cities
end end
end end
end
task industry: :environment do task industry: :environment do
# Fetch and parse HTML document # Fetch and parse HTML document
html_industries = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html")) html_industries = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
unless html_industries.at_css(".find-jobsby-categories .list-of-working-positions").nil?
# Defind industries array
industries = []
# Get industry # Get industry
html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").each do |title| industries = html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").map do |title|
industry = { {
title: title.text.strip title: title.text.squish
} }
industries << industry
end end
if industries.count > 0 if industries.length > 0
Industry.import industries Industry.import industries
end end
end end
end
def check_exist_or_create_company(company_attributes)
find_company = Company.find_or_create_by(company_attributes)
return find_company.id
end
def check_exist_or_create_industry(industry_title)
industries = Industry.where("title LIKE ?", industry_title)
if industries.count == 0
industry = Industry.create(title: industry_title)
else
industry = industries[0]
end
return industry
end
def check_exist_or_create_city(city_title)
cities = City.where("title LIKE ?", city_title)
if cities.count == 0
city = City.create(title: city_title)
else
city = cities[0]
end
return city
end
def check_exist_or_create_job(job_attributes)
job = Job.find_or_create_by(job_attributes)
return job
end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment