Commit db3ba1fb by Trịnh Hoàng Phúc

Fix review 12/05/2020

parent 079c82d0
Pipeline #612 failed with stages
in 0 seconds
class City < ApplicationRecord class City < ApplicationRecord
validates :title, presence: true validates :title, presence: true, uniqueness: true
has_and_belongs_to_many :jobs has_and_belongs_to_many :jobs
end end
class Industry < ApplicationRecord class Industry < ApplicationRecord
validates :title, presence: true validates :title, presence: true, uniqueness: true
has_and_belongs_to_many :jobs has_and_belongs_to_many :jobs
end end
class Job < ApplicationRecord class Job < ApplicationRecord
validates :title, presence: true scope :by_cities, -> (city_id) {includes(:cities).where("cities.id = ?", city_id).references(:cities)}
scope :by_industries, -> (industry_id) {includes(:industries).where("industries.id = ?", industry_id).references(:industries)}
scope :by_companies, -> (company_id) {where("company_id = #{company_id}")}
EXPORT_CSV_ATTRIBUTES = %w(title updated_date_job level years_of_experience salary expiration_date).freeze
belongs_to :company belongs_to :company
has_many :applies has_many :applies
...@@ -12,9 +16,15 @@ class Job < ApplicationRecord ...@@ -12,9 +16,15 @@ class Job < ApplicationRecord
has_and_belongs_to_many :industries has_and_belongs_to_many :industries
has_and_belongs_to_many :cities has_and_belongs_to_many :cities
scope :by_cities, -> (city_id) {includes(:cities).where("cities.id = ?", city_id).references(:cities)} validate :updated_date_job_cannot_be_greater_than_expiration_date, on: :create
scope :by_industries, -> (industry_id) {includes(:industries).where("industries.id = ?", industry_id).references(:industries)}
scope :by_companies, -> (company_id) {where("company_id = #{company_id}")} validates :title, length: { minimum: 6 }
validates :title, :updated_date_job, :level, :expiration_date, :salary, :min_salary, :max_salary, presence: true, on: :create
validates :min_salary, :max_salary, numericality: { only_integer: true }
EXPORT_CSV_ATTRIBUTES = %w(title updated_date_job level years_of_experience salary expiration_date).freeze def updated_date_job_cannot_be_greater_than_expiration_date
if DateTime.parse(updated_date_job).to_i > DateTime.parse(expiration_date).to_i
errors.add(:updated_date_job, "can't be greater than expiration date")
end
end
end end
class CityService
def import cities
City.import cities
end
def check_exist_or_create_city city_title
cities = City.where("title LIKE ?", city_title)
if cities.length == 0
city = City.create(title: city_title)
else
city = cities[0]
end
return city
end
end
\ No newline at end of file
class CompanyService
def check_exist_or_create_company company_attributes
find_company = Company.find_or_create_by(company_attributes)
return find_company.id
end
end
\ No newline at end of file
class CrawlerService
def self.convert_salary salary
if salary == "Cạnh tranh"
[0, 999999999]
elsif salary.include? "Dưới"
max_salary = (salary.gsub("Dưới ","").gsub(" Tr VND","").gsub(",",".").to_f*1000000).to_i
[0, max_salary]
elsif salary.include? "Trên"
min_salary = (salary.gsub("Trên ","").gsub(" Tr VND","").gsub(",",".").to_f*1000000).to_i
max_salary = 999999999
[min_salary, max_salary]
else
range_salary = salary.split("-")
min_salary = (range_salary[0].gsub("$ ","").gsub(" Tr ","").to_f*1000000).to_i
max_salary = (range_salary[1].gsub(" Tr VND","").gsub(" ","").to_f*1000000).to_i
[min_salary, max_salary]
end
end
end
\ No newline at end of file
class IndustryService
def import industries
Industry.import industries
end
def check_exist_or_create_industry industry_title
industries = Industry.where("title LIKE ?", industry_title)
if industries.length == 0
industry = Industry.create(title: industry_title)
else
industry = industries[0]
end
return industry
end
end
\ No newline at end of file
class JobService class JobService
def check_exist_or_create_job job_attributes def self.check_exist_or_create_job job_attributes
job = Job.find_or_create_by(job_attributes) job = Job.find_or_create_by(job_attributes)
return job return job
end end
......
...@@ -5,11 +5,8 @@ namespace :crawler do ...@@ -5,11 +5,8 @@ namespace :crawler do
desc "Crawler Careerbuilder" desc "Crawler Careerbuilder"
task job: :environment do task job: :environment do
# Define exception logger # Define crawler logger
exception_logger = Logger.new("log/exception_logger.log") logger = Logger.new("log/crawler_logger.log")
# Define skip logger
skip_url_logger = Logger.new("log/skip_url_logger.log")
# Loop page # Loop page
(1..2).each do |page| (1..2).each do |page|
...@@ -17,22 +14,14 @@ namespace :crawler do ...@@ -17,22 +14,14 @@ namespace :crawler do
html_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html")) html_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
# Loop item # Loop item
html_jobs.css(".jobs-side-list .job-item").each do |item| html_jobs.css(".jobs-side-list .job-item").each do |item|
html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(item.css(".figure .figcaption .title .job_link @href").text)))
if html_job_detail.at_css(".search-result-list-detail .tabs div#tab-1").nil?
logger.warn "Another template #{item.css(".figure .figcaption .title .job_link @href").text}"
next
end
# Set salary, min-salary, max-salary # Set salary, min-salary, max-salary
salary = item.at_css(".figure .figcaption .caption .salary").text.gsub("$ ","") salary = item.at_css(".figure .figcaption .caption .salary").text.gsub("$ ","")
if salary == "Cạnh tranh" min_salary, max_salary = CrawlerService.convert_salary salary
min_salary = 0
max_salary = 999999999
elsif salary.include? "Dưới"
min_salary = 0
max_salary = (salary.gsub("Dưới ","").gsub(" Tr VND","").gsub(",",".").to_f*1000000).to_i
elsif salary.include? "Trên"
min_salary = (salary.gsub("Trên ","").gsub(" Tr VND","").gsub(",",".").to_f*1000000).to_i
max_salary = 999999999
else
range_salary = salary.split("-")
min_salary = (range_salary[0].gsub("$ ","").gsub(" Tr ","").to_f*1000000).to_i
max_salary = (range_salary[1].gsub(" Tr VND","").gsub(" ","").to_f*1000000).to_i
end
# Job attributes # Job attributes
job_attributes = { job_attributes = {
title: item.at_css(".figure .figcaption .title a @title").text, title: item.at_css(".figure .figcaption .title a @title").text,
...@@ -41,83 +30,75 @@ namespace :crawler do ...@@ -41,83 +30,75 @@ namespace :crawler do
min_salary: min_salary, min_salary: min_salary,
max_salary: max_salary max_salary: max_salary
} }
# Defind industry ids array html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .has-background ul li").each do |ele|
industries = [] type = ele.at_css("strong").text
case type
html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(item.css(".figure .figcaption .title .job_link @href").text))) when "Hết hạn nộp"
if html_job_detail.at_css(".search-result-list-detail .container .no-gutters").present? job_attributes[:expiration_date] = ele.at_css("p").text.squish
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .has-background ul li").each do |ele| when "Cấp bậc"
type = ele.at_css("strong").text job_attributes[:level] = ele.at_css("p").text.squish
case type when "Kinh nghiệm"
when "Hết hạn nộp" job_attributes[:years_of_experience] = ele.at_css("p").text.squish
job_attributes[:expiration_date] = ele.at_css("p").text.squish
when "Cấp bậc"
job_attributes[:level] = ele.at_css("p").text.squish
when "Kinh nghiệm"
job_attributes[:years_of_experience] = ele.at_css("p").text.squish
end
end
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele|
if ele.at_css("h3").present?
type = ele.at_css("h3").text
case type
when "Phúc lợi "
job_attributes[:benefit] = ele.at_css("ul").inner_html.squish
when "Mô tả Công việc"
job_attributes[:job_description] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Mô tả Công việc</h3>","")
when "Yêu Cầu Công Việc"
job_attributes[:job_requirements] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Yêu Cầu Công Việc</h3>","")
when "Thông tin khác"
job_attributes[:other_information] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Thông tin khác</h3>","")
end
end
end end
if item.at_css(".figure .image a @href").text != "javascript:void(0);" end
# Company attributes html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele|
html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text))) if ele.at_css("h3").present?
if html_company_detail.at_css(".jobsby-company").present? type = ele.at_css("h3").text
company_attributes = { case type
title: html_company_detail.at_css(".jobsby-company .company-introduction .company-info .info .content .name").text, when "Phúc lợi "
address: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content p")[1].text, job_attributes[:benefit] = ele.at_css("ul").inner_html.squish
logo: html_company_detail.at_css(".jobsby-company .company-introduction .company-info .info .img @src").text, when "Mô tả Công việc"
description: html_company_detail.at_css(".jobsby-company .company-introduction .company-info .info .content ul").inner_html.squish job_attributes[:job_description] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Mô tả Công việc</h3>","")
} when "Yêu Cầu Công Việc"
# Check exist or create company job_attributes[:job_requirements] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Yêu Cầu Công Việc</h3>","")
job_attributes[:company_id] = CompanyService.new.check_exist_or_create_company company_attributes when "Thông tin khác"
job_attributes[:other_information] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Thông tin khác</h3>","")
end end
end end
end
# Create job if item.at_css(".figure .image a @href").text != "javascript:void(0);"
job = JobService.new.check_exist_or_create_job job_attributes # Company attributes
# Defind cities array html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text)))
cities = [] if html_company_detail.at_css(".jobsby-company").present?
item.css(".figure .figcaption .caption .location ul li").each do |city| company_attributes = {
city = city_service.check_exist_or_create_city city.text.squish title: html_company_detail.at_css(".jobsby-company .company-introduction .company-info .info .content .name").text,
cities << city address: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content p")[1].text,
end logo: html_company_detail.at_css(".jobsby-company .company-introduction .company-info .info .img @src").text,
# Create city_job description: html_company_detail.at_css(".jobsby-company .company-introduction .company-info .info .content ul").inner_html.squish
if cities.length > 0 }
cities.each do |city| # Check exist or create company
job.cities << city job_attributes[:company_id] = Company.find_or_create_by(company_attributes).id
end
end end
# Create industry_job end
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").each do |ele| # Create job
industry = industry_service.check_exist_or_create_industry ele.text.gsub(",","").squish job = JobService.check_exist_or_create_job job_attributes
industries << industry if job.errors.full_messages.present?
logger.error "#{job.errors.full_messages.join(",")}"
next
end
# Defind cities array
cities = item.css(".figure .figcaption .caption .location ul li").map do |city|
city = City.find_or_create_by({title: city.text.squish})
end
# Create city_job
if cities.length > 0
cities.each do |city|
job.cities << city
end end
end
if industries.length > 0 # Defind industries array
industries.each do |industry| industries = html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").map do |ele|
job.industries << industry industry = Industry.find_or_create_by({title: ele.text.gsub(",","").squish})
end end
# Create industry_job
if industries.length > 0
industries.each do |industry|
job.industries << industry
end end
else
skip_url_logger.info "another template #{item.at_css(".figure .figcaption .title .job_link @href").text}"
end end
logger.info "Crawl success url : #{item.css(".figure .figcaption .title .job_link @href").text}"
rescue Exception => e rescue Exception => e
exception_logger.info e logger.error e
skip_url_logger.info "another template #{item.at_css(".figure .figcaption .title .job_link @href").text}"
next next
end end
end end
...@@ -142,7 +123,7 @@ namespace :crawler do ...@@ -142,7 +123,7 @@ namespace :crawler do
end end
cities = cities_in_country + cities_foreign cities = cities_in_country + cities_foreign
if cities.length > 0 if cities.length > 0
city_service.import cities City.import cities
end end
end end
...@@ -156,15 +137,7 @@ namespace :crawler do ...@@ -156,15 +137,7 @@ namespace :crawler do
} }
end end
if industries.length > 0 if industries.length > 0
industry_service.import industries Industry.import industries
end end
end end
def city_service
@city_service ||= CityService.new
end
def industry_service
@industry_service ||= IndustryService.new
end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment