Commit cb600a23 by nnnghia98

refactoring job crawler code

parent 2d0503f4
......@@ -5,89 +5,43 @@ require "openssl"
OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE
class CrawlData
def crawl_web
page = Nokogiri::HTML.parse(open(Settings.crawl.base_url))
page = Nokogiri::HTML.parse(open(Settings.crawl.base_url, ssl_verify_mode: nil))
total_job = page.css("div.ais-stats h1.col-sm-10 span").text.gsub(",", "").to_f
total_page = (total_job / 50).floor
fixed_total_page = 20
total_page = (total_job / Settings.crawl.jobs_per_page).floor
crawl_job_title_logger = ActiveSupport::Logger.new("log/crawl_data.log")
crawl_job_title_logger.info "Crawl at #{Time.current}"
(1..fixed_total_page).each do |each_page|
(1..Settings.crawl.fixed_total_page).each do |each_page|
page = Nokogiri::HTML.parse(open(URI.encode("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{each_page}-vi.html")))
(0..49).each do |j|
job_url = page.css(".jobtitle h3 a @href")[j].text
job_page = Nokogiri::HTML.parse(open(URI.encode(job_url)))
# Job code
job_code = job_url.split("/").last.split(".")[-2]
next if job_page.css(".LeftJobCB").nil?
# Job title
job_title = job_page.css(".top-job-info h1").text.strip
crawl_job_title_logger.info "#{job_title}"
# Job post date
job_post_date = job_page.css(".datepost span").text
job_salary, job_position, job_expiration_date, job_industries, job_level = ""
job_workplace = []
detail_job_new = job_page.css(".DetailJobNew li p")
(0..detail_job_new.count - 1).each do |detail_part|
detail = detail_job_new[detail_part].text
if detail.include?("Nơi làm việc")
job_workplace = detail.gsub("/[\r\n]+/", "").partition(":").last.split(",")
elsif detail.include?("Lương")
job_salary = detail.gsub("/[\r\n]+/", "").partition(":").last.strip
elsif detail.include?("Cấp bậc")
job_level = detail.gsub("/[\r\n]+/", "").partition(":").last.strip
elsif detail.include?("Hết hạn nộp")
job_expiration_date = detail.gsub("/[\r\n]+/", "").partition(":").last.strip
elsif detail.include?("Ngành nghề")
job_industries = detail.gsub("/[\r\n]+/", "").partition(":").last.split(",")
end
end
# Company code
company_code = job_url.split("/").last.split("-").last.split(".")[-2].strip
job_description, job_requirement = ""
job_container_detail = job_page.css("div.MarBot20")
next if job_page.css(".LeftJobCB").nil?
(0..job_container_detail.count - 1).each do |detail_part|
detail = job_container_detail[detail_part].text
if detail.include?("Mô tả Công việc")
job_description = detail.partition("Mô tả Công việc").last
elsif detail.include?("Yêu Cầu Công Việc")
job_requirement = detail.partition("Yêu Cầu Công Việc").last
end
end
job = JobHtml.new(job_page).parse_job
company_name, company_email, company_address, company_desc, company_code = ""
# Company full name
unless job_page.css(".tit_company").nil?
company_name = job_page.css("div.tit_company").text.strip
end
# Company code
company_code = job_url.split("/").last.split("-").last.split(".")[-2].strip
crawl_job_title_logger.info "#{job[:title]}"
# Company address
unless job_page.css(".TitleDetailNew label")[0].nil?
company_address = job_page.css("p.TitleDetailNew label")[0].text.strip
end
# Company description
company_desc = job_page.css("#emp_more p").text.strip
next if job[:workplace].nil?
job_workplace.each do |city_name|
job[:workplace].each do |city_name|
city_id = city_id(city_name)
company_id = company_id(company_code, company_name, company_address, company_desc)
job_id = job_id(job_code, job_title, job_salary,
job_description, job_requirement,
job_level, job_post_date,
job_expiration_date, company_id)
company_id = company_id(company_code, job[:company_name], job[:company_address], job[:company_description])
job_id = job_id(job_code, job[:title], job[:salary],
job[:description], job[:requirement],
job[:level], job[:post_date],
job[:expiration_date], company_id)
CityJob.find_or_create_by!(job_id: job_id, city_id: city_id)
job_industries.each do |job_industry|
job[:industries].each do |job_industry|
industry_id = industry_id(job_industry.strip)
IndustryJob.find_or_create_by!(industry_id: industry_id, job_id: job_id)
end
......
class JobHtml
def initialize( html_data = {} )
@html_data = html_data
end
def parse_job
get_job_info
get_job_detail
job = { title: get_title,
salary: @job_salary,
level: @job_level,
post_date: get_post_date,
description: @job_description,
requirement: @job_requirement,
expiration_date: @job_expiration_date,
workplace: @job_workplace,
level: @job_level,
industries: @job_industries,
company_name: get_company_name,
company_address: get_company_address,
company_description: get_company_description }
return job
end
def get_title
@job_title = @html_data.css(".top-job-info h1").text.strip
end
def get_post_date
@job_post_date = @html_data.css(".datepost span").text
end
def get_job_info
info_container = @html_data.css(".DetailJobNew li p")
job_info = (0..info_container.count - 1).map do |info_part|
info = info_container[info_part].text
case
when info.include?("Nơi làm việc")
@job_workplace = info.gsub("/[\r\n]+/", "").partition(":").last.split(",")
when info.include?("Lương")
@job_salary = info.gsub("/[\r\n]+/", "").partition(":").last.strip
when info.include?("Cấp bậc")
@job_level = info.gsub("/[\r\n]+/", "").partition(":").last.strip
when info.include?("Hết hạn nộp")
@job_expiration_date = info.gsub("/[\r\n]+/", "").partition(":").last.strip
when info.include?("Ngành nghề")
@job_industries = info.gsub("/[\r\n]+/", "").partition(":").last.split(",")
end
end
end
def get_job_detail
detail_container = @html_data.css("div.MarBot20")
job_detail = (0..detail_container.count - 1).map do |detail_part|
detail = detail_container[detail_part].text
if detail.include?("Mô tả Công việc")
@job_description = detail.partition("Mô tả Công việc").last
elsif detail.include?("Yêu Cầu Công Việc")
@job_requirement = detail.partition("Yêu Cầu Công Việc").last
end
end
end
def get_company_name
@html_data.css(".tit_company").present? ? @html_data.css("div.tit_company").text.strip : ""
end
def get_company_description
@html_data.css("#emp_more p").text.strip
end
def get_company_address
@html_data.css(".TitleDetailNew label")[0].present? ? @html_data.css("p.TitleDetailNew label")[0].text.strip : ""
end
end
......@@ -20,3 +20,5 @@ solr:
crawl:
base_url: "https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"
jobs_per_page: 50
fixed_total_page: 20
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment