Commit 37105061 by nnnghia98

refactoring code

parent 6e007fe3
...@@ -7,6 +7,7 @@ class CrawlData ...@@ -7,6 +7,7 @@ class CrawlData
def crawl_web def crawl_web
page = Nokogiri::HTML.parse(open(Settings.crawl.base_url, ssl_verify_mode: nil)) page = Nokogiri::HTML.parse(open(Settings.crawl.base_url, ssl_verify_mode: nil))
total_job = page.css("div.ais-stats h1.col-sm-10 span").text.gsub(",", "").to_f total_job = page.css("div.ais-stats h1.col-sm-10 span").text.gsub(",", "").to_f
return if total_job == 0
total_page = (total_job / Settings.crawl.jobs_per_page).floor total_page = (total_job / Settings.crawl.jobs_per_page).floor
crawl_job_title_logger = ActiveSupport::Logger.new("log/crawl_data.log") crawl_job_title_logger = ActiveSupport::Logger.new("log/crawl_data.log")
crawl_job_title_logger.info "Crawl at #{Time.current}" crawl_job_title_logger.info "Crawl at #{Time.current}"
...@@ -18,31 +19,31 @@ class CrawlData ...@@ -18,31 +19,31 @@ class CrawlData
job_page = Nokogiri::HTML.parse(open(URI.encode(job_url))) job_page = Nokogiri::HTML.parse(open(URI.encode(job_url)))
# Job code job = JobHtml.new(job_page).parse_job
job_code = job_url.split("/").last.split(".")[-2]
# Company code next if job_page.css(".LeftJobCB").nil? || job[:workplace].blank?
company_code = job_url.split("/").last.split("-").last.split(".")[-2].strip
next if job_page.css(".LeftJobCB").nil? # Job code
job_code = job_url.split("/").last.split(".")[-2] || ""
job = JobHtml.new(job_page).parse_job # Company code
company_code = job_page.css(".viewmorejob a @href").present? ?
job_page.css(".viewmorejob a @href").text.split("/").last.split("-")[-2].strip : ""
crawl_job_title_logger.info "#{job[:title]}" crawl_job_title_logger.info "#{job[:title]}"
next if job[:workplace].nil?
job[:workplace].each do |city_name| job[:workplace].each do |city_name|
city_id = city_id(city_name) city_id = get_city(city_name).id
company_id = company_id(company_code, job[:company_name], job[:company_address], job[:company_description]) company_id = get_company(company_code, job[:company_name], job[:company_address], job[:company_description]).id
job_id = job_id(job_code, job[:title], job[:salary], job_id = get_job(job_code, job[:title], job[:salary],
job[:description], job[:requirement], job[:description], job[:requirement],
job[:level], job[:post_date], job[:level], job[:post_date],
job[:expiration_date], company_id) job[:expiration_date], company_id).id
CityJob.find_or_create_by!(job_id: job_id, city_id: city_id) CityJob.find_or_create_by!(job_id: job_id, city_id: city_id)
job[:industries].each do |job_industry| job[:industries].each do |job_industry|
industry_id = industry_id(job_industry.strip) job_industry = job_industry.strip
industry_id = get_industry(job_industry).id
IndustryJob.find_or_create_by!(industry_id: industry_id, job_id: job_id) IndustryJob.find_or_create_by!(industry_id: industry_id, job_id: job_id)
end end
end end
...@@ -50,28 +51,25 @@ class CrawlData ...@@ -50,28 +51,25 @@ class CrawlData
end end
end end
def company_id(code, name, address, description) def get_company(code, name, address, description)
company = Company.find_or_initialize_by(code: code) company = Company.find_or_initialize_by(code: code)
company.update(name: name, address: address, description: description) company.update(name: name, address: address, description: description)
company.id company
end end
def industry_id(name) def get_industry(name)
industry = Industry.find_or_create_by!(name: name) industry = Industry.find_or_create_by!(name: name)
industry.id industry
end end
def city_id(name) def get_city(name)
name = name.strip name = name.strip
City.find_or_create_by(name: name, region: "Việt Nam").id City.find_or_create_by(name: name, region: "Việt Nam")
end end
def job_id(code = nil, title, salary, description, requirement, level, post_date, expiration_date, company_id) def get_job(code = nil, title, salary, description, requirement, level, post_date, expiration_date, company_id)
if expiration_date.nil? attrs = expiration_date.nil? ? {title: job_title, company_id: company_id} : {code: code}
job = Job.find_or_initialize_by(title: job_title, company_id: company_id) job = Job.find_or_initialize_by attrs
else
job = Job.find_or_initialize_by(code: code)
end
job.update(code: code, job.update(code: code,
title: title, title: title,
...@@ -82,6 +80,6 @@ class CrawlData ...@@ -82,6 +80,6 @@ class CrawlData
expiration_date: expiration_date, expiration_date: expiration_date,
level: level, level: level,
company_id: company_id) company_id: company_id)
job.id job
end end
end end
...@@ -6,61 +6,67 @@ class JobHtml ...@@ -6,61 +6,67 @@ class JobHtml
def parse_job def parse_job
get_job_info get_job_info
get_job_detail get_job_detail
job = { title: get_title, { title: get_title,
salary: @job_salary, salary: get_job_info[:salary],
level: @job_level, level: get_job_info[:level],
post_date: get_post_date, post_date: get_post_date,
description: @job_description, description: get_job_detail[:description],
requirement: @job_requirement, requirement: get_job_detail[:requirement],
expiration_date: @job_expiration_date, expiration_date: get_job_info[:expiration_date],
workplace: @job_workplace, workplace: get_job_info[:workplace],
level: @job_level, industries: get_job_info[:industries],
industries: @job_industries,
company_name: get_company_name, company_name: get_company_name,
company_address: get_company_address, company_address: get_company_address,
company_description: get_company_description } company_description: get_company_description }
return job
end end
private
def get_title def get_title
@job_title = @html_data.css(".top-job-info h1").text.strip @html_data.css(".top-job-info h1").text.strip
end end
def get_post_date def get_post_date
@job_post_date = @html_data.css(".datepost span").text @html_data.css(".datepost span").text
end end
def get_job_info def get_job_info
info_container = @html_data.css(".DetailJobNew li p") info_container = @html_data.css(".DetailJobNew li p")
job_info = {}
job_info = (0..info_container.count - 1).map do |info_part| (0..info_container.count - 1).each do |info_part|
info = info_container[info_part].text info = info_container[info_part].text
case case
when info.include?("Nơi làm việc") when info.include?("Nơi làm việc")
@job_workplace = info.gsub("/[\r\n]+/", "").partition(":").last.split(",") job_info[:workplace] = info.gsub("/[\r\n]+/", "").partition(":").last.split(",") || []
when info.include?("Lương") when info.include?("Lương")
@job_salary = info.gsub("/[\r\n]+/", "").partition(":").last.strip job_info[:salary] = info.gsub("/[\r\n]+/", "").partition(":").last.strip
when info.include?("Cấp bậc") when info.include?("Cấp bậc")
@job_level = info.gsub("/[\r\n]+/", "").partition(":").last.strip job_info[:level] = info.gsub("/[\r\n]+/", "").partition(":").last.strip
when info.include?("Hết hạn nộp") when info.include?("Hết hạn nộp")
@job_expiration_date = info.gsub("/[\r\n]+/", "").partition(":").last.strip job_info[:expiration_date] = info.gsub("/[\r\n]+/", "").partition(":").last.strip
when info.include?("Ngành nghề") when info.include?("Ngành nghề")
@job_industries = info.gsub("/[\r\n]+/", "").partition(":").last.split(",") job_info[:industries] = info.gsub("/[\r\n]+/", "").partition(":").last.split(",")
end end
end end
return job_info
end end
def get_job_detail def get_job_detail
detail_container = @html_data.css("div.MarBot20") detail_container = @html_data.css("div.MarBot20")
job_detail = {}
job_detail = (0..detail_container.count - 1).map do |detail_part| (0..detail_container.count - 1).map do |detail_part|
detail = detail_container[detail_part].text detail = detail_container[detail_part].text
if detail.include?("Mô tả Công việc") if detail.include?("Mô tả Công việc")
@job_description = detail.partition("Mô tả Công việc").last job_detail[:description] = detail.partition("Mô tả Công việc").last
elsif detail.include?("Yêu Cầu Công Việc") elsif detail.include?("Yêu Cầu Công Việc")
@job_requirement = detail.partition("Yêu Cầu Công Việc").last job_detail[:requirement] = detail.partition("Yêu Cầu Công Việc").last
end end
end end
return job_detail
end end
def get_company_name def get_company_name
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment