Commit e16f8904 by Mai Hoang Thai Ha

fix description, requirement from array to string, crawler first page on CareerBuilder

parent 31274088
# Web crawler
require 'open-uri' require 'open-uri'
require 'csv' require 'csv'
require 'zip' require 'zip'
...@@ -6,90 +5,86 @@ require 'zip' ...@@ -6,90 +5,86 @@ require 'zip'
namespace :job do namespace :job do
desc 'importjob' desc 'importjob'
task web_job_import: :environment do task web_job_crawler: :environment do
url = 'https://careerbuilder.vn/vi/tim-viec-lam/nhan-vien-thiet-ke-thoi-trang.35B6D3AD.html' parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html').body)
unparsed_page = HTTParty.get(url) job_item = parsed_page.css('div.job-item')
parsed_page ||= Nokogiri::HTML(unparsed_page.body) (0..job_item.count - 1).each do |item|
job_desc = parsed_page.css('div.job-desc') job_link = job_item[item].css('div.title a').attribute('href').text
job_detail = parsed_page.css('section.job-detail-content') unparsed_job_link = HTTParty.get(job_link)
# title parsed_job_link ||= Nokogiri::HTML(unparsed_job_link.body)
title = job_desc.css('h1.title').text job_desc = parsed_job_link.css('div.job-desc')
company = job_desc.css('a.job-company-name').text job_detail = parsed_job_link.css('section.job-detail-content')
# info box # title - company
info_box = job_detail.css('div.detail-box') title = job_desc.css('h1.title').text
info_box_item = info_box.css('ul li') company = job_desc.css('a.job-company-name').text
update_at, industry, type, salary, experience, level, expiration_date = '' # info box
city = info_box.first.text.squish.remove('Địa điểm').strip info_box = job_detail.css('div.detail-box')
(0..info_box_item.count - 1).each do |part| info_box_item = info_box.css('ul li')
info = info_box_item[part].text city_box = info_box.css('div.map a')
if info.include?(key = 'Ngày cập nhật') # city, update_at, industry, type, salary, experience, level, expiration_date
update_at = info.squish.remove(key).strip update_at, industry, type, salary, experience, level, expiration_date = ''
elsif info.include?(key = 'Ngành nghề') city = city_box.text
industry = info.squish.remove(key).strip (0..info_box_item.count - 1).each do |part|
elsif info.include?(key = 'Hình thức') info = info_box_item[part].text
type = info.squish.remove(key).strip if info.include?(key = 'Ngày cập nhật')
elsif info.include?(key = 'Lương') update_at = info.squish.remove(key).strip
salary = info.squish.remove(key).strip elsif info.include?(key = 'Ngành nghề')
elsif info.include?(key = 'Kinh nghiệm') industry = info.squish.remove(key).strip
experience = info.squish.remove(key).strip elsif info.include?(key = 'Hình thức')
elsif info.include?(key = 'Cấp bậc') type = info.squish.remove(key).strip
level = info.squish.remove(key).strip elsif info.include?(key = 'Lương')
elsif info.include?(key = 'Hết hạn nộp') salary = info.squish.remove(key).strip
expiration_date = info.squish.remove(key).strip elsif info.include?(key = 'Kinh nghiệm')
end experience = info.squish.remove(key).strip
end elsif info.include?(key = 'Cấp bậc')
# benefit level = info.squish.remove(key).strip
job_detail_row = job_detail.css('div.detail-row') elsif info.include?(key = 'Hết hạn nộp')
benefit_list = [] expiration_date = info.squish.remove(key).strip
description_list = []
requirement_list = []
other_info_list = []
benefits = job_detail.css('ul.welfare-list li')
(0..benefits.count - 1).each do |part|
benefit = benefits[part].text.strip
benefit_list << benefit
end
# description - requirment
(0..job_detail_row.count - 1).each do |part|
job_detail_text = job_detail_row[part].text
if job_detail_text.include?('Mô tả Công việc')
descriptions = job_detail_row.css('p')
(0..descriptions.count - 1).each do |desc|
description = descriptions[desc].text.strip
description_list << description
end end
elsif job_detail_text.include?('Yêu Cầu Công Việc') end
requirements = job_detail_row.css('p') # benefit
(0..requirements.count - 1).each do |req| job_detail_row = job_detail.css('div.detail-row')
requirement = requirements[req].text.strip benefit_list = []
requirement_list << requirement other_info_list = []
benefits = job_detail.css('ul.welfare-list li')
(0..benefits.count - 1).each do |part|
benefit = benefits[part].text.strip
benefit_list << benefit
end
# description, requirement
description, requirement = ''
(0..job_detail_row.count - 1).each do |part|
job_detail_text = job_detail_row[part].text
if job_detail_text.include?('Mô tả Công việc')
description = job_detail_text.partition('Mô tả Công việc').last.squish.strip
elsif job_detail_text.include?('Yêu Cầu Công Việc')
requirement = job_detail_text.partition('Yêu Cầu Công Việc').last.squish.strip
end end
end end
# benefit
other_info = job_detail.css('div.content_fck ul li')
(0..other_info.count - 1).each do |part|
info = other_info[part].text.squish.strip
other_info_list << info
end
job = {
title: title,
company: company,
city: city,
update_at: update_at,
industry: industry,
type: type,
salary: salary,
experences: experience,
level: level,
# position: position,
expiration_date: expiration_date,
benefit: benefit_list,
description: description,
requirement: requirement,
other_info: other_info_list
}
puts job
end end
# other info
other_info = job_detail.css('div.content_fck ul li')
(0..other_info.count - 1).each do |part|
info = other_info[part].text.squish.strip
other_info_list << info
end
job = {
title: title,
company: company,
city: city,
update_at: update_at,
industry: industry,
type: type,
salary: salary,
experences: experience,
level: level,
# position: position,
expiration_date: expiration_date,
benefit: benefit_list,
description: description_list,
requirement: requirement_list,
other_info: other_info_list
}
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment