Commit c657a9de by Mai Hoang Thai Ha

fix some bugs

parent ea11d8e6
...@@ -6,25 +6,25 @@ namespace :crawler do ...@@ -6,25 +6,25 @@ namespace :crawler do
desc 'importjob' desc 'importjob'
task jobs: :environment do task jobs: :environment do
parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html').body) (1..3).each do |page|
parsed_page = Nokogiri::HTML(HTTParty.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html").body)
jobs_item = parsed_page.css('div.job-item .job_link') jobs_item = parsed_page.css('div.job-item .job_link')
(0..jobs_item.length - 1).each do |item|
job_link = jobs_item[item].attribute('href').text jobs_item.each do |item|
job_page = Nokogiri::HTML(HTTParty.get(job_link).body) job_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/vi/tim-viec-lam/' +
job_desc = job_page.css('div.job-desc') CGI.escape(item.attribute('href').text.remove('https://careerbuilder.vn/vi/tim-viec-lam/'))).body)
job_detail = job_page.css('section.job-detail-content') job_detail = job_page.css('section.job-detail-content')
# title - company # title - company
title = job_desc.css('h1.title').text title = job_page.css('div.job-desc h1.title').text
company = job_desc.css('a.job-company-name').text company = job_page.css('div.job-desc a.job-company-name').text
# info box # info box
info_box = job_detail.css('div.detail-box') info_box_item = job_detail.css('.detail-box ul li')
info_box_item = info_box.css('ul li')
city_box = info_box.css('div.map a')
# city, update_at, industry, type, salary, experience, level, expiration_date # city, update_at, industry, type, salary, experience, level, expiration_date
update_at, industry, type, salary, experience, level, expiration_date = '' update_at, industry, type, salary, experience, level, expiration_date = ''
city = city_box.text city = job_detail.css('.detail-box .map a').text
(0..info_box_item.length - 1).each do |part|
info = info_box_item[part].text info_box_item.each do |info_item|
info = info_item.text
if info.include?(key = 'Ngày cập nhật') if info.include?(key = 'Ngày cập nhật')
update_at = info.squish.remove(key).strip update_at = info.squish.remove(key).strip
elsif info.include?(key = 'Ngành nghề') elsif info.include?(key = 'Ngành nghề')
...@@ -41,29 +41,31 @@ namespace :crawler do ...@@ -41,29 +41,31 @@ namespace :crawler do
expiration_date = info.squish.remove(key).strip expiration_date = info.squish.remove(key).strip
end end
end end
# benefit # benefit
job_detail_row = job_detail.css('div.detail-row')
benefit_list = [] benefit_list = []
other_info_list = [] other_info_list = []
benefits = job_detail.css('ul.welfare-list li') benefits = job_detail.css('ul.welfare-list li')
(0..benefits.length - 1).each do |part| benefits.each do |part|
benefit = benefits[part].text.strip benefit = part.text.strip
benefit_list << benefit benefit_list << benefit
end end
# description, requirement # description, requirement
description, requirement = '' description, requirement = ''
(0..job_detail_row.length - 1).each do |part| job_detail_row = job_detail.css('div.detail-row')
job_detail_text = job_detail_row[part].text job_detail_row.each do |part|
job_detail_text = part.text
if job_detail_text.include?('Mô tả Công việc') if job_detail_text.include?('Mô tả Công việc')
description = job_detail_text.partition('Mô tả Công việc').last.squish.strip description = job_detail_text.partition('Mô tả Công việc').last.squish.strip
elsif job_detail_text.include?('Yêu Cầu Công Việc') elsif job_detail_text.include?('Yêu Cầu Công Việc')
requirement = job_detail_text.partition('Yêu Cầu Công Việc').last.squish.strip requirement = job_detail_text.partition('Yêu Cầu Công Việc').last.squish.strip
end end
end end
# benefit # other info
other_info = job_detail.css('div.content_fck ul li') other_info = job_detail.css('div.content_fck ul li')
(0..other_info.length - 1).each do |part|
info = other_info[part].text.squish.strip other_info.each do |part|
info = part.text.squish.strip
other_info_list << info other_info_list << info
end end
job = { job = {
...@@ -86,14 +88,16 @@ namespace :crawler do ...@@ -86,14 +88,16 @@ namespace :crawler do
puts job puts job
end end
end end
end
desc 'crawler industry form CareerBuilder' desc 'crawler industry form CareerBuilder'
task industries: :environment do task industries: :environment do
parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body) parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_job = parsed_page.css('div.list-of-working-positions ul.list-jobs li a') list_job = parsed_page.css('div.list-of-working-positions ul.list-jobs li a')
industry_list = [] industry_list = []
(0..list_job.length - 1).each do |part|
industry = list_job[part].text.squish.strip list_job.each do |part|
industry = part.text.squish.strip
industry_list << industry industry_list << industry
end end
p industry_list p industry_list
...@@ -104,8 +108,9 @@ namespace :crawler do ...@@ -104,8 +108,9 @@ namespace :crawler do
parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body) parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_location = parsed_page.css('div.main-jobs-by-location ul li') list_location = parsed_page.css('div.main-jobs-by-location ul li')
city_list = [] city_list = []
(0..list_location.length - 1).each do |part|
city_item = list_location[part].text list_location.each do |part|
city_item = part.text
region = 1 region = 1
if city_item.include?(key = 'Việc làm tại') if city_item.include?(key = 'Việc làm tại')
city_item = city_item.remove(key).strip city_item = city_item.remove(key).strip
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment