Commit 9e051c4d by Mai Hoang Thai Ha

fixed logic,...

parent 492fb257
......@@ -45,8 +45,6 @@ group :development do
gem 'listen', '~> 3.3'
# Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring
gem 'spring'
gem 'nokogiri', '~> 1.11', '>= 1.11.7'
gem 'httparty', '~> 0.18.1'
gem 'rubocop-rails', '~> 2.11', '>= 2.11.3'
end
......@@ -61,3 +59,5 @@ end
# Windows does not include zoneinfo files, so bundle the tzinfo-data gem
gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby]
gem 'slim-rails', '~> 3.2'
gem 'nokogiri', '~> 1.11', '>= 1.11.7'
gem 'httparty', '~> 0.18.1'
\ No newline at end of file
require 'open-uri'
namespace :crawler do
# command: rails crawler:jobs TYPE=TEST / ALL
desc 'crawler from CareerBuilder'
task jobs: :environment do
ARGV.each { |a| task a.to_sym { ; } }
total_pages = 0
if ARGV.length == 1 && ARGV[0] == 'TEST'
total_pages = 1
elsif ARGV.length == 1 && ARGV[0] == 'ALL'
unless %w[ALL TEST].include?(ENV['TYPE'])
abort 'Do you want to crawl all pages (ALL) or some pages (TEST)? Please ONLY pass ONE argument.'
end
total_pages = 5 # default = TEST
if ENV['TYPE'] == 'ALL'
first_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html').body)
jobs_per_page = first_page.css('div.job-item').count
total_jobs = first_page.css('.search-result-list .job-found p').text.split(' ').first.gsub(',', '').to_i
total_jobs = first_page.css('.search-result-list .job-found-amout p').text.tr('^0-9', '')
total_pages = (total_jobs.to_f / jobs_per_page).round
else
exit
end
(1..total_pages).each do |page|
parsed_page = Nokogiri::HTML(HTTParty.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html").body)
jobs_item = parsed_page.css('div.job-item .job_link')
jobs_item.each do |item|
job_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/vi/tim-viec-lam/' +
CGI.escape(item.attribute('href').text.remove('https://careerbuilder.vn/vi/tim-viec-lam/'))).body)
job_detail = job_page.css('section.job-detail-content')
# title - company
title = job_page.css('div.job-desc h1.title').text
company = job_page.css('div.job-desc a.job-company-name').text
# info box
info_box_item = job_detail.css('.detail-box ul li')
# city, update_at, industry, type, salary, experience, level, expiration_date
job_industries = []
retries ||= 0
url ||= item.attribute('href').text
job_page = Nokogiri::HTML(HTTParty.get(url).body)
# Job
job_title = job_page.css('div.job-desc h1.title').text
# update_at, job_industries, job_type, salary, experience, level, expiration_date
detail_box_items = job_page.css('.job-detail-content .detail-box ul li')
# init
update_at, job_type, salary, experience, level, expiration_date = ''
job_cities = []
job_detail.css('.detail-box .map p a').each do |part|
city = part.text
job_cities << city
end
info_box_item.each do |info_item|
industries = []
detail_box_items.each do |info_item|
info = info_item.text
if info.include?(key = 'Ngày cập nhật')
update_at = info.squish.remove(key).strip
elsif info.include?(key = 'Ngành nghề')
job_industries = info.squish.remove(key).strip.split(' , ')
elsif info.include?(key = 'Hình thức')
# case/when
case
when info.include?(key = 'Ngày cập nhật')
update_at = info.squish.remove(key).strip.to_time
when info.include?(key = 'Ngành nghề')
industries = info.squish.remove(key).strip.split(' , ')
when info.include?(key = 'Hình thức')
job_type = info.squish.remove(key).strip
elsif info.include?(key = 'Lương')
when info.include?(key = 'Lương')
salary = info.squish.remove(key).strip
elsif info.include?(key = 'Kinh nghiệm')
when info.include?(key = 'Kinh nghiệm')
experience = info.squish.remove(key).strip
elsif info.include?(key = 'Cấp bậc')
when info.include?(key = 'Cấp bậc')
level = info.squish.remove(key).strip
elsif info.include?(key = 'Hết hạn nộp')
expiration_date = info.squish.remove(key).strip
end
end
# benefit
benefit_list = []
other_info_list = []
benefits = job_detail.css('ul.welfare-list li')
benefits.each do |part|
benefit = part.text.strip
benefit_list << benefit
end
# description, requirement
description, requirement = ''
job_detail_row = job_detail.css('div.detail-row')
job_detail_row.each do |part|
job_detail_text = part.text
if job_detail_text.include?('Mô tả Công việc')
description = job_detail_text.partition('Mô tả Công việc').last.squish.strip
elsif job_detail_text.include?('Yêu Cầu Công Việc')
requirement = job_detail_text.partition('Yêu Cầu Công Việc').last.squish.strip
end
end
# other info
other_info = job_detail.css('div.content_fck ul li')
other_info.each do |part|
info = part.text.squish.strip
other_info_list << info
end
company = Company.find_or_create_by(name: company)
job = Job.find_or_create_by(
title: title,
when info.include?(key = 'Hết hạn nộp')
expiration_date = info.squish.remove(key).strip.to_time
end
end
# benefits, description, requirement, other_info
job_detail_rows = job_page.css('section.job-detail-content div.detail-row')
benefits, description, requirement, other_info = []
job_detail_rows.each do |detail_row|
detail_title = detail_row.css('.detail-title').text.strip
case detail_title
when 'Phúc lợi'
benefits = detail_row.css(':not(h3.detail-title)').map(&:text).map(&:squish)[1..-1].reject(&:blank?).join('---')
when 'Mô tả Công việc'
description = detail_row.css(':not(h3.detail-title)').map(&:text).map(&:squish)[1..-1].reject(&:blank?).join('---')
when 'Yêu Cầu Công Việc'
requirement = detail_row.css(':not(h3.detail-title)').map(&:text).map(&:squish)[1..-1].reject(&:blank?).join('---')
when 'Thông tin khác'
other_info = detail_row.css(':not(h3.detail-title)').map(&:text).map(&:squish)[1..-1].reject(&:blank?).join('---')
end
end
# Company
company_name = job_page.css('div.job-desc a.job-company-name').text
# Cities
cities = job_page.css('.job-detail-content .detail-box .map p a').map(&:text)
company_object = Company.find_or_create_by(name: company_name)
job_object = Job.create({ title: job_title,
job_type: job_type,
salary: salary,
position: level,
experience: experience,
position: level,
expiration_date: expiration_date,
benefit: benefit_list.each { |benefit| },
description: description,
benefit: benefits,
requirement: requirement,
other_info: other_info_list.each { |info| }
)
company.jobs << job
job_industries.each do |industry|
industry_id = Industry.find_or_create_by(name: industry)
job.industries << industry_id
other_info: other_info,
company_id: company_object.id,
created_at: update_at,
updated_at: update_at })
industries.map do |industry|
industry_objects = Industry.find_or_create_by(name: industry)
job_object.industries << industry_objects
end
job_cities.each do |city|
city_id = City.find_or_create_by(name: city)
job.cities << city_id
cities.map do |city|
city_objects = City.find_or_create_by(name: city)
job_object.cities << city_objects
end
rescue URI::InvalidURIError => e
puts "[Error] #{e.message}"
encode_url = CGI.escape(url.remove('https://careerbuilder.vn/vi/tim-viec-lam/'))
url = "https://careerbuilder.vn/vi/tim-viec-lam/#{encode_url}"
retry if (retries += 1) < 2
rescue StandardError => e
puts e.message
puts e.backtrace.inspect
end
end
end
......@@ -113,14 +106,9 @@ namespace :crawler do
task industries: :environment do
parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_job = parsed_page.css('div.list-of-working-positions ul.list-jobs li a')
industry_list = []
list_job.each do |part|
industry = part.text.squish.strip
industry_list << industry
end
industry_list.each do |industry|
Industry.create(name: industry)
Industry.find_or_create_by(name: industry)
end
end
......@@ -128,22 +116,17 @@ namespace :crawler do
task cities: :environment do
parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_location = parsed_page.css('div.main-jobs-by-location ul li')
city_list = []
list_location.each do |part|
city_item = part.text
city_name = part.text
region = 1
if city_item.include?(key = 'Việc làm tại')
city_item = city_item.remove(key).strip
if city_name.include?(key = 'Việc làm tại')
city_name = city_name.remove(key).strip
region = 0
end
city = {
name: city_item,
name: city_name,
region: region
}
city_list << city
end
city_list.each do |city|
City.create(
name: city[:name],
region: city[:region]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment