Commit 9e051c4d by Mai Hoang Thai Ha

fixed logic,...

parent 492fb257
...@@ -45,8 +45,6 @@ group :development do ...@@ -45,8 +45,6 @@ group :development do
gem 'listen', '~> 3.3' gem 'listen', '~> 3.3'
# Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring # Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring
gem 'spring' gem 'spring'
gem 'nokogiri', '~> 1.11', '>= 1.11.7'
gem 'httparty', '~> 0.18.1'
gem 'rubocop-rails', '~> 2.11', '>= 2.11.3' gem 'rubocop-rails', '~> 2.11', '>= 2.11.3'
end end
...@@ -61,3 +59,5 @@ end ...@@ -61,3 +59,5 @@ end
# Windows does not include zoneinfo files, so bundle the tzinfo-data gem # Windows does not include zoneinfo files, so bundle the tzinfo-data gem
gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby] gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby]
gem 'slim-rails', '~> 3.2' gem 'slim-rails', '~> 3.2'
gem 'nokogiri', '~> 1.11', '>= 1.11.7'
gem 'httparty', '~> 0.18.1'
\ No newline at end of file
require 'open-uri' require 'open-uri'
namespace :crawler do namespace :crawler do
# command: rails crawler:jobs TYPE=TEST / ALL
desc 'crawler from CareerBuilder' desc 'crawler from CareerBuilder'
task jobs: :environment do task jobs: :environment do
ARGV.each { |a| task a.to_sym { ; } } unless %w[ALL TEST].include?(ENV['TYPE'])
abort 'Do you want to crawl all pages (ALL) or some pages (TEST)? Please ONLY pass ONE argument.'
total_pages = 0 end
if ARGV.length == 1 && ARGV[0] == 'TEST' total_pages = 5 # default = TEST
total_pages = 1 if ENV['TYPE'] == 'ALL'
elsif ARGV.length == 1 && ARGV[0] == 'ALL'
first_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html').body) first_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html').body)
jobs_per_page = first_page.css('div.job-item').count jobs_per_page = first_page.css('div.job-item').count
total_jobs = first_page.css('.search-result-list .job-found p').text.split(' ').first.gsub(',', '').to_i total_jobs = first_page.css('.search-result-list .job-found-amout p').text.tr('^0-9', '')
total_pages = (total_jobs.to_f / jobs_per_page).round total_pages = (total_jobs.to_f / jobs_per_page).round
else
exit
end end
(1..total_pages).each do |page| (1..total_pages).each do |page|
parsed_page = Nokogiri::HTML(HTTParty.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html").body) parsed_page = Nokogiri::HTML(HTTParty.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html").body)
jobs_item = parsed_page.css('div.job-item .job_link') jobs_item = parsed_page.css('div.job-item .job_link')
jobs_item.each do |item| jobs_item.each do |item|
job_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/vi/tim-viec-lam/' + retries ||= 0
CGI.escape(item.attribute('href').text.remove('https://careerbuilder.vn/vi/tim-viec-lam/'))).body) url ||= item.attribute('href').text
job_detail = job_page.css('section.job-detail-content') job_page = Nokogiri::HTML(HTTParty.get(url).body)
# title - company # Job
title = job_page.css('div.job-desc h1.title').text job_title = job_page.css('div.job-desc h1.title').text
company = job_page.css('div.job-desc a.job-company-name').text # update_at, job_industries, job_type, salary, experience, level, expiration_date
# info box detail_box_items = job_page.css('.job-detail-content .detail-box ul li')
info_box_item = job_detail.css('.detail-box ul li') # init
# city, update_at, industry, type, salary, experience, level, expiration_date
job_industries = []
update_at, job_type, salary, experience, level, expiration_date = '' update_at, job_type, salary, experience, level, expiration_date = ''
job_cities = [] industries = []
job_detail.css('.detail-box .map p a').each do |part| detail_box_items.each do |info_item|
city = part.text
job_cities << city
end
info_box_item.each do |info_item|
info = info_item.text info = info_item.text
if info.include?(key = 'Ngày cập nhật') # case/when
update_at = info.squish.remove(key).strip case
elsif info.include?(key = 'Ngành nghề') when info.include?(key = 'Ngày cập nhật')
job_industries = info.squish.remove(key).strip.split(' , ') update_at = info.squish.remove(key).strip.to_time
elsif info.include?(key = 'Hình thức') when info.include?(key = 'Ngành nghề')
industries = info.squish.remove(key).strip.split(' , ')
when info.include?(key = 'Hình thức')
job_type = info.squish.remove(key).strip job_type = info.squish.remove(key).strip
elsif info.include?(key = 'Lương') when info.include?(key = 'Lương')
salary = info.squish.remove(key).strip salary = info.squish.remove(key).strip
elsif info.include?(key = 'Kinh nghiệm') when info.include?(key = 'Kinh nghiệm')
experience = info.squish.remove(key).strip experience = info.squish.remove(key).strip
elsif info.include?(key = 'Cấp bậc') when info.include?(key = 'Cấp bậc')
level = info.squish.remove(key).strip level = info.squish.remove(key).strip
elsif info.include?(key = 'Hết hạn nộp') when info.include?(key = 'Hết hạn nộp')
expiration_date = info.squish.remove(key).strip expiration_date = info.squish.remove(key).strip.to_time
end end
end end
# benefits, description, requirement, other_info
# benefit job_detail_rows = job_page.css('section.job-detail-content div.detail-row')
benefit_list = [] benefits, description, requirement, other_info = []
other_info_list = [] job_detail_rows.each do |detail_row|
benefits = job_detail.css('ul.welfare-list li') detail_title = detail_row.css('.detail-title').text.strip
benefits.each do |part| case detail_title
benefit = part.text.strip when 'Phúc lợi'
benefit_list << benefit benefits = detail_row.css(':not(h3.detail-title)').map(&:text).map(&:squish)[1..-1].reject(&:blank?).join('---')
end when 'Mô tả Công việc'
# description, requirement description = detail_row.css(':not(h3.detail-title)').map(&:text).map(&:squish)[1..-1].reject(&:blank?).join('---')
description, requirement = '' when 'Yêu Cầu Công Việc'
job_detail_row = job_detail.css('div.detail-row') requirement = detail_row.css(':not(h3.detail-title)').map(&:text).map(&:squish)[1..-1].reject(&:blank?).join('---')
job_detail_row.each do |part| when 'Thông tin khác'
job_detail_text = part.text other_info = detail_row.css(':not(h3.detail-title)').map(&:text).map(&:squish)[1..-1].reject(&:blank?).join('---')
if job_detail_text.include?('Mô tả Công việc') end
description = job_detail_text.partition('Mô tả Công việc').last.squish.strip end
elsif job_detail_text.include?('Yêu Cầu Công Việc') # Company
requirement = job_detail_text.partition('Yêu Cầu Công Việc').last.squish.strip company_name = job_page.css('div.job-desc a.job-company-name').text
end # Cities
end cities = job_page.css('.job-detail-content .detail-box .map p a').map(&:text)
# other info company_object = Company.find_or_create_by(name: company_name)
other_info = job_detail.css('div.content_fck ul li') job_object = Job.create({ title: job_title,
other_info.each do |part|
info = part.text.squish.strip
other_info_list << info
end
company = Company.find_or_create_by(name: company)
job = Job.find_or_create_by(
title: title,
job_type: job_type, job_type: job_type,
salary: salary, salary: salary,
position: level,
experience: experience, experience: experience,
position: level,
expiration_date: expiration_date, expiration_date: expiration_date,
benefit: benefit_list.each { |benefit| },
description: description, description: description,
benefit: benefits,
requirement: requirement, requirement: requirement,
other_info: other_info_list.each { |info| } other_info: other_info,
) company_id: company_object.id,
company.jobs << job created_at: update_at,
job_industries.each do |industry| updated_at: update_at })
industry_id = Industry.find_or_create_by(name: industry) industries.map do |industry|
job.industries << industry_id industry_objects = Industry.find_or_create_by(name: industry)
job_object.industries << industry_objects
end end
job_cities.each do |city| cities.map do |city|
city_id = City.find_or_create_by(name: city) city_objects = City.find_or_create_by(name: city)
job.cities << city_id job_object.cities << city_objects
end end
rescue URI::InvalidURIError => e
puts "[Error] #{e.message}"
encode_url = CGI.escape(url.remove('https://careerbuilder.vn/vi/tim-viec-lam/'))
url = "https://careerbuilder.vn/vi/tim-viec-lam/#{encode_url}"
retry if (retries += 1) < 2
rescue StandardError => e
puts e.message
puts e.backtrace.inspect
end end
end end
end end
...@@ -113,14 +106,9 @@ namespace :crawler do ...@@ -113,14 +106,9 @@ namespace :crawler do
task industries: :environment do task industries: :environment do
parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body) parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_job = parsed_page.css('div.list-of-working-positions ul.list-jobs li a') list_job = parsed_page.css('div.list-of-working-positions ul.list-jobs li a')
industry_list = []
list_job.each do |part| list_job.each do |part|
industry = part.text.squish.strip industry = part.text.squish.strip
industry_list << industry Industry.find_or_create_by(name: industry)
end
industry_list.each do |industry|
Industry.create(name: industry)
end end
end end
...@@ -128,22 +116,17 @@ namespace :crawler do ...@@ -128,22 +116,17 @@ namespace :crawler do
task cities: :environment do task cities: :environment do
parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body) parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_location = parsed_page.css('div.main-jobs-by-location ul li') list_location = parsed_page.css('div.main-jobs-by-location ul li')
city_list = []
list_location.each do |part| list_location.each do |part|
city_item = part.text city_name = part.text
region = 1 region = 1
if city_item.include?(key = 'Việc làm tại') if city_name.include?(key = 'Việc làm tại')
city_item = city_item.remove(key).strip city_name = city_name.remove(key).strip
region = 0 region = 0
end end
city = { city = {
name: city_item, name: city_name,
region: region region: region
} }
city_list << city
end
city_list.each do |city|
City.create( City.create(
name: city[:name], name: city[:name],
region: city[:region] region: city[:region]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment