Commit 4aa82d16 by Mai Hoang Thai Ha

Merge branch 'master' into 'Task/10_create_job_list_page_ID4'

# Conflicts:
#   Gemfile
#   Gemfile.lock
#   app/models/city.rb
#   db/schema.rb
#   lib/tasks/web_crawler.rake
parents 98d6bf79 d71790ab
...@@ -31,8 +31,9 @@ gem 'bootsnap', '>= 1.4.4', require: false ...@@ -31,8 +31,9 @@ gem 'bootsnap', '>= 1.4.4', require: false
group :development, :test do group :development, :test do
# Call 'byebug' anywhere in the code to stop execution and get a debugger console # Call 'byebug' anywhere in the code to stop execution and get a debugger console
gem 'byebug', platforms: [:mri, :mingw, :x64_mingw] gem 'byebug', platforms: [:mri, :mingw, :x64_mingw]
gem 'pry-rails', '~> 0.3.9' gem 'pry', '~> 0.14.1'
# gem 'pry-nav', '~> 0.3.0' # gem 'pry-nav'
gem 'pry-rails'
end end
group :development do group :development do
......
...@@ -111,7 +111,7 @@ GEM ...@@ -111,7 +111,7 @@ GEM
concurrent-ruby (~> 1.0) concurrent-ruby (~> 1.0)
jbuilder (2.11.2) jbuilder (2.11.2)
activesupport (>= 5.0.0) activesupport (>= 5.0.0)
listen (3.5.1) listen (3.6.0)
rb-fsevent (~> 0.10, >= 0.10.3) rb-fsevent (~> 0.10, >= 0.10.3)
rb-inotify (~> 0.9, >= 0.9.10) rb-inotify (~> 0.9, >= 0.9.10)
loofah (2.10.0) loofah (2.10.0)
...@@ -120,7 +120,7 @@ GEM ...@@ -120,7 +120,7 @@ GEM
mail (2.7.1) mail (2.7.1)
mini_mime (>= 0.1.1) mini_mime (>= 0.1.1)
marcel (1.0.1) marcel (1.0.1)
method_source (0.9.2) method_source (1.0.0)
mime-types (3.3.1) mime-types (3.3.1)
mime-types-data (~> 3.2015) mime-types-data (~> 3.2015)
mime-types-data (3.2021.0704) mime-types-data (3.2021.0704)
...@@ -135,9 +135,9 @@ GEM ...@@ -135,9 +135,9 @@ GEM
parallel (1.20.1) parallel (1.20.1)
parser (3.0.2.0) parser (3.0.2.0)
ast (~> 2.4.1) ast (~> 2.4.1)
pry (0.12.2) pry (0.14.1)
coderay (~> 1.1.0) coderay (~> 1.1)
method_source (~> 0.9.0) method_source (~> 1.0)
pry-rails (0.3.9) pry-rails (0.3.9)
pry (>= 0.10.4) pry (>= 0.10.4)
public_suffix (4.0.6) public_suffix (4.0.6)
...@@ -178,7 +178,7 @@ GEM ...@@ -178,7 +178,7 @@ GEM
rake (>= 0.13) rake (>= 0.13)
thor (~> 1.0) thor (~> 1.0)
rainbow (3.0.0) rainbow (3.0.0)
rake (13.0.3) rake (13.0.6)
rb-fsevent (0.11.0) rb-fsevent (0.11.0)
rb-inotify (0.10.1) rb-inotify (0.10.1)
ffi (~> 1.0) ffi (~> 1.0)
...@@ -193,7 +193,7 @@ GEM ...@@ -193,7 +193,7 @@ GEM
rubocop-ast (>= 1.7.0, < 2.0) rubocop-ast (>= 1.7.0, < 2.0)
ruby-progressbar (~> 1.7) ruby-progressbar (~> 1.7)
unicode-display_width (>= 1.4.0, < 3.0) unicode-display_width (>= 1.4.0, < 3.0)
rubocop-ast (1.7.0) rubocop-ast (1.8.0)
parser (>= 3.0.1.1) parser (>= 3.0.1.1)
rubocop-rails (2.11.3) rubocop-rails (2.11.3)
activesupport (>= 4.2.0) activesupport (>= 4.2.0)
...@@ -218,7 +218,7 @@ GEM ...@@ -218,7 +218,7 @@ GEM
slim (4.1.0) slim (4.1.0)
temple (>= 0.7.6, < 0.9) temple (>= 0.7.6, < 0.9)
tilt (>= 2.0.6, < 2.1) tilt (>= 2.0.6, < 2.1)
slim-rails (3.2.0) slim-rails (3.3.0)
actionpack (>= 3.1) actionpack (>= 3.1)
railties (>= 3.1) railties (>= 3.1)
slim (>= 3.0, < 5.0) slim (>= 3.0, < 5.0)
...@@ -275,7 +275,8 @@ DEPENDENCIES ...@@ -275,7 +275,8 @@ DEPENDENCIES
listen (~> 3.3) listen (~> 3.3)
mysql2 (~> 0.5) mysql2 (~> 0.5)
nokogiri (~> 1.11, >= 1.11.7) nokogiri (~> 1.11, >= 1.11.7)
pry-rails (~> 0.3.9) pry (~> 0.14.1)
pry-rails
puma (~> 5.0) puma (~> 5.0)
rack-mini-profiler (~> 2.0) rack-mini-profiler (~> 2.0)
rails (~> 6.1.3, >= 6.1.3.2) rails (~> 6.1.3, >= 6.1.3.2)
......
class CreateJobs < ActiveRecord::Migration[6.1] class CreateJobs < ActiveRecord::Migration[6.1]
def change def change
create_table :jobs do |t| create_table :jobs do |t|
t.string :title t.string :title, null: false
t.string :job_type t.string :job_type
t.string :salary t.string :salary
t.string :experience t.string :experience
......
require 'open-uri' require 'open-uri'
require 'csv'
require 'zip'
namespace :crawler do namespace :crawler do
# command: rails crawler:jobs TYPE=TEST / ALL
desc 'crawler from CareerBuilder' desc 'crawler from CareerBuilder'
task jobs: :environment do task jobs: :environment do
ARGV.each { |a| task a.to_sym { ; } } unless %w[ALL TEST].include?(ENV['TYPE'])
abort 'Do you want to crawl all pages (ALL) or some pages (TEST)? Please ONLY pass ONE argument.'
total_pages = 0 end
if ARGV.length == 1 && ARGV[0] == 'TEST' logger = Logger.new("#{Rails.root}/log/job_crawler.log")
total_pages = 5 logger.info "Start crawler job at: #{Time.current}"
elsif ARGV.length == 1 && ARGV[0] == 'ALL' total_pages = 5 # default = TEST
if ENV['TYPE'] == 'ALL'
first_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html').body) first_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html').body)
jobs_per_page = first_page.css('div.job-item').count jobs_per_page = first_page.css('div.job-item').count
total_jobs = first_page.css('.search-result-list .job-found p').text.split(' ').first.gsub(',', '').to_i total_jobs = first_page.css('.search-result-list .job-found-amout p').text.tr('^0-9', '')
total_pages = (total_jobs.to_f / jobs_per_page).round total_pages = (total_jobs.to_f / jobs_per_page).round
else
exit
end end
(1..total_pages).each do |page| (1..total_pages).each do |page|
parsed_page = Nokogiri::HTML(HTTParty.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html").body) parsed_page = Nokogiri::HTML(HTTParty.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html").body)
logger.info("Page: #{page}")
jobs_item = parsed_page.css('div.job-item .job_link') jobs_item = parsed_page.css('div.job-item .job_link')
jobs_item.each do |item| jobs_item.each do |item|
job_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/vi/tim-viec-lam/' + retries ||= 0
CGI.escape(item.attribute('href').text.remove('https://careerbuilder.vn/vi/tim-viec-lam/'))).body) url ||= item.attribute('href').text
job_detail = job_page.css('section.job-detail-content') logger.info("job link: #{url}")
# title - company job_page = Nokogiri::HTML(HTTParty.get(url).body)
title = job_page.css('div.job-desc h1.title').text # Job
company = job_page.css('div.job-desc a.job-company-name').text job_title = job_page.css('div.job-desc h1.title').text
# info box if job_title.blank?
info_box_item = job_detail.css('.detail-box ul li') logger.info 'Remove this job because title is empty'
# city, update_at, industry, type, salary, experience, level, expiration_date next
job_industries = [] end
# update_at, job_industries, job_type, salary, experience, level, expiration_date
detail_box_items = job_page.css('.job-detail-content .detail-box ul li')
# init
update_at, job_type, salary, experience, level, expiration_date = '' update_at, job_type, salary, experience, level, expiration_date = ''
job_cities = [] industries = []
job_detail.css('.detail-box .map p a').each do |part| detail_box_items.each do |info_item|
city = part.text key = info_item.css('strong').text.strip
job_cities << city default_value = info_item.css('p').text.squish
end # case/when
case key
info_box_item.each do |info_item| when 'Ngày cập nhật'
info = info_item.text update_at = default_value.to_time
if info.include?(key = 'Ngày cập nhật') when 'Ngành nghề'
update_at = info.squish.remove(key).strip industries = default_value.split(' , ')
elsif info.include?(key = 'Ngành nghề') when 'Hình thức'
job_industries = info.squish.remove(key).strip.split(' , ') job_type = default_value
elsif info.include?(key = 'Hình thức') when 'Lương'
job_type = info.squish.remove(key).strip salary = default_value
elsif info.include?(key = 'Lương') when 'Kinh nghiệm'
salary = info.squish.remove(key).strip experience = default_value.squish
elsif info.include?(key = 'Kinh nghiệm') when 'Cấp bậc'
experience = info.squish.remove(key).strip level = default_value
elsif info.include?(key = 'Cấp bậc') when 'Hết hạn nộp'
level = info.squish.remove(key).strip expiration_date = default_value.to_time
elsif info.include?(key = 'Hết hạn nộp') end
expiration_date = info.squish.remove(key).strip end
end # benefits, description, requirement, other_info
end job_detail_rows = job_page.css('section.job-detail-content div.detail-row')
benefits, description, requirement, other_info = ''
# benefit job_detail_rows.each do |detail_row|
benefit_list = [] detail_title = detail_row.css('.detail-title').text.strip
other_info_list = [] detail_content = detail_row.css(':not(h3.detail-title)')
benefits = job_detail.css('ul.welfare-list li') case detail_title
benefits.each do |part| when 'Phúc lợi'
benefit = part.text.strip benefits = detail_row.css('ul.welfare-list li').map(&:text).map(&:squish).join('---')
benefit_list << benefit when 'Mô tả Công việc'
end description = detail_content.inner_html
# description, requirement when 'Yêu Cầu Công Việc'
description, requirement = '' requirement = detail_content.inner_html
job_detail_row = job_detail.css('div.detail-row') when 'Thông tin khác'
job_detail_row.each do |part| other_info = detail_row.css('.content_fck ul li').map(&:text).map(&:squish).join('---')
job_detail_text = part.text end
if job_detail_text.include?('Mô tả Công việc') end
description = job_detail_text.partition('Mô tả Công việc').last.squish.strip # Company
elsif job_detail_text.include?('Yêu Cầu Công Việc') company_name = job_page.css('div.job-desc a.job-company-name').text
requirement = job_detail_text.partition('Yêu Cầu Công Việc').last.squish.strip company_object = Company.find_or_create_by(name: company_name)
end job_object = Job.find_or_create_by({ title: job_title,
end
# other info
other_info = job_detail.css('div.content_fck ul li')
other_info.each do |part|
info = part.text.squish.strip
other_info_list << info
end
company = Company.find_or_create_by(name: company)
job = Job.find_or_create_by(
title: title,
job_type: job_type, job_type: job_type,
salary: salary, salary: salary,
position: level,
experience: experience, experience: experience,
position: level,
expiration_date: expiration_date, expiration_date: expiration_date,
benefit: benefit_list.each { |benefit| },
description: description, description: description,
benefit: benefits,
requirement: requirement, requirement: requirement,
other_info: other_info_list.each { |info| } other_info: other_info,
) company_id: company_object.id })
company.jobs << job industry_objects = industries.map { |industry| Industry.find_or_create_by(name: industry) }
job_industries.each do |industry| job_object.industries << industry_objects
industry_id = Industry.find_or_create_by(name: industry) # Cities
job.industries << industry_id cities = job_page.css('.job-detail-content .detail-box .map p a').map(&:text)
end city_objects = cities.map { |city| City.find_or_create_by(name: city) }
job_cities.each do |city| job_object.cities << city_objects
city_id = City.find_or_create_by(name: city) rescue URI::InvalidURIError => e
job.cities << city_id puts "[Error] #{e.message}"
end logger.error "URI must be ascii only : #{url}"
end encode_url = CGI.escape(url.remove('https://careerbuilder.vn/vi/tim-viec-lam/'))
end url = "https://careerbuilder.vn/vi/tim-viec-lam/#{encode_url}"
retry if (retries += 1) < 2
rescue StandardError => e
puts e.message
puts e.backtrace.inspect
end
end
logger.info "Finished at: #{Time.current}"
end end
desc 'crawler industry form CareerBuilder' desc 'crawler industry form CareerBuilder'
task industries: :environment do task industries: :environment do
parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body) parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_job = parsed_page.css('div.list-of-working-positions ul.list-jobs li a') list_job = parsed_page.css('div.list-of-working-positions ul.list-jobs li a')
industry_list = []
list_job.each do |part| list_job.each do |part|
industry = part.text.squish.strip industry = part.text.squish.strip
industry_list << industry Industry.find_or_create_by(name: industry)
end
industry_list.each do |industry|
Industry.create(name: industry)
end end
end end
desc 'crawler city form CareerBuilder' desc 'crawler city form CareerBuilder'
task cities: :environment do task cities: :environment do
parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body) parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_location = parsed_page.css('div.main-jobs-by-location ul li') list_location = parsed_page.css('div.main-jobs-by-location ul li')
city_list = [] list_location.each do |city|
city_name = city.text
list_location.each do |part| region = :international
city_item = part.text if city_name.start_with?('Việc làm tại')
region = 1 city_name = city_name.remove('Việc làm tại').strip
if city_item.include?(key = 'Việc làm tại') region = :vietnam
city_item = city_item.remove(key).strip end
region = 0 City.find_or_create_by(
end name: city_name,
city = {
name: city_item,
region: region region: region
}
city_list << city
end
city_list.each do |city|
City.create(
name: city[:name],
region: city[:region]
) )
end end
end end
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment