Commit 4aa82d16 by Mai Hoang Thai Ha

Merge branch 'master' into 'Task/10_create_job_list_page_ID4'

# Conflicts:
#   Gemfile
#   Gemfile.lock
#   app/models/city.rb
#   db/schema.rb
#   lib/tasks/web_crawler.rake
parents 98d6bf79 d71790ab
......@@ -31,8 +31,9 @@ gem 'bootsnap', '>= 1.4.4', require: false
group :development, :test do
# Call 'byebug' anywhere in the code to stop execution and get a debugger console
gem 'byebug', platforms: [:mri, :mingw, :x64_mingw]
gem 'pry-rails', '~> 0.3.9'
# gem 'pry-nav', '~> 0.3.0'
gem 'pry', '~> 0.14.1'
# gem 'pry-nav'
gem 'pry-rails'
end
group :development do
......
......@@ -111,7 +111,7 @@ GEM
concurrent-ruby (~> 1.0)
jbuilder (2.11.2)
activesupport (>= 5.0.0)
listen (3.5.1)
listen (3.6.0)
rb-fsevent (~> 0.10, >= 0.10.3)
rb-inotify (~> 0.9, >= 0.9.10)
loofah (2.10.0)
......@@ -120,7 +120,7 @@ GEM
mail (2.7.1)
mini_mime (>= 0.1.1)
marcel (1.0.1)
method_source (0.9.2)
method_source (1.0.0)
mime-types (3.3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2021.0704)
......@@ -135,9 +135,9 @@ GEM
parallel (1.20.1)
parser (3.0.2.0)
ast (~> 2.4.1)
pry (0.12.2)
coderay (~> 1.1.0)
method_source (~> 0.9.0)
pry (0.14.1)
coderay (~> 1.1)
method_source (~> 1.0)
pry-rails (0.3.9)
pry (>= 0.10.4)
public_suffix (4.0.6)
......@@ -178,7 +178,7 @@ GEM
rake (>= 0.13)
thor (~> 1.0)
rainbow (3.0.0)
rake (13.0.3)
rake (13.0.6)
rb-fsevent (0.11.0)
rb-inotify (0.10.1)
ffi (~> 1.0)
......@@ -193,7 +193,7 @@ GEM
rubocop-ast (>= 1.7.0, < 2.0)
ruby-progressbar (~> 1.7)
unicode-display_width (>= 1.4.0, < 3.0)
rubocop-ast (1.7.0)
rubocop-ast (1.8.0)
parser (>= 3.0.1.1)
rubocop-rails (2.11.3)
activesupport (>= 4.2.0)
......@@ -218,7 +218,7 @@ GEM
slim (4.1.0)
temple (>= 0.7.6, < 0.9)
tilt (>= 2.0.6, < 2.1)
slim-rails (3.2.0)
slim-rails (3.3.0)
actionpack (>= 3.1)
railties (>= 3.1)
slim (>= 3.0, < 5.0)
......@@ -275,7 +275,8 @@ DEPENDENCIES
listen (~> 3.3)
mysql2 (~> 0.5)
nokogiri (~> 1.11, >= 1.11.7)
pry-rails (~> 0.3.9)
pry (~> 0.14.1)
pry-rails
puma (~> 5.0)
rack-mini-profiler (~> 2.0)
rails (~> 6.1.3, >= 6.1.3.2)
......
class CreateJobs < ActiveRecord::Migration[6.1]
def change
create_table :jobs do |t|
t.string :title
t.string :title, null: false
t.string :job_type
t.string :salary
t.string :experience
......
require 'open-uri'
require 'csv'
require 'zip'
namespace :crawler do
# command: rails crawler:jobs TYPE=TEST / ALL
desc 'crawler from CareerBuilder'
task jobs: :environment do
ARGV.each { |a| task a.to_sym { ; } }
total_pages = 0
if ARGV.length == 1 && ARGV[0] == 'TEST'
total_pages = 5
elsif ARGV.length == 1 && ARGV[0] == 'ALL'
unless %w[ALL TEST].include?(ENV['TYPE'])
abort 'Do you want to crawl all pages (ALL) or some pages (TEST)? Please ONLY pass ONE argument.'
end
logger = Logger.new("#{Rails.root}/log/job_crawler.log")
logger.info "Start crawler job at: #{Time.current}"
total_pages = 5 # default = TEST
if ENV['TYPE'] == 'ALL'
first_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html').body)
jobs_per_page = first_page.css('div.job-item').count
total_jobs = first_page.css('.search-result-list .job-found p').text.split(' ').first.gsub(',', '').to_i
total_jobs = first_page.css('.search-result-list .job-found-amout p').text.tr('^0-9', '')
total_pages = (total_jobs.to_f / jobs_per_page).round
else
exit
end
(1..total_pages).each do |page|
parsed_page = Nokogiri::HTML(HTTParty.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html").body)
logger.info("Page: #{page}")
jobs_item = parsed_page.css('div.job-item .job_link')
jobs_item.each do |item|
job_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/vi/tim-viec-lam/' +
CGI.escape(item.attribute('href').text.remove('https://careerbuilder.vn/vi/tim-viec-lam/'))).body)
job_detail = job_page.css('section.job-detail-content')
# title - company
title = job_page.css('div.job-desc h1.title').text
company = job_page.css('div.job-desc a.job-company-name').text
# info box
info_box_item = job_detail.css('.detail-box ul li')
# city, update_at, industry, type, salary, experience, level, expiration_date
job_industries = []
update_at, job_type, salary, experience, level, expiration_date = ''
job_cities = []
job_detail.css('.detail-box .map p a').each do |part|
city = part.text
job_cities << city
retries ||= 0
url ||= item.attribute('href').text
logger.info("job link: #{url}")
job_page = Nokogiri::HTML(HTTParty.get(url).body)
# Job
job_title = job_page.css('div.job-desc h1.title').text
if job_title.blank?
logger.info 'Remove this job because title is empty'
next
end
info_box_item.each do |info_item|
info = info_item.text
if info.include?(key = 'Ngày cập nhật')
update_at = info.squish.remove(key).strip
elsif info.include?(key = 'Ngành nghề')
job_industries = info.squish.remove(key).strip.split(' , ')
elsif info.include?(key = 'Hình thức')
job_type = info.squish.remove(key).strip
elsif info.include?(key = 'Lương')
salary = info.squish.remove(key).strip
elsif info.include?(key = 'Kinh nghiệm')
experience = info.squish.remove(key).strip
elsif info.include?(key = 'Cấp bậc')
level = info.squish.remove(key).strip
elsif info.include?(key = 'Hết hạn nộp')
expiration_date = info.squish.remove(key).strip
# update_at, job_industries, job_type, salary, experience, level, expiration_date
detail_box_items = job_page.css('.job-detail-content .detail-box ul li')
# init
update_at, job_type, salary, experience, level, expiration_date = ''
industries = []
detail_box_items.each do |info_item|
key = info_item.css('strong').text.strip
default_value = info_item.css('p').text.squish
# case/when
case key
when 'Ngày cập nhật'
update_at = default_value.to_time
when 'Ngành nghề'
industries = default_value.split(' , ')
when 'Hình thức'
job_type = default_value
when 'Lương'
salary = default_value
when 'Kinh nghiệm'
experience = default_value.squish
when 'Cấp bậc'
level = default_value
when 'Hết hạn nộp'
expiration_date = default_value.to_time
end
end
# benefit
benefit_list = []
other_info_list = []
benefits = job_detail.css('ul.welfare-list li')
benefits.each do |part|
benefit = part.text.strip
benefit_list << benefit
end
# description, requirement
description, requirement = ''
job_detail_row = job_detail.css('div.detail-row')
job_detail_row.each do |part|
job_detail_text = part.text
if job_detail_text.include?('Mô tả Công việc')
description = job_detail_text.partition('Mô tả Công việc').last.squish.strip
elsif job_detail_text.include?('Yêu Cầu Công Việc')
requirement = job_detail_text.partition('Yêu Cầu Công Việc').last.squish.strip
# benefits, description, requirement, other_info
job_detail_rows = job_page.css('section.job-detail-content div.detail-row')
benefits, description, requirement, other_info = ''
job_detail_rows.each do |detail_row|
detail_title = detail_row.css('.detail-title').text.strip
detail_content = detail_row.css(':not(h3.detail-title)')
case detail_title
when 'Phúc lợi'
benefits = detail_row.css('ul.welfare-list li').map(&:text).map(&:squish).join('---')
when 'Mô tả Công việc'
description = detail_content.inner_html
when 'Yêu Cầu Công Việc'
requirement = detail_content.inner_html
when 'Thông tin khác'
other_info = detail_row.css('.content_fck ul li').map(&:text).map(&:squish).join('---')
end
end
# other info
other_info = job_detail.css('div.content_fck ul li')
other_info.each do |part|
info = part.text.squish.strip
other_info_list << info
end
company = Company.find_or_create_by(name: company)
job = Job.find_or_create_by(
title: title,
job_type: job_type,
salary: salary,
position: level,
experience: experience,
expiration_date: expiration_date,
benefit: benefit_list.each { |benefit| },
description: description,
requirement: requirement,
other_info: other_info_list.each { |info| }
)
company.jobs << job
job_industries.each do |industry|
industry_id = Industry.find_or_create_by(name: industry)
job.industries << industry_id
end
job_cities.each do |city|
city_id = City.find_or_create_by(name: city)
job.cities << city_id
end
# Company
company_name = job_page.css('div.job-desc a.job-company-name').text
company_object = Company.find_or_create_by(name: company_name)
job_object = Job.find_or_create_by({ title: job_title,
job_type: job_type,
salary: salary,
experience: experience,
position: level,
expiration_date: expiration_date,
description: description,
benefit: benefits,
requirement: requirement,
other_info: other_info,
company_id: company_object.id })
industry_objects = industries.map { |industry| Industry.find_or_create_by(name: industry) }
job_object.industries << industry_objects
# Cities
cities = job_page.css('.job-detail-content .detail-box .map p a').map(&:text)
city_objects = cities.map { |city| City.find_or_create_by(name: city) }
job_object.cities << city_objects
rescue URI::InvalidURIError => e
puts "[Error] #{e.message}"
logger.error "URI must be ascii only : #{url}"
encode_url = CGI.escape(url.remove('https://careerbuilder.vn/vi/tim-viec-lam/'))
url = "https://careerbuilder.vn/vi/tim-viec-lam/#{encode_url}"
retry if (retries += 1) < 2
rescue StandardError => e
puts e.message
puts e.backtrace.inspect
end
end
logger.info "Finished at: #{Time.current}"
end
desc 'crawler industry form CareerBuilder'
task industries: :environment do
parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_job = parsed_page.css('div.list-of-working-positions ul.list-jobs li a')
industry_list = []
list_job.each do |part|
industry = part.text.squish.strip
industry_list << industry
end
industry_list.each do |industry|
Industry.create(name: industry)
Industry.find_or_create_by(name: industry)
end
end
desc 'crawler city form CareerBuilder'
task cities: :environment do
parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_location = parsed_page.css('div.main-jobs-by-location ul li')
city_list = []
list_location.each do |part|
city_item = part.text
region = 1
if city_item.include?(key = 'Việc làm tại')
city_item = city_item.remove(key).strip
region = 0
list_location.each do |city|
city_name = city.text
region = :international
if city_name.start_with?('Việc làm tại')
city_name = city_name.remove('Việc làm tại').strip
region = :vietnam
end
city = {
name: city_item,
City.find_or_create_by(
name: city_name,
region: region
}
city_list << city
end
city_list.each do |city|
City.create(
name: city[:name],
region: city[:region]
)
end
end
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment