fix bug crawling

parent fde8ff8e
Pipeline #1358 failed with stages
in 0 seconds
...@@ -8,6 +8,7 @@ gem 'rails', '~> 6.1.3', '>= 6.1.3.2' ...@@ -8,6 +8,7 @@ gem 'rails', '~> 6.1.3', '>= 6.1.3.2'
gem 'bootstrap', '~> 5.0.1' gem 'bootstrap', '~> 5.0.1'
gem 'nokogiri', '~> 1.11', '>= 1.11.7' gem 'nokogiri', '~> 1.11', '>= 1.11.7'
# Use sqlite3 as the database for Active Record # Use sqlite3 as the database for Active Record
gem 'mysql2', '~> 0.5.3' gem 'mysql2', '~> 0.5.3'
# Use Puma as the app server # Use Puma as the app server
...@@ -30,6 +31,7 @@ gem 'jbuilder', '~> 2.7' ...@@ -30,6 +31,7 @@ gem 'jbuilder', '~> 2.7'
# Reduces boot times through caching; required in config/boot.rb # Reduces boot times through caching; required in config/boot.rb
gem 'bootsnap', '>= 1.4.4', require: false gem 'bootsnap', '>= 1.4.4', require: false
gem 'whenever', require: false
group :development, :test do group :development, :test do
# Call 'byebug' anywhere in the code to stop execution and get a debugger console # Call 'byebug' anywhere in the code to stop execution and get a debugger console
......
...@@ -82,6 +82,7 @@ GEM ...@@ -82,6 +82,7 @@ GEM
regexp_parser (>= 1.5, < 3.0) regexp_parser (>= 1.5, < 3.0)
xpath (~> 3.2) xpath (~> 3.2)
childprocess (3.0.0) childprocess (3.0.0)
chronic (0.10.2)
concurrent-ruby (1.1.9) concurrent-ruby (1.1.9)
crass (1.0.6) crass (1.0.6)
erubi (1.10.0) erubi (1.10.0)
...@@ -91,7 +92,6 @@ GEM ...@@ -91,7 +92,6 @@ GEM
activesupport (>= 4.2.0) activesupport (>= 4.2.0)
i18n (1.8.10) i18n (1.8.10)
concurrent-ruby (~> 1.0) concurrent-ruby (~> 1.0)
io-wait (0.1.0)
jbuilder (2.11.2) jbuilder (2.11.2)
activesupport (>= 5.0.0) activesupport (>= 5.0.0)
listen (3.5.1) listen (3.5.1)
...@@ -108,12 +108,6 @@ GEM ...@@ -108,12 +108,6 @@ GEM
minitest (5.14.4) minitest (5.14.4)
msgpack (1.4.2) msgpack (1.4.2)
mysql2 (0.5.3) mysql2 (0.5.3)
net-http (0.1.1)
net-protocol
uri
net-protocol (0.1.1)
io-wait
timeout
nio4r (2.5.7) nio4r (2.5.7)
nokogiri (1.11.7-x86_64-linux) nokogiri (1.11.7-x86_64-linux)
racc (~> 1.4) racc (~> 1.4)
...@@ -185,13 +179,11 @@ GEM ...@@ -185,13 +179,11 @@ GEM
sprockets (>= 3.0.0) sprockets (>= 3.0.0)
thor (1.1.0) thor (1.1.0)
tilt (2.0.10) tilt (2.0.10)
timeout (0.1.1)
turbolinks (5.2.1) turbolinks (5.2.1)
turbolinks-source (~> 5.2) turbolinks-source (~> 5.2)
turbolinks-source (5.2.0) turbolinks-source (5.2.0)
tzinfo (2.0.4) tzinfo (2.0.4)
concurrent-ruby (~> 1.0) concurrent-ruby (~> 1.0)
uri (0.10.1)
web-console (4.1.0) web-console (4.1.0)
actionview (>= 6.0.0) actionview (>= 6.0.0)
activemodel (>= 6.0.0) activemodel (>= 6.0.0)
...@@ -209,6 +201,8 @@ GEM ...@@ -209,6 +201,8 @@ GEM
websocket-driver (0.7.5) websocket-driver (0.7.5)
websocket-extensions (>= 0.1.0) websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5) websocket-extensions (0.1.5)
whenever (1.0.0)
chronic (>= 0.6.3)
xpath (3.2.0) xpath (3.2.0)
nokogiri (~> 1.8) nokogiri (~> 1.8)
zeitwerk (2.4.2) zeitwerk (2.4.2)
...@@ -224,7 +218,6 @@ DEPENDENCIES ...@@ -224,7 +218,6 @@ DEPENDENCIES
jbuilder (~> 2.7) jbuilder (~> 2.7)
listen (~> 3.3) listen (~> 3.3)
mysql2 (~> 0.5.3) mysql2 (~> 0.5.3)
net-http
nokogiri (~> 1.11, >= 1.11.7) nokogiri (~> 1.11, >= 1.11.7)
puma (~> 5.0) puma (~> 5.0)
rack-mini-profiler (~> 2.0) rack-mini-profiler (~> 2.0)
...@@ -237,6 +230,7 @@ DEPENDENCIES ...@@ -237,6 +230,7 @@ DEPENDENCIES
web-console (>= 4.1.0) web-console (>= 4.1.0)
webdrivers webdrivers
webpacker (~> 5.0) webpacker (~> 5.0)
whenever
RUBY VERSION RUBY VERSION
ruby 3.0.1p64 ruby 3.0.1p64
......
every 1.day, at: '08:18 am' do
rake 'crawler:all'
end
class ChangeJobs < ActiveRecord::Migration[6.1] class ChangeJobs < ActiveRecord::Migration[6.1]
def up def up
change_column :jobs, :salary, :string change_column :jobs, :salary, :string
remove_column :jobs, :industries_type, :text
remove_column :jobs, :location, :text
end end
def down def down
change_column :jobs, :salary, :integer change_column :jobs, :salary, :integer
change_column :jobs, :industries_type, :text
change_column :jobs, :location, :text
end end
end end
require 'open-uri' require 'open-uri'
require 'logger'
namespace :crawler do namespace :crawler do
desc 'Crawl Jobs and Companies' desc 'Crawl Jobs and Companies'
...@@ -19,20 +20,23 @@ namespace :crawler do ...@@ -19,20 +20,23 @@ namespace :crawler do
pagination_job_listing.each do |detail_jobs| pagination_job_listing.each do |detail_jobs|
company_url = detail_jobs.css('a.company-name').attribute('href').text company_url = detail_jobs.css('a.company-name').attribute('href').text
next if company_url == 'javascript:void(0);' next if company_url == 'javascript:void(0);'
slug_company = CGI.escape(company_url.gsub('https://careerbuilder.vn/vi/nha-tuyen-dung/', '').strip) slug_company = CGI.escape(company_url.gsub('https://careerbuilder.vn/vi/nha-tuyen-dung/', '').strip)
company_page = "https://careerbuilder.vn/vi/nha-tuyen-dung/#{slug_company}" company_page = "https://careerbuilder.vn/vi/nha-tuyen-dung/#{slug_company}"
puts company_page parse_company_page = Nokogiri::HTML(URI.open(company_page).read)
parse_company_page = Nokogiri::HTML(URI.open(company_page))
company = parse_company_page.css('div.container') company = parse_company_page.css('div.container')
company_name = company.css('div.company-info div.content p.name') company_name = company.css('div.company-info div.content p.name')
next if company_name.nil? next if company_name.nil?
name = company.css('div.company-info div.content p.name').text
logger = Logger.new("#{Rails.root}/log/crawler_jobs.log")
logger.info("Link company: #{company_page}.to_s")
company_info = company.css('div.company-info div.content') company_info = company.css('div.company-info div.content')
address = company_info.css('p')[1].try(:text) address = company_info.css('p')[1].try(:text)
description = company_info.css('ul li').text description = company_info.css('ul li').text
overview = company.css('div.row div.content p').text.squish.strip overview = company.css('div.row div.content p').text.squish.strip
Company.find_or_create_by( Company.find_or_create_by(
name: name, name: company_name.text,
address: address, address: address,
description: description, description: description,
overview: overview overview: overview
...@@ -41,27 +45,28 @@ namespace :crawler do ...@@ -41,27 +45,28 @@ namespace :crawler do
slug_job = CGI.escape(detail_jobs.css('a.job_link').attribute('href').text slug_job = CGI.escape(detail_jobs.css('a.job_link').attribute('href').text
.gsub('https://careerbuilder.vn/vi/tim-viec-lam/', '').strip) .gsub('https://careerbuilder.vn/vi/tim-viec-lam/', '').strip)
job_detail_page = "https://careerbuilder.vn/vi/tim-viec-lam/#{slug_job}" job_detail_page = "https://careerbuilder.vn/vi/tim-viec-lam/#{slug_job}"
puts job_detail_page parse_job_detail_page = Nokogiri::HTML(URI.open(job_detail_page).read)
parse_job_detail_page = Nokogiri::HTML(URI.open(job_detail_page))
detail_job = parse_job_detail_page.css('div.container') detail_job = parse_job_detail_page.css('div.container')
title = detail_job.css('div.job-desc h1.title') title = detail_job.css('div.job-desc h1.title')
next if title.nil? next if title.nil?
title_job = detail_job.css('div.job-desc h1.title').text
logger.info("Link job: #{job_detail_page}")
salary, experience, type, level, expired_at = '' salary, experience, type, level, expired_at = ''
detail_content = detail_job.css('div.detail-box.has-background ul li') detail_content = detail_job.css('div.col-lg-4 col-sm-6 item-blue ul li')
detail_content.each do |content| detail_content.each do |content|
case content.css('strong').text case content.css('strong').text
when 'Lương' when 'Lương'
salary = content.css('p').text salary = content.css('p').text
when 'Kinh nghiệm' when 'Kinh nghiệm'
experience = content.css('p').text puts content.css('p').text
when 'Hình thức' when 'Hình thức'
type = content.css('p').text puts content.css('p').text
when 'Cấp bậc' when 'Cấp bậc'
level = content.css('p').text level = content.css('p').text
when 'Hết hạn nộp' when 'Hết hạn nộp'
expired_at = content.css('p').text expired_at = content.css('p').text
end end
end end
benefits, overview, requirement, other_requirement = '' benefits, overview, requirement, other_requirement = ''
detail_require = detail_job.css('div.detail-row') detail_require = detail_job.css('div.detail-row')
...@@ -79,7 +84,7 @@ namespace :crawler do ...@@ -79,7 +84,7 @@ namespace :crawler do
end end
job = Job.find_or_create_by( job = Job.find_or_create_by(
title: title_job, title: title.text,
salary: salary, salary: salary,
experience: experience, experience: experience,
type: type, type: type,
...@@ -89,25 +94,25 @@ namespace :crawler do ...@@ -89,25 +94,25 @@ namespace :crawler do
overview: overview, overview: overview,
requirement: requirement, requirement: requirement,
other_requirement: other_requirement, other_requirement: other_requirement,
company_id: Company.find_by(name: name).id company_id: Company.find_by(name: company_name.text).id
) )
industries = detail_job.css('div.detail-box.has-background ul li p a') industries = detail_job.css('div.detail-box.has-background ul li p a')
industries.each do |industry| industries.each do |industry|
industry_name = industry.text.squish name = industry.text.squish
industries = Industry.find_or_create_by( industry_name = Industry.find_or_create_by(
name: industry_name name: name
) )
job.industries << industries job.industries << industry_name
end end
location = detail_job.css('div.map p a') location = detail_job.css('div.map p a')
location.each do |city| location.each do |city|
city_name = city.text name = city.text
cities = City.find_or_create_by( city_name = City.find_or_create_by(
name: city_name name: name
) )
job.cities << cities job.cities << city_name
end end
end end
page += 1 page += 1
...@@ -156,9 +161,12 @@ namespace :crawler do ...@@ -156,9 +161,12 @@ namespace :crawler do
end end
end end
desc 'Craw regions, cities, industries, jobs and companies'
task all: %i[regions cities industries jobs]
def parse_base_url def parse_base_url
base_url = Nokogiri::HTML(URI.open('https://careerbuilder.vn/')) base_url = Nokogiri::HTML(URI.open('https://careerbuilder.vn/'))
industries_url = base_url.css('div.menu div.dropdown-menu ul li a')[1].attributes['href'].value industries_url = base_url.css('div.menu div.dropdown-menu ul li a')[1].attributes['href'].text
Nokogiri::HTML(URI.open(industries_url)) Nokogiri::HTML(URI.open(industries_url))
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment