fix bug crawling

parent fde8ff8e
Pipeline #1358 failed with stages
in 0 seconds
......@@ -8,6 +8,7 @@ gem 'rails', '~> 6.1.3', '>= 6.1.3.2'
gem 'bootstrap', '~> 5.0.1'
gem 'nokogiri', '~> 1.11', '>= 1.11.7'
# Use sqlite3 as the database for Active Record
gem 'mysql2', '~> 0.5.3'
# Use Puma as the app server
......@@ -30,6 +31,7 @@ gem 'jbuilder', '~> 2.7'
# Reduces boot times through caching; required in config/boot.rb
gem 'bootsnap', '>= 1.4.4', require: false
gem 'whenever', require: false
group :development, :test do
# Call 'byebug' anywhere in the code to stop execution and get a debugger console
......
......@@ -82,6 +82,7 @@ GEM
regexp_parser (>= 1.5, < 3.0)
xpath (~> 3.2)
childprocess (3.0.0)
chronic (0.10.2)
concurrent-ruby (1.1.9)
crass (1.0.6)
erubi (1.10.0)
......@@ -91,7 +92,6 @@ GEM
activesupport (>= 4.2.0)
i18n (1.8.10)
concurrent-ruby (~> 1.0)
io-wait (0.1.0)
jbuilder (2.11.2)
activesupport (>= 5.0.0)
listen (3.5.1)
......@@ -108,12 +108,6 @@ GEM
minitest (5.14.4)
msgpack (1.4.2)
mysql2 (0.5.3)
net-http (0.1.1)
net-protocol
uri
net-protocol (0.1.1)
io-wait
timeout
nio4r (2.5.7)
nokogiri (1.11.7-x86_64-linux)
racc (~> 1.4)
......@@ -185,13 +179,11 @@ GEM
sprockets (>= 3.0.0)
thor (1.1.0)
tilt (2.0.10)
timeout (0.1.1)
turbolinks (5.2.1)
turbolinks-source (~> 5.2)
turbolinks-source (5.2.0)
tzinfo (2.0.4)
concurrent-ruby (~> 1.0)
uri (0.10.1)
web-console (4.1.0)
actionview (>= 6.0.0)
activemodel (>= 6.0.0)
......@@ -209,6 +201,8 @@ GEM
websocket-driver (0.7.5)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5)
whenever (1.0.0)
chronic (>= 0.6.3)
xpath (3.2.0)
nokogiri (~> 1.8)
zeitwerk (2.4.2)
......@@ -224,7 +218,6 @@ DEPENDENCIES
jbuilder (~> 2.7)
listen (~> 3.3)
mysql2 (~> 0.5.3)
net-http
nokogiri (~> 1.11, >= 1.11.7)
puma (~> 5.0)
rack-mini-profiler (~> 2.0)
......@@ -237,6 +230,7 @@ DEPENDENCIES
web-console (>= 4.1.0)
webdrivers
webpacker (~> 5.0)
whenever
RUBY VERSION
ruby 3.0.1p64
......
every 1.day, at: '08:18 am' do
rake 'crawler:all'
end
class ChangeJobs < ActiveRecord::Migration[6.1]
def up
change_column :jobs, :salary, :string
remove_column :jobs, :industries_type, :text
remove_column :jobs, :location, :text
end
def down
change_column :jobs, :salary, :integer
change_column :jobs, :industries_type, :text
change_column :jobs, :location, :text
end
end
require 'open-uri'
require 'logger'
namespace :crawler do
desc 'Crawl Jobs and Companies'
......@@ -19,20 +20,23 @@ namespace :crawler do
pagination_job_listing.each do |detail_jobs|
company_url = detail_jobs.css('a.company-name').attribute('href').text
next if company_url == 'javascript:void(0);'
slug_company = CGI.escape(company_url.gsub('https://careerbuilder.vn/vi/nha-tuyen-dung/', '').strip)
company_page = "https://careerbuilder.vn/vi/nha-tuyen-dung/#{slug_company}"
puts company_page
parse_company_page = Nokogiri::HTML(URI.open(company_page))
parse_company_page = Nokogiri::HTML(URI.open(company_page).read)
company = parse_company_page.css('div.container')
company_name = company.css('div.company-info div.content p.name')
next if company_name.nil?
name = company.css('div.company-info div.content p.name').text
logger = Logger.new("#{Rails.root}/log/crawler_jobs.log")
logger.info("Link company: #{company_page}.to_s")
company_info = company.css('div.company-info div.content')
address = company_info.css('p')[1].try(:text)
description = company_info.css('ul li').text
overview = company.css('div.row div.content p').text.squish.strip
Company.find_or_create_by(
name: name,
name: company_name.text,
address: address,
description: description,
overview: overview
......@@ -41,27 +45,28 @@ namespace :crawler do
slug_job = CGI.escape(detail_jobs.css('a.job_link').attribute('href').text
.gsub('https://careerbuilder.vn/vi/tim-viec-lam/', '').strip)
job_detail_page = "https://careerbuilder.vn/vi/tim-viec-lam/#{slug_job}"
puts job_detail_page
parse_job_detail_page = Nokogiri::HTML(URI.open(job_detail_page))
parse_job_detail_page = Nokogiri::HTML(URI.open(job_detail_page).read)
detail_job = parse_job_detail_page.css('div.container')
title = detail_job.css('div.job-desc h1.title')
next if title.nil?
title_job = detail_job.css('div.job-desc h1.title').text
logger.info("Link job: #{job_detail_page}")
salary, experience, type, level, expired_at = ''
detail_content = detail_job.css('div.detail-box.has-background ul li')
detail_content = detail_job.css('div.col-lg-4 col-sm-6 item-blue ul li')
detail_content.each do |content|
case content.css('strong').text
when 'Lương'
salary = content.css('p').text
when 'Kinh nghiệm'
experience = content.css('p').text
puts content.css('p').text
when 'Hình thức'
type = content.css('p').text
puts content.css('p').text
when 'Cấp bậc'
level = content.css('p').text
when 'Hết hạn nộp'
expired_at = content.css('p').text
end
end
benefits, overview, requirement, other_requirement = ''
detail_require = detail_job.css('div.detail-row')
......@@ -79,7 +84,7 @@ namespace :crawler do
end
job = Job.find_or_create_by(
title: title_job,
title: title.text,
salary: salary,
experience: experience,
type: type,
......@@ -89,25 +94,25 @@ namespace :crawler do
overview: overview,
requirement: requirement,
other_requirement: other_requirement,
company_id: Company.find_by(name: name).id
company_id: Company.find_by(name: company_name.text).id
)
industries = detail_job.css('div.detail-box.has-background ul li p a')
industries.each do |industry|
industry_name = industry.text.squish
industries = Industry.find_or_create_by(
name: industry_name
name = industry.text.squish
industry_name = Industry.find_or_create_by(
name: name
)
job.industries << industries
job.industries << industry_name
end
location = detail_job.css('div.map p a')
location.each do |city|
city_name = city.text
cities = City.find_or_create_by(
name: city_name
name = city.text
city_name = City.find_or_create_by(
name: name
)
job.cities << cities
job.cities << city_name
end
end
page += 1
......@@ -156,9 +161,12 @@ namespace :crawler do
end
end
desc 'Craw regions, cities, industries, jobs and companies'
task all: %i[regions cities industries jobs]
def parse_base_url
base_url = Nokogiri::HTML(URI.open('https://careerbuilder.vn/'))
industries_url = base_url.css('div.menu div.dropdown-menu ul li a')[1].attributes['href'].value
industries_url = base_url.css('div.menu div.dropdown-menu ul li a')[1].attributes['href'].text
Nokogiri::HTML(URI.open(industries_url))
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment