Commit 6f684333 by Trịnh Hoàng Phúc

Crawl all job using parallel and mutex

parent ab24f6a1
Pipeline #621 failed with stages
in 0 seconds
...@@ -29,11 +29,13 @@ gem 'rsolr' ...@@ -29,11 +29,13 @@ gem 'rsolr'
gem 'carrierwave' gem 'carrierwave'
gem 'activerecord-import' gem 'activerecord-import', require: false
gem 'will_paginate' gem 'will_paginate'
gem 'settingslogic' gem 'settingslogic'
gem 'parallel', require: false
# Use Active Storage variant # Use Active Storage variant
# gem 'image_processing', '~> 1.2' # gem 'image_processing', '~> 1.2'
......
...@@ -160,6 +160,7 @@ GEM ...@@ -160,6 +160,7 @@ GEM
nokogiri (1.10.9-x86-mingw32) nokogiri (1.10.9-x86-mingw32)
mini_portile2 (~> 2.4.0) mini_portile2 (~> 2.4.0)
orm_adapter (0.5.0) orm_adapter (0.5.0)
parallel (1.19.1)
pry (0.13.0) pry (0.13.0)
coderay (~> 1.1) coderay (~> 1.1)
method_source (~> 1.0) method_source (~> 1.0)
...@@ -305,6 +306,7 @@ DEPENDENCIES ...@@ -305,6 +306,7 @@ DEPENDENCIES
listen (>= 3.0.5, < 3.2) listen (>= 3.0.5, < 3.2)
meta-tags meta-tags
mysql2 mysql2
parallel
pry pry
puma (~> 4.1) puma (~> 4.1)
rails (~> 6.0.2, >= 6.0.2.2) rails (~> 6.0.2, >= 6.0.2.2)
......
require "nokogiri" require "nokogiri"
require "open-uri" require "open-uri"
require "parallel"
require "activerecord-import"
namespace :crawler do namespace :crawler do
desc "Crawler Careerbuilder" desc "Crawler Careerbuilder"
...@@ -7,13 +9,15 @@ namespace :crawler do ...@@ -7,13 +9,15 @@ namespace :crawler do
task job: :environment do task job: :environment do
# Define crawler logger # Define crawler logger
logger = Logger.new("log/crawler_logger.log") logger = Logger.new("log/crawler_logger.log")
html_careerbuilder_list_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"))
total_page = (html_careerbuilder_list_jobs.at_css(".search-result-list .container .job-found .job-found-amout p").text.tr(",việc làm","").to_i / 50.0).ceil
# Loop page # Loop page
(1..2).each do |page| (1..total_page).each do |page|
# Fetch and parse HTML document # Fetch and parse HTML document
html_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html")) html_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
# Loop item # Loop item
html_jobs.css(".jobs-side-list .job-item").each do |item| Parallel.each(html_jobs.css(".jobs-side-list .job-item"), in_threads: 5) { |item|
begin
url = item.css(".figure .figcaption .title .job_link @href").text url = item.css(".figure .figcaption .title .job_link @href").text
html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(url))) html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(url)))
if html_job_detail.at_css(".search-result-list-detail .tabs div#tab-1").nil? if html_job_detail.at_css(".search-result-list-detail .tabs div#tab-1").nil?
...@@ -62,7 +66,7 @@ namespace :crawler do ...@@ -62,7 +66,7 @@ namespace :crawler do
end end
next if item.at_css(".figure .image a @href").text == "javascript:void(0);" next if item.at_css(".figure .image a @href").text == "javascript:void(0);"
# Company attributes # Company attributes
html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text))) html_company_detail = Nokogiri::HTML.parse(URI.open(URI.encode(item.css(".figure .image a @href").text)))
next if html_company_detail.at_css(".jobsby-company").nil? next if html_company_detail.at_css(".jobsby-company").nil?
company_css = ".jobsby-company .company-introduction .company-info .info " company_css = ".jobsby-company .company-introduction .company-info .info "
company_attributes = { company_attributes = {
...@@ -80,19 +84,24 @@ namespace :crawler do ...@@ -80,19 +84,24 @@ namespace :crawler do
industry.text.tr(",","").squish industry.text.tr(",","").squish
end end
sleep rand
Mutex.new.synchronize {
result = CrawlerService.imports(job_attributes, company_attributes, cities, industries) result = CrawlerService.imports(job_attributes, company_attributes, cities, industries)
}
sleep rand
logger.info "Crawl success url : #{url}" logger.info "Crawl success url : #{url}"
rescue Exception => e rescue => e
logger.error e logger.error e
next next
end end
}
end end
end end
task city: :environment do task city: :environment do
# Fetch and parse HTML document # Fetch and parse HTML document
html_cities = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html")) html_cities = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
# Get city in country # Get city in country
cities_in_country = html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").map do |title| cities_in_country = html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").map do |title|
{ {
...@@ -115,7 +124,7 @@ namespace :crawler do ...@@ -115,7 +124,7 @@ namespace :crawler do
task industry: :environment do task industry: :environment do
# Fetch and parse HTML document # Fetch and parse HTML document
html_industries = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html")) html_industries = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
# Get industry # Get industry
industries = html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").map do |title| industries = html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").map do |title|
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment