Commit 6f684333 by Trịnh Hoàng Phúc

Crawl all job using parallel and mutex

parent ab24f6a1
Pipeline #621 failed with stages
in 0 seconds
...@@ -29,11 +29,13 @@ gem 'rsolr' ...@@ -29,11 +29,13 @@ gem 'rsolr'
gem 'carrierwave' gem 'carrierwave'
gem 'activerecord-import' gem 'activerecord-import', require: false
gem 'will_paginate' gem 'will_paginate'
gem 'settingslogic' gem 'settingslogic'
gem 'parallel', require: false
# Use Active Storage variant # Use Active Storage variant
# gem 'image_processing', '~> 1.2' # gem 'image_processing', '~> 1.2'
......
...@@ -160,6 +160,7 @@ GEM ...@@ -160,6 +160,7 @@ GEM
nokogiri (1.10.9-x86-mingw32) nokogiri (1.10.9-x86-mingw32)
mini_portile2 (~> 2.4.0) mini_portile2 (~> 2.4.0)
orm_adapter (0.5.0) orm_adapter (0.5.0)
parallel (1.19.1)
pry (0.13.0) pry (0.13.0)
coderay (~> 1.1) coderay (~> 1.1)
method_source (~> 1.0) method_source (~> 1.0)
...@@ -305,6 +306,7 @@ DEPENDENCIES ...@@ -305,6 +306,7 @@ DEPENDENCIES
listen (>= 3.0.5, < 3.2) listen (>= 3.0.5, < 3.2)
meta-tags meta-tags
mysql2 mysql2
parallel
pry pry
puma (~> 4.1) puma (~> 4.1)
rails (~> 6.0.2, >= 6.0.2.2) rails (~> 6.0.2, >= 6.0.2.2)
......
require "nokogiri" require "nokogiri"
require "open-uri" require "open-uri"
require "parallel"
require "activerecord-import"
namespace :crawler do namespace :crawler do
desc "Crawler Careerbuilder" desc "Crawler Careerbuilder"
...@@ -7,92 +9,99 @@ namespace :crawler do ...@@ -7,92 +9,99 @@ namespace :crawler do
task job: :environment do task job: :environment do
# Define crawler logger # Define crawler logger
logger = Logger.new("log/crawler_logger.log") logger = Logger.new("log/crawler_logger.log")
html_careerbuilder_list_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"))
total_page = (html_careerbuilder_list_jobs.at_css(".search-result-list .container .job-found .job-found-amout p").text.tr(",việc làm","").to_i / 50.0).ceil
# Loop page # Loop page
(1..2).each do |page| (1..total_page).each do |page|
# Fetch and parse HTML document # Fetch and parse HTML document
html_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html")) html_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
# Loop item # Loop item
html_jobs.css(".jobs-side-list .job-item").each do |item| Parallel.each(html_jobs.css(".jobs-side-list .job-item"), in_threads: 5) { |item|
url = item.css(".figure .figcaption .title .job_link @href").text begin
html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(url))) url = item.css(".figure .figcaption .title .job_link @href").text
if html_job_detail.at_css(".search-result-list-detail .tabs div#tab-1").nil? html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(url)))
logger.warn "Another template #{url}" if html_job_detail.at_css(".search-result-list-detail .tabs div#tab-1").nil?
next logger.warn "Another template #{url}"
end next
# Set salary, min-salary, max-salary
if item.at_css(".figure .figcaption .caption .salary").text.include? "USD"
logger.warn "Another template #{url}"
next
end
salary = item.at_css(".figure .figcaption .caption .salary").text.gsub("$ ","")
min_salary, max_salary = CrawlerService.convert_salary(salary)
# Job attributes
job_attributes = {
title: item.at_css(".figure .figcaption .title a @title").text,
updated_date_job: item.at_css(".bottom-right-icon .time time").text,
salary: salary,
min_salary: min_salary,
max_salary: max_salary
}
html_job_detail.css(".job-detail-content .row .has-background ul li").each do |ele|
type = ele.at_css("strong").text
case type
when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.at_css("p").text.squish
when "Cấp bậc"
job_attributes[:level] = ele.at_css("p").text.squish
when "Kinh nghiệm"
job_attributes[:years_of_experience] = ele.at_css("p").text.squish
end end
end # Set salary, min-salary, max-salary
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele| if item.at_css(".figure .figcaption .caption .salary").text.include? "USD"
next if ele.at_css(".detail-title").nil? logger.warn "Another template #{url}"
type = ele.at_css(".detail-title").text next
case type
when "Phúc lợi "
job_attributes[:benefit] = ele.at_css("ul").inner_html.squish
when "Mô tả Công việc"
job_attributes[:job_description] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Mô tả Công việc</h3>","")
when "Yêu Cầu Công Việc"
job_attributes[:job_requirements] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Yêu Cầu Công Việc</h3>","")
when "Thông tin khác"
job_attributes[:other_information] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Thông tin khác</h3>","")
end end
end salary = item.at_css(".figure .figcaption .caption .salary").text.gsub("$ ","")
next if item.at_css(".figure .image a @href").text == "javascript:void(0);" min_salary, max_salary = CrawlerService.convert_salary(salary)
# Company attributes # Job attributes
html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text))) job_attributes = {
next if html_company_detail.at_css(".jobsby-company").nil? title: item.at_css(".figure .figcaption .title a @title").text,
company_css = ".jobsby-company .company-introduction .company-info .info " updated_date_job: item.at_css(".bottom-right-icon .time time").text,
company_attributes = { salary: salary,
title: html_company_detail.at_css(company_css + ".content .name").text, min_salary: min_salary,
address: html_company_detail.css(company_css + ".content p")[1].text, max_salary: max_salary
logo: html_company_detail.at_css(company_css + ".img @src").text, }
description: html_company_detail.at_css(company_css + ".content ul").inner_html.squish html_job_detail.css(".job-detail-content .row .has-background ul li").each do |ele|
} type = ele.at_css("strong").text
# Defind cities array case type
cities = item.css(".figure .figcaption .caption .location ul li").map do |city| when "Hết hạn nộp"
city.text.squish job_attributes[:expiration_date] = ele.at_css("p").text.squish
end when "Cấp bậc"
# Defind industries array job_attributes[:level] = ele.at_css("p").text.squish
industries = html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").map do |industry| when "Kinh nghiệm"
industry.text.tr(",","").squish job_attributes[:years_of_experience] = ele.at_css("p").text.squish
end end
end
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele|
next if ele.at_css(".detail-title").nil?
type = ele.at_css(".detail-title").text
case type
when "Phúc lợi "
job_attributes[:benefit] = ele.at_css("ul").inner_html.squish
when "Mô tả Công việc"
job_attributes[:job_description] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Mô tả Công việc</h3>","")
when "Yêu Cầu Công Việc"
job_attributes[:job_requirements] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Yêu Cầu Công Việc</h3>","")
when "Thông tin khác"
job_attributes[:other_information] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Thông tin khác</h3>","")
end
end
next if item.at_css(".figure .image a @href").text == "javascript:void(0);"
# Company attributes
html_company_detail = Nokogiri::HTML.parse(URI.open(URI.encode(item.css(".figure .image a @href").text)))
next if html_company_detail.at_css(".jobsby-company").nil?
company_css = ".jobsby-company .company-introduction .company-info .info "
company_attributes = {
title: html_company_detail.at_css(company_css + ".content .name").text,
address: html_company_detail.css(company_css + ".content p")[1].text,
logo: html_company_detail.at_css(company_css + ".img @src").text,
description: html_company_detail.at_css(company_css + ".content ul").inner_html.squish
}
# Defind cities array
cities = item.css(".figure .figcaption .caption .location ul li").map do |city|
city.text.squish
end
# Defind industries array
industries = html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").map do |industry|
industry.text.tr(",","").squish
end
result = CrawlerService.imports(job_attributes, company_attributes, cities, industries) sleep rand
Mutex.new.synchronize {
logger.info "Crawl success url : #{url}" result = CrawlerService.imports(job_attributes, company_attributes, cities, industries)
rescue Exception => e }
sleep rand
logger.info "Crawl success url : #{url}"
rescue => e
logger.error e logger.error e
next next
end end
}
end end
end end
task city: :environment do task city: :environment do
# Fetch and parse HTML document # Fetch and parse HTML document
html_cities = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html")) html_cities = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
# Get city in country # Get city in country
cities_in_country = html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").map do |title| cities_in_country = html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").map do |title|
{ {
...@@ -115,7 +124,7 @@ namespace :crawler do ...@@ -115,7 +124,7 @@ namespace :crawler do
task industry: :environment do task industry: :environment do
# Fetch and parse HTML document # Fetch and parse HTML document
html_industries = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html")) html_industries = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
# Get industry # Get industry
industries = html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").map do |title| industries = html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").map do |title|
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment