Commit 6f684333 by Trịnh Hoàng Phúc

Crawl all job using parallel and mutex

parent ab24f6a1
Pipeline #621 failed with stages
in 0 seconds
......@@ -29,11 +29,13 @@ gem 'rsolr'
gem 'carrierwave'
gem 'activerecord-import'
gem 'activerecord-import', require: false
gem 'will_paginate'
gem 'settingslogic'
gem 'parallel', require: false
# Use Active Storage variant
# gem 'image_processing', '~> 1.2'
......
......@@ -160,6 +160,7 @@ GEM
nokogiri (1.10.9-x86-mingw32)
mini_portile2 (~> 2.4.0)
orm_adapter (0.5.0)
parallel (1.19.1)
pry (0.13.0)
coderay (~> 1.1)
method_source (~> 1.0)
......@@ -305,6 +306,7 @@ DEPENDENCIES
listen (>= 3.0.5, < 3.2)
meta-tags
mysql2
parallel
pry
puma (~> 4.1)
rails (~> 6.0.2, >= 6.0.2.2)
......
require "nokogiri"
require "open-uri"
require "parallel"
require "activerecord-import"
namespace :crawler do
desc "Crawler Careerbuilder"
......@@ -7,92 +9,99 @@ namespace :crawler do
task job: :environment do
# Define crawler logger
logger = Logger.new("log/crawler_logger.log")
html_careerbuilder_list_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"))
total_page = (html_careerbuilder_list_jobs.at_css(".search-result-list .container .job-found .job-found-amout p").text.tr(",việc làm","").to_i / 50.0).ceil
# Loop page
(1..2).each do |page|
(1..total_page).each do |page|
# Fetch and parse HTML document
html_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
# Loop item
html_jobs.css(".jobs-side-list .job-item").each do |item|
url = item.css(".figure .figcaption .title .job_link @href").text
html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(url)))
if html_job_detail.at_css(".search-result-list-detail .tabs div#tab-1").nil?
logger.warn "Another template #{url}"
next
end
# Set salary, min-salary, max-salary
if item.at_css(".figure .figcaption .caption .salary").text.include? "USD"
logger.warn "Another template #{url}"
next
end
salary = item.at_css(".figure .figcaption .caption .salary").text.gsub("$ ","")
min_salary, max_salary = CrawlerService.convert_salary(salary)
# Job attributes
job_attributes = {
title: item.at_css(".figure .figcaption .title a @title").text,
updated_date_job: item.at_css(".bottom-right-icon .time time").text,
salary: salary,
min_salary: min_salary,
max_salary: max_salary
}
html_job_detail.css(".job-detail-content .row .has-background ul li").each do |ele|
type = ele.at_css("strong").text
case type
when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.at_css("p").text.squish
when "Cấp bậc"
job_attributes[:level] = ele.at_css("p").text.squish
when "Kinh nghiệm"
job_attributes[:years_of_experience] = ele.at_css("p").text.squish
Parallel.each(html_jobs.css(".jobs-side-list .job-item"), in_threads: 5) { |item|
begin
url = item.css(".figure .figcaption .title .job_link @href").text
html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(url)))
if html_job_detail.at_css(".search-result-list-detail .tabs div#tab-1").nil?
logger.warn "Another template #{url}"
next
end
end
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele|
next if ele.at_css(".detail-title").nil?
type = ele.at_css(".detail-title").text
case type
when "Phúc lợi "
job_attributes[:benefit] = ele.at_css("ul").inner_html.squish
when "Mô tả Công việc"
job_attributes[:job_description] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Mô tả Công việc</h3>","")
when "Yêu Cầu Công Việc"
job_attributes[:job_requirements] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Yêu Cầu Công Việc</h3>","")
when "Thông tin khác"
job_attributes[:other_information] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Thông tin khác</h3>","")
# Set salary, min-salary, max-salary
if item.at_css(".figure .figcaption .caption .salary").text.include? "USD"
logger.warn "Another template #{url}"
next
end
end
next if item.at_css(".figure .image a @href").text == "javascript:void(0);"
# Company attributes
html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text)))
next if html_company_detail.at_css(".jobsby-company").nil?
company_css = ".jobsby-company .company-introduction .company-info .info "
company_attributes = {
title: html_company_detail.at_css(company_css + ".content .name").text,
address: html_company_detail.css(company_css + ".content p")[1].text,
logo: html_company_detail.at_css(company_css + ".img @src").text,
description: html_company_detail.at_css(company_css + ".content ul").inner_html.squish
}
# Defind cities array
cities = item.css(".figure .figcaption .caption .location ul li").map do |city|
city.text.squish
end
# Defind industries array
industries = html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").map do |industry|
industry.text.tr(",","").squish
end
salary = item.at_css(".figure .figcaption .caption .salary").text.gsub("$ ","")
min_salary, max_salary = CrawlerService.convert_salary(salary)
# Job attributes
job_attributes = {
title: item.at_css(".figure .figcaption .title a @title").text,
updated_date_job: item.at_css(".bottom-right-icon .time time").text,
salary: salary,
min_salary: min_salary,
max_salary: max_salary
}
html_job_detail.css(".job-detail-content .row .has-background ul li").each do |ele|
type = ele.at_css("strong").text
case type
when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.at_css("p").text.squish
when "Cấp bậc"
job_attributes[:level] = ele.at_css("p").text.squish
when "Kinh nghiệm"
job_attributes[:years_of_experience] = ele.at_css("p").text.squish
end
end
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele|
next if ele.at_css(".detail-title").nil?
type = ele.at_css(".detail-title").text
case type
when "Phúc lợi "
job_attributes[:benefit] = ele.at_css("ul").inner_html.squish
when "Mô tả Công việc"
job_attributes[:job_description] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Mô tả Công việc</h3>","")
when "Yêu Cầu Công Việc"
job_attributes[:job_requirements] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Yêu Cầu Công Việc</h3>","")
when "Thông tin khác"
job_attributes[:other_information] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Thông tin khác</h3>","")
end
end
next if item.at_css(".figure .image a @href").text == "javascript:void(0);"
# Company attributes
html_company_detail = Nokogiri::HTML.parse(URI.open(URI.encode(item.css(".figure .image a @href").text)))
next if html_company_detail.at_css(".jobsby-company").nil?
company_css = ".jobsby-company .company-introduction .company-info .info "
company_attributes = {
title: html_company_detail.at_css(company_css + ".content .name").text,
address: html_company_detail.css(company_css + ".content p")[1].text,
logo: html_company_detail.at_css(company_css + ".img @src").text,
description: html_company_detail.at_css(company_css + ".content ul").inner_html.squish
}
# Defind cities array
cities = item.css(".figure .figcaption .caption .location ul li").map do |city|
city.text.squish
end
# Defind industries array
industries = html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").map do |industry|
industry.text.tr(",","").squish
end
result = CrawlerService.imports(job_attributes, company_attributes, cities, industries)
logger.info "Crawl success url : #{url}"
rescue Exception => e
sleep rand
Mutex.new.synchronize {
result = CrawlerService.imports(job_attributes, company_attributes, cities, industries)
}
sleep rand
logger.info "Crawl success url : #{url}"
rescue => e
logger.error e
next
end
end
}
end
end
task city: :environment do
# Fetch and parse HTML document
html_cities = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
html_cities = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
# Get city in country
cities_in_country = html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").map do |title|
{
......@@ -115,7 +124,7 @@ namespace :crawler do
task industry: :environment do
# Fetch and parse HTML document
html_industries = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
html_industries = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
# Get industry
industries = html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").map do |title|
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment