Commit 629f8bc6 by Trịnh Hoàng Phúc

Merge branch 'feature/crawl_all_job_using_parallel_mutex' into 'master'

Crawl all job using parallel and mutex

See merge request !20
parents ab24f6a1 6f684333
Pipeline #622 failed with stages
in 0 seconds
......@@ -29,11 +29,13 @@ gem 'rsolr'
gem 'carrierwave'
gem 'activerecord-import'
gem 'activerecord-import', require: false
gem 'will_paginate'
gem 'settingslogic'
gem 'parallel', require: false
# Use Active Storage variant
# gem 'image_processing', '~> 1.2'
......
......@@ -160,6 +160,7 @@ GEM
nokogiri (1.10.9-x86-mingw32)
mini_portile2 (~> 2.4.0)
orm_adapter (0.5.0)
parallel (1.19.1)
pry (0.13.0)
coderay (~> 1.1)
method_source (~> 1.0)
......@@ -305,6 +306,7 @@ DEPENDENCIES
listen (>= 3.0.5, < 3.2)
meta-tags
mysql2
parallel
pry
puma (~> 4.1)
rails (~> 6.0.2, >= 6.0.2.2)
......
require "nokogiri"
require "open-uri"
require "parallel"
require "activerecord-import"
namespace :crawler do
desc "Crawler Careerbuilder"
......@@ -7,13 +9,15 @@ namespace :crawler do
task job: :environment do
# Define crawler logger
logger = Logger.new("log/crawler_logger.log")
html_careerbuilder_list_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"))
total_page = (html_careerbuilder_list_jobs.at_css(".search-result-list .container .job-found .job-found-amout p").text.tr(",việc làm","").to_i / 50.0).ceil
# Loop page
(1..2).each do |page|
(1..total_page).each do |page|
# Fetch and parse HTML document
html_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
# Loop item
html_jobs.css(".jobs-side-list .job-item").each do |item|
Parallel.each(html_jobs.css(".jobs-side-list .job-item"), in_threads: 5) { |item|
begin
url = item.css(".figure .figcaption .title .job_link @href").text
html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(url)))
if html_job_detail.at_css(".search-result-list-detail .tabs div#tab-1").nil?
......@@ -62,7 +66,7 @@ namespace :crawler do
end
next if item.at_css(".figure .image a @href").text == "javascript:void(0);"
# Company attributes
html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text)))
html_company_detail = Nokogiri::HTML.parse(URI.open(URI.encode(item.css(".figure .image a @href").text)))
next if html_company_detail.at_css(".jobsby-company").nil?
company_css = ".jobsby-company .company-introduction .company-info .info "
company_attributes = {
......@@ -80,19 +84,24 @@ namespace :crawler do
industry.text.tr(",","").squish
end
sleep rand
Mutex.new.synchronize {
result = CrawlerService.imports(job_attributes, company_attributes, cities, industries)
}
sleep rand
logger.info "Crawl success url : #{url}"
rescue Exception => e
rescue => e
logger.error e
next
end
}
end
end
task city: :environment do
# Fetch and parse HTML document
html_cities = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
html_cities = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
# Get city in country
cities_in_country = html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").map do |title|
{
......@@ -115,7 +124,7 @@ namespace :crawler do
task industry: :environment do
# Fetch and parse HTML document
html_industries = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
html_industries = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
# Get industry
industries = html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").map do |title|
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment