Crawl all job using parallel and mutex

6f684333 · Trịnh Hoàng Phúc · ab24f6a1 · 6f684333 · 6f684333 · 6f684333
Commit 6f684333 authored May 15, 2020 by Trịnh Hoàng Phúc
Show whitespace changes
Inline Side-by-side

Showing with 21 additions and 8 deletions

Gemfile
+3 -1

Gemfile.lock
+2 -0

lib/tasks/crawler.rake
+16 -7

No files found.
--- a/Gemfile
+++ b/Gemfile
@@ -29,11 +29,13 @@ gem 'rsolr'
 gem 'carrierwave'
-gem 'activerecord-import'
+gem 'activerecord-import', require: false
 gem 'will_paginate'
 gem 'settingslogic'
+gem 'parallel', require: false
 # Use Active Storage variant
 # gem 'image_processing', '~> 1.2'

--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -160,6 +160,7 @@ GEM
    nokogiri (1.10.9-x86-mingw32)
      mini_portile2 (~> 2.4.0)
    orm_adapter (0.5.0)
+    parallel (1.19.1)
    pry (0.13.0)
      coderay (~> 1.1)
      method_source (~> 1.0)
@@ -305,6 +306,7 @@ DEPENDENCIES
  listen (>= 3.0.5, < 3.2)
  meta-tags
  mysql2
+  parallel
  pry
  puma (~> 4.1)
  rails (~> 6.0.2, >= 6.0.2.2)

--- a/lib/tasks/crawler.rake
+++ b/lib/tasks/crawler.rake
 require "nokogiri"
 require "open-uri"
+require "parallel"
+require "activerecord-import"
 namespace :crawler do
  desc "Crawler Careerbuilder"
@@ -7,13 +9,15 @@ namespace :crawler do
  task job: :environment do
    # Define crawler logger
    logger = Logger.new("log/crawler_logger.log")
+    html_careerbuilder_list_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"))
+    total_page = (html_careerbuilder_list_jobs.at_css(".search-result-list .container .job-found .job-found-amout p").text.tr(",việc làm","").to_i / 50.0).ceil
    # Loop page
-    (1..2).each do |page|
+    (1..total_page).each do |page|
      # Fetch and parse HTML document
      html_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
      # Loop item
-      html_jobs.css(".jobs-side-list .job-item").each do |item|
+      Parallel.each(html_jobs.css(".jobs-side-list .job-item"), in_threads: 5) { |item|
+        begin
          url = item.css(".figure .figcaption .title .job_link @href").text
          html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(url)))
          if html_job_detail.at_css(".search-result-list-detail .tabs div#tab-1").nil?
@@ -62,7 +66,7 @@ namespace :crawler do
          end
          next if item.at_css(".figure .image a @href").text == "javascript:void(0);"
          # Company attributes
-        html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text)))
+          html_company_detail = Nokogiri::HTML.parse(URI.open(URI.encode(item.css(".figure .image a @href").text)))
          next if html_company_detail.at_css(".jobsby-company").nil?
          company_css = ".jobsby-company .company-introduction .company-info .info "
          company_attributes = {
@@ -80,19 +84,24 @@ namespace :crawler do
            industry.text.tr(",","").squish
          end              
+          sleep rand
+          Mutex.new.synchronize {
            result = CrawlerService.imports(job_attributes, company_attributes, cities, industries)
+          }
+          sleep rand
          logger.info "Crawl success url : #{url}"
-        rescue Exception => e
+        rescue => e
          logger.error e
          next
        end
+      }
    end
  end
  task city: :environment do
    # Fetch and parse HTML document
-    html_cities = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
+    html_cities = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
    # Get city in country
    cities_in_country = html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").map do |title|
      {
@@ -115,7 +124,7 @@ namespace :crawler do
  task industry: :environment do
    # Fetch and parse HTML document
-    html_industries = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
+    html_industries = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
    # Get industry
    industries = html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").map do |title|
      {