Crawl all job using parallel and mutex

6f684333 · Trịnh Hoàng Phúc · ab24f6a1 · 6f684333 · 6f684333 · 6f684333
Commit 6f684333 authored May 15, 2020 by Trịnh Hoàng Phúc
Hide whitespace changes
Inline Side-by-side

Showing with 87 additions and 74 deletions

Gemfile
+3 -1

Gemfile.lock
+2 -0

lib/tasks/crawler.rake
+82 -73

No files found.
--- a/Gemfile
+++ b/Gemfile
@@ -29,11 +29,13 @@ gem 'rsolr'
 gem 'carrierwave'
-gem 'activerecord-import'
+gem 'activerecord-import', require: false
 gem 'will_paginate'
 gem 'settingslogic'
+gem 'parallel', require: false
 # Use Active Storage variant
 # gem 'image_processing', '~> 1.2'

--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -160,6 +160,7 @@ GEM
    nokogiri (1.10.9-x86-mingw32)
      mini_portile2 (~> 2.4.0)
    orm_adapter (0.5.0)
+    parallel (1.19.1)
    pry (0.13.0)
      coderay (~> 1.1)
      method_source (~> 1.0)
@@ -305,6 +306,7 @@ DEPENDENCIES
  listen (>= 3.0.5, < 3.2)
  meta-tags
  mysql2
+  parallel
  pry
  puma (~> 4.1)
  rails (~> 6.0.2, >= 6.0.2.2)

--- a/lib/tasks/crawler.rake
+++ b/lib/tasks/crawler.rake
 require "nokogiri"
 require "open-uri"
+require "parallel"
+require "activerecord-import"
 namespace :crawler do
  desc "Crawler Careerbuilder"
@@ -7,92 +9,99 @@ namespace :crawler do
  task job: :environment do
    # Define crawler logger
    logger = Logger.new("log/crawler_logger.log")
+    html_careerbuilder_list_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"))
+    total_page = (html_careerbuilder_list_jobs.at_css(".search-result-list .container .job-found .job-found-amout p").text.tr(",việc làm","").to_i / 50.0).ceil
    # Loop page
-    (1..2).each do |page|
+    (1..total_page).each do |page|
      # Fetch and parse HTML document
      html_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
      # Loop item
-      html_jobs.css(".jobs-side-list .job-item").each do |item|
+      Parallel.each(html_jobs.css(".jobs-side-list .job-item"), in_threads: 5) { |item|
-        url = item.css(".figure .figcaption .title .job_link @href").text
+        begin
-        html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(url)))
+          url = item.css(".figure .figcaption .title .job_link @href").text
-        if html_job_detail.at_css(".search-result-list-detail .tabs div#tab-1").nil?
+          html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(url)))
-          logger.warn "Another template #{url}"
+          if html_job_detail.at_css(".search-result-list-detail .tabs div#tab-1").nil?
-          next
+            logger.warn "Another template #{url}"
-        end
+            next
-        # Set salary, min-salary, max-salary
-        if item.at_css(".figure .figcaption .caption .salary").text.include? "USD"
-          logger.warn "Another template #{url}"
-          next
-        end
-        salary = item.at_css(".figure .figcaption .caption .salary").text.gsub("$ ","")
-        min_salary, max_salary = CrawlerService.convert_salary(salary)
-        # Job attributes
-        job_attributes = {
-          title: item.at_css(".figure .figcaption .title a @title").text,
-          updated_date_job: item.at_css(".bottom-right-icon .time time").text,
-          salary: salary,
-          min_salary: min_salary,
-          max_salary: max_salary
-        }
-        html_job_detail.css(".job-detail-content .row .has-background ul li").each do |ele|
-          type = ele.at_css("strong").text
-          case type
-          when "Hết hạn nộp"
-            job_attributes[:expiration_date] = ele.at_css("p").text.squish
-          when "Cấp bậc"
-            job_attributes[:level] = ele.at_css("p").text.squish
-          when "Kinh nghiệm"
-            job_attributes[:years_of_experience] = ele.at_css("p").text.squish
          end
-        end
+          # Set salary, min-salary, max-salary
-        html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele|
+          if item.at_css(".figure .figcaption .caption .salary").text.include? "USD"
-          next if ele.at_css(".detail-title").nil?
+            logger.warn "Another template #{url}"
-          type = ele.at_css(".detail-title").text
+            next
-          case type
-          when "Phúc lợi "
-            job_attributes[:benefit] = ele.at_css("ul").inner_html.squish
-          when "Mô tả Công việc"
-            job_attributes[:job_description] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Mô tả Công việc</h3>","")
-          when "Yêu Cầu Công Việc"
-            job_attributes[:job_requirements] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Yêu Cầu Công Việc</h3>","")
-          when "Thông tin khác"
-            job_attributes[:other_information] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Thông tin khác</h3>","")
          end
-        end
+          salary = item.at_css(".figure .figcaption .caption .salary").text.gsub("$ ","")
-        next if item.at_css(".figure .image a @href").text == "javascript:void(0);"
+          min_salary, max_salary = CrawlerService.convert_salary(salary)
-        # Company attributes
+          # Job attributes
-        html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text)))
+          job_attributes = {
-        next if html_company_detail.at_css(".jobsby-company").nil?
+            title: item.at_css(".figure .figcaption .title a @title").text,
-        company_css = ".jobsby-company .company-introduction .company-info .info "
+            updated_date_job: item.at_css(".bottom-right-icon .time time").text,
-        company_attributes = {
+            salary: salary,
-          title: html_company_detail.at_css(company_css + ".content .name").text,
+            min_salary: min_salary,
-          address: html_company_detail.css(company_css + ".content p")[1].text,
+            max_salary: max_salary
-          logo: html_company_detail.at_css(company_css + ".img @src").text,
+          }
-          description: html_company_detail.at_css(company_css + ".content ul").inner_html.squish
+          html_job_detail.css(".job-detail-content .row .has-background ul li").each do |ele|
-        }
+            type = ele.at_css("strong").text
-        # Defind cities array
+            case type
-        cities = item.css(".figure .figcaption .caption .location ul li").map do |city|
+            when "Hết hạn nộp"
-          city.text.squish
+              job_attributes[:expiration_date] = ele.at_css("p").text.squish
-        end
+            when "Cấp bậc"
-        # Defind industries array
+              job_attributes[:level] = ele.at_css("p").text.squish
-        industries = html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").map do |industry|
+            when "Kinh nghiệm"
-          industry.text.tr(",","").squish
+              job_attributes[:years_of_experience] = ele.at_css("p").text.squish
-        end
+            end
+          end
+          html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele|
+            next if ele.at_css(".detail-title").nil?
+            type = ele.at_css(".detail-title").text
+            case type
+            when "Phúc lợi "
+              job_attributes[:benefit] = ele.at_css("ul").inner_html.squish
+            when "Mô tả Công việc"
+              job_attributes[:job_description] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Mô tả Công việc</h3>","")
+            when "Yêu Cầu Công Việc"
+              job_attributes[:job_requirements] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Yêu Cầu Công Việc</h3>","")
+            when "Thông tin khác"
+              job_attributes[:other_information] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Thông tin khác</h3>","")
+            end
+          end
+          next if item.at_css(".figure .image a @href").text == "javascript:void(0);"
+          # Company attributes
+          html_company_detail = Nokogiri::HTML.parse(URI.open(URI.encode(item.css(".figure .image a @href").text)))
+          next if html_company_detail.at_css(".jobsby-company").nil?
+          company_css = ".jobsby-company .company-introduction .company-info .info "
+          company_attributes = {
+            title: html_company_detail.at_css(company_css + ".content .name").text,
+            address: html_company_detail.css(company_css + ".content p")[1].text,
+            logo: html_company_detail.at_css(company_css + ".img @src").text,
+            description: html_company_detail.at_css(company_css + ".content ul").inner_html.squish
+          }
+          # Defind cities array
+          cities = item.css(".figure .figcaption .caption .location ul li").map do |city|
+            city.text.squish
+          end
+          # Defind industries array
+          industries = html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").map do |industry|
+            industry.text.tr(",","").squish
+          end              
-        result = CrawlerService.imports(job_attributes, company_attributes, cities, industries)
+          sleep rand
+          Mutex.new.synchronize {
-        logger.info "Crawl success url : #{url}"
+            result = CrawlerService.imports(job_attributes, company_attributes, cities, industries)
-        rescue Exception => e
+          }
+          sleep rand
+          logger.info "Crawl success url : #{url}"
+        rescue => e
          logger.error e
          next
-      end
+        end
+      }
    end
  end
  task city: :environment do
    # Fetch and parse HTML document
-    html_cities = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
+    html_cities = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
    # Get city in country
    cities_in_country = html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").map do |title|
      {
@@ -115,7 +124,7 @@ namespace :crawler do
  task industry: :environment do
    # Fetch and parse HTML document
-    html_industries = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
+    html_industries = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
    # Get industry
    industries = html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").map do |title|
      {