use Template Method Pattern

a0abd223 · Ngô Trung Hưng · 0ac0989b · a0abd223 · a0abd223 · a0abd223
Commit a0abd223 authored Jul 31, 2020 by Ngô Trung Hưng
11 changed files
--- a/app/models/city.rb
+++ b/app/models/city.rb
@@ -4,5 +4,5 @@
 class City < ApplicationRecord
  has_many :city_jobs
  has_many :jobs, through: :city_jobs
-  enum area: { international: 0, domestic:1, range: 69 }
+  enum area: { international: 0, domestic: 1 }
 end
--- a/config/application.rb
+++ b/config/application.rb
@@ -9,8 +9,9 @@ Bundler.require(*Rails.groups)
 module Venjob
  class Application < Rails::Application
    # Initialize configuration defaults for originally generated Rails version.
+    config.autoload_paths << Rails.root.join('lib')
+    config.eager_load_paths << Rails.root.join('lib')
    config.load_defaults 5.2
-
    # Settings in config/environments/* take precedence over those specified here.
    # Application configuration can go into files in config/initializers
    # -- all .rb files in that directory are automatically loaded after loading

--- a/config/schedule.rb
+++ b/config/schedule.rb
+# Use this file to easily define all of your cron jobs.
+#
+# It's helpful, but not entirely necessary to understand cron before proceeding.
+# http://en.wikipedia.org/wiki/Cron
+
+# Example:
+#
+# set :output, "/path/to/my/cron_log.log"
+#
+# every 2.hours do
+#   command "/usr/bin/some_great_command"
+#   runner "MyModel.some_method"
+#   rake "some:great:rake:task"
+# end
+#
+# every 4.days do
+#   runner "AnotherModel.prune_old_records"
+# end
+
+# Learn more: http://github.com/javan/whenever
+env :PATH, ENV['PATH']
+every 1.hours do
+  rake 'crawler:populate'
+end
\ No newline at end of file
--- a/lib/src/base/base.rb
+++ b/lib/src/base/base.rb
+# frozen_string_literal: true
+
+require 'nokogiri'
+require 'open-uri'
+require 'logger'
+
+# Crawler data
+class Base
+  COMPANY_SECURITY = 1
+
+  attr_accessor :job, :page
+
+  def initialize(page)
+    @job = {}
+    @page = page
+  end
+
+  def logger
+    @logger ||= Logger.new(Rails.root.join('log', 'crawl.log'))
+  end
+
+  def create_data
+    take_data
+    job
+  rescue StandardError => e
+    logger.error "Crawler data job have error: #{e}"
+  end
+
+  private
+
+  def take_data
+    job[:name] = fill_name
+    job[:company_name] = fill_company_name
+    job[:city_name] = fill_city_name
+    job[:created_date] = fill_created_date
+    job[:expiration_date] = fill_expiration_date
+    job[:salary] = fill_salary
+    job[:industry_name] = fill_industry_name
+    job[:description] = fill_description
+    job[:level] = fill_lever
+    job[:exprience] = fill_experience
+  end
+
+  def fill_name
+    page.search('.apply-now-content .job-desc .title').text
+  end
+
+  def fill_company_name
+    page.search('.apply-now-content .job-desc .job-company-name').text
+  end
+
+  def fill_city_name
+    page.search('.detail-box .map p a').map(&:text).join(',')
+  end
+
+  def fill_created_date
+    page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].text
+  end
+
+  def fill_expiration_date
+    page.xpath('//ul//li[last()]//p').last.text
+  end
+
+  def fill_salary
+    page.xpath('//ul//li[position()=1]//p')[1].text
+  end
+
+  def fill_industry_name
+    industries = page.xpath('//ul//li[position()=2]//p//a').map(&:text)
+    industries.map(&:strip).join(',')
+  end
+
+  def fill_description
+    job[:description] = page.search('.tabs .tab-content .detail-row').to_s
+  end
+
+  def check
+    noname = page.search('//ul//li').text
+    noname.include?('Kinh nghiệm')
+  end
+
+  def fill_lever
+    if check
+      page.xpath('//ul//li[position()=3]//p')[1].text.strip
+    else
+      page.xpath('//ul//li[position()=2]//p')[1].text
+    end
+  end
+
+  def fill_experience
+    check ? page.xpath('//ul//li[position()=2]//p')[1].text.strip : ''
+  end
+end
--- a/lib/src/crawler.rb
+++ b/lib/src/crawler.rb
+# frozen_string_literal: true
+
+require 'open-uri'
+
+# Crawler data
+class Crawler
+  COMPANY_SECURITY = 1
+  NUMBER_LINK = 100
+  SIZE_LI = 8
+  RANGE = 69
+
+  def path_to_first_link
+    Rails.root.join('tmp', 'link.txt')
+  end
+
+  def logger
+    @logger ||= Logger.new(Rails.root.join('log', 'crawler.log'))
+  end
+
+  def link_make_stop_crawler
+    file = File.readlines(path_to_first_link, 'r') if File.exist?(path_to_first_link)
+    file.blank? ? 'NOT' : file.join
+  end
+
+  def safe_link(url)
+    Nokogiri::HTML(URI.open(URI.parse(URI.escape(url))))
+  end
+
+  def crawl_link(page)
+    website_companies = []
+    page.times do |i|
+      page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
+      link_companies = page.search('.figcaption .caption @href')
+      website_companies += link_companies.map(&:value).uniq
+      link_jobs = page.search('.figcaption .title .job_link @href').text
+      break if link_jobs.include?(link_make_stop_crawler)
+    end
+    website_companies.select(&:present?)
+  rescue StandardError => e
+    logger.error "Crawler link on page have error #{e}"    
+  end
+
+  def craw_data_cities
+    page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
+    locations = page.search('#location option').map(&:text)
+    locations.each_with_index do |val, index|
+      area = index > RANGE ? City.areas['international'] : City.areas['domestic']
+      City.find_or_create_by(name: val) { |city| city.area = area }
+    end
+  end
+
+  def craw_data_companies
+    crawl_link(NUMBER_LINK).each do |url|
+      next if url.include?('javascript:void(0);')
+
+      page = safe_link(url)
+      company_name = page.search('.company-info .content .name').text
+      Company.find_or_create_by(name: company_name) do |company|
+        company.address = page.search('.company-info .info .content p:nth-child(3)').text
+        company.short_description = page.search('.main-about-us .content').text
+      end
+    rescue StandardError => e
+      logger.error "Crawler data companies has error: #{e}"
+    end
+  end
+end
--- a/lib/src/crawler_job.rb
+++ b/lib/src/crawler_job.rb
+# frozen_string_literal: true
+
+require 'src/crawler.rb'
+require_relative '../src/interface/red_interface.rb'
+require_relative '../src/interface/blue_interface.rb'
+require_relative '../src/interface/green_interface.rb'
+
+# Crawler data job
+class CrawlerJob < Crawler
+  def crawl_link(page)
+    website_jobs = []
+    page.times do |i|
+      page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
+      link_jobs = page.search('.figcaption .title .job_link @href')
+      website_jobs += link_jobs.map(&:value)
+      break if website_jobs.include?(link_make_stop_crawler)
+    end
+    File.write(path_to_first_link, website_jobs[0])
+    website_jobs.select(&:present?)
+  rescue StandardError => e
+    logger.error "Crawler link on page have error #{e}"
+  end
+
+  def reverse_arr
+    arr_link = []
+    crawl_link(NUMBER_LINK).each { |val| arr_link << val }
+    arr_link.reverse!
+  end
+
+  def craw_data_jobs
+    reverse_arr.each do |path|
+      page = safe_link(path)
+      if page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].present?
+        @data = RedInterface.new(page).create_data
+      elsif page.search('section .template-200').text.present?
+        @data = BlueInterface.new(page).create_data
+      elsif page.search('.DetailJobNew ul li').size == SIZE_LI && page.search('.right-col ul li').text.exclude?('Độ tuổi')
+        @data = GreenInterface.new(page).create_data
+      end
+      add_data(@data)
+    end
+  end
+
+  def add_data(data)
+    id_company = Company.find_by name: data[:company_name]
+    id_company = id_company.present? ? id_company.id : COMPANY_SECURITY
+    id_job = Job.create!(name: data[:name],
+                         company_id: id_company,
+                         level: data[:level],
+                         experience: data[:exprience],
+                         salary: data[:salary],
+                         create_date: data[:created_date],
+                         expiration_date: data[:expiration_date],
+                         description: data[:description])
+    create_industry_relation(data[:industry_name], id_job.id)
+    create_city_relation(data[:city_name], id_job.id)
+  rescue StandardError => e
+    logger.error "Crawler data jobs has error: #{e}"
+  end
+
+  def create_industry_relation(data, id_job)
+    return if data.blank? && id_job.blank?
+
+    industries = data.split(',')
+    industries.each do |val|
+      val.gsub!('&amp;', '&') if val.include?('&amp;')
+      industry = Industry.find_or_create_by name: val.strip
+      IndustryJob.create(industry_id: industry.id, job_id: id_job)
+    end
+  end
+
+  def create_city_relation(data, id_job)
+    cities = data.split(',')
+    cities.each do |city|
+      city = City.find_or_create_by(name: city.strip, area: City.areas['domestic'])
+      CityJob.create(job_id: id_job, city_id: city.id)
+    end
+  end
+end
--- a/lib/src/interface/blue_interface.rb
+++ b/lib/src/interface/blue_interface.rb
+# frozen_string_literal: true
+
+require_relative '../base/base.rb'
+
+# Inherience from base
+class BlueInterface < Base
+  def fill_company_name
+    page.search('.top-job .top-job-info .tit_company').text
+  end
+
+  def fill_city_name
+    page.search('.info-workplace .value a').map(&:text).join(',')
+  end
+
+  def fill_created_date; end
+
+  def fill_expiration_date
+    page.xpath('//ul//li[position()=4]//div').text
+  end
+
+  def fill_salary
+    page.xpath('//ul//li[position()=3]//div').text
+  end
+
+  def fill_industry_name
+    page.xpath('//ul//li[position()=5]//div').text
+  end
+
+  def fill_description
+    page.search('.left-col').to_s
+  end
+
+  def check
+    noname = page.xpath('//ul//li[position()=2]/b').last.text
+    noname.include?('Cấp bậc')
+  end
+
+  def fill_lever
+    check ? page.xpath('//ul//li[position()=2]/div').last.text : ''
+  end
+
+  def fill_experience
+    page.xpath('//ul//li[position()=7]/b').text
+  end
+end
--- a/lib/src/interface/green_interface.rb
+++ b/lib/src/interface/green_interface.rb
+# frozen_string_literal: true
+
+require_relative '../base/base.rb'
+
+# ahihi
+class GreenInterface < Base
+  def fill_name
+    page.search('.info-company h1').text
+  end
+
+  def fill_company_name
+    page.search('.info-company .text-job h2').text
+  end
+
+  def fill_city_name
+    page.search('.DetailJobNew ul li:nth-child(1) a').text
+  end
+
+  def fill_created_date; end
+
+  def fill_expiration_date
+    page.xpath('//ul//li[last()-1]//span').children[1].text
+  end
+
+  def fill_salary
+    page.xpath('//ul//li[last()-2]//span').text
+  end
+
+  def fill_industry_name
+    page.search('.DetailJobNew li:nth-child(3) span').text.strip
+  end
+
+  def fill_description
+    page.search('.left-col .detail-row').text
+  end
+
+  def fill_lever
+    page.search('.DetailJobNew li:nth-child(2) span').text.strip
+  end
+
+  def check_exp
+    noname = page.search('.DetailJobNew li span').text
+    noname.include?('Kinh nghiệm')
+  end
+
+  def fill_experience
+    check_exp ? page.search('.DetailJobNew li:nth-child(5) span').text.strip : ''
+  end
+end
--- a/lib/src/interface/red_interface.rb
+++ b/lib/src/interface/red_interface.rb
+# frozen_string_literal: true
+
+require_relative '../base/base.rb'
+
+# Inherience from base
+class RedInterface < Base
+end
--- a/lib/src/interface_web.rb
+++ b/lib/src/interface_web.rb
-# frozen_string_literal: true
-
-require 'open-uri'
-
-# Crawler data
-class Crawler
-  COMPANY_SECURITY = 1
-  NUMBER_LINK = 2
-  SIZE_LI_INTERFACE_5 = 10
-
-  def path_to_first_link
-    Rails.root.join('tmp', 'link.txt')
-  end
-
-  def logger
-    @logger ||= Logger.new(Rails.root.join('log', 'crawler.log'))
-  end
-
-  def stop_crawler
-    file = File.readlines(path_to_first_link, 'r') if File.exist?(path_to_first_link)
-    file.blank? ? '' : file.join
-  end
-
-  def safe_link(url)
-    Nokogiri::HTML(URI.open(URI.parse(URI.escape(url))))
-  end
-
-  def crawl_link(page)
-    data = []
-    website_companies = []
-    website_jobs = []
-    begin
-      page.times do |i|
-        page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
-        link_companies = page.search('.figcaption .caption @href')
-        website_companies += link_companies.map(&:value).uniq
-        link_jobs = page.search('.figcaption .title .job_link @href')
-        website_jobs += link_jobs.map(&:value)
-        break if website_jobs.include?(stop_crawler)
-      end
-    rescue StandardError => e
-      logger.error "Crawler link on page have error #{e}"
-    end
-    website_companies = website_companies.select(&:present?)
-    website_jobs = website_jobs.select(&:present?)
-    File.write(path_to_first_link, website_jobs[0])
-    data << website_companies << website_jobs
-  end
-
-  def link_job_and_companies
-    @link_job_and_companies ||= crawl_link(NUMBER_LINK)
-  end
-
-  def craw_data_cities
-    page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
-    locations = page.search('#location option').map(&:text)
-    locations.each_with_index do |val, index|
-      area = index > City.areas['range'] ? City.areas['international'] : City.areas['domestic']
-      City.find_or_create_by(name: val) { |city| city.area = area }
-    end
-  end
-
-  def craw_data_companies
-    link_crawl = link_job_and_companies
-    link_crawl[0].each do |url|      
-      page = safe_link(url)
-      company_name = page.search('.company-info .content .name').text
-      Company.find_or_create_by(name: company_name) do |company|
-        company.address = page.search('.company-info .info .content p:nth-child(3)').text
-        company.short_description = page.search('.main-about-us .content').text
-      end
-    end
-    rescue StandardError => e
-      logger.error "Crawler data companies has error: #{e}"
-  end
-end
-
-  # def make_data
-  #   puts 'Please wait for crawl jobs data! . . .'
-  #   link_crawl = link_job_and_companies
-  #   arr_link = []
-  #   link_crawl[1].each do |val|
-  #     break if stop_crawler == val
-  #     arr_link << val
-  #   end
-  #   arr_link.reverse!.each_with_index do |path, i|
-  #     page = Nokogiri::HTML(URI.open(URI.parse(URI.escape(path))))
-  #     if page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].present?
-  #       crawl_data_jobs_interface_1(page)
-  #     elsif page.search('section .template-200').text.present?
-  #       crawl_data_jobs_interface_2(page)
-  #     elsif page.search('.DetailJobNew ul li').size == SIZE_LI_INTERFACE_5 && !page.search('.right-col ul li').text.include?('Độ tuổi')
-  #       crawl_data_jobs_interface_5(page)
-  #     end
-  #     puts "#{i} - #{path}"
-  #   end
-  #   puts 'Crawler data jobs success!'
-  # end
-
-  # private
-
-  # def add_data(data)
-  #   id_company = Company.find_by name: data[:company_name]
-  #   id_company = id_company.present? ? id_company.id : COMPANY_SECURITY
-  #   id_job = Job.create!(name: data[:name],
-  #                       company_id: id_company,
-  #                       level: data[:level],
-  #                       experience: data[:exprience],
-  #                       salary: data[:salary],
-  #                       create_date: data[:created_date],
-  #                       expiration_date: data[:expiration_date],
-  #                       description: data[:description])
-  #   make_foreign_industries_table(data[:industry_name], id_job.id)
-  #   make_foreign_cities_table(data[:city_name], id_job.id)
-  # rescue StandardError => e
-  #   puts e
-  # end
-
-  # def crawl_data_jobs_interface_1(page)
-  #   data = {}
-  #   data[:name] = page.search('.apply-now-content .job-desc .title').text
-  #   data[:company_name] = page.search('.apply-now-content .job-desc .job-company-name').text
-  #   location = []
-  #   length = page.search('.detail-box .map p a').size
-  #   length.times do |n|
-  #     location << page.search(".detail-box .map p a:nth-child(#{n + 1})").text
-  #   end
-  #   data[:city_name] = location.join(',')
-  #   data[:created_date] = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].text
-  #   data[:expiration_date] = page.search('.item-blue .detail-box ul li:last')[1].text.delete!("[\n,\t,\r]").split(' ').last
-  #   data[:salary] = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[1].text
-  #   industries = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(2) a').text
-  #   industries = industries.delete!("[\n,\t,\r]").split('  ').select(&:present?)
-  #   data[:industry_name] = industries.join(',')
-  #   data[:description] = page.search('.tabs .tab-content .detail-row:nth-child(n)').to_s
-  #   get_level = page.search('.item-blue .detail-box:last ul li:nth-child(3)').text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
-  #   get_level = get_level[1].to_s.strip
-  #   if get_level.blank?
-  #     g_level = page.search('.item-blue .detail-box:last ul li:nth-child(2)').text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
-  #     data[:level] = g_level[1].to_s.strip
-  #   else
-  #     data[:level] = get_level
-  #   end
-  #   exp = page.search('.item-blue .detail-box:last ul li:nth-child(2)').text.delete!("[\n,\t,\r]").split('Kinh nghiệm')
-  #   exp = exp[1].to_s.strip
-  #   data[:exprience] = exp
-  #   add_data(data)
-  # end
-
-  # def crawl_data_jobs_interface_2(page)
-  #   data = {}
-  #   data[:name] = page.search('.apply-now-content .job-desc .title').text
-  #   data[:company_name] = page.search('.top-job .top-job-info .tit_company').text
-  #   locations = []
-  #   length = page.search('.info-workplace .value a').size
-  #   length.times do |n|
-  #     locations << page.search(".info-workplace .value a:nth-child(#{n + 1})").text
-  #   end
-  #   data[:city_name] = locations.join(',')
-  #   data[:created_date] = ''
-  #   expiration_date = page.search('.info li:nth-child(4)').text
-  #   data[:expiration_date] = expiration_date.blank? ? '' : expiration_date.delete!("[\n,\t,\r]").split(' ').last
-  #   data[:salary] = page.search('.info li:nth-child(3)').text.split('Lương').last.strip
-  #   data[:industry_name] = page.search('.info li:nth-child(5) .value').text
-  #   data[:description] = page.search('.left-col').to_s
-  #   lv = page.search('.boxtp .info li:nth-child(2)').text
-  #   data[:level] = lv.blank? ? '' : lv.delete!("[\n,\t,\r]").strip.split('Cấp bậc').last.strip
-  #   exp = page.search('.info li:nth-child(6)').text
-  #   data[:exprience] = exp.blank? ? '' : exp.delete!("[\n,\t,\r]").split('Kinh nghiệm').last.strip
-  #   add_data(data)
-  # end
-
-  # def crawl_data_jobs_interface_5(page)
-  #   data = {}
-  #   data[:name] = page.search('.info-company h1').text
-  #   data[:company_name] = page.search('.info-company .text-job h2').text
-  #   data[:city_name] = page.search('.DetailJobNew ul li:nth-child(1) a').text
-  #   data[:created_date] = ''
-  #   data[:expiration_date] = page.search('.DetailJobNew li:nth-child(9) span').text.strip
-  #   data[:salary] = page.search('.DetailJobNew li:nth-child(3) span').text.strip
-  #   data[:industry_name] = page.search('.DetailJobNew li:nth-child(2) span').text.strip
-  #   data[:description] = page.search('.left-col .detail-row')
-  #   data[:level] = page.search('.DetailJobNew ul li:nth-child(6) span').text.strip
-  #   data[:exprience] = page.search('.DetailJobNew li:nth-child(5) span').text.strip
-  #   add_data(data)
-  # end
-
-  # def make_foreign_industries_table(data, id_job)
-  #   unless data.blank? && id_job.blank?
-  #     content = data.split(',')
-  #     content.each do |val|
-  #       val.gsub!('&amp;', '&') if val.include?('&amp;')
-  #       data_industry = Industry.find_by name: val.strip
-  #       id_industry = data_industry.blank? ? Industry.create!(name: val.strip).id : data_industry.id
-  #       IndustryJob.create!(industry_id: id_industry, job_id: id_job)
-  #     end
-  #   end
-  # end
-
-  # def make_foreign_cities_table(data, id_job)
-  #   return if data.blank? && id_job.blank?
-  #   cities = data.split(',')
-  #   cities.each do |city|
-  #     data_city = City.find_by name: city.strip
-  #     id_cities = data_city.blank? ? City.create!(name: city.strip, area: DOMESTIC).id : data_city.id
-  #     CityJob.create!(job_id: id_job, city_id: id_cities)
-  #   end
-  # end
--- a/lib/tasks/crawler.rake
+++ b/lib/tasks/crawler.rake
 # frozen_string_literal: true

 require 'open-uri'
-require 'src/interface_web'
+require 'src/crawler'
+require 'src/crawler_job'

 # rake task
 namespace :crawler do
@@ -13,5 +14,6 @@ namespace :crawler do
    cw = Crawler.new
    cw.craw_data_cities
    cw.craw_data_companies
+    CrawlerJob.new.craw_data_jobs
  end
 end