Merge branch 'crawler' into 'master'

done crawler See merge request !2

Merge branch 'crawler' into 'master'
done crawler See merge request !2
5131650d · Hoang Phuc Do · 2e9845a6 · 0b319ea9 · 5131650d · 5131650d
Commit 5131650d authored Aug 04, 2020 by Hoang Phuc Do
19 changed files
--- a/Gemfile
+++ b/Gemfile
@@ -26,7 +26,8 @@ gem 'jbuilder', '~> 2.5'
 # gem 'redis', '~> 4.0'
 # Use ActiveModel has_secure_password
 # gem 'bcrypt', '~> 3.1.7'
-
+gem 'nokogiri'
+gem 'whenever'
 # Use ActiveStorage variant
 # gem 'mini_magick', '~> 4.8'


--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -64,6 +64,7 @@ GEM
    chromedriver-helper (2.1.1)
      archive-zip (~> 0.10)
      nokogiri (~> 1.8)
+    chronic (0.10.2)
    coffee-rails (4.2.2)
      coffee-script (>= 2.2.0)
      railties (>= 4.0.0)
@@ -183,6 +184,8 @@ GEM
    websocket-driver (0.7.3)
      websocket-extensions (>= 0.1.0)
    websocket-extensions (0.1.5)
+    whenever (1.0.0)
+      chronic (>= 0.6.3)
    xpath (3.2.0)
      nokogiri (~> 1.8)

@@ -198,6 +201,7 @@ DEPENDENCIES
  jbuilder (~> 2.5)
  listen (>= 3.0.5, < 3.2)
  mysql2 (>= 0.4.4, < 0.6.0)
+  nokogiri
  puma (~> 3.11)
  rails (~> 5.2.4, >= 5.2.4.3)
  sass-rails (~> 5.0)
@@ -208,6 +212,7 @@ DEPENDENCIES
  tzinfo-data
  uglifier (>= 1.3.0)
  web-console (>= 3.3.0)
+  whenever

 RUBY VERSION
   ruby 2.6.6p146

--- a/app/assets/javascripts/home.coffee
+++ b/app/assets/javascripts/home.coffee
+# Place all the behaviors and hooks related to the matching controller here.
+# All this logic will automatically be available in application.js.
+# You can use CoffeeScript in this file: http://coffeescript.org/
--- a/app/assets/javascripts/job.coffee
+++ b/app/assets/javascripts/job.coffee
+# Place all the behaviors and hooks related to the matching controller here.
+# All this logic will automatically be available in application.js.
+# You can use CoffeeScript in this file: http://coffeescript.org/
--- a/app/assets/stylesheets/home.scss
+++ b/app/assets/stylesheets/home.scss
+// Place all the styles related to the home controller here.
+// They will automatically be included in application.css.
+// You can use Sass (SCSS) here: http://sass-lang.com/
--- a/app/assets/stylesheets/job.scss
+++ b/app/assets/stylesheets/job.scss
+// Place all the styles related to the job controller here.
+// They will automatically be included in application.css.
+// You can use Sass (SCSS) here: http://sass-lang.com/
--- a/app/models/city.rb
+++ b/app/models/city.rb
@@ -2,6 +2,7 @@

 # Description/Explanation of Person class
 class City < ApplicationRecord
+  RANGE = 69
  has_many :city_jobs
  has_many :jobs, through: :city_jobs
  scope :all_cities, -> { select :id, :name }

--- a/app/models/company.rb
+++ b/app/models/company.rb
@@ -2,5 +2,6 @@

 # Description/Explanation of Person class
 class Company < ApplicationRecord
+  COMPANY_SECURITY = 1
  has_many :jobs
 end
--- a/config/application.rb
+++ b/config/application.rb
@@ -10,7 +10,7 @@ module Venjob
  class Application < Rails::Application
    # Initialize configuration defaults for originally generated Rails version.
    config.load_defaults 5.2
-
+    config.autoload_paths << Rails.root.join('lib/src')
    # Settings in config/environments/* take precedence over those specified here.
    # Application configuration can go into files in config/initializers
    # -- all .rb files in that directory are automatically loaded after loading

--- a/config/schedule.rb
+++ b/config/schedule.rb
+# Use this file to easily define all of your cron jobs.
+#
+# It's helpful, but not entirely necessary to understand cron before proceeding.
+# http://en.wikipedia.org/wiki/Cron
+
+# Example:
+#
+# set :output, "/path/to/my/cron_log.log"
+#
+# every 2.hours do
+#   command "/usr/bin/some_great_command"
+#   runner "MyModel.some_method"
+#   rake "some:great:rake:task"
+# end
+#
+# every 4.days do
+#   runner "AnotherModel.prune_old_records"
+# end
+
+# Learn more: http://github.com/javan/whenever
+env :PATH, ENV['PATH']
+every 1.hours do
+  rake 'crawler:populate'
+end
\ No newline at end of file
--- a/db/migrate/20200729064551_change_column_table_city.rb
+++ b/db/migrate/20200729064551_change_column_table_city.rb
+class ChangeColumnTableCity < ActiveRecord::Migration[5.2]
+  def change
+    change_column :cities, :area, :integer
+    #Ex:- change_column("admin_users", "email", :string, :limit =>25)
+  end
+end
--- a/db/schema.rb
+++ b/db/schema.rb
@@ -10,7 +10,7 @@
 #
 # It's strongly recommended that you check this file into your version control system.

-ActiveRecord::Schema.define(version: 2020_07_28_021412) do
+ActiveRecord::Schema.define(version: 2020_07_29_064551) do

  create_table "applied_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
    t.bigint "user_id"
@@ -26,7 +26,7 @@ ActiveRecord::Schema.define(version: 2020_07_28_021412) do

  create_table "cities", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
    t.string "name"
-    t.boolean "area"
+    t.integer "area"
    t.datetime "created_at", null: false
    t.datetime "updated_at", null: false
  end

--- a/lib/src/base/base.rb
+++ b/lib/src/base/base.rb
+# frozen_string_literal: true
+
+require 'nokogiri'
+require 'open-uri'
+require 'logger'
+
+# Crawler data
+module Base
+  class Base
+    attr_accessor :job, :page
+
+    def initialize(page)
+      @job = {}
+      @page = page
+    end
+
+    def logger
+      @logger ||= Logger.new(Rails.root.join('log', 'crawl.log'))
+    end
+
+    def create_data
+      take_data
+      job
+    rescue StandardError => e
+      logger.error "Crawler data job have error: #{e}"
+    end
+
+    private
+
+    def take_data
+      job[:name] = fill_name
+      job[:company_name] = fill_company_name
+      job[:city_name] = fill_city_name
+      job[:created_date] = fill_created_date
+      job[:expiration_date] = fill_expiration_date
+      job[:salary] = fill_salary
+      job[:industry_name] = fill_industry_name
+      job[:description] = fill_description
+      job[:level] = fill_lever
+      job[:exprience] = fill_experience
+    end
+
+    def fill_name
+      page.search('.apply-now-content .job-desc .title').text
+    end
+
+    def fill_company_name
+      page.search('.apply-now-content .job-desc .job-company-name').text
+    end
+
+    def fill_city_name
+      page.search('.detail-box .map p a').map(&:text).join(',')
+    end
+
+    def fill_created_date
+      page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].try(:text)
+    end
+
+    def fill_expiration_date
+      page.xpath('//ul//li[last()]//p').last.text
+    end
+
+    def fill_salary
+      page.xpath('//ul//li[position()=1]//p')[1].text
+    end
+
+    def fill_industry_name
+      industries = page.xpath('//ul//li[position()=2]//p//a').map(&:text)
+      industries.map(&:strip).join(',')
+    end
+
+    def fill_description
+      job[:description] = page.search('.tabs .tab-content .detail-row').to_s
+    end
+
+    def exist_experience?
+      noname = page.search('//ul//li').text
+      noname.include?('Kinh nghiệm')
+    end
+
+    def fill_lever
+      exist_experience? ? page.xpath('//ul//li[position()=3]//p')[1].text.strip : page.xpath('//ul//li[position()=2]//p')[1].text
+    end
+
+    def fill_experience
+      exist_experience? ? page.xpath('//ul//li[position()=2]//p')[1].text.strip : ''
+    end
+  end
+end
--- a/lib/src/crawler.rb
+++ b/lib/src/crawler.rb
+# frozen_string_literal: true
+
+require 'open-uri'
+
+# Crawler data
+class Crawler
+  attr_accessor :number_link
+
+  def initialize(number_link)
+    @number_link = number_link
+  end
+
+  def path_to_first_link
+    Rails.root.join('tmp', 'link.txt')
+  end
+
+  def logger
+    @logger ||= Logger.new(Rails.root.join('log', 'crawler.log'))
+  end
+
+  def link_make_stop_crawler
+    file = File.readlines(path_to_first_link, 'r') if File.exist?(path_to_first_link)
+    file.blank? ? 'NOT' : file.join
+  end
+
+  def safe_link(url)
+    Nokogiri::HTML(URI.open(URI.escape(url)))
+  end
+
+  def crawl_link
+    website_companies = []
+    number_link.times do |i|
+      page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
+      link_companies = page.search('.figcaption .caption @href')
+      website_companies += link_companies.map(&:value).uniq
+      link_jobs = page.search('.figcaption .title .job_link @href').text
+      break if link_jobs.include?(link_make_stop_crawler)
+    end
+    website_companies.select(&:present?)
+  rescue StandardError => e
+    logger.error "Crawler link on page have error #{e}"
+  end
+
+  def craw_data_cities
+    page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
+    locations = page.search('#location option').map(&:text)
+    locations.each_with_index do |val, index|
+      area = index > City::RANGE ? City.areas['international'] : City.areas['domestic']
+      City.find_or_create_by(name: val) { |city| city.area = area }
+    end
+  end
+
+  def craw_data_companies
+    crawl_link.each do |url|
+      page = safe_link(url)
+      company_name = page.search('.company-info .content .name').text
+      Company.find_or_create_by(name: company_name) do |company|
+        company.address = page.search('.company-info .info .content p:nth-child(3)').text
+        company.short_description = page.search('.main-about-us .content').text
+      end
+    rescue StandardError => e
+      logger.error "Crawler data companies has error: #{e}"
+    end
+  end
+end
--- a/lib/src/crawler_job.rb
+++ b/lib/src/crawler_job.rb
+# frozen_string_literal: true
+
+# Crawler job
+class CrawlerJob < Crawler
+  SIZE_LI = 8
+
+  def crawl_link
+    website_jobs = []
+    number_link.times do |i|
+      page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
+      link_jobs = page.search('.figcaption .title .job_link @href')
+      link_jobs.each do |val|
+        link = val.value
+        return website_jobs if link.include?(link_make_stop_crawler)
+
+        website_jobs << link
+      end
+    end
+    website_jobs
+  rescue StandardError => e
+    logger.error "Crawler link jobs on page have error #{e}"
+  end
+
+  def parse_data
+    @parse_data ||= crawl_link.reverse!
+  end
+
+  def refresh_first_link
+    File.write(path_to_first_link, parse_data.last)
+  end
+
+  def craw_data_jobs
+    parse_data.each do |path|
+      page = safe_link(path)
+      if page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].present?
+        @data = Interface::RedInterface.new(page).create_data
+      elsif page.search('section .template-200').text.present?
+        @data = Interface::BlueInterface.new(page).create_data
+      elsif page.search('.DetailJobNew ul li').size == SIZE_LI && page.search('.right-col ul li').text.exclude?('Độ tuổi')
+        @data = Interface::GreenInterface.new(page).create_data
+      end
+      add_data(@data)
+      refresh_first_link
+    end
+  end
+
+  def add_data(data)
+    id_company = (Company.find_by name: data[:company_name]).try(:id) || Company::COMPANY_SECURITY
+    job = Job.create(name: data[:name],
+                    company_id: id_company,
+                    level: data[:level],
+                    experience: data[:exprience],
+                    salary: data[:salary],
+                    create_date: data[:created_date],
+                    expiration_date: data[:expiration_date],
+                    description: data[:description])
+    create_industry_relation(data[:industry_name], job)
+    create_city_relation(data[:city_name], job)
+  rescue StandardError => e
+    logger.error "Crawler data jobs has error: #{e}"
+  end
+
+  def create_industry_relation(data, job)
+    industries = data.split(',')
+    industries.each do |val|
+      val.gsub!('&amp;', '&') if val.include?('&amp;')
+      industry = Industry.find_or_create_by name: val.strip
+      job.industries << industry
+    end
+  end
+
+  def create_city_relation(data, job)
+    cities = data.split(',')
+    cities.each do |city|
+      city = City.find_or_create_by(name: city.strip, area: City.areas['domestic'])
+      job.cities << city
+    end
+  end
+end
--- a/lib/src/interface/blue_interface.rb
+++ b/lib/src/interface/blue_interface.rb
+# frozen_string_literal: true
+
+# Inherience from base
+module Interface
+  class BlueInterface < Base::Base
+    def fill_company_name
+      page.search('.top-job .top-job-info .tit_company').text
+    end
+
+    def fill_city_name
+      page.search('.info-workplace .value a').map(&:text).join(',')
+    end
+
+    def fill_expiration_date
+      page.xpath('//ul//li[position()=4]//div').text
+    end
+
+    def fill_salary
+      page.xpath('//ul//li[position()=3]//div').text
+    end
+
+    def fill_industry_name
+      page.xpath('//ul//li[position()=5]//div').text
+    end
+
+    def fill_description
+      page.search('.left-col').to_s
+    end
+
+    def exist_level?
+      noname = page.xpath('//ul//li[position()=2]/b').last.text
+      noname.include?('Cấp bậc')
+    end
+
+    def fill_lever
+      exist_level? ? page.xpath('//ul//li[position()=2]/div').last.text : ''
+    end
+
+    def fill_experience
+      page.xpath('//ul//li[position()=7]/b').text
+    end
+  end
+end
--- a/lib/src/interface/green_interface.rb
+++ b/lib/src/interface/green_interface.rb
+# frozen_string_literal: true
+
+# Green Interface
+module Interface
+  class GreenInterface < Base::Base
+    def fill_name
+      page.search('.info-company h1').text
+    end
+
+    def fill_company_name
+      page.search('.info-company .text-job h2').text
+    end
+
+    def fill_city_name
+      page.search('.DetailJobNew ul li:nth-child(1) a').text
+    end
+
+    def fill_expiration_date
+      page.xpath('//ul//li[last()-1]//span').children[1].text
+    end
+
+    def fill_salary
+      page.xpath('//ul//li[last()-2]//span').text
+    end
+
+    def fill_industry_name
+      page.search('.DetailJobNew li:nth-child(3) span').text.strip
+    end
+
+    def fill_description
+      page.search('.left-col .detail-row').text
+    end
+
+    def fill_lever
+      page.search('.DetailJobNew li:nth-child(2) span').text.strip
+    end
+
+    def exist_experience?
+      noname = page.search('.DetailJobNew li span').text
+      noname.include?('Kinh nghiệm')
+    end
+
+    def fill_experience
+      exist_experience? ? page.search('.DetailJobNew li:nth-child(5) span').text.strip : ''
+    end
+  end
+end
--- a/lib/src/interface/red_interface.rb
+++ b/lib/src/interface/red_interface.rb
+# frozen_string_literal: true
+
+# Inherience from base
+module Interface
+  class RedInterface < Base::Base
+  end
+end
--- a/lib/tasks/crawler.rake
+++ b/lib/tasks/crawler.rake
+# frozen_string_literal: true
+
+require 'open-uri'
+
+# rake task
+namespace :crawler do
+  task populate: :environment do
+    NUMBER_LINK_WILL_BE_CRAWLER = 5
+    Company.find_or_create_by(name: 'Bảo mật') do |company|
+      company.address = 'Vui lòng xem trong mô tả công việc'
+      company.short_description = 'Vui lòng xem trong mô tả công việc'
+    end
+    cw = Crawler.new(NUMBER_LINK_WILL_BE_CRAWLER)
+    cw.craw_data_cities
+    cw.craw_data_companies
+    CrawlerJob.new(NUMBER_LINK_WILL_BE_CRAWLER).craw_data_jobs            
+  end
+end