Commit a0abd223 by Ngô Trung Hưng

use Template Method Pattern

parent 0ac0989b
Pipeline #746 failed with stages
in 0 seconds
......@@ -4,5 +4,5 @@
class City < ApplicationRecord
has_many :city_jobs
has_many :jobs, through: :city_jobs
enum area: { international: 0, domestic:1, range: 69 }
enum area: { international: 0, domestic: 1 }
end
......@@ -9,8 +9,9 @@ Bundler.require(*Rails.groups)
module Venjob
class Application < Rails::Application
# Initialize configuration defaults for originally generated Rails version.
config.autoload_paths << Rails.root.join('lib')
config.eager_load_paths << Rails.root.join('lib')
config.load_defaults 5.2
# Settings in config/environments/* take precedence over those specified here.
# Application configuration can go into files in config/initializers
# -- all .rb files in that directory are automatically loaded after loading
......
# Use this file to easily define all of your cron jobs.
#
# It's helpful, but not entirely necessary to understand cron before proceeding.
# http://en.wikipedia.org/wiki/Cron
# Example:
#
# set :output, "/path/to/my/cron_log.log"
#
# every 2.hours do
# command "/usr/bin/some_great_command"
# runner "MyModel.some_method"
# rake "some:great:rake:task"
# end
#
# every 4.days do
# runner "AnotherModel.prune_old_records"
# end
# Learn more: http://github.com/javan/whenever
env :PATH, ENV['PATH']
every 1.hours do
rake 'crawler:populate'
end
\ No newline at end of file
# frozen_string_literal: true
require 'nokogiri'
require 'open-uri'
require 'logger'
# Crawler data
class Base
COMPANY_SECURITY = 1
attr_accessor :job, :page
def initialize(page)
@job = {}
@page = page
end
def logger
@logger ||= Logger.new(Rails.root.join('log', 'crawl.log'))
end
def create_data
take_data
job
rescue StandardError => e
logger.error "Crawler data job have error: #{e}"
end
private
def take_data
job[:name] = fill_name
job[:company_name] = fill_company_name
job[:city_name] = fill_city_name
job[:created_date] = fill_created_date
job[:expiration_date] = fill_expiration_date
job[:salary] = fill_salary
job[:industry_name] = fill_industry_name
job[:description] = fill_description
job[:level] = fill_lever
job[:exprience] = fill_experience
end
def fill_name
page.search('.apply-now-content .job-desc .title').text
end
def fill_company_name
page.search('.apply-now-content .job-desc .job-company-name').text
end
def fill_city_name
page.search('.detail-box .map p a').map(&:text).join(',')
end
def fill_created_date
page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].text
end
def fill_expiration_date
page.xpath('//ul//li[last()]//p').last.text
end
def fill_salary
page.xpath('//ul//li[position()=1]//p')[1].text
end
def fill_industry_name
industries = page.xpath('//ul//li[position()=2]//p//a').map(&:text)
industries.map(&:strip).join(',')
end
def fill_description
job[:description] = page.search('.tabs .tab-content .detail-row').to_s
end
def check
noname = page.search('//ul//li').text
noname.include?('Kinh nghiệm')
end
def fill_lever
if check
page.xpath('//ul//li[position()=3]//p')[1].text.strip
else
page.xpath('//ul//li[position()=2]//p')[1].text
end
end
def fill_experience
check ? page.xpath('//ul//li[position()=2]//p')[1].text.strip : ''
end
end
# frozen_string_literal: true
require 'open-uri'
# Crawler data
class Crawler
COMPANY_SECURITY = 1
NUMBER_LINK = 100
SIZE_LI = 8
RANGE = 69
def path_to_first_link
Rails.root.join('tmp', 'link.txt')
end
def logger
@logger ||= Logger.new(Rails.root.join('log', 'crawler.log'))
end
def link_make_stop_crawler
file = File.readlines(path_to_first_link, 'r') if File.exist?(path_to_first_link)
file.blank? ? 'NOT' : file.join
end
def safe_link(url)
Nokogiri::HTML(URI.open(URI.parse(URI.escape(url))))
end
def crawl_link(page)
website_companies = []
page.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
link_companies = page.search('.figcaption .caption @href')
website_companies += link_companies.map(&:value).uniq
link_jobs = page.search('.figcaption .title .job_link @href').text
break if link_jobs.include?(link_make_stop_crawler)
end
website_companies.select(&:present?)
rescue StandardError => e
logger.error "Crawler link on page have error #{e}"
end
def craw_data_cities
page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
locations = page.search('#location option').map(&:text)
locations.each_with_index do |val, index|
area = index > RANGE ? City.areas['international'] : City.areas['domestic']
City.find_or_create_by(name: val) { |city| city.area = area }
end
end
def craw_data_companies
crawl_link(NUMBER_LINK).each do |url|
next if url.include?('javascript:void(0);')
page = safe_link(url)
company_name = page.search('.company-info .content .name').text
Company.find_or_create_by(name: company_name) do |company|
company.address = page.search('.company-info .info .content p:nth-child(3)').text
company.short_description = page.search('.main-about-us .content').text
end
rescue StandardError => e
logger.error "Crawler data companies has error: #{e}"
end
end
end
# frozen_string_literal: true
require 'src/crawler.rb'
require_relative '../src/interface/red_interface.rb'
require_relative '../src/interface/blue_interface.rb'
require_relative '../src/interface/green_interface.rb'
# Crawler data job
class CrawlerJob < Crawler
def crawl_link(page)
website_jobs = []
page.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
link_jobs = page.search('.figcaption .title .job_link @href')
website_jobs += link_jobs.map(&:value)
break if website_jobs.include?(link_make_stop_crawler)
end
File.write(path_to_first_link, website_jobs[0])
website_jobs.select(&:present?)
rescue StandardError => e
logger.error "Crawler link on page have error #{e}"
end
def reverse_arr
arr_link = []
crawl_link(NUMBER_LINK).each { |val| arr_link << val }
arr_link.reverse!
end
def craw_data_jobs
reverse_arr.each do |path|
page = safe_link(path)
if page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].present?
@data = RedInterface.new(page).create_data
elsif page.search('section .template-200').text.present?
@data = BlueInterface.new(page).create_data
elsif page.search('.DetailJobNew ul li').size == SIZE_LI && page.search('.right-col ul li').text.exclude?('Độ tuổi')
@data = GreenInterface.new(page).create_data
end
add_data(@data)
end
end
def add_data(data)
id_company = Company.find_by name: data[:company_name]
id_company = id_company.present? ? id_company.id : COMPANY_SECURITY
id_job = Job.create!(name: data[:name],
company_id: id_company,
level: data[:level],
experience: data[:exprience],
salary: data[:salary],
create_date: data[:created_date],
expiration_date: data[:expiration_date],
description: data[:description])
create_industry_relation(data[:industry_name], id_job.id)
create_city_relation(data[:city_name], id_job.id)
rescue StandardError => e
logger.error "Crawler data jobs has error: #{e}"
end
def create_industry_relation(data, id_job)
return if data.blank? && id_job.blank?
industries = data.split(',')
industries.each do |val|
val.gsub!('&amp;', '&') if val.include?('&amp;')
industry = Industry.find_or_create_by name: val.strip
IndustryJob.create(industry_id: industry.id, job_id: id_job)
end
end
def create_city_relation(data, id_job)
cities = data.split(',')
cities.each do |city|
city = City.find_or_create_by(name: city.strip, area: City.areas['domestic'])
CityJob.create(job_id: id_job, city_id: city.id)
end
end
end
# frozen_string_literal: true
require_relative '../base/base.rb'
# Inherience from base
class BlueInterface < Base
def fill_company_name
page.search('.top-job .top-job-info .tit_company').text
end
def fill_city_name
page.search('.info-workplace .value a').map(&:text).join(',')
end
def fill_created_date; end
def fill_expiration_date
page.xpath('//ul//li[position()=4]//div').text
end
def fill_salary
page.xpath('//ul//li[position()=3]//div').text
end
def fill_industry_name
page.xpath('//ul//li[position()=5]//div').text
end
def fill_description
page.search('.left-col').to_s
end
def check
noname = page.xpath('//ul//li[position()=2]/b').last.text
noname.include?('Cấp bậc')
end
def fill_lever
check ? page.xpath('//ul//li[position()=2]/div').last.text : ''
end
def fill_experience
page.xpath('//ul//li[position()=7]/b').text
end
end
# frozen_string_literal: true
require_relative '../base/base.rb'
# ahihi
class GreenInterface < Base
def fill_name
page.search('.info-company h1').text
end
def fill_company_name
page.search('.info-company .text-job h2').text
end
def fill_city_name
page.search('.DetailJobNew ul li:nth-child(1) a').text
end
def fill_created_date; end
def fill_expiration_date
page.xpath('//ul//li[last()-1]//span').children[1].text
end
def fill_salary
page.xpath('//ul//li[last()-2]//span').text
end
def fill_industry_name
page.search('.DetailJobNew li:nth-child(3) span').text.strip
end
def fill_description
page.search('.left-col .detail-row').text
end
def fill_lever
page.search('.DetailJobNew li:nth-child(2) span').text.strip
end
def check_exp
noname = page.search('.DetailJobNew li span').text
noname.include?('Kinh nghiệm')
end
def fill_experience
check_exp ? page.search('.DetailJobNew li:nth-child(5) span').text.strip : ''
end
end
# frozen_string_literal: true
require_relative '../base/base.rb'
# Inherience from base
class RedInterface < Base
end
# frozen_string_literal: true
require 'open-uri'
# Crawler data
class Crawler
COMPANY_SECURITY = 1
NUMBER_LINK = 2
SIZE_LI_INTERFACE_5 = 10
def path_to_first_link
Rails.root.join('tmp', 'link.txt')
end
def logger
@logger ||= Logger.new(Rails.root.join('log', 'crawler.log'))
end
def stop_crawler
file = File.readlines(path_to_first_link, 'r') if File.exist?(path_to_first_link)
file.blank? ? '' : file.join
end
def safe_link(url)
Nokogiri::HTML(URI.open(URI.parse(URI.escape(url))))
end
def crawl_link(page)
data = []
website_companies = []
website_jobs = []
begin
page.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
link_companies = page.search('.figcaption .caption @href')
website_companies += link_companies.map(&:value).uniq
link_jobs = page.search('.figcaption .title .job_link @href')
website_jobs += link_jobs.map(&:value)
break if website_jobs.include?(stop_crawler)
end
rescue StandardError => e
logger.error "Crawler link on page have error #{e}"
end
website_companies = website_companies.select(&:present?)
website_jobs = website_jobs.select(&:present?)
File.write(path_to_first_link, website_jobs[0])
data << website_companies << website_jobs
end
def link_job_and_companies
@link_job_and_companies ||= crawl_link(NUMBER_LINK)
end
def craw_data_cities
page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
locations = page.search('#location option').map(&:text)
locations.each_with_index do |val, index|
area = index > City.areas['range'] ? City.areas['international'] : City.areas['domestic']
City.find_or_create_by(name: val) { |city| city.area = area }
end
end
def craw_data_companies
link_crawl = link_job_and_companies
link_crawl[0].each do |url|
page = safe_link(url)
company_name = page.search('.company-info .content .name').text
Company.find_or_create_by(name: company_name) do |company|
company.address = page.search('.company-info .info .content p:nth-child(3)').text
company.short_description = page.search('.main-about-us .content').text
end
end
rescue StandardError => e
logger.error "Crawler data companies has error: #{e}"
end
end
# def make_data
# puts 'Please wait for crawl jobs data! . . .'
# link_crawl = link_job_and_companies
# arr_link = []
# link_crawl[1].each do |val|
# break if stop_crawler == val
# arr_link << val
# end
# arr_link.reverse!.each_with_index do |path, i|
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape(path))))
# if page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].present?
# crawl_data_jobs_interface_1(page)
# elsif page.search('section .template-200').text.present?
# crawl_data_jobs_interface_2(page)
# elsif page.search('.DetailJobNew ul li').size == SIZE_LI_INTERFACE_5 && !page.search('.right-col ul li').text.include?('Độ tuổi')
# crawl_data_jobs_interface_5(page)
# end
# puts "#{i} - #{path}"
# end
# puts 'Crawler data jobs success!'
# end
# private
# def add_data(data)
# id_company = Company.find_by name: data[:company_name]
# id_company = id_company.present? ? id_company.id : COMPANY_SECURITY
# id_job = Job.create!(name: data[:name],
# company_id: id_company,
# level: data[:level],
# experience: data[:exprience],
# salary: data[:salary],
# create_date: data[:created_date],
# expiration_date: data[:expiration_date],
# description: data[:description])
# make_foreign_industries_table(data[:industry_name], id_job.id)
# make_foreign_cities_table(data[:city_name], id_job.id)
# rescue StandardError => e
# puts e
# end
# def crawl_data_jobs_interface_1(page)
# data = {}
# data[:name] = page.search('.apply-now-content .job-desc .title').text
# data[:company_name] = page.search('.apply-now-content .job-desc .job-company-name').text
# location = []
# length = page.search('.detail-box .map p a').size
# length.times do |n|
# location << page.search(".detail-box .map p a:nth-child(#{n + 1})").text
# end
# data[:city_name] = location.join(',')
# data[:created_date] = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].text
# data[:expiration_date] = page.search('.item-blue .detail-box ul li:last')[1].text.delete!("[\n,\t,\r]").split(' ').last
# data[:salary] = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[1].text
# industries = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(2) a').text
# industries = industries.delete!("[\n,\t,\r]").split(' ').select(&:present?)
# data[:industry_name] = industries.join(',')
# data[:description] = page.search('.tabs .tab-content .detail-row:nth-child(n)').to_s
# get_level = page.search('.item-blue .detail-box:last ul li:nth-child(3)').text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
# get_level = get_level[1].to_s.strip
# if get_level.blank?
# g_level = page.search('.item-blue .detail-box:last ul li:nth-child(2)').text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
# data[:level] = g_level[1].to_s.strip
# else
# data[:level] = get_level
# end
# exp = page.search('.item-blue .detail-box:last ul li:nth-child(2)').text.delete!("[\n,\t,\r]").split('Kinh nghiệm')
# exp = exp[1].to_s.strip
# data[:exprience] = exp
# add_data(data)
# end
# def crawl_data_jobs_interface_2(page)
# data = {}
# data[:name] = page.search('.apply-now-content .job-desc .title').text
# data[:company_name] = page.search('.top-job .top-job-info .tit_company').text
# locations = []
# length = page.search('.info-workplace .value a').size
# length.times do |n|
# locations << page.search(".info-workplace .value a:nth-child(#{n + 1})").text
# end
# data[:city_name] = locations.join(',')
# data[:created_date] = ''
# expiration_date = page.search('.info li:nth-child(4)').text
# data[:expiration_date] = expiration_date.blank? ? '' : expiration_date.delete!("[\n,\t,\r]").split(' ').last
# data[:salary] = page.search('.info li:nth-child(3)').text.split('Lương').last.strip
# data[:industry_name] = page.search('.info li:nth-child(5) .value').text
# data[:description] = page.search('.left-col').to_s
# lv = page.search('.boxtp .info li:nth-child(2)').text
# data[:level] = lv.blank? ? '' : lv.delete!("[\n,\t,\r]").strip.split('Cấp bậc').last.strip
# exp = page.search('.info li:nth-child(6)').text
# data[:exprience] = exp.blank? ? '' : exp.delete!("[\n,\t,\r]").split('Kinh nghiệm').last.strip
# add_data(data)
# end
# def crawl_data_jobs_interface_5(page)
# data = {}
# data[:name] = page.search('.info-company h1').text
# data[:company_name] = page.search('.info-company .text-job h2').text
# data[:city_name] = page.search('.DetailJobNew ul li:nth-child(1) a').text
# data[:created_date] = ''
# data[:expiration_date] = page.search('.DetailJobNew li:nth-child(9) span').text.strip
# data[:salary] = page.search('.DetailJobNew li:nth-child(3) span').text.strip
# data[:industry_name] = page.search('.DetailJobNew li:nth-child(2) span').text.strip
# data[:description] = page.search('.left-col .detail-row')
# data[:level] = page.search('.DetailJobNew ul li:nth-child(6) span').text.strip
# data[:exprience] = page.search('.DetailJobNew li:nth-child(5) span').text.strip
# add_data(data)
# end
# def make_foreign_industries_table(data, id_job)
# unless data.blank? && id_job.blank?
# content = data.split(',')
# content.each do |val|
# val.gsub!('&amp;', '&') if val.include?('&amp;')
# data_industry = Industry.find_by name: val.strip
# id_industry = data_industry.blank? ? Industry.create!(name: val.strip).id : data_industry.id
# IndustryJob.create!(industry_id: id_industry, job_id: id_job)
# end
# end
# end
# def make_foreign_cities_table(data, id_job)
# return if data.blank? && id_job.blank?
# cities = data.split(',')
# cities.each do |city|
# data_city = City.find_by name: city.strip
# id_cities = data_city.blank? ? City.create!(name: city.strip, area: DOMESTIC).id : data_city.id
# CityJob.create!(job_id: id_job, city_id: id_cities)
# end
# end
# frozen_string_literal: true
require 'open-uri'
require 'src/interface_web'
require 'src/crawler'
require 'src/crawler_job'
# rake task
namespace :crawler do
......@@ -13,5 +14,6 @@ namespace :crawler do
cw = Crawler.new
cw.craw_data_cities
cw.craw_data_companies
CrawlerJob.new.craw_data_jobs
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment