Commit 3c7e899c by Ngô Trung Hưng

fix autoload

parent 37488f20
Pipeline #789 canceled with stages
in 0 seconds
...@@ -10,7 +10,7 @@ module Venjob ...@@ -10,7 +10,7 @@ module Venjob
class Application < Rails::Application class Application < Rails::Application
# Initialize configuration defaults for originally generated Rails version. # Initialize configuration defaults for originally generated Rails version.
config.load_defaults 5.2 config.load_defaults 5.2
config.autoload_paths << Rails.root.join('lib') config.autoload_paths << Rails.root.join('lib/src')
# Settings in config/environments/* take precedence over those specified here. # Settings in config/environments/* take precedence over those specified here.
# Application configuration can go into files in config/initializers # Application configuration can go into files in config/initializers
# -- all .rb files in that directory are automatically loaded after loading # -- all .rb files in that directory are automatically loaded after loading
......
...@@ -5,87 +5,85 @@ require 'open-uri' ...@@ -5,87 +5,85 @@ require 'open-uri'
require 'logger' require 'logger'
# Crawler data # Crawler data
module Src module Base
module Base class Base
class Base attr_accessor :job, :page
attr_accessor :job, :page
def initialize(page)
def initialize(page) @job = {}
@job = {} @page = page
@page = page end
end
def logger
def logger @logger ||= Logger.new(Rails.root.join('log', 'crawl.log'))
@logger ||= Logger.new(Rails.root.join('log', 'crawl.log')) end
end
def create_data
def create_data take_data
take_data job
job rescue StandardError => e
rescue StandardError => e logger.error "Crawler data job have error: #{e}"
logger.error "Crawler data job have error: #{e}" end
end
private
private
def take_data
def take_data job[:name] = fill_name
job[:name] = fill_name job[:company_name] = fill_company_name
job[:company_name] = fill_company_name job[:city_name] = fill_city_name
job[:city_name] = fill_city_name job[:created_date] = fill_created_date
job[:created_date] = fill_created_date job[:expiration_date] = fill_expiration_date
job[:expiration_date] = fill_expiration_date job[:salary] = fill_salary
job[:salary] = fill_salary job[:industry_name] = fill_industry_name
job[:industry_name] = fill_industry_name job[:description] = fill_description
job[:description] = fill_description job[:level] = fill_lever
job[:level] = fill_lever job[:exprience] = fill_experience
job[:exprience] = fill_experience end
end
def fill_name
def fill_name page.search('.apply-now-content .job-desc .title').text
page.search('.apply-now-content .job-desc .title').text end
end
def fill_company_name
def fill_company_name page.search('.apply-now-content .job-desc .job-company-name').text
page.search('.apply-now-content .job-desc .job-company-name').text end
end
def fill_city_name
def fill_city_name page.search('.detail-box .map p a').map(&:text).join(',')
page.search('.detail-box .map p a').map(&:text).join(',') end
end
def fill_created_date
def fill_created_date page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].try(:text)
page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].try(:text) end
end
def fill_expiration_date
def fill_expiration_date page.xpath('//ul//li[last()]//p').last.text
page.xpath('//ul//li[last()]//p').last.text end
end
def fill_salary
def fill_salary page.xpath('//ul//li[position()=1]//p')[1].text
page.xpath('//ul//li[position()=1]//p')[1].text end
end
def fill_industry_name
def fill_industry_name industries = page.xpath('//ul//li[position()=2]//p//a').map(&:text)
industries = page.xpath('//ul//li[position()=2]//p//a').map(&:text) industries.map(&:strip).join(',')
industries.map(&:strip).join(',') end
end
def fill_description
def fill_description job[:description] = page.search('.tabs .tab-content .detail-row').to_s
job[:description] = page.search('.tabs .tab-content .detail-row').to_s end
end
def exist_experience?
def exist_experience? noname = page.search('//ul//li').text
noname = page.search('//ul//li').text noname.include?('Kinh nghiệm')
noname.include?('Kinh nghiệm') end
end
def fill_lever
def fill_lever exist_experience? ? page.xpath('//ul//li[position()=3]//p')[1].text.strip : page.xpath('//ul//li[position()=2]//p')[1].text
exist_experience? ? page.xpath('//ul//li[position()=3]//p')[1].text.strip : page.xpath('//ul//li[position()=2]//p')[1].text end
end
def fill_experience
def fill_experience exist_experience? ? page.xpath('//ul//li[position()=2]//p')[1].text.strip : ''
exist_experience? ? page.xpath('//ul//li[position()=2]//p')[1].text.strip : ''
end
end end
end end
end end
...@@ -3,65 +3,63 @@ ...@@ -3,65 +3,63 @@
require 'open-uri' require 'open-uri'
# Crawler data # Crawler data
module Src class Crawler
class Crawler attr_accessor :number_link
attr_accessor :number_link
def initialize(number_link) def initialize(number_link)
@number_link = number_link @number_link = number_link
end end
def path_to_first_link def path_to_first_link
Rails.root.join('tmp', 'link.txt') Rails.root.join('tmp', 'link.txt')
end end
def logger def logger
@logger ||= Logger.new(Rails.root.join('log', 'crawler.log')) @logger ||= Logger.new(Rails.root.join('log', 'crawler.log'))
end end
def link_make_stop_crawler def link_make_stop_crawler
file = File.readlines(path_to_first_link, 'r') if File.exist?(path_to_first_link) file = File.readlines(path_to_first_link, 'r') if File.exist?(path_to_first_link)
file.blank? ? 'NOT' : file.join file.blank? ? 'NOT' : file.join
end end
def safe_link(url) def safe_link(url)
Nokogiri::HTML(URI.open(URI.escape(url))) Nokogiri::HTML(URI.open(URI.escape(url)))
end end
def crawl_link def crawl_link
website_companies = [] website_companies = []
number_link.times do |i| number_link.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html")) page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
link_companies = page.search('.figcaption .caption @href') link_companies = page.search('.figcaption .caption @href')
website_companies += link_companies.map(&:value).uniq website_companies += link_companies.map(&:value).uniq
link_jobs = page.search('.figcaption .title .job_link @href').text link_jobs = page.search('.figcaption .title .job_link @href').text
break if link_jobs.include?(link_make_stop_crawler) break if link_jobs.include?(link_make_stop_crawler)
end
website_companies.select(&:present?)
rescue StandardError => e
logger.error "Crawler link on page have error #{e}"
end end
website_companies.select(&:present?)
rescue StandardError => e
logger.error "Crawler link on page have error #{e}"
end
def craw_data_cities def craw_data_cities
page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')) page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
locations = page.search('#location option').map(&:text) locations = page.search('#location option').map(&:text)
locations.each_with_index do |val, index| locations.each_with_index do |val, index|
area = index > City::RANGE ? City.areas['international'] : City.areas['domestic'] area = index > City::RANGE ? City.areas['international'] : City.areas['domestic']
City.find_or_create_by(name: val) { |city| city.area = area } City.find_or_create_by(name: val) { |city| city.area = area }
end
end end
end
def craw_data_companies def craw_data_companies
crawl_link.each do |url| crawl_link.each do |url|
page = safe_link(url) page = safe_link(url)
company_name = page.search('.company-info .content .name').text company_name = page.search('.company-info .content .name').text
Company.find_or_create_by(name: company_name) do |company| Company.find_or_create_by(name: company_name) do |company|
company.address = page.search('.company-info .info .content p:nth-child(3)').text company.address = page.search('.company-info .info .content p:nth-child(3)').text
company.short_description = page.search('.main-about-us .content').text company.short_description = page.search('.main-about-us .content').text
end
rescue StandardError => e
logger.error "Crawler data companies has error: #{e}"
end end
rescue StandardError => e
logger.error "Crawler data companies has error: #{e}"
end end
end end
end end
# frozen_string_literal: true # frozen_string_literal: true
# Autoload # Crawler job
module Src class CrawlerJob < Crawler
# Crawler job SIZE_LI = 8
class CrawlerJob < Crawler
SIZE_LI = 8
def crawl_link def crawl_link
website_jobs = [] website_jobs = []
number_link.times do |i| number_link.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html")) page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
link_jobs = page.search('.figcaption .title .job_link @href') link_jobs = page.search('.figcaption .title .job_link @href')
link_jobs.each do |val| link_jobs.each do |val|
link = val.value link = val.value
return website_jobs if link.include?(link_make_stop_crawler) return website_jobs if link.include?(link_make_stop_crawler)
website_jobs << link website_jobs << link
end
end end
website_jobs
rescue StandardError => e
logger.error "Crawler link jobs on page have error #{e}"
end end
website_jobs
rescue StandardError => e
logger.error "Crawler link jobs on page have error #{e}"
end
def parse_data def parse_data
@parse_data ||= crawl_link.reverse! @parse_data ||= crawl_link.reverse!
end end
def refresh_first_link def refresh_first_link
File.write(path_to_first_link, parse_data.last) File.write(path_to_first_link, parse_data.last)
end end
def craw_data_jobs def craw_data_jobs
parse_data.each do |path| parse_data.each do |path|
page = safe_link(path) page = safe_link(path)
if page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].present? if page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].present?
@data = Src::Interface::RedInterface.new(page).create_data @data = Interface::RedInterface.new(page).create_data
elsif page.search('section .template-200').text.present? elsif page.search('section .template-200').text.present?
@data = Src::Interface::BlueInterface.new(page).create_data @data = Interface::BlueInterface.new(page).create_data
elsif page.search('.DetailJobNew ul li').size == SIZE_LI && page.search('.right-col ul li').text.exclude?('Độ tuổi') elsif page.search('.DetailJobNew ul li').size == SIZE_LI && page.search('.right-col ul li').text.exclude?('Độ tuổi')
@data = Src::Interface::GreenInterface.new(page).create_data @data = Interface::GreenInterface.new(page).create_data
end
add_data(@data)
refresh_first_link
end end
add_data(@data)
refresh_first_link
end end
end
def add_data(data) def add_data(data)
id_company = (Company.find_by name: data[:company_name]).try(:id) || Company::COMPANY_SECURITY id_company = (Company.find_by name: data[:company_name]).try(:id) || Company::COMPANY_SECURITY
job = Job.create(name: data[:name], job = Job.create(name: data[:name],
company_id: id_company, company_id: id_company,
level: data[:level], level: data[:level],
experience: data[:exprience], experience: data[:exprience],
salary: data[:salary], salary: data[:salary],
create_date: data[:created_date], create_date: data[:created_date],
expiration_date: data[:expiration_date], expiration_date: data[:expiration_date],
description: data[:description]) description: data[:description])
create_industry_relation(data[:industry_name], job) create_industry_relation(data[:industry_name], job)
create_city_relation(data[:city_name], job) create_city_relation(data[:city_name], job)
rescue StandardError => e rescue StandardError => e
logger.error "Crawler data jobs has error: #{e}" logger.error "Crawler data jobs has error: #{e}"
end end
def create_industry_relation(data, job) def create_industry_relation(data, job)
industries = data.split(',') industries = data.split(',')
industries.each do |val| industries.each do |val|
val.gsub!('&amp;', '&') if val.include?('&amp;') val.gsub!('&amp;', '&') if val.include?('&amp;')
industry = Industry.find_or_create_by name: val.strip industry = Industry.find_or_create_by name: val.strip
job.industries << industry job.industries << industry
end
end end
end
def create_city_relation(data, job) def create_city_relation(data, job)
cities = data.split(',') cities = data.split(',')
cities.each do |city| cities.each do |city|
city = City.find_or_create_by(name: city.strip, area: City.areas['domestic']) city = City.find_or_create_by(name: city.strip, area: City.areas['domestic'])
job.cities << city job.cities << city
end
end end
end end
end end
# frozen_string_literal: true # frozen_string_literal: true
# Inherience from base # Inherience from base
module Src module Interface
module Interface class BlueInterface < Base::Base
class BlueInterface < Base::Base def fill_company_name
def fill_company_name page.search('.top-job .top-job-info .tit_company').text
page.search('.top-job .top-job-info .tit_company').text end
end
def fill_city_name
def fill_city_name page.search('.info-workplace .value a').map(&:text).join(',')
page.search('.info-workplace .value a').map(&:text).join(',') end
end
def fill_expiration_date
def fill_expiration_date page.xpath('//ul//li[position()=4]//div').text
page.xpath('//ul//li[position()=4]//div').text end
end
def fill_salary
def fill_salary page.xpath('//ul//li[position()=3]//div').text
page.xpath('//ul//li[position()=3]//div').text end
end
def fill_industry_name
def fill_industry_name page.xpath('//ul//li[position()=5]//div').text
page.xpath('//ul//li[position()=5]//div').text end
end
def fill_description
def fill_description page.search('.left-col').to_s
page.search('.left-col').to_s end
end
def exist_level?
def exist_level? noname = page.xpath('//ul//li[position()=2]/b').last.text
noname = page.xpath('//ul//li[position()=2]/b').last.text noname.include?('Cấp bậc')
noname.include?('Cấp bậc') end
end
def fill_lever
def fill_lever exist_level? ? page.xpath('//ul//li[position()=2]/div').last.text : ''
exist_level? ? page.xpath('//ul//li[position()=2]/div').last.text : '' end
end
def fill_experience
def fill_experience page.xpath('//ul//li[position()=7]/b').text
page.xpath('//ul//li[position()=7]/b').text
end
end end
end end
end end
# frozen_string_literal: true # frozen_string_literal: true
# ahihi # Green Interface
module Src module Interface
module Interface class GreenInterface < Base::Base
class GreenInterface < Base::Base def fill_name
def fill_name page.search('.info-company h1').text
page.search('.info-company h1').text end
end
def fill_company_name
def fill_company_name page.search('.info-company .text-job h2').text
page.search('.info-company .text-job h2').text end
end
def fill_city_name
def fill_city_name page.search('.DetailJobNew ul li:nth-child(1) a').text
page.search('.DetailJobNew ul li:nth-child(1) a').text end
end
def fill_expiration_date
def fill_expiration_date page.xpath('//ul//li[last()-1]//span').children[1].text
page.xpath('//ul//li[last()-1]//span').children[1].text end
end
def fill_salary
def fill_salary page.xpath('//ul//li[last()-2]//span').text
page.xpath('//ul//li[last()-2]//span').text end
end
def fill_industry_name
def fill_industry_name page.search('.DetailJobNew li:nth-child(3) span').text.strip
page.search('.DetailJobNew li:nth-child(3) span').text.strip end
end
def fill_description
def fill_description page.search('.left-col .detail-row').text
page.search('.left-col .detail-row').text end
end
def fill_lever
def fill_lever page.search('.DetailJobNew li:nth-child(2) span').text.strip
page.search('.DetailJobNew li:nth-child(2) span').text.strip end
end
def exist_experience?
def exist_experience? noname = page.search('.DetailJobNew li span').text
noname = page.search('.DetailJobNew li span').text noname.include?('Kinh nghiệm')
noname.include?('Kinh nghiệm') end
end
def fill_experience
def fill_experience exist_experience? ? page.search('.DetailJobNew li:nth-child(5) span').text.strip : ''
exist_experience? ? page.search('.DetailJobNew li:nth-child(5) span').text.strip : ''
end
end end
end end
end end
# frozen_string_literal: true # frozen_string_literal: true
# Inherience from base # Inherience from base
module Src module Interface
module Interface class RedInterface < Base::Base
class RedInterface < Base::Base
end
end end
end end
...@@ -10,9 +10,9 @@ namespace :crawler do ...@@ -10,9 +10,9 @@ namespace :crawler do
company.address = 'Vui lòng xem trong mô tả công việc' company.address = 'Vui lòng xem trong mô tả công việc'
company.short_description = 'Vui lòng xem trong mô tả công việc' company.short_description = 'Vui lòng xem trong mô tả công việc'
end end
cw = Src::Crawler.new(NUMBER_LINK_WILL_BE_CRAWLER) cw = Crawler.new(NUMBER_LINK_WILL_BE_CRAWLER)
cw.craw_data_cities cw.craw_data_cities
cw.craw_data_companies cw.craw_data_companies
Src::CrawlerJob.new(NUMBER_LINK_WILL_BE_CRAWLER).craw_data_jobs CrawlerJob.new(NUMBER_LINK_WILL_BE_CRAWLER).craw_data_jobs
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment