Commit 0ac0989b by Ngô Trung Hưng

fix code

parent 8fecd429
Pipeline #736 failed with stages
in 0 seconds
...@@ -4,4 +4,5 @@ ...@@ -4,4 +4,5 @@
class City < ApplicationRecord class City < ApplicationRecord
has_many :city_jobs has_many :city_jobs
has_many :jobs, through: :city_jobs has_many :jobs, through: :city_jobs
enum area: { international: 0, domestic:1, range: 69 }
end end
class ChangeColumnTableCity < ActiveRecord::Migration[5.2]
def change
change_column :cities, :area, :integer
#Ex:- change_column("admin_users", "email", :string, :limit =>25)
end
end
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2020_07_28_021412) do ActiveRecord::Schema.define(version: 2020_07_29_064551) do
create_table "applied_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "applied_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.bigint "user_id" t.bigint "user_id"
...@@ -26,7 +26,7 @@ ActiveRecord::Schema.define(version: 2020_07_28_021412) do ...@@ -26,7 +26,7 @@ ActiveRecord::Schema.define(version: 2020_07_28_021412) do
create_table "cities", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "cities", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.string "name" t.string "name"
t.boolean "area" t.integer "area"
t.datetime "created_at", null: false t.datetime "created_at", null: false
t.datetime "updated_at", null: false t.datetime "updated_at", null: false
end end
......
...@@ -5,222 +5,204 @@ require 'open-uri' ...@@ -5,222 +5,204 @@ require 'open-uri'
# Crawler data # Crawler data
class Crawler class Crawler
COMPANY_SECURITY = 1 COMPANY_SECURITY = 1
NUMBER_LINK = 1 NUMBER_LINK = 2
SIZE_LI_INTERFACE_5 = 10 SIZE_LI_INTERFACE_5 = 10
INTERNATIONAL = 0
DOMESTIC = 1 def path_to_first_link
RANGE = 69 Rails.root.join('tmp', 'link.txt')
end
def logger
@logger ||= Logger.new(Rails.root.join('log', 'crawler.log'))
end
def stop_crawler
file = File.readlines(path_to_first_link, 'r') if File.exist?(path_to_first_link)
file.blank? ? '' : file.join
end
def safe_link(url)
Nokogiri::HTML(URI.open(URI.parse(URI.escape(url))))
end
def crawl_link(page) def crawl_link(page)
puts "Crawling link on page...\nPLease wait...\n"
data = [] data = []
website_companies = [] website_companies = []
website_jobs = [] website_jobs = []
begin
file = File.readlines('tmp/link.txt', 'r') if File.exist?('tmp/link.txt') page.times do |i|
@@stop_crawl = file.blank? ? '' : file.join page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
page.times do |i| link_companies = page.search('.figcaption .caption @href')
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html")) website_companies += link_companies.map(&:value).uniq
link_companies = page.search('.figcaption .caption @href') link_jobs = page.search('.figcaption .title .job_link @href')
website_companies += link_companies.map(&:value).uniq website_jobs += link_jobs.map(&:value)
link_jobs = page.search('.figcaption .title .job_link @href') break if website_jobs.include?(stop_crawler)
website_jobs += link_jobs.map(&:value) end
break if website_jobs.include?(@@stop_crawl) rescue StandardError => e
logger.error "Crawler link on page have error #{e}"
end end
website_companies = website_companies.select { |val| val.present? && val != 'javascript:void(0);' } website_companies = website_companies.select(&:present?)
website_jobs = website_jobs.select(&:present?) website_jobs = website_jobs.select(&:present?)
puts "Result:\nCompany: #{website_companies.length} link\nJob : #{website_jobs.length} link\n--------------" File.write(path_to_first_link, website_jobs[0])
File.write('tmp/link.txt', website_jobs[0])
data << website_companies << website_jobs data << website_companies << website_jobs
end end
def link_job_and_companies def link_job_and_companies
@link_job_and_companies ||= crawl_link(NUMBER_LINK) @link_job_and_companies ||= crawl_link(NUMBER_LINK)
end end
def self.safe_link(url)
Nokogiri::HTML(URI.parse(URI.escape(url)))
end
def craw_data_cities def craw_data_cities
page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')) page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
puts "Crawling data location... \n. \n. \n." locations = page.search('#location option').map(&:text)
data_list_cities = [] locations.each_with_index do |val, index|
data = page.search('#location option') area = index > City.areas['range'] ? City.areas['international'] : City.areas['domestic']
list_cities = data.to_s.split('</option>') City.find_or_create_by(name: val) { |city| city.area = area }
list_cities.each do |x|
data_list_cities << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/, '').rstrip
end
puts 'Save data to database...'
data_list_cities.each_with_index do |val, index|
area = index > RANGE ? INTERNATIONAL : DOMESTIC
City.find_or_create_by(name: val) do |city|
city.area = area
end
end end
end end
def craw_data_companies def craw_data_companies
puts 'Crawl data companies'
link_crawl = link_job_and_companies link_crawl = link_job_and_companies
link_crawl[0].each do |url| link_crawl[0].each do |url|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape(url)))) page = safe_link(url)
name = '' company_name = page.search('.company-info .content .name').text
address = '' Company.find_or_create_by(name: company_name) do |company|
desc = '' company.address = page.search('.company-info .info .content p:nth-child(3)').text
company_name = page.search('.company-info .info .content .name').text company.short_description = page.search('.main-about-us .content').text
if company_name.blank?
name = page.search('.section-page #cp_company_name').text.strip
address = page.search('.section-page .cp_basic_info_details ul li:nth-child(1)').text
desc = page.search('.cp_aboutus_item .content_fck').text
else
name = company_name.strip
address = page.search('.company-info .info .content p:nth-child(3)').text
desc = page.search('.main-about-us .content').text
end
begin
if name.present? && address.present? && desc.present?
Company.find_or_create_by(name: name.strip) do |company|
company.address = address
company.short_description = desc
end
puts name
end
rescue StandardError => e
puts e
end end
end end
rescue StandardError => e
logger.error "Crawler data companies has error: #{e}"
end end
end
def make_data # def make_data
puts 'Please wait for crawl jobs data! . . .' # puts 'Please wait for crawl jobs data! . . .'
link_crawl = link_job_and_companies # link_crawl = link_job_and_companies
arr_link = [] # arr_link = []
link_crawl[1].each do |val| # link_crawl[1].each do |val|
break if @@stop_crawl == val # break if stop_crawler == val
arr_link << val # arr_link << val
end # end
arr_link.reverse!.each_with_index do |path, i| # arr_link.reverse!.each_with_index do |path, i|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape(path)))) # page = Nokogiri::HTML(URI.open(URI.parse(URI.escape(path))))
if page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].present? # if page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].present?
crawl_data_jobs_interface_1(page) # crawl_data_jobs_interface_1(page)
elsif page.search('section .template-200').text.present? # elsif page.search('section .template-200').text.present?
crawl_data_jobs_interface_2(page) # crawl_data_jobs_interface_2(page)
elsif page.search('.DetailJobNew ul li').size == SIZE_LI_INTERFACE_5 && !page.search('.right-col ul li').text.include?('Độ tuổi') # elsif page.search('.DetailJobNew ul li').size == SIZE_LI_INTERFACE_5 && !page.search('.right-col ul li').text.include?('Độ tuổi')
crawl_data_jobs_interface_5(page) # crawl_data_jobs_interface_5(page)
end # end
puts "#{i} - #{path}" # puts "#{i} - #{path}"
end # end
puts 'Crawler data jobs success!' # puts 'Crawler data jobs success!'
end # end
private # private
def add_data(data)
id_company = Company.find_by name: data[:company_name]
id_company = id_company.present? ? id_company.id : COMPANY_SECURITY
id_job = Job.create!(name: data[:name],
company_id: id_company,
level: data[:level],
experience: data[:exprience],
salary: data[:salary],
create_date: data[:created_date],
expiration_date: data[:expiration_date],
description: data[:description])
make_foreign_industries_table(data[:industry_name], id_job.id)
make_foreign_cities_table(data[:city_name], id_job.id)
rescue StandardError => e
puts e
end
def crawl_data_jobs_interface_1(page) # def add_data(data)
data = {} # id_company = Company.find_by name: data[:company_name]
data[:name] = page.search('.apply-now-content .job-desc .title').text # id_company = id_company.present? ? id_company.id : COMPANY_SECURITY
data[:company_name] = page.search('.apply-now-content .job-desc .job-company-name').text # id_job = Job.create!(name: data[:name],
location = [] # company_id: id_company,
length = page.search('.detail-box .map p a').size # level: data[:level],
length.times do |n| # experience: data[:exprience],
location << page.search(".detail-box .map p a:nth-child(#{n + 1})").text # salary: data[:salary],
end # create_date: data[:created_date],
data[:city_name] = location.join(',') # expiration_date: data[:expiration_date],
data[:created_date] = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].text # description: data[:description])
data[:expiration_date] = page.search('.item-blue .detail-box ul li:last')[1].text.delete!("[\n,\t,\r]").split(' ').last # make_foreign_industries_table(data[:industry_name], id_job.id)
data[:salary] = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[1].text # make_foreign_cities_table(data[:city_name], id_job.id)
industries = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(2) a').text # rescue StandardError => e
industries = industries.delete!("[\n,\t,\r]").split(' ').select(&:present?) # puts e
data[:industry_name] = industries.join(',') # end
data[:description] = page.search('.tabs .tab-content .detail-row:nth-child(n)').to_s
get_level = page.search('.item-blue .detail-box:last ul li:nth-child(3)').text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
get_level = get_level[1].to_s.strip
if get_level.blank?
g_level = page.search('.item-blue .detail-box:last ul li:nth-child(2)').text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
data[:level] = g_level[1].to_s.strip
else
data[:level] = get_level
end
exp = page.search('.item-blue .detail-box:last ul li:nth-child(2)').text.delete!("[\n,\t,\r]").split('Kinh nghiệm')
exp = exp[1].to_s.strip
data[:exprience] = exp
add_data(data)
end
def crawl_data_jobs_interface_2(page) # def crawl_data_jobs_interface_1(page)
data = {} # data = {}
data[:name] = page.search('.apply-now-content .job-desc .title').text # data[:name] = page.search('.apply-now-content .job-desc .title').text
data[:company_name] = page.search('.top-job .top-job-info .tit_company').text # data[:company_name] = page.search('.apply-now-content .job-desc .job-company-name').text
locations = [] # location = []
length = page.search('.info-workplace .value a').size # length = page.search('.detail-box .map p a').size
length.times do |n| # length.times do |n|
locations << page.search(".info-workplace .value a:nth-child(#{n + 1})").text # location << page.search(".detail-box .map p a:nth-child(#{n + 1})").text
end # end
data[:city_name] = locations.join(',') # data[:city_name] = location.join(',')
data[:created_date] = '' # data[:created_date] = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].text
expiration_date = page.search('.info li:nth-child(4)').text # data[:expiration_date] = page.search('.item-blue .detail-box ul li:last')[1].text.delete!("[\n,\t,\r]").split(' ').last
data[:expiration_date] = expiration_date.blank? ? '' : expiration_date.delete!("[\n,\t,\r]").split(' ').last # data[:salary] = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[1].text
data[:salary] = page.search('.info li:nth-child(3)').text.split('Lương').last.strip # industries = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(2) a').text
data[:industry_name] = page.search('.info li:nth-child(5) .value').text # industries = industries.delete!("[\n,\t,\r]").split(' ').select(&:present?)
data[:description] = page.search('.left-col').to_s # data[:industry_name] = industries.join(',')
lv = page.search('.boxtp .info li:nth-child(2)').text # data[:description] = page.search('.tabs .tab-content .detail-row:nth-child(n)').to_s
data[:level] = lv.blank? ? '' : lv.delete!("[\n,\t,\r]").strip.split('Cấp bậc').last.strip # get_level = page.search('.item-blue .detail-box:last ul li:nth-child(3)').text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
exp = page.search('.info li:nth-child(6)').text # get_level = get_level[1].to_s.strip
data[:exprience] = exp.blank? ? '' : exp.delete!("[\n,\t,\r]").split('Kinh nghiệm').last.strip # if get_level.blank?
add_data(data) # g_level = page.search('.item-blue .detail-box:last ul li:nth-child(2)').text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
end # data[:level] = g_level[1].to_s.strip
# else
# data[:level] = get_level
# end
# exp = page.search('.item-blue .detail-box:last ul li:nth-child(2)').text.delete!("[\n,\t,\r]").split('Kinh nghiệm')
# exp = exp[1].to_s.strip
# data[:exprience] = exp
# add_data(data)
# end
def crawl_data_jobs_interface_5(page) # def crawl_data_jobs_interface_2(page)
data = {} # data = {}
data[:name] = page.search('.info-company h1').text # data[:name] = page.search('.apply-now-content .job-desc .title').text
data[:company_name] = page.search('.info-company .text-job h2').text # data[:company_name] = page.search('.top-job .top-job-info .tit_company').text
data[:city_name] = page.search('.DetailJobNew ul li:nth-child(1) a').text # locations = []
data[:created_date] = '' # length = page.search('.info-workplace .value a').size
data[:expiration_date] = page.search('.DetailJobNew li:nth-child(9) span').text.strip # length.times do |n|
data[:salary] = page.search('.DetailJobNew li:nth-child(3) span').text.strip # locations << page.search(".info-workplace .value a:nth-child(#{n + 1})").text
data[:industry_name] = page.search('.DetailJobNew li:nth-child(2) span').text.strip # end
data[:description] = page.search('.left-col .detail-row') # data[:city_name] = locations.join(',')
data[:level] = page.search('.DetailJobNew ul li:nth-child(6) span').text.strip # data[:created_date] = ''
data[:exprience] = page.search('.DetailJobNew li:nth-child(5) span').text.strip # expiration_date = page.search('.info li:nth-child(4)').text
add_data(data) # data[:expiration_date] = expiration_date.blank? ? '' : expiration_date.delete!("[\n,\t,\r]").split(' ').last
end # data[:salary] = page.search('.info li:nth-child(3)').text.split('Lương').last.strip
# data[:industry_name] = page.search('.info li:nth-child(5) .value').text
# data[:description] = page.search('.left-col').to_s
# lv = page.search('.boxtp .info li:nth-child(2)').text
# data[:level] = lv.blank? ? '' : lv.delete!("[\n,\t,\r]").strip.split('Cấp bậc').last.strip
# exp = page.search('.info li:nth-child(6)').text
# data[:exprience] = exp.blank? ? '' : exp.delete!("[\n,\t,\r]").split('Kinh nghiệm').last.strip
# add_data(data)
# end
def make_foreign_industries_table(data, id_job) # def crawl_data_jobs_interface_5(page)
unless data.blank? && id_job.blank? # data = {}
content = data.split(',') # data[:name] = page.search('.info-company h1').text
content.each do |val| # data[:company_name] = page.search('.info-company .text-job h2').text
val.gsub!('&amp;', '&') if val.include?('&amp;') # data[:city_name] = page.search('.DetailJobNew ul li:nth-child(1) a').text
data_industry = Industry.find_by name: val.strip # data[:created_date] = ''
id_industry = data_industry.blank? ? Industry.create!(name: val.strip).id : data_industry.id # data[:expiration_date] = page.search('.DetailJobNew li:nth-child(9) span').text.strip
IndustryJob.create!(industry_id: id_industry, job_id: id_job) # data[:salary] = page.search('.DetailJobNew li:nth-child(3) span').text.strip
end # data[:industry_name] = page.search('.DetailJobNew li:nth-child(2) span').text.strip
end # data[:description] = page.search('.left-col .detail-row')
end # data[:level] = page.search('.DetailJobNew ul li:nth-child(6) span').text.strip
# data[:exprience] = page.search('.DetailJobNew li:nth-child(5) span').text.strip
# add_data(data)
# end
def make_foreign_cities_table(data, id_job) # def make_foreign_industries_table(data, id_job)
return if data.blank? && id_job.blank? # unless data.blank? && id_job.blank?
cities = data.split(',') # content = data.split(',')
cities.each do |city| # content.each do |val|
data_city = City.find_by name: city.strip # val.gsub!('&amp;', '&') if val.include?('&amp;')
id_cities = data_city.blank? ? City.create!(name: city.strip, area: DOMESTIC).id : data_city.id # data_industry = Industry.find_by name: val.strip
CityJob.create!(job_id: id_job, city_id: id_cities) # id_industry = data_industry.blank? ? Industry.create!(name: val.strip).id : data_industry.id
end # IndustryJob.create!(industry_id: id_industry, job_id: id_job)
end # end
end # end
# end
# def make_foreign_cities_table(data, id_job)
# return if data.blank? && id_job.blank?
# cities = data.split(',')
# cities.each do |city|
# data_city = City.find_by name: city.strip
# id_cities = data_city.blank? ? City.create!(name: city.strip, area: DOMESTIC).id : data_city.id
# CityJob.create!(job_id: id_job, city_id: id_cities)
# end
# end
...@@ -13,6 +13,5 @@ namespace :crawler do ...@@ -13,6 +13,5 @@ namespace :crawler do
cw = Crawler.new cw = Crawler.new
cw.craw_data_cities cw.craw_data_cities
cw.craw_data_companies cw.craw_data_companies
cw.make_data
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment