Commit 5683fa11 by Ngô Trung Hưng

fix crawler

parent b5e63c5d
Pipeline #718 canceled with stages
in 0 seconds
...@@ -26,7 +26,8 @@ gem 'jbuilder', '~> 2.5' ...@@ -26,7 +26,8 @@ gem 'jbuilder', '~> 2.5'
# gem 'redis', '~> 4.0' # gem 'redis', '~> 4.0'
# Use ActiveModel has_secure_password # Use ActiveModel has_secure_password
# gem 'bcrypt', '~> 3.1.7' # gem 'bcrypt', '~> 3.1.7'
gem 'nokogiri'
gem 'whenever'
# Use ActiveStorage variant # Use ActiveStorage variant
# gem 'mini_magick', '~> 4.8' # gem 'mini_magick', '~> 4.8'
......
...@@ -64,6 +64,7 @@ GEM ...@@ -64,6 +64,7 @@ GEM
chromedriver-helper (2.1.1) chromedriver-helper (2.1.1)
archive-zip (~> 0.10) archive-zip (~> 0.10)
nokogiri (~> 1.8) nokogiri (~> 1.8)
chronic (0.10.2)
coffee-rails (4.2.2) coffee-rails (4.2.2)
coffee-script (>= 2.2.0) coffee-script (>= 2.2.0)
railties (>= 4.0.0) railties (>= 4.0.0)
...@@ -183,6 +184,8 @@ GEM ...@@ -183,6 +184,8 @@ GEM
websocket-driver (0.7.3) websocket-driver (0.7.3)
websocket-extensions (>= 0.1.0) websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5) websocket-extensions (0.1.5)
whenever (1.0.0)
chronic (>= 0.6.3)
xpath (3.2.0) xpath (3.2.0)
nokogiri (~> 1.8) nokogiri (~> 1.8)
...@@ -198,6 +201,7 @@ DEPENDENCIES ...@@ -198,6 +201,7 @@ DEPENDENCIES
jbuilder (~> 2.5) jbuilder (~> 2.5)
listen (>= 3.0.5, < 3.2) listen (>= 3.0.5, < 3.2)
mysql2 (>= 0.4.4, < 0.6.0) mysql2 (>= 0.4.4, < 0.6.0)
nokogiri
puma (~> 3.11) puma (~> 3.11)
rails (~> 5.2.4, >= 5.2.4.3) rails (~> 5.2.4, >= 5.2.4.3)
sass-rails (~> 5.0) sass-rails (~> 5.0)
...@@ -208,6 +212,7 @@ DEPENDENCIES ...@@ -208,6 +212,7 @@ DEPENDENCIES
tzinfo-data tzinfo-data
uglifier (>= 1.3.0) uglifier (>= 1.3.0)
web-console (>= 3.3.0) web-console (>= 3.3.0)
whenever
RUBY VERSION RUBY VERSION
ruby 2.6.6p146 ruby 2.6.6p146
......
...@@ -3,15 +3,19 @@ ...@@ -3,15 +3,19 @@
require 'open-uri' require 'open-uri'
# Crawler data # Crawler data
class InterfaceWeb class InterfaceWeb
def self.crawl_link_for_companies_jobs(page) INTERNATION = 0
VIETNAM = 1
RANGE = 69
def self.crawl_link(page)
puts "Crawling link on page...\nPLease wait...\n" puts "Crawling link on page...\nPLease wait...\n"
data = [] data = []
website_companies = [] website_companies = []
website_jobs = [] website_jobs = []
file = File.readlines('tmp/link.txt', 'r') if File.exist?('tmp/link.txt') file = File.readlines('tmp/link.txt', 'r') if File.exist?('tmp/link.txt')
@@stop_crawl = file.blank? ? '' : file.join @@stop_crawl = file.blank? ? '' : file.join
page.times do |i| page.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html")) page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
link_companies = page.search('.figcaption .caption @href') link_companies = page.search('.figcaption .caption @href')
...@@ -22,15 +26,15 @@ class InterfaceWeb ...@@ -22,15 +26,15 @@ class InterfaceWeb
end end
website_companies = website_companies.select { |val| val.present? && val != 'javascript:void(0);' } website_companies = website_companies.select { |val| val.present? && val != 'javascript:void(0);' }
website_jobs = website_jobs.select(&:present?) website_jobs = website_jobs.select(&:present?)
puts "Result:\nCompany: #{website_companies.length} link\nJob : #{website_jobs.length} link\n------------------------" puts "Result:\nCompany: #{website_companies.length} link\nJob : #{website_jobs.length} link\n--------------"
File.write('tmp/link.txt', website_jobs[0]) File.write('tmp/link.txt', website_jobs[0])
data << website_companies << website_jobs data << website_companies << website_jobs
end end
def self.get_link_job_and_companies def self.link_job_and_companies
@crawl_link_for_companies_jobs ||= crawl_link_for_companies_jobs(2) @link_job_and_companies ||= crawl_link(3)
end end
def self.safe_link(url) def self.safe_link(url)
Nokogiri::HTML(URI.parse(URI.escape(url))) Nokogiri::HTML(URI.parse(URI.escape(url)))
end end
...@@ -46,7 +50,7 @@ class InterfaceWeb ...@@ -46,7 +50,7 @@ class InterfaceWeb
end end
puts 'Save data to database...' puts 'Save data to database...'
data_list_cities.each_with_index do |val, index| data_list_cities.each_with_index do |val, index|
area = index > 69 ? 0 : 1 area = index > RANGE ? INTERNATION : VIETNAM
City.find_or_create_by(name: val) do |city| City.find_or_create_by(name: val) do |city|
city.name = val city.name = val
city.area = area city.area = area
...@@ -56,18 +60,19 @@ class InterfaceWeb ...@@ -56,18 +60,19 @@ class InterfaceWeb
def self.craw_data_companies def self.craw_data_companies
puts 'Crawl data companies' puts 'Crawl data companies'
link_crawl = get_link_job_and_companies link_crawl = link_job_and_companies
link_crawl[0].each do |url| link_crawl[0].each do |url|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape(url)))) page = Nokogiri::HTML(URI.open(URI.parse(URI.escape(url))))
name = '' name = ''
address = '' address = ''
desc = '' desc = ''
if page.search('.company-info .info .content .name').text == '' company_name = page.search('.company-info .info .content .name').text
name = page.search('.section-page #cp_company_name').text if company_name.blank?
name = page.search('.section-page #cp_company_name').text.strip
address = page.search('.section-page .cp_basic_info_details ul li:nth-child(1)').text address = page.search('.section-page .cp_basic_info_details ul li:nth-child(1)').text
desc = page.search('.cp_aboutus_item .content_fck').text desc = page.search('.cp_aboutus_item .content_fck').text
else else
name = page.search('.company-info .info .content .name').text name = company_name.strip
address = page.search('.company-info .info .content p:nth-child(3)').text address = page.search('.company-info .info .content p:nth-child(3)').text
desc = page.search('.main-about-us .content').text desc = page.search('.main-about-us .content').text
end end
...@@ -85,7 +90,7 @@ class InterfaceWeb ...@@ -85,7 +90,7 @@ class InterfaceWeb
end end
end end
end end
def self.add_data(name, company_name, city_name, created_date, expiration_date, salary, industry_name, description, level, exprience) def self.add_data(name, company_name, city_name, created_date, expiration_date, salary, industry_name, description, level, exprience)
begin begin
id_company = Company.find_by name: company_name id_company = Company.find_by name: company_name
...@@ -139,34 +144,22 @@ class InterfaceWeb ...@@ -139,34 +144,22 @@ class InterfaceWeb
def self.crawl_data_jobs_interface_2(page) def self.crawl_data_jobs_interface_2(page)
name = page.search('.apply-now-content .job-desc .title').text name = page.search('.apply-now-content .job-desc .title').text
company_name = page.search('.top-job .top-job-info .tit_company').text company_name = page.search('.top-job .top-job-info .tit_company').text
location = [] locations = []
length = page.search('.info-workplace .value a').size length = page.search('.info-workplace .value a').size
length.times do |n| length.times do |n|
location << page.search(".info-workplace .value a:nth-child(#{n + 1})").text locations << page.search(".info-workplace .value a:nth-child(#{n + 1})").text
end end
city_name = location.join(',') city_name = locations.join(',')
created_date = '' created_date = ''
expiration_date = page.search('.info li:nth-child(4)').text expiration_date = page.search('.info li:nth-child(4)').text
expiration_date = if expiration_date.blank? expiration_date = expiration_date.blank? ? '' : expiration_date.delete!("[\n,\t,\r]").split(' ').last
''
else
expiration_date.to_s.delete!("[\n,\t,\r]").split(' ').last
end
salary = page.search('.info li:nth-child(3)').text.split('Lương').last.strip salary = page.search('.info li:nth-child(3)').text.split('Lương').last.strip
industry_name = page.search('.info li:nth-child(5) .value').text industry_name = page.search('.info li:nth-child(5) .value').text
description = page.search('.left-col').to_s description = page.search('.left-col').to_s
lv = page.search('.boxtp .info li:nth-child(2)').text lv = page.search('.boxtp .info li:nth-child(2)').text
level = if lv.blank? level = lv.blank? ? '' : lv.delete!("[\n,\t,\r]").strip.split('Cấp bậc').last.strip
''
else
lv.delete!("[\n,\t,\r]").strip.split('Cấp bậc').last.strip
end
exp = page.search('.info li:nth-child(6)').text exp = page.search('.info li:nth-child(6)').text
exprience = if exp.blank? exprience = exp.blank? ? '' : exp.delete!("[\n,\t,\r]").split('Kinh nghiệm').last.strip
''
else
exp.delete!("[\n,\t,\r]").split('Kinh nghiệm').last.strip
end
add_data(name, company_name, city_name, created_date, expiration_date, salary, industry_name, description, level, exprience) add_data(name, company_name, city_name, created_date, expiration_date, salary, industry_name, description, level, exprience)
end end
...@@ -187,7 +180,7 @@ class InterfaceWeb ...@@ -187,7 +180,7 @@ class InterfaceWeb
def self.make_foreign_industries_table(data, id_job) def self.make_foreign_industries_table(data, id_job)
content = data.split(',') content = data.split(',')
content.each do |val| content.each do |val|
val.gsub!('&amp;','&') if val.include?('&amp;') val.gsub!('&amp;', '&') if val.include?('&amp;')
id_industry = Industry.find_by name: val.strip id_industry = Industry.find_by name: val.strip
id_industry = id_industry.blank? ? Industry.create!(name: val.strip).id : id_industry.id id_industry = id_industry.blank? ? Industry.create!(name: val.strip).id : id_industry.id
IndustryJob.create!(industry_id: id_industry, job_id: id_job) IndustryJob.create!(industry_id: id_industry, job_id: id_job)
...@@ -205,7 +198,7 @@ class InterfaceWeb ...@@ -205,7 +198,7 @@ class InterfaceWeb
def self.make_data def self.make_data
puts 'Please wait for crawl jobs data! . . .' puts 'Please wait for crawl jobs data! . . .'
link_crawl = get_link_job_and_companies link_crawl = link_job_and_companies
arr_link = [] arr_link = []
link_crawl[1].each do |val| link_crawl[1].each do |val|
break if @@stop_crawl == val break if @@stop_crawl == val
...@@ -215,7 +208,7 @@ class InterfaceWeb ...@@ -215,7 +208,7 @@ class InterfaceWeb
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape(path)))) page = Nokogiri::HTML(URI.open(URI.parse(URI.escape(path))))
if !page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].nil? if !page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].nil?
crawl_data_jobs_interface_1(page) crawl_data_jobs_interface_1(page)
elsif page.search('section .template-200').text != '' elsif page.search('section .template-200').text.present?
crawl_data_jobs_interface_2(page) crawl_data_jobs_interface_2(page)
elsif page.search('.DetailJobNew ul li').size == 10 && !page.search('.right-col ul li').text.include?('Độ tuổi') elsif page.search('.DetailJobNew ul li').size == 10 && !page.search('.right-col ul li').text.include?('Độ tuổi')
crawl_data_jobs_interface_5(page) crawl_data_jobs_interface_5(page)
......
# frozen_string_literal: true # frozen_string_literal: true
require 'open-uri' require 'open-uri'
require 'logger'
require 'src/interface_web' require 'src/interface_web'
# rake task # rake task
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment