Commit 814a4af9 by Ngô Trung Hưng

fix part 2

parent 5683fa11
Pipeline #719 canceled with stages
in 0 seconds
...@@ -4,11 +4,13 @@ require 'open-uri' ...@@ -4,11 +4,13 @@ require 'open-uri'
# Crawler data # Crawler data
class InterfaceWeb class InterfaceWeb
INTERNATION = 0 COMPANY_SECURITY = 1
VIETNAM = 1 SIZE_LI_INTERFACE_5 = 10
INTERNATIONAL = 0
DOMESTIC = 1
RANGE = 69 RANGE = 69
def self.crawl_link(page) def crawl_link(page)
puts "Crawling link on page...\nPLease wait...\n" puts "Crawling link on page...\nPLease wait...\n"
data = [] data = []
website_companies = [] website_companies = []
...@@ -31,15 +33,16 @@ class InterfaceWeb ...@@ -31,15 +33,16 @@ class InterfaceWeb
data << website_companies << website_jobs data << website_companies << website_jobs
end end
def self.link_job_and_companies
@link_job_and_companies ||= crawl_link(3) def link_job_and_companies
@link_job_and_companies ||= crawl_link(2)
end end
def self.safe_link(url) def self.safe_link(url)
Nokogiri::HTML(URI.parse(URI.escape(url))) Nokogiri::HTML(URI.parse(URI.escape(url)))
end end
def self.craw_data_cities def craw_data_cities
page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')) page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
puts "Crawling data location... \n. \n. \n." puts "Crawling data location... \n. \n. \n."
data_list_cities = [] data_list_cities = []
...@@ -50,7 +53,7 @@ class InterfaceWeb ...@@ -50,7 +53,7 @@ class InterfaceWeb
end end
puts 'Save data to database...' puts 'Save data to database...'
data_list_cities.each_with_index do |val, index| data_list_cities.each_with_index do |val, index|
area = index > RANGE ? INTERNATION : VIETNAM area = index > RANGE ? INTERNATIONAL : DOMESTIC
City.find_or_create_by(name: val) do |city| City.find_or_create_by(name: val) do |city|
city.name = val city.name = val
city.area = area city.area = area
...@@ -58,7 +61,7 @@ class InterfaceWeb ...@@ -58,7 +61,7 @@ class InterfaceWeb
end end
end end
def self.craw_data_companies def craw_data_companies
puts 'Crawl data companies' puts 'Crawl data companies'
link_crawl = link_job_and_companies link_crawl = link_job_and_companies
link_crawl[0].each do |url| link_crawl[0].each do |url|
...@@ -91,10 +94,11 @@ class InterfaceWeb ...@@ -91,10 +94,11 @@ class InterfaceWeb
end end
end end
def self.add_data(name, company_name, city_name, created_date, expiration_date, salary, industry_name, description, level, exprience) private
begin
def add_data(name, company_name, city_name, created_date, expiration_date, salary, industry_name, description, level, exprience)
id_company = Company.find_by name: company_name id_company = Company.find_by name: company_name
id_company = id_company.present? ? id_company.id : 1 id_company = id_company.present? ? id_company.id : COMPANY_SECURITY
id_job = Job.create!(name: name, id_job = Job.create!(name: name,
company_id: id_company, company_id: id_company,
level: level, level: level,
...@@ -108,9 +112,10 @@ class InterfaceWeb ...@@ -108,9 +112,10 @@ class InterfaceWeb
rescue StandardError => e rescue StandardError => e
puts e puts e
end end
end
def self.crawl_data_jobs_interface_1(page) private
def crawl_data_jobs_interface_1(page)
name = page.search('.apply-now-content .job-desc .title').text name = page.search('.apply-now-content .job-desc .title').text
company_name = page.search('.apply-now-content .job-desc .job-company-name').text company_name = page.search('.apply-now-content .job-desc .job-company-name').text
location = [] location = []
...@@ -141,7 +146,9 @@ class InterfaceWeb ...@@ -141,7 +146,9 @@ class InterfaceWeb
add_data(name, company_name, city_name, created_date, expiration_date, salary, industry_name, description, level, exprience) add_data(name, company_name, city_name, created_date, expiration_date, salary, industry_name, description, level, exprience)
end end
def self.crawl_data_jobs_interface_2(page) private
def crawl_data_jobs_interface_2(page)
name = page.search('.apply-now-content .job-desc .title').text name = page.search('.apply-now-content .job-desc .title').text
company_name = page.search('.top-job .top-job-info .tit_company').text company_name = page.search('.top-job .top-job-info .tit_company').text
locations = [] locations = []
...@@ -163,7 +170,9 @@ class InterfaceWeb ...@@ -163,7 +170,9 @@ class InterfaceWeb
add_data(name, company_name, city_name, created_date, expiration_date, salary, industry_name, description, level, exprience) add_data(name, company_name, city_name, created_date, expiration_date, salary, industry_name, description, level, exprience)
end end
def self.crawl_data_jobs_interface_5(page) private
def crawl_data_jobs_interface_5(page)
name = page.search('.info-company h1').text name = page.search('.info-company h1').text
company_name = page.search('.info-company .text-job h2').text company_name = page.search('.info-company .text-job h2').text
city_name = page.search('.DetailJobNew ul li:nth-child(1) a').text city_name = page.search('.DetailJobNew ul li:nth-child(1) a').text
...@@ -177,26 +186,36 @@ class InterfaceWeb ...@@ -177,26 +186,36 @@ class InterfaceWeb
add_data(name, company_name, city_name, created_date, expiration_date, salary, industry_name, description, level, exprience) add_data(name, company_name, city_name, created_date, expiration_date, salary, industry_name, description, level, exprience)
end end
def self.make_foreign_industries_table(data, id_job) private
def make_foreign_industries_table(data, id_job)
unless data.blank? && id_job.blank?
content = data.split(',') content = data.split(',')
content.each do |val| content.each do |val|
val.gsub!('&amp;', '&') if val.include?('&amp;') val.gsub!('&amp;', '&') if val.include?('&amp;')
id_industry = Industry.find_by name: val.strip data_industry = Industry.find_by name: val.strip
id_industry = id_industry.blank? ? Industry.create!(name: val.strip).id : id_industry.id id_industry = data_industry.blank? ? Industry.create!(name: val.strip).id : data_industry.id
IndustryJob.create!(industry_id: id_industry, job_id: id_job) IndustryJob.create!(industry_id: id_industry, job_id: id_job)
end end
end end
end
def self.make_foreign_cities_table(data, id_job) private
def make_foreign_cities_table(data, id_job)
unless data.blank? && id_job.blank?
cities = data.split(',') cities = data.split(',')
cities.each do |city| cities.each do |city|
id_cities = City.find_by name: city.strip data_city = City.find_by name: city.strip
id_cities = id_cities.blank? ? City.create!(name: city.strip, area: 1).id : id_cities.id id_cities = data_city.blank? ? City.create!(name: city.strip, area: DOMESTIC).id : data_city.id
CityJob.create!(job_id: id_job, city_id: id_cities) CityJob.create!(job_id: id_job, city_id: id_cities)
end end
end end
end
public
def self.make_data def make_data
puts 'Please wait for crawl jobs data! . . .' puts 'Please wait for crawl jobs data! . . .'
link_crawl = link_job_and_companies link_crawl = link_job_and_companies
arr_link = [] arr_link = []
...@@ -206,11 +225,11 @@ class InterfaceWeb ...@@ -206,11 +225,11 @@ class InterfaceWeb
end end
arr_link.reverse!.each_with_index do |path, i| arr_link.reverse!.each_with_index do |path, i|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape(path)))) page = Nokogiri::HTML(URI.open(URI.parse(URI.escape(path))))
if !page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].nil? if page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].present?
crawl_data_jobs_interface_1(page) crawl_data_jobs_interface_1(page)
elsif page.search('section .template-200').text.present? elsif page.search('section .template-200').text.present?
crawl_data_jobs_interface_2(page) crawl_data_jobs_interface_2(page)
elsif page.search('.DetailJobNew ul li').size == 10 && !page.search('.right-col ul li').text.include?('Độ tuổi') elsif page.search('.DetailJobNew ul li').size == SIZE_LI_INTERFACE_5 && !page.search('.right-col ul li').text.include?('Độ tuổi')
crawl_data_jobs_interface_5(page) crawl_data_jobs_interface_5(page)
end end
puts "#{i} - #{path}" puts "#{i} - #{path}"
......
...@@ -6,8 +6,14 @@ require 'src/interface_web' ...@@ -6,8 +6,14 @@ require 'src/interface_web'
# rake task # rake task
namespace :crawler do namespace :crawler do
task populate: :environment do task populate: :environment do
InterfaceWeb.craw_data_cities Company.find_or_create_by(name: 'Bảo mật') do |company|
InterfaceWeb.craw_data_companies company.name = 'Bảo mật'
InterfaceWeb.make_data company.address = 'Vui lòng xem trong mô tả công việc'
company.short_description = 'Vui lòng xem trong mô tả công việc'
end
cw = InterfaceWeb.new
cw.craw_data_cities
cw.craw_data_companies
cw.make_data
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment