Commit 4a66e48a by Ngô Trung Hưng

craw 90%

parent a1f70d89
...@@ -6,6 +6,7 @@ class HomeController < ApplicationController ...@@ -6,6 +6,7 @@ class HomeController < ApplicationController
# crawl_data_jobs_interface_3() # crawl_data_jobs_interface_3()
# crawl_data_jobs_interface_4() # crawl_data_jobs_interface_4()
# crawl_data_jobs_interface_5() # crawl_data_jobs_interface_5()
make_data make_data
# craw_data_companies
end end
end end
...@@ -36,7 +36,7 @@ module CrawlerHelper ...@@ -36,7 +36,7 @@ module CrawlerHelper
end end
website_companies = website_companies.join(",") website_companies = website_companies.join(",")
website_companies = website_companies.split(",").uniq! website_companies = website_companies.split(",")
website_companies = website_companies.select { |val| val != ''} website_companies = website_companies.select { |val| val != ''}
website_jobs = website_jobs.join(",") website_jobs = website_jobs.join(",")
...@@ -83,14 +83,21 @@ module CrawlerHelper ...@@ -83,14 +83,21 @@ module CrawlerHelper
end end
@data_companies[:description] = @data_companies_description @data_companies[:description] = @data_companies_description
render plain: "#{@data_companies[:name]} -- #{@data_companies[:address]} -- #{@data_companies[:description]}" # render plain: "#{@data_companies[:name]} -- #{@data_companies[:address]} -- #{@data_companies[:description]}"
render plain: "#{@data_companies[:name]} = #{@data_companies[:name].length} "
# render plain: @data_companies # render plain: @data_companies
end end
def base_link(url) def base_link(url)
Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}")))) Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
end end
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/ky-su-dau-thau-mep.35B45617.html"))))
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/dai-dien-tieu-thu-sales-representative-quang-binh-tp-dong-hoi.35B4572F.html"))))
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/tuyen-tai-xe-van-phong-cho-sep-han-quoc-tu-binh-thanh.35B45A41.html"))))
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/dai-dien-tieu-thu-sales-representative-quang-nam-phuoc-son-hiep-duc-thang-binh.35B4572D.html"))))
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/program-management-executive.35B428B5.html"))))
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/nhan-vien-tong-vu-phong-van-va-lam-test-truc-tuyen-nhan-viec-ngay-sau-3-5-ngay-nop-ho-so.35B44E79.html"))))
# Crawler job # Crawler job
def crawl_data_jobs_interface_1(url) def crawl_data_jobs_interface_1(url)
page = base_link(url) page = base_link(url)
...@@ -177,148 +184,109 @@ module CrawlerHelper ...@@ -177,148 +184,109 @@ module CrawlerHelper
end end
def crawl_data_jobs_interface_3(url) def crawl_data_jobs_interface_3(url)
link_crawl = crawl_link_for_companies_jobs() page = base_link(url)
@data = {} @name << page.search(".intro_job h1").text
@name = [] @data[:name] = @name
@company_name = []
@level = []
@exprience = []
@salary = []
@create_date = []
@expiration_date = []
@description = []
@industry_name = []
@city_name = []
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/tuyen-tai-xe-van-phong-cho-sep-han-quoc-tu-binh-thanh.35B45A41.html"))))
#interface1
name = page.search(".info-company h1").text
@data[:name] = name
company_name = page.search(".zone-company .text-job h2").text
@data[:company_name] = company_name
city_name = page.search(".DetailJobNew ul li:nth-child(1) a").text @company_name << page.search(".info-company .text-job h2").text
@data[:city_name] = city_name @data[:company_name] = @company_name
@city_name << page.search(".DetailJobNew ul li:nth-child(1) a").text
@data[:created_date] = "" @data[:city_name] = @city_name
expiration_date = page.search(".DetailJobNew li:nth-child(7) span").text @created_date << ""
@data[:expiration_date] = expiration_date @data[:created_date] = @created_date
salary = page.search(".DetailJobNew li:nth-child(6) span").text @expiration_date << page.search(".DetailJobNew .info ul li:nth-child(3) p").text.strip
@data[:salary] = salary @data[:expiration_date] = @expiration_date
industry_name = page.search(".DetailJobNew li:nth-child(3) span a").text @salary << page.search(".DetailJobNew .salary ul li:nth-child(3) p").text.strip
@data[:industry_name] = industry_name.delete!("[\n,\t,\r]").split(' ').select { |v| v != ''} @data[:salary] = @salary
description = page.search(".left-col").to_s.delete!("[\n,\t,\r]") @industry_name << page.search(".DetailJobNew .salary ul li:nth-child(2) p").text.strip
@data[:description] = description @data[:industry_name] = @industry_name
level = page.search(".DetailJobNew ul li:nth-child(2) span").text @description << page.search(".content_job .detail-row").to_s.delete!("[\n,\t,\r]")
@data[:level] = level @data[:description] = @description
@data[:exprience] = ""
render plain: "#{@data}" @level << page.search(".DetailJobNew .info ul li:nth-child(2) p").text.strip
@data[:level] = @level
@exprience << page.search(".DetailJobNew .info ul li:nth-child(1) p").text.strip
@data[:exprience] = @exprience
end end
def crawl_data_jobs_interface_4(url) def crawl_data_jobs_interface_4(url)
# link_crawl = crawl_link_for_companies_jobs() page = base_link(url)
@data = {}
@name = []
@company_name = []
@level = []
@exprience = []
@salary = []
@create_date = []
@expiration_date = []
@description = []
@industry_name = []
@city_name = []
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/dai-dien-tieu-thu-sales-representative-quang-nam-phuoc-son-hiep-duc-thang-binh.35B4572D.html"))))
#interface1
name = page.search(".info-company h1").text
@data[:name] = name
company_name = page.search(".info-company .text-job h2").text
@data[:company_name] = company_name
city_name = page.search(".DetailJobNew ul li:nth-child(1) a").text @name << page.search(".info-company h1").text
@data[:city_name] = city_name @data[:name] = @name
if page.search(".zone-company .text-job h2").text == ""
@company_name << page.search(".info-company .text-job h2").text
@industry_name << page.search(".DetailJobNew li:nth-child(3) span").text.strip
else
@company_name << page.search(".zone-company .text-job h2").text.strip
industry_name = page.search(".DetailJobNew li:nth-child(3) span a").text
@industry_name << industry_name.delete!("[\n,\t,\r]").split(' ').select { |v| v != ''}
end
@data[:created_date] = "" @data[:company_name] = @company_name
@data[:industry_name] = @industry_name
expiration_date = page.search(".DetailJobNew li:nth-child(7) span").text @city_name << page.search(".DetailJobNew ul li:nth-child(1) a").text
@data[:expiration_date] = expiration_date @data[:city_name] = @city_name
salary = page.search(".DetailJobNew li:nth-child(6) span").text @created_date << ""
@data[:salary] = salary @data[:created_date] = @created_date
industry_name = page.search(".DetailJobNew li:nth-child(3) span").text.strip @expiration_date << page.search(".DetailJobNew li:nth-child(7) span").text
@data[:industry_name] = industry_name @data[:expiration_date] = @expiration_date
description = page.search(".left-col").to_s.delete!("[\n,\t,\r]") @salary << page.search(".DetailJobNew li:nth-child(6) span").text
@data[:description] = description @data[:salary] = @salary
level = page.search(".DetailJobNew ul li:nth-child(2) span").text @description << page.search(".left-col").to_s.delete!("[\n,\t,\r]")
@data[:level] = level @data[:description] = @description
@data[:exprience] = ""
render plain: "#{@data}" @level << page.search(".DetailJobNew ul li:nth-child(2) span").text
@data[:level] = @level
@exprience << ""
@data[:exprience] = @exprience
end end
def crawl_data_jobs_interface_5(url) def crawl_data_jobs_interface_5(url)
# link_crawl = crawl_link_for_companies_jobs() page = base_link(url)
@data = {} @name << page.search(".info-company h1").text
@name = [] @data[:name] = @name
@company_name = []
@level = []
@exprience = []
@salary = []
@create_date = []
@expiration_date = []
@description = []
@industry_name = []
@city_name = []
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/program-management-executive.35B428B5.html"))))
#interface1
name = page.search(".info-company h1").text
@data[:name] = name
company_name = page.search(".info-company .text-job h2").text @company_name << page.search(".info-company .text-job h2").text
@data[:company_name] = company_name @data[:company_name] = @company_name
city_name = page.search(".DetailJobNew ul li:nth-child(1) a").text @city_name << page.search(".DetailJobNew ul li:nth-child(1) a").text
@data[:city_name] = city_name @data[:city_name] = @city_name
@created_date << ""
@data[:created_date] = "" @data[:created_date] = @created_date
expiration_date = page.search(".DetailJobNew li:nth-child(9) span").text.strip @expiration_date << page.search(".DetailJobNew li:nth-child(9) span").text.strip
@data[:expiration_date] = expiration_date @data[:expiration_date] = @expiration_date
salary = page.search(".DetailJobNew li:nth-child(3) span").text.strip @salary << page.search(".DetailJobNew li:nth-child(3) span").text.strip
@data[:salary] = salary @data[:salary] = @salary
industry_name = page.search(".DetailJobNew li:nth-child(2) span").text.strip @industry_name << page.search(".DetailJobNew li:nth-child(2) span").text.strip
@data[:industry_name] = industry_name @data[:industry_name] = @industry_name
description = page.search(".left-col .detail-row").to_s.delete!("[\n,\t,\r]") @description << page.search(".left-col .detail-row").to_s.delete!("[\n,\t,\r]")
@data[:description] = description @data[:description] = @description
level = page.search(".DetailJobNew ul li:nth-child(6) span").text.strip @level << page.search(".DetailJobNew ul li:nth-child(6) span").text.strip
@data[:level] = level @data[:level] = @level
exprience = page.search(".DetailJobNew li:nth-child(5) span").text.strip @exprience << page.search(".DetailJobNew li:nth-child(5) span").text.strip
@data[:exprience] = exprience @data[:exprience] = @exprience
render plain: "#{@data}"
end end
def make_data def make_data
...@@ -341,26 +309,32 @@ module CrawlerHelper ...@@ -341,26 +309,32 @@ module CrawlerHelper
if page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[0] != nil if page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[0] != nil
crawl_data_jobs_interface_1(path) crawl_data_jobs_interface_1(path)
elsif page.search(".DetailJobNew li:nth-child(5) span").to_s == "" elsif page.search(".info li:nth-child(6)").text != ""
crawl_data_jobs_interface_2(path) crawl_data_jobs_interface_2(path)
# elsif page.search(".DetailJobNew li:nth-child(5) span").to_s != "" elsif page.search(".DetailJobNew ul li").size == 10
# crawl_data_jobs_interface_5(path) crawl_data_jobs_interface_5(path)
elsif page.search(".DetailJobNew ul li").size == 8
crawl_data_jobs_interface_4(path)
else
crawl_data_jobs_interface_3(path)
end end
end end
# render plain: "#{link_crawl[1]} --- #{link_crawl[1].length}" render plain: "#{@data}"
render plain: "#{@data} => #{@data[:name][0]} # render plain: "#{@data[:company_name]}--#{@data[:company_name].length}"
=> #{@data[:company_name][0]} # name: => #{@data[:name][0]} -- #{@data[:name].length}
=> #{@data[:level][0]} # company: => #{@data[:company_name][0]} -- #{@data[:company_name].length}
=> #{@data[:industry_name][0]} # level: => #{@data[:level][0]} -- #{@data[:level].length}
=> #{@data[:exprience][0]} # industry: => #{@data[:industry_name][0]} -- #{@data[:industry_name].length}
=> #{@data[:expiration_date][0]} # exprience: => #{@data[:exprience][0]} -- #{@data[:exprience].length}
=> #{@data[:created_date][0]} # expiration date: => #{@data[:expiration_date][0]} -- #{@data[:expiration_date].length}
=> #{@data[:city_name][0]} # created date: => #{@data[:created_date][0]} -- #{@data[:created_date].length}
=> #{@data[:description][0]} # city: => #{@data[:city_name][0]} -- #{@data[:city_name].length}
=> #{@data[:salary][0]}" # salary: => #{@data[:salary][0]} -- #{@data[:salary].length}
# description: => #{@data[:description][0]} -- #{@data[:description].length} "
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/truong-tram-y-te-cong-ty.35B44FDF.html")))) # page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/truong-tram-y-te-cong-ty.35B44FDF.html"))))
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/hr-admin-executive.35B45B43.html")))) # page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/hr-admin-executive.35B45B43.html"))))
end end
end end
\ No newline at end of file
...@@ -14,7 +14,7 @@ default: &default ...@@ -14,7 +14,7 @@ default: &default
encoding: utf8 encoding: utf8
pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %> pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username: root username: root
password: '12345678' password: '1'
socket: /var/run/mysqld/mysqld.sock socket: /var/run/mysqld/mysqld.sock
......
require 'open-uri'
class Clawler
@page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
# PILL DATA CITIES
def self.make_cities
@data_list_cities = []
data = @page.search("#location option")
list_cities = data.to_s.split("</option>")
list_cities.each do |x|
@data_list_cities << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
@data_list_cities.length.times do |i|
if i <= 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 1)
elsif i > 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 0)
end
end
end
#PIL DATA INDUSTRIES
def self.make_industries
@data_list_industries = []
data = @page.search("#industry option")
list_industries = data.to_s.split("</option>")
list_industries.each do |x|
@data_list_industries << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
@data_list_industries.length.times do |i|
name = (@data_list_industries[i].to_s)
Industry.create!(name: name)
end
end
# CRAWLER LINK JOB & COMPANIES
def self.crawl_link_for_companies_jobs
data = []
website_companies = []
website_jobs = []
num_page_will_crawl = 1
num_page_will_crawl.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i+1}-vi.html"))
website_companies << page.search(".figcaption .caption a/@href").text.to_s.split('https://careerbuilder.vn/')
website_jobs << page.search(".figcaption .title .job_link @href").text.to_s.split('https://careerbuilder.vn/')
end
website_companies = website_companies.join(",")
website_companies = website_companies.split(",").uniq!
website_companies = website_companies.select { |val| val != ''}
website_jobs = website_jobs.join(",")
website_jobs = website_jobs.split(",")
website_jobs = website_jobs.select { |val| val != ''}
data << website_companies << website_jobs
end
# CRAWLER DATA COMPANIES
def self.craw_data_companies
link_crawl = crawl_link_for_companies_jobs()
@data_companies = {}
@data_companies_name = []
@data_companies_address = []
@data_companies_description = []
link_crawl[0].each do |url|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
name = ''
address = ''
desc = ''
if page.search(".company-info .info .content .name").text == ""
name = page.search(".section-page #cp_company_name").text
address = page.search(".section-page .cp_basic_info_details ul li:nth-child(1)").text
desc = page.search(".cp_aboutus_item .content_fck").text
else
name = page.search(".company-info .info .content .name").text
address = page.search(".company-info .info .content p:nth-child(3)").text
desc = page.search(".main-about-us .content").text
end
if (name != "" && address != "" && desc != "")
@data_companies_name << name.to_s.rstrip
@data_companies_address << address.to_s.rstrip
@data_companies_description << desc
end
end
@data_companies[:name] = @data_companies_name
@data_companies[:address] = @data_companies_address
@data_companies_description.each do |val|
val.to_s.delete!("[\n,\t,\r]")
val.strip!
end
@data_companies[:description] = @data_companies_description
@data_companies
end
# FILL DATA COMPANIES
def self.make_companies
@data = self.craw_data_companies()
i = @data[:name].length
i.times do |n|
name = @data[:name][n]
address = @data[:address][n]
short_description = @data[:description][n]
password = "password"
Company.create!(name: name,
address: address,
short_description: short_description)
end
end
end
\ No newline at end of file
class Linkcrawl
# CRAWLER LINK JOB & COMPANIES
end
\ No newline at end of file
require 'open-uri'
require 'src/interface_web'
class Clawler
@page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
# PILL DATA CITIES
def self.make_cities
@data_list_cities = []
data = @page.search("#location option")
list_cities = data.to_s.split("</option>")
list_cities.each do |x|
@data_list_cities << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
@data_list_cities.length.times do |i|
if i <= 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 1)
elsif i > 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 0)
end
end
end
#PIL DATA INDUSTRIES
def self.make_industries
@data_list_industries = []
data = @page.search("#industry option")
list_industries = data.to_s.split("</option>")
list_industries.each do |x|
@data_list_industries << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').strip
end
@data_list_industries.length.times do |i|
name = (@data_list_industries[i].to_s)
if name.include?('&amp;')
name.gsub!('&amp;','&')
end
Industry.create!(name: name)
end
end
# FILL DATA COMPANIES
def self.make_companies
Company.create!(name: "Company name",
address: "Vui lòng trong mô tả công việc",
short_description: "Vui lòng trong mô tả công việc")
@data = Interface_web.craw_data_companies()
i = @data[:name].length
i.times do |n|
name = @data[:name][n]
address = @data[:address][n]
short_description = @data[:description][n]
Company.create!(name: name,
address: address,
short_description: short_description)
end
end
# FILL DATA JOBS
def self.make_jobs
@data_jobs = Interface_web.make_data()
i = @data_jobs[:name].length
i.times do |n|
name = @data_jobs[:name][n].to_s
company_name = @data_jobs[:company_name][n].to_s.strip
id_company = Company.find_by name: company_name
if id_company != nil
id_company = id_company.id
else
id_company = 1
end
level = @data_jobs[:level][n].to_s
experience = @data_jobs[:exprience][n].to_s
salary = @data_jobs[:salary][n].to_s
create_date = @data_jobs[:created_date][n].to_s
expiration_date = @data_jobs[:expiration_date][n].to_s
description = @data_jobs[:description][n].to_s
id_job = Job.create!(name: name,
company_id: id_company,
level: level,
experience: experience,
salary: salary,
create_date: create_date,
expiration_date: expiration_date,
description: description)
self.make_foreign_industries_table(@data_jobs[:industry_name][n],id_job.id)
self.make_cities_cities_table(@data_jobs[:city_name][n],id_job.id)
end
end
def self.make_foreign_industries_table(data,id_job)
@content = data.split(',')
length = @content.length
length.times do |n|
id_industry = Industry.find_by name: @content[n].strip
if !id_industry
id_industry = Industry.create!(name: @content[n].strip).id
else
id_industry = id_industry.id
end
IndustryJob.create!(industry_id: id_industry,
job_id: id_job)
end
end
def self.make_cities_cities_table(data,id_job)
if data.include?(',')
@content = data.split(',')
else
@content = data.split('|')
end
length = @content.length
length.times do |n|
id_cities = City.find_by name: @content[n].strip
if !id_cities
id_cities = City.create!(name: @content[n].strip).id
else
id_cities = id_cities.id
end
CityJob.create!(job_id: id_job,
city_id: id_cities)
end
end
end
\ No newline at end of file
class Interface_web
def self.crawl_link_for_companies_jobs(page)
data = []
website_companies = []
website_jobs = []
num_page_will_crawl = page
num_page_will_crawl.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i+1}-vi.html"))
website_companies << page.search(".figcaption .caption a/@href").text.to_s.split('https://careerbuilder.vn/')
website_jobs << page.search(".figcaption .title .job_link @href").text.to_s.split('https://careerbuilder.vn/')
end
website_companies = website_companies.join(",")
website_companies = website_companies.split(",").uniq!
website_companies = website_companies.select { |val| val != ''}
website_jobs = website_jobs.join(",")
website_jobs = website_jobs.split(",")
website_jobs = website_jobs.select { |val| val != ''}
data << website_companies << website_jobs
end
def self.craw_data_companies
link_crawl = crawl_link_for_companies_jobs(1)
@data_companies = {}
@data_companies_name = []
@data_companies_address = []
@data_companies_description = []
link_crawl[0].each do |url|
page = base_link(url)
name = ''
address = ''
desc = ''
if page.search(".company-info .info .content .name").text == ""
name = page.search(".section-page #cp_company_name").text
address = page.search(".section-page .cp_basic_info_details ul li:nth-child(1)").text
desc = page.search(".cp_aboutus_item .content_fck").text
else
name = page.search(".company-info .info .content .name").text
address = page.search(".company-info .info .content p:nth-child(3)").text
desc = page.search(".main-about-us .content").text
end
if (name != "" && address != "" && desc != "")
@data_companies_name << name.to_s.rstrip
@data_companies_address << address.to_s.rstrip
@data_companies_description << desc
end
end
@data_companies[:name] = @data_companies_name
@data_companies[:address] = @data_companies_address
@data_companies_description.each do |val|
val.to_s.delete!("[\n,\t,\r]")
val.strip!
end
@data_companies[:description] = @data_companies_description
@data_companies
end
def self.base_link(url)
Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
end
def self.crawl_data_jobs_interface_1(url)
page = base_link(url)
@name << page.search(".apply-now-content .job-desc .title").text
@data[:name] = @name
@company_name << page.search(".apply-now-content .job-desc .job-company-name").text
@data[:company_name] = @company_name
@city_name << page.search(".detail-box .map p a").text
@data[:city_name] = @city_name
@created_date << page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[0].text
@data[:created_date] = @created_date
@expiration_date << page.search(".item-blue .detail-box ul li:last")[1].text.delete!("[\n,\t,\r]").split(' ').last
@data[:expiration_date] = @expiration_date
@salary << page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[1].text
@data[:salary] = @salary
industry_name = page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(2) a").text
industry_name = industry_name.delete!("[\n,\t,\r]").split(' ').select { |v| v != ''}
@industry_name << industry_name.join(',')
@data[:industry_name] = @industry_name
@description << page.search(".tabs .tab-content .detail-row:nth-child(n)").to_s.delete!("[\n,\t,\r]")
@data[:description] = @description
get_level = page.search(".item-blue .detail-box:last ul li:nth-child(3)").text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
get_level = get_level[1].to_s.strip
if get_level == ""
level = page.search(".item-blue .detail-box:last ul li:nth-child(2)").text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
@level << level[1].to_s.strip
else
level = page.search(".item-blue .detail-box:last ul li:nth-child(3)").text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
@level << level[1].to_s.strip
end
@data[:level] = @level
exprience = page.search(".item-blue .detail-box:last ul li:nth-child(2)").text.delete!("[\n,\t,\r]").split('Kinh nghiệm')
exprience = exprience[1].to_s.strip
@exprience << exprience
@data[:exprience] = @exprience
end
def self.crawl_data_jobs_interface_2(url)
page = base_link(url)
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/dai-dien-tieu-thu-sales-representative-quang-binh-tp-dong-hoi.35B4572F.html"))))
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
#interface1
@name << page.search(".apply-now-content .job-desc .title").text
@data[:name] = @name
@company_name << page.search(".top-job .top-job-info .tit_company").text
@data[:company_name] = @company_name
@city_name << page.search(".info-workplace .value a").text
@data[:city_name] = @city_name
@created_date << ""
@data[:created_date] =@created_date
expiration_date = page.search(".info li:nth-child(4)").text
@expiration_date << expiration_date.to_s.delete!("[\n,\t,\r]").split(' ').last
@data[:expiration_date] = @expiration_date
@salary << page.search(".info li:nth-child(3)").text.split("Lương").last.strip
@data[:salary] = @salary
@industry_name << page.search(".info li:nth-child(5) .value").text
@data[:industry_name] = @industry_name
@description << page.search(".left-col").to_s.delete!("[\n,\t,\r]")
@data[:description] = @description
@level << page.search(".boxtp .info li:nth-child(2)").text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc').last.strip
@data[:level] = @level
@exprience << page.search(".info li:nth-child(6)").text.delete!("[\n,\t,\r]").split('Kinh nghiệm').last.strip
@data[:exprience] = @exprience
end
def self.crawl_data_jobs_interface_3(url)
page = base_link(url)
@name << page.search(".intro_job h1").text
@data[:name] = @name
@company_name << page.search(".info-company .text-job h2").text
@data[:company_name] = @company_name
@city_name << page.search(".DetailJobNew ul li:nth-child(1) a").text
@data[:city_name] = @city_name
@created_date << ""
@data[:created_date] = @created_date
@expiration_date << page.search(".DetailJobNew .info ul li:nth-child(3) p").text.strip
@data[:expiration_date] = @expiration_date
@salary << page.search(".DetailJobNew .salary ul li:nth-child(3) p").text.strip
@data[:salary] = @salary
@industry_name << page.search(".DetailJobNew .salary ul li:nth-child(2) p").text.strip
@data[:industry_name] = @industry_name
@description << page.search(".content_job .detail-row").to_s.delete!("[\n,\t,\r]")
@data[:description] = @description
@level << page.search(".DetailJobNew .info ul li:nth-child(2) p").text.strip
@data[:level] = @level
@exprience << page.search(".DetailJobNew .info ul li:nth-child(1) p").text.strip
@data[:exprience] = @exprience
end
def self.crawl_data_jobs_interface_4(url)
page = base_link(url)
@name << page.search(".info-company h1").text
@data[:name] = @name
if page.search(".zone-company .text-job h2").text == ""
@company_name << page.search(".info-company .text-job h2").text
@industry_name << page.search(".DetailJobNew li:nth-child(3) span").text.strip
else
@company_name << page.search(".zone-company .text-job h2").text.strip
industry_name = page.search(".DetailJobNew li:nth-child(3) span a").text
@industry_name << industry_name.delete!("[\n,\t,\r]").split(' ').select { |v| v != ''}
end
@data[:company_name] = @company_name
@data[:industry_name] = @industry_name
@city_name << page.search(".DetailJobNew ul li:nth-child(1) a").text
@data[:city_name] = @city_name
@created_date << ""
@data[:created_date] = @created_date
@expiration_date << page.search(".DetailJobNew li:nth-child(7) span").text
@data[:expiration_date] = @expiration_date
@salary << page.search(".DetailJobNew li:nth-child(6) span").text
@data[:salary] = @salary
@description << page.search(".left-col").to_s.delete!("[\n,\t,\r]")
@data[:description] = @description
@level << page.search(".DetailJobNew ul li:nth-child(2) span").text
@data[:level] = @level
@exprience << ""
@data[:exprience] = @exprience
end
def self.crawl_data_jobs_interface_5(url)
page = base_link(url)
@name << page.search(".info-company h1").text
@data[:name] = @name
@company_name << page.search(".info-company .text-job h2").text
@data[:company_name] = @company_name
@city_name << page.search(".DetailJobNew ul li:nth-child(1) a").text
@data[:city_name] = @city_name
@created_date << ""
@data[:created_date] = @created_date
@expiration_date << page.search(".DetailJobNew li:nth-child(9) span").text.strip
@data[:expiration_date] = @expiration_date
@salary << page.search(".DetailJobNew li:nth-child(3) span").text.strip
@data[:salary] = @salary
@industry_name << page.search(".DetailJobNew li:nth-child(2) span").text.strip
@data[:industry_name] = @industry_name
@description << page.search(".left-col .detail-row").to_s.delete!("[\n,\t,\r]")
@data[:description] = @description
@level << page.search(".DetailJobNew ul li:nth-child(6) span").text.strip
@data[:level] = @level
@exprience << page.search(".DetailJobNew li:nth-child(5) span").text.strip
@data[:exprience] = @exprience
end
def self.make_data
@data = {}
@name = []
@company_name = []
@level = []
@exprience = []
@salary = []
@created_date = []
@expiration_date = []
@description = []
@industry_name = []
@city_name = []
link_crawl = crawl_link_for_companies_jobs(1)
link_crawl[1].each do |path|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{path}"))))
if page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[0] != nil
crawl_data_jobs_interface_1(path)
elsif page.search(".info li:nth-child(6)").text != ""
crawl_data_jobs_interface_2(path)
elsif page.search(".DetailJobNew ul li").size == 10
crawl_data_jobs_interface_5(path)
elsif page.search(".DetailJobNew ul li").size == 8
crawl_data_jobs_interface_4(path)
else
crawl_data_jobs_interface_3(path)
end
end
@data
end
end
\ No newline at end of file
require 'helper/crawler' require 'src/crawler'
namespace :db do namespace :db do
task populate: :environment do task populate: :environment do
Clawler.make_cities Clawler.make_cities
Clawler.make_industries Clawler.make_industries
Clawler.make_companies Clawler.make_companies
Clawler.make_jobs
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment