import data

parent 20dff5d8
Pipeline #1353 failed with stages
in 0 seconds
require 'open-uri'
namespace :crawler do namespace :crawler do
desc "TODO" desc "TODO"
task jobs: :environment do task jobs: :environment do
...@@ -11,46 +12,46 @@ namespace :crawler do ...@@ -11,46 +12,46 @@ namespace :crawler do
last_page = (total.to_f / per_page.to_f).round last_page = (total.to_f / per_page.to_f).round
while page <= last_page while page <= last_page
pagination_list_url = "https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html" pagination_list_url = "https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"
pagination_list_url_job = Nokogiri::HTML(URI.open(pagination_list_url)) parse_list_url = Nokogiri::HTML(URI.open(pagination_list_url))
pagination_job_listing = pagination_list_url_job.css('div.job-item') pagination_job_listing = parse_list_url.css('div.job-item')
pagination_job_listing.each do |detail_jobs| pagination_job_listing.each do |detail_jobs|
pagination_url = detail_jobs.css('a')[1].attributes["href"].value pagination_url = detail_jobs.css('a')[1].attributes["href"].value
parse_pagination_url = Nokogiri::HTML(URI.open(pagination_url)) parse_pagination_url = Nokogiri::HTML(URI.open(pagination_url))
pagination_detail_job = parse_pagination_url.css('div.container') pagination_detail_job = parse_pagination_url.css('div.container')
strong_element_value = pagination_detail_job.css('div.detail-box.has-background ul li') strong_element_value = pagination_detail_job.css('div.detail-box.has-background ul li')
puts pagination_detail_job.css('div.job-desc h1.title')[0].text title = pagination_detail_job.css('div.job-desc h1.title')[0].text
strong_element_value.each do |title_strong| strong_element_value.each do |title_strong|
case title_strong.css('strong').text case title_strong.css('strong').text
when "Lương" when "Lương"
puts title_strong.css('p').text.gsub(/\s+/, " ").strip salary = title_strong.css('p').text.gsub(/\s+/, " ").strip
when "Kinh nghiệm" when "Kinh nghiệm"
puts title_strong.css('p').text.gsub(/\s+/, " ").strip experience = title_strong.css('p').text.gsub(/\s+/, " ").strip
when "Cấp bậc" when "Cấp bậc"
puts title_strong.css('p').text.gsub(/\s+/, " ").strip level = title_strong.css('p').text.gsub(/\s+/, " ").strip
when "Hết hạn nộp" when "Hết hạn nộp"
puts title_strong.css('p').text.gsub(/\s+/, " ").strip expired_at = title_strong.css('p').text.gsub(/\s+/, " ").strip
end end
end end
h3_element_value = pagination_detail_job.css('div.detail-row') h3_element_value = pagination_detail_job.css('div.detail-row')
h3_element_value.each do |h3_element| h3_element_value.each do |h3_element|
case h3_element.css('h3').text case h3_element.css('h3').text
when "Mô tả Công việc" when "Mô tả Công việc"
puts h3_element.css('p').text.gsub(/\s+/, " ").strip overview = h3_element.css('p').text.gsub(/\s+/, " ").strip
when "Yêu Cầu Công Việc" when "Yêu Cầu Công Việc"
puts h3_element.css('p').text.gsub(/\s+/, " ").strip requirement = h3_element.css('p').text.gsub(/\s+/, " ").strip
when "Thông tin khác" when "Thông tin khác"
puts h3_element.css('div.content_fck ul li').text.gsub(/\s+/, " ").strip other_requirement = h3_element.css('div.content_fck ul li').text.gsub(/\s+/, " ").strip
end end
end end
company_url = detail_jobs.css('a')[0].attributes["href"].value company_url = detail_jobs.css('a')[0].attributes["href"].value
parse_company_url = Nokogiri::HTML(URI.open(company_url)) parse_company_url = Nokogiri::HTML(URI.open(company_url))
company = parse_company_url.css('div.container') company = parse_company_url.css('div.container')
puts company.css('div.company-info div.info div.content p.name').text comapny = company.css('div.company-info div.info div.content p.name').text
company_info = company.css('div.company-info div.info div.content') company_info = company.css('div.company-info div.info div.content')
puts company_info.css('p')[1].text address company_info.css('p')[1].text
puts company_info.css('ul li').text description = company_info.css('ul li').text
puts company.css('div.row div.content p').text.gsub(/\s+/, " ").strip overview = company.css('div.row div.content p').text.gsub(/\s+/, " ").strip
end end
page +=1 page +=1
end end
...@@ -60,20 +61,45 @@ namespace :crawler do ...@@ -60,20 +61,45 @@ namespace :crawler do
desc "TODO" desc "TODO"
task industries: :environment do task industries: :environment do
industries_listing = parse_base_url.css('div.container div.list-of-working-positions div.col-md-6.col-lg-4.cus-col') industries_listing = parse_base_url.css('div.container div.list-of-working-positions div.col-md-6.col-lg-4.cus-col')
puts industries_listing.css('ul.list-jobs li').text industries_listing.each do |industries|
industries_name = industries.css('ul.list-jobs li').text
puts 'Added: ' + (industries_name ? industries_name : '')
end
end end
desc "TODO" desc "TODO"
task cities: :environment do task cities: :environment do
cities = parse_base_url.css('div.container div.col-xl-3 div.main-jobs-by-location div.jobs-in-country li a') cities = parse_base_url.css('div.container div.col-xl-3 div.main-jobs-by-location div.jobs-in-country li a')
cities.each do |city| cities.each do |city|
puts city.text.gsub('Việc làm tại','') city_name = city.text.gsub('Việc làm tại','')
City.find_or_create_by(
name: city_name
)
Region.find_or_create_by(name: 'Trong nước').id
puts 'Added: ' + (city_name ? city_name : '')
end end
cities_foreign = parse_base_url.css('div.container div.overseas-jobs li a')
cities_foreign.each do |city|
city_name = city.text
City.find_or_create_by(
name: city_name
)
Region.find_or_create_by(name: 'Nước Ngoài').id
puts 'Added: ' + (city_name ? city_name : '')
end end
end
desc "TODO" desc "TODO"
task regions: :environment do task regions: :environment do
puts parse_base_url.css('div.container div.col-xl-3 div.main-jobs-by-location h3').text regions = parse_base_url.css('div.container div.col-xl-3 div.main-jobs-by-location h3')
regions.each do |region|
region_name region.text.gsub('Việc Làm','')
Region.find_or_create_by(
name: region_name
)
end
end end
def parse_base_url def parse_base_url
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment