Commit ea11d8e6 by Mai Hoang Thai Ha

rename namespace, use length instead of count,...

parent 423678ed
...@@ -2,18 +2,17 @@ require 'open-uri' ...@@ -2,18 +2,17 @@ require 'open-uri'
require 'csv' require 'csv'
require 'zip' require 'zip'
namespace :job do namespace :crawler do
desc 'importjob' desc 'importjob'
task web_job_crawler: :environment do task jobs: :environment do
parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html').body) parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html').body)
job_item = parsed_page.css('div.job-item') jobs_item = parsed_page.css('div.job-item .job_link')
(0..job_item.count - 1).each do |item| (0..jobs_item.length - 1).each do |item|
job_link = job_item[item].css('div.title a').attribute('href').text job_link = jobs_item[item].attribute('href').text
unparsed_job_link = HTTParty.get(job_link) job_page = Nokogiri::HTML(HTTParty.get(job_link).body)
parsed_job_link ||= Nokogiri::HTML(unparsed_job_link.body) job_desc = job_page.css('div.job-desc')
job_desc = parsed_job_link.css('div.job-desc') job_detail = job_page.css('section.job-detail-content')
job_detail = parsed_job_link.css('section.job-detail-content')
# title - company # title - company
title = job_desc.css('h1.title').text title = job_desc.css('h1.title').text
company = job_desc.css('a.job-company-name').text company = job_desc.css('a.job-company-name').text
...@@ -24,7 +23,7 @@ namespace :job do ...@@ -24,7 +23,7 @@ namespace :job do
# city, update_at, industry, type, salary, experience, level, expiration_date # city, update_at, industry, type, salary, experience, level, expiration_date
update_at, industry, type, salary, experience, level, expiration_date = '' update_at, industry, type, salary, experience, level, expiration_date = ''
city = city_box.text city = city_box.text
(0..info_box_item.count - 1).each do |part| (0..info_box_item.length - 1).each do |part|
info = info_box_item[part].text info = info_box_item[part].text
if info.include?(key = 'Ngày cập nhật') if info.include?(key = 'Ngày cập nhật')
update_at = info.squish.remove(key).strip update_at = info.squish.remove(key).strip
...@@ -47,13 +46,13 @@ namespace :job do ...@@ -47,13 +46,13 @@ namespace :job do
benefit_list = [] benefit_list = []
other_info_list = [] other_info_list = []
benefits = job_detail.css('ul.welfare-list li') benefits = job_detail.css('ul.welfare-list li')
(0..benefits.count - 1).each do |part| (0..benefits.length - 1).each do |part|
benefit = benefits[part].text.strip benefit = benefits[part].text.strip
benefit_list << benefit benefit_list << benefit
end end
# description, requirement # description, requirement
description, requirement = '' description, requirement = ''
(0..job_detail_row.count - 1).each do |part| (0..job_detail_row.length - 1).each do |part|
job_detail_text = job_detail_row[part].text job_detail_text = job_detail_row[part].text
if job_detail_text.include?('Mô tả Công việc') if job_detail_text.include?('Mô tả Công việc')
description = job_detail_text.partition('Mô tả Công việc').last.squish.strip description = job_detail_text.partition('Mô tả Công việc').last.squish.strip
...@@ -63,7 +62,7 @@ namespace :job do ...@@ -63,7 +62,7 @@ namespace :job do
end end
# benefit # benefit
other_info = job_detail.css('div.content_fck ul li') other_info = job_detail.css('div.content_fck ul li')
(0..other_info.count - 1).each do |part| (0..other_info.length - 1).each do |part|
info = other_info[part].text.squish.strip info = other_info[part].text.squish.strip
other_info_list << info other_info_list << info
end end
...@@ -89,11 +88,11 @@ namespace :job do ...@@ -89,11 +88,11 @@ namespace :job do
end end
desc 'crawler industry form CareerBuilder' desc 'crawler industry form CareerBuilder'
task web_industry_crawler: :environment do task industries: :environment do
parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body) parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_job = parsed_page.css('div.list-of-working-positions ul.list-jobs li a') list_job = parsed_page.css('div.list-of-working-positions ul.list-jobs li a')
industry_list = [] industry_list = []
(0..list_job.count - 1).each do |part| (0..list_job.length - 1).each do |part|
industry = list_job[part].text.squish.strip industry = list_job[part].text.squish.strip
industry_list << industry industry_list << industry
end end
...@@ -101,11 +100,11 @@ namespace :job do ...@@ -101,11 +100,11 @@ namespace :job do
end end
desc 'crawler city form CareerBuilder' desc 'crawler city form CareerBuilder'
task web_city_crawler: :environment do task cities: :environment do
parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body) parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_location = parsed_page.css('div.main-jobs-by-location ul li') list_location = parsed_page.css('div.main-jobs-by-location ul li')
city_list = [] city_list = []
(0..list_location.count - 1).each do |part| (0..list_location.length - 1).each do |part|
city_item = list_location[part].text city_item = list_location[part].text
region = 1 region = 1
if city_item.include?(key = 'Việc làm tại') if city_item.include?(key = 'Việc làm tại')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment