Commit f54f8365 by Mai Hoang Thai Ha

created sample rake task for crawler

parent c4f21c57
......@@ -57,3 +57,5 @@ end
# Windows does not include zoneinfo files, so bundle the tzinfo-data gem
gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby]
gem 'slim-rails', '~> 3.2'
gem 'nokogiri', '~> 1.11', '>= 1.11.7'
gem 'httparty', '~> 0.18.1'
\ No newline at end of file
......@@ -83,6 +83,9 @@ GEM
ffi (1.15.3)
globalid (0.4.2)
activesupport (>= 4.2.0)
httparty (0.18.1)
mime-types (~> 3.0)
multi_xml (>= 0.5.2)
i18n (1.8.10)
concurrent-ruby (~> 1.0)
jbuilder (2.11.2)
......@@ -97,9 +100,13 @@ GEM
mini_mime (>= 0.1.1)
marcel (1.0.1)
method_source (0.9.2)
mime-types (3.3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2021.0704)
mini_mime (1.1.0)
minitest (5.14.4)
msgpack (1.4.2)
multi_xml (0.6.0)
nio4r (2.5.7)
nokogiri (1.11.7-x86_64-linux)
racc (~> 1.4)
......@@ -219,8 +226,10 @@ DEPENDENCIES
bootsnap (>= 1.4.4)
byebug
capybara (>= 3.26)
httparty (~> 0.18.1)
jbuilder (~> 2.7)
listen (~> 3.3)
nokogiri (~> 1.11, >= 1.11.7)
pry-nav (~> 0.3.0)
pry-rails (~> 0.3.9)
puma (~> 5.0)
......
require 'open-uri'
require 'csv'
require 'zip'
namespace :job do
desc "import job"
task web_job_import: :environment do
url = "https://careerbuilder.vn/vi/tim-viec-lam/nhan-vien-thiet-ke-thoi-trang.35B6D3AD.html"
unparsed_page = HTTParty.get(url)
parsed_page ||= Nokogiri::HTML(unparsed_page.body)
job_desc = parsed_page.css('div.job-desc')
job_detail = parsed_page.css('section.job-detail-content')
# title
title = job_desc.css('h1.title').text
company = job_desc.css('a.job-company-name').text
# info box
info_box = job_detail.css('div.detail-box')
info_box_item = info_box.css('ul li')
update_at, industry, type, salary, experience, level, expiration_date = ''
city = info_box.first.text.squish.remove("Địa điểm").strip
(0..info_box_item.count - 1).each do |part|
info = info_box_item[part].text
if info.include?(key = "Ngày cập nhật")
update_at = info.squish.remove(key).strip
elsif info.include?(key = "Ngành nghề")
industry = info.squish.remove(key).strip
elsif info.include?(key = "Hình thức")
type = info.squish.remove(key).strip
elsif info.include?(key = "Lương")
salary = info.squish.remove(key).strip
elsif info.include?(key = "Kinh nghiệm")
experience = info.squish.remove(key).strip
elsif info.include?(key = "Cấp bậc")
level = info.squish.remove(key).strip
elsif info.include?(key = "Hết hạn nộp")
expiration_date = info.squish.remove(key).strip
end
end
# benefit
job_detail_row = job_detail.css('div.detail-row')
benefit_list = []
description_list = []
requirement_list = []
other_info_list = []
benefits = job_detail.css('ul.welfare-list li')
(0..benefits.count - 1).each do |part|
benefit = benefits[part].text.strip
benefit_list << benefit
end
# description - requirment
(0..job_detail_row.count - 1).each do |part|
job_detail_text = job_detail_row[part].text
if job_detail_text.include?("Mô tả Công việc")
descriptions = job_detail_row.css('p')
(0..descriptions.count - 1).each do |desc|
description = descriptions[desc].text.strip
description_list << description
end
elsif job_detail_text.include?("Yêu Cầu Công Việc")
requirements = job_detail_row.css('p')
(0..requirements.count - 1).each do |req|
requirement = requirements[req].text.strip
requirement_list << requirement
end
end
end
# other info
other_info = job_detail.css('div.content_fck ul li')
(0..other_info.count - 1).each do |part|
info = other_info[part].text.squish.strip
other_info_list << info
end
job = {
title: title,
company: company,
city: city,
update_at: update_at,
industry: industry,
type: type,
salary: salary,
experences: experience,
level: level,
# position: position,
expiration_date: expiration_date,
benefit: benefit_list,
description: description_list,
requirement: requirement_list,
other_info: other_info_list
}
byebug
end
end
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment