Commit d71790ab by phuctmZigexn

Merge branch 'Task/6_create_crawler' into 'master'

created sample rake task for crawler

See merge request !4
parents ca4035a9 135b4c82
...@@ -31,8 +31,9 @@ gem 'bootsnap', '>= 1.4.4', require: false ...@@ -31,8 +31,9 @@ gem 'bootsnap', '>= 1.4.4', require: false
group :development, :test do group :development, :test do
# Call 'byebug' anywhere in the code to stop execution and get a debugger console # Call 'byebug' anywhere in the code to stop execution and get a debugger console
gem 'byebug', platforms: [:mri, :mingw, :x64_mingw] gem 'byebug', platforms: [:mri, :mingw, :x64_mingw]
gem 'pry-rails', '~> 0.3.9' gem 'pry', '~> 0.14.1'
gem 'pry-nav', '~> 0.3.0' # gem 'pry-nav'
gem 'pry-rails'
end end
group :development do group :development do
...@@ -44,6 +45,7 @@ group :development do ...@@ -44,6 +45,7 @@ group :development do
gem 'listen', '~> 3.3' gem 'listen', '~> 3.3'
# Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring # Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring
gem 'spring' gem 'spring'
gem 'rubocop-rails', '~> 2.11', '>= 2.11.3'
end end
group :test do group :test do
...@@ -57,3 +59,5 @@ end ...@@ -57,3 +59,5 @@ end
# Windows does not include zoneinfo files, so bundle the tzinfo-data gem # Windows does not include zoneinfo files, so bundle the tzinfo-data gem
gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby] gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby]
gem 'slim-rails', '~> 3.2' gem 'slim-rails', '~> 3.2'
gem 'nokogiri', '~> 1.11', '>= 1.11.7'
gem 'httparty', '~> 0.18.1'
\ No newline at end of file
...@@ -62,6 +62,7 @@ GEM ...@@ -62,6 +62,7 @@ GEM
zeitwerk (~> 2.3) zeitwerk (~> 2.3)
addressable (2.8.0) addressable (2.8.0)
public_suffix (>= 2.0.2, < 5.0) public_suffix (>= 2.0.2, < 5.0)
ast (2.4.2)
bindex (0.8.1) bindex (0.8.1)
bootsnap (1.7.5) bootsnap (1.7.5)
msgpack (~> 1.0) msgpack (~> 1.0)
...@@ -83,11 +84,14 @@ GEM ...@@ -83,11 +84,14 @@ GEM
ffi (1.15.3) ffi (1.15.3)
globalid (0.4.2) globalid (0.4.2)
activesupport (>= 4.2.0) activesupport (>= 4.2.0)
httparty (0.18.1)
mime-types (~> 3.0)
multi_xml (>= 0.5.2)
i18n (1.8.10) i18n (1.8.10)
concurrent-ruby (~> 1.0) concurrent-ruby (~> 1.0)
jbuilder (2.11.2) jbuilder (2.11.2)
activesupport (>= 5.0.0) activesupport (>= 5.0.0)
listen (3.5.1) listen (3.6.0)
rb-fsevent (~> 0.10, >= 0.10.3) rb-fsevent (~> 0.10, >= 0.10.3)
rb-inotify (~> 0.9, >= 0.9.10) rb-inotify (~> 0.9, >= 0.9.10)
loofah (2.10.0) loofah (2.10.0)
...@@ -96,19 +100,24 @@ GEM ...@@ -96,19 +100,24 @@ GEM
mail (2.7.1) mail (2.7.1)
mini_mime (>= 0.1.1) mini_mime (>= 0.1.1)
marcel (1.0.1) marcel (1.0.1)
method_source (0.9.2) method_source (1.0.0)
mime-types (3.3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2021.0704)
mini_mime (1.1.0) mini_mime (1.1.0)
minitest (5.14.4) minitest (5.14.4)
msgpack (1.4.2) msgpack (1.4.2)
multi_xml (0.6.0)
mysql2 (0.5.3) mysql2 (0.5.3)
nio4r (2.5.7) nio4r (2.5.7)
nokogiri (1.11.7-x86_64-linux) nokogiri (1.11.7-x86_64-linux)
racc (~> 1.4) racc (~> 1.4)
pry (0.12.2) parallel (1.20.1)
coderay (~> 1.1.0) parser (3.0.2.0)
method_source (~> 0.9.0) ast (~> 2.4.1)
pry-nav (0.3.0) pry (0.14.1)
pry (>= 0.9.10, < 0.13.0) coderay (~> 1.1)
method_source (~> 1.0)
pry-rails (0.3.9) pry-rails (0.3.9)
pry (>= 0.10.4) pry (>= 0.10.4)
public_suffix (4.0.6) public_suffix (4.0.6)
...@@ -148,11 +157,29 @@ GEM ...@@ -148,11 +157,29 @@ GEM
method_source method_source
rake (>= 0.13) rake (>= 0.13)
thor (~> 1.0) thor (~> 1.0)
rake (13.0.3) rainbow (3.0.0)
rake (13.0.6)
rb-fsevent (0.11.0) rb-fsevent (0.11.0)
rb-inotify (0.10.1) rb-inotify (0.10.1)
ffi (~> 1.0) ffi (~> 1.0)
regexp_parser (2.1.1) regexp_parser (2.1.1)
rexml (3.2.5)
rubocop (1.18.3)
parallel (~> 1.10)
parser (>= 3.0.0.0)
rainbow (>= 2.2.2, < 4.0)
regexp_parser (>= 1.8, < 3.0)
rexml
rubocop-ast (>= 1.7.0, < 2.0)
ruby-progressbar (~> 1.7)
unicode-display_width (>= 1.4.0, < 3.0)
rubocop-ast (1.8.0)
parser (>= 3.0.1.1)
rubocop-rails (2.11.3)
activesupport (>= 4.2.0)
rack (>= 1.1)
rubocop (>= 1.7.0, < 2.0)
ruby-progressbar (1.11.0)
rubyzip (2.3.2) rubyzip (2.3.2)
sass-rails (6.0.0) sass-rails (6.0.0)
sassc-rails (~> 2.1, >= 2.1.1) sassc-rails (~> 2.1, >= 2.1.1)
...@@ -171,7 +198,7 @@ GEM ...@@ -171,7 +198,7 @@ GEM
slim (4.1.0) slim (4.1.0)
temple (>= 0.7.6, < 0.9) temple (>= 0.7.6, < 0.9)
tilt (>= 2.0.6, < 2.1) tilt (>= 2.0.6, < 2.1)
slim-rails (3.2.0) slim-rails (3.3.0)
actionpack (>= 3.1) actionpack (>= 3.1)
railties (>= 3.1) railties (>= 3.1)
slim (>= 3.0, < 5.0) slim (>= 3.0, < 5.0)
...@@ -191,6 +218,7 @@ GEM ...@@ -191,6 +218,7 @@ GEM
turbolinks-source (5.2.0) turbolinks-source (5.2.0)
tzinfo (2.0.4) tzinfo (2.0.4)
concurrent-ruby (~> 1.0) concurrent-ruby (~> 1.0)
unicode-display_width (2.0.0)
web-console (4.1.0) web-console (4.1.0)
actionview (>= 6.0.0) actionview (>= 6.0.0)
activemodel (>= 6.0.0) activemodel (>= 6.0.0)
...@@ -219,14 +247,17 @@ DEPENDENCIES ...@@ -219,14 +247,17 @@ DEPENDENCIES
bootsnap (>= 1.4.4) bootsnap (>= 1.4.4)
byebug byebug
capybara (>= 3.26) capybara (>= 3.26)
httparty (~> 0.18.1)
jbuilder (~> 2.7) jbuilder (~> 2.7)
listen (~> 3.3) listen (~> 3.3)
mysql2 (~> 0.5) mysql2 (~> 0.5)
pry-nav (~> 0.3.0) nokogiri (~> 1.11, >= 1.11.7)
pry-rails (~> 0.3.9) pry (~> 0.14.1)
pry-rails
puma (~> 5.0) puma (~> 5.0)
rack-mini-profiler (~> 2.0) rack-mini-profiler (~> 2.0)
rails (~> 6.1.3, >= 6.1.3.2) rails (~> 6.1.3, >= 6.1.3.2)
rubocop-rails (~> 2.11, >= 2.11.3)
sass-rails (~> 6.0) sass-rails (~> 6.0)
selenium-webdriver selenium-webdriver
slim-rails (~> 3.2) slim-rails (~> 3.2)
......
class City < ApplicationRecord class City < ApplicationRecord
has_and_belongs_to_many :jobs has_and_belongs_to_many :jobs
enum region: { vietnam: 0, international: 1 }
end end
class CreateJobs < ActiveRecord::Migration[6.1] class CreateJobs < ActiveRecord::Migration[6.1]
def change def change
create_table :jobs do |t| create_table :jobs do |t|
t.string :title t.string :title, null: false
t.string :type t.string :job_type
t.string :salary t.string :salary
t.string :experience t.string :experience
t.string :position t.string :position
......
...@@ -105,8 +105,8 @@ ActiveRecord::Schema.define(version: 2021_07_20_055614) do ...@@ -105,8 +105,8 @@ ActiveRecord::Schema.define(version: 2021_07_20_055614) do
end end
create_table "jobs", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| create_table "jobs", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t|
t.string "title" t.string "title", null: false
t.string "type" t.string "job_type"
t.string "salary" t.string "salary"
t.string "experience" t.string "experience"
t.string "position" t.string "position"
......
require 'open-uri'
namespace :crawler do
# command: rails crawler:jobs TYPE=TEST / ALL
desc 'crawler from CareerBuilder'
task jobs: :environment do
unless %w[ALL TEST].include?(ENV['TYPE'])
abort 'Do you want to crawl all pages (ALL) or some pages (TEST)? Please ONLY pass ONE argument.'
end
logger = Logger.new("#{Rails.root}/log/job_crawler.log")
logger.info "Start crawler job at: #{Time.current}"
total_pages = 5 # default = TEST
if ENV['TYPE'] == 'ALL'
first_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html').body)
jobs_per_page = first_page.css('div.job-item').count
total_jobs = first_page.css('.search-result-list .job-found-amout p').text.tr('^0-9', '')
total_pages = (total_jobs.to_f / jobs_per_page).round
end
(1..total_pages).each do |page|
parsed_page = Nokogiri::HTML(HTTParty.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html").body)
logger.info("Page: #{page}")
jobs_item = parsed_page.css('div.job-item .job_link')
jobs_item.each do |item|
retries ||= 0
url ||= item.attribute('href').text
logger.info("job link: #{url}")
job_page = Nokogiri::HTML(HTTParty.get(url).body)
# Job
job_title = job_page.css('div.job-desc h1.title').text
if job_title.blank?
logger.info 'Remove this job because title is empty'
next
end
# update_at, job_industries, job_type, salary, experience, level, expiration_date
detail_box_items = job_page.css('.job-detail-content .detail-box ul li')
# init
update_at, job_type, salary, experience, level, expiration_date = ''
industries = []
detail_box_items.each do |info_item|
key = info_item.css('strong').text.strip
default_value = info_item.css('p').text.squish
# case/when
case key
when 'Ngày cập nhật'
update_at = default_value.to_time
when 'Ngành nghề'
industries = default_value.split(' , ')
when 'Hình thức'
job_type = default_value
when 'Lương'
salary = default_value
when 'Kinh nghiệm'
experience = default_value.squish
when 'Cấp bậc'
level = default_value
when 'Hết hạn nộp'
expiration_date = default_value.to_time
end
end
# benefits, description, requirement, other_info
job_detail_rows = job_page.css('section.job-detail-content div.detail-row')
benefits, description, requirement, other_info = ''
job_detail_rows.each do |detail_row|
detail_title = detail_row.css('.detail-title').text.strip
detail_content = detail_row.css(':not(h3.detail-title)')
case detail_title
when 'Phúc lợi'
benefits = detail_row.css('ul.welfare-list li').map(&:text).map(&:squish).join('---')
when 'Mô tả Công việc'
description = detail_content.inner_html
when 'Yêu Cầu Công Việc'
requirement = detail_content.inner_html
when 'Thông tin khác'
other_info = detail_row.css('.content_fck ul li').map(&:text).map(&:squish).join('---')
end
end
# Company
company_name = job_page.css('div.job-desc a.job-company-name').text
company_object = Company.find_or_create_by(name: company_name)
job_object = Job.find_or_create_by({ title: job_title,
job_type: job_type,
salary: salary,
experience: experience,
position: level,
expiration_date: expiration_date,
description: description,
benefit: benefits,
requirement: requirement,
other_info: other_info,
company_id: company_object.id })
industry_objects = industries.map { |industry| Industry.find_or_create_by(name: industry) }
job_object.industries << industry_objects
# Cities
cities = job_page.css('.job-detail-content .detail-box .map p a').map(&:text)
city_objects = cities.map { |city| City.find_or_create_by(name: city) }
job_object.cities << city_objects
rescue URI::InvalidURIError => e
puts "[Error] #{e.message}"
logger.error "URI must be ascii only : #{url}"
encode_url = CGI.escape(url.remove('https://careerbuilder.vn/vi/tim-viec-lam/'))
url = "https://careerbuilder.vn/vi/tim-viec-lam/#{encode_url}"
retry if (retries += 1) < 2
rescue StandardError => e
puts e.message
puts e.backtrace.inspect
end
end
logger.info "Finished at: #{Time.current}"
end
desc 'crawler industry form CareerBuilder'
task industries: :environment do
parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_job = parsed_page.css('div.list-of-working-positions ul.list-jobs li a')
list_job.each do |part|
industry = part.text.squish.strip
Industry.find_or_create_by(name: industry)
end
end
desc 'crawler city form CareerBuilder'
task cities: :environment do
parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_location = parsed_page.css('div.main-jobs-by-location ul li')
list_location.each do |city|
city_name = city.text
region = :international
if city_name.start_with?('Việc làm tại')
city_name = city_name.remove('Việc làm tại').strip
region = :vietnam
end
City.find_or_create_by(
name: city_name,
region: region
)
end
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment