fix crawler

parent fe91b8cf
every 1.day, at: '08:18 am' do every 1.day, at: '08:00 am' do
rake 'crawler:all' rake 'crawler:all'
end end
class ChangeCompanies < ActiveRecord::Migration[6.1]
def change
change_column :companies, :address, :text
end
end
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2021_07_20_145646) do ActiveRecord::Schema.define(version: 2021_07_23_035105) do
create_table "apply_jobs", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| create_table "apply_jobs", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t|
t.bigint "user_id", null: false t.bigint "user_id", null: false
...@@ -44,7 +44,7 @@ ActiveRecord::Schema.define(version: 2021_07_20_145646) do ...@@ -44,7 +44,7 @@ ActiveRecord::Schema.define(version: 2021_07_20_145646) do
t.text "description" t.text "description"
t.datetime "created_at", precision: 6, null: false t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false t.datetime "updated_at", precision: 6, null: false
t.string "address" t.text "address"
t.text "overview" t.text "overview"
end end
......
...@@ -4,6 +4,8 @@ require 'logger' ...@@ -4,6 +4,8 @@ require 'logger'
namespace :crawler do namespace :crawler do
desc 'Crawl Jobs and Companies' desc 'Crawl Jobs and Companies'
task jobs: :environment do task jobs: :environment do
logger = Logger.new("#{Rails.root}/log/crawler_jobs.log")
logger.info "Start crawler job at: #{Time.current}"
base_url = Nokogiri::HTML(URI.open('https://careerbuilder.vn/')) base_url = Nokogiri::HTML(URI.open('https://careerbuilder.vn/'))
job_page = base_url.css('div.menu div.dropdown-menu ul li a')[0].attributes['href'].value job_page = base_url.css('div.menu div.dropdown-menu ul li a')[0].attributes['href'].value
parse_job_page = Nokogiri::HTML(URI.open(job_page)) parse_job_page = Nokogiri::HTML(URI.open(job_page))
...@@ -28,8 +30,8 @@ namespace :crawler do ...@@ -28,8 +30,8 @@ namespace :crawler do
company_name = company.css('div.company-info div.content p.name') company_name = company.css('div.company-info div.content p.name')
next if company_name.nil? next if company_name.nil?
logger = Logger.new("#{Rails.root}/log/crawler_jobs.log") logger.info("Link company: #{company_page}")
logger.info("Link company: #{company_page}.to_s")
company_info = company.css('div.company-info div.content') company_info = company.css('div.company-info div.content')
address = company_info.css('p')[1].try(:text) address = company_info.css('p')[1].try(:text)
description = company_info.css('ul li').text description = company_info.css('ul li').text
...@@ -51,22 +53,22 @@ namespace :crawler do ...@@ -51,22 +53,22 @@ namespace :crawler do
next if title.nil? next if title.nil?
logger.info("Link job: #{job_detail_page}") logger.info("Link job: #{job_detail_page}")
salary, experience, type, level, expired_at = '' salary, experience, type, level, expired_at = ''
detail_content = detail_job.css('div.col-lg-4 col-sm-6 item-blue ul li') detail_content = detail_job.css('div.row div.detail-box.has-background ul li')
detail_content.each do |content| detail_content.each do |content|
case content.css('strong').text case content.css('strong').text
when 'Lương' when 'Lương'
salary = content.css('p').text salary = content.css('p').text
when 'Kinh nghiệm' when 'Kinh nghiệm'
puts content.css('p').text experience = content.css('p').text
when 'Hình thức' when 'Hình thức'
puts content.css('p').text type = content.css('p').text
when 'Cấp bậc' when 'Cấp bậc'
level = content.css('p').text level = content.css('p').text
when 'Hết hạn nộp' when 'Hết hạn nộp'
expired_at = content.css('p').text expired_at = content.css('p').text
end end
end end
benefits, overview, requirement, other_requirement = '' benefits, overview, requirement, other_requirement = ''
detail_require = detail_job.css('div.detail-row') detail_require = detail_job.css('div.detail-row')
...@@ -97,26 +99,25 @@ namespace :crawler do ...@@ -97,26 +99,25 @@ namespace :crawler do
company_id: Company.find_by(name: company_name.text).id company_id: Company.find_by(name: company_name.text).id
) )
job_industries = []
industries = detail_job.css('div.detail-box.has-background ul li p a') industries = detail_job.css('div.detail-box.has-background ul li p a')
industries.each do |industry| industries.each do |industry|
name = industry.text.squish name = industry.text.squish
industry_name = Industry.find_or_create_by( job_industries << Industry.find__by(name: name)
name: name
)
job.industries << industry_name
end end
job.industries << job_industries
job_cities = []
location = detail_job.css('div.map p a') location = detail_job.css('div.map p a')
location.each do |city| location.each do |city|
name = city.text name = city.text
city_name = City.find_or_create_by( job_cities << City.find_by(name: name)
name: name
)
job.cities << city_name
end end
job.cities << job_cities unless job_cities.nil?
end end
page += 1 page += 1
end end
logger.info "End crawler job at: #{Time.current}"
end end
desc 'Crawl Industries' desc 'Crawl Industries'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment