Commit b8abb807 by Mai Hoang Thai Ha

add logger

parent 8328deb3
class CreateJobs < ActiveRecord::Migration[6.1] class CreateJobs < ActiveRecord::Migration[6.1]
def change def change
create_table :jobs do |t| create_table :jobs do |t|
t.string :title t.string :title, null: false
t.string :job_type t.string :job_type
t.string :salary t.string :salary
t.string :experience t.string :experience
......
...@@ -105,7 +105,7 @@ ActiveRecord::Schema.define(version: 2021_07_20_055614) do ...@@ -105,7 +105,7 @@ ActiveRecord::Schema.define(version: 2021_07_20_055614) do
end end
create_table "jobs", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| create_table "jobs", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t|
t.string "title" t.string "title", null: false
t.string "job_type" t.string "job_type"
t.string "salary" t.string "salary"
t.string "experience" t.string "experience"
......
...@@ -7,6 +7,8 @@ namespace :crawler do ...@@ -7,6 +7,8 @@ namespace :crawler do
unless %w[ALL TEST].include?(ENV['TYPE']) unless %w[ALL TEST].include?(ENV['TYPE'])
abort 'Do you want to crawl all pages (ALL) or some pages (TEST)? Please ONLY pass ONE argument.' abort 'Do you want to crawl all pages (ALL) or some pages (TEST)? Please ONLY pass ONE argument.'
end end
logger = Logger.new("#{Rails.root}/log/job_crawler.log")
logger.info "Start crawler job at: #{Time.current}"
total_pages = 5 # default = TEST total_pages = 5 # default = TEST
if ENV['TYPE'] == 'ALL' if ENV['TYPE'] == 'ALL'
first_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html').body) first_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html').body)
...@@ -16,10 +18,12 @@ namespace :crawler do ...@@ -16,10 +18,12 @@ namespace :crawler do
end end
(1..total_pages).each do |page| (1..total_pages).each do |page|
parsed_page = Nokogiri::HTML(HTTParty.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html").body) parsed_page = Nokogiri::HTML(HTTParty.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html").body)
logger.info("Page: #{page}")
jobs_item = parsed_page.css('div.job-item .job_link') jobs_item = parsed_page.css('div.job-item .job_link')
jobs_item.each do |item| jobs_item.each do |item|
retries ||= 0 retries ||= 0
url ||= item.attribute('href').text url ||= item.attribute('href').text
logger.info("job link: #{url}")
job_page = Nokogiri::HTML(HTTParty.get(url).body) job_page = Nokogiri::HTML(HTTParty.get(url).body)
# Job # Job
job_title = job_page.css('div.job-desc h1.title').text job_title = job_page.css('div.job-desc h1.title').text
...@@ -88,6 +92,7 @@ namespace :crawler do ...@@ -88,6 +92,7 @@ namespace :crawler do
job_object.cities << city_objects job_object.cities << city_objects
rescue URI::InvalidURIError => e rescue URI::InvalidURIError => e
puts "[Error] #{e.message}" puts "[Error] #{e.message}"
logger.error "URI must be ascii only : #{url}"
encode_url = CGI.escape(url.remove('https://careerbuilder.vn/vi/tim-viec-lam/')) encode_url = CGI.escape(url.remove('https://careerbuilder.vn/vi/tim-viec-lam/'))
url = "https://careerbuilder.vn/vi/tim-viec-lam/#{encode_url}" url = "https://careerbuilder.vn/vi/tim-viec-lam/#{encode_url}"
retry if (retries += 1) < 2 retry if (retries += 1) < 2
...@@ -96,6 +101,7 @@ namespace :crawler do ...@@ -96,6 +101,7 @@ namespace :crawler do
puts e.backtrace.inspect puts e.backtrace.inspect
end end
end end
logger.info "Finished at: #{Time.current}"
end end
desc 'crawler industry form CareerBuilder' desc 'crawler industry form CareerBuilder'
...@@ -110,7 +116,7 @@ namespace :crawler do ...@@ -110,7 +116,7 @@ namespace :crawler do
desc 'crawler city form CareerBuilder' desc 'crawler city form CareerBuilder'
task cities: :environment do task cities: :environment do
parsed_page ||= Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body) parsed_page = Nokogiri::HTML(HTTParty.get('https://careerbuilder.vn/tim-viec-lam.html').body)
list_location = parsed_page.css('div.main-jobs-by-location ul li') list_location = parsed_page.css('div.main-jobs-by-location ul li')
list_location.each do |city| list_location.each do |city|
city_name = city.text city_name = city.text
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment