Commit ed8bc042 by Tô Ngọc Ánh

Merge branch 'crawler' into 'master'

refactor code

See merge request !4
parents 735a713b 102be3eb
Pipeline #698 failed with stages
in 0 seconds
......@@ -64,4 +64,5 @@ gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby]
##
gem "nokogiri"
gem 'whenever', require: false
##
......@@ -64,6 +64,7 @@ GEM
chromedriver-helper (2.1.1)
archive-zip (~> 0.10)
nokogiri (~> 1.8)
chronic (0.10.2)
coffee-rails (4.2.2)
coffee-script (>= 2.2.0)
railties (>= 4.0.0)
......@@ -187,6 +188,8 @@ GEM
websocket-driver (0.7.3)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5)
whenever (1.0.0)
chronic (>= 0.6.3)
xpath (3.2.0)
nokogiri (~> 1.8)
......@@ -214,6 +217,7 @@ DEPENDENCIES
tzinfo-data
uglifier (>= 1.3.0)
web-console (>= 3.3.0)
whenever
RUBY VERSION
ruby 2.6.6p146
......
# Use this file to easily define all of your cron jobs.
#
# It's helpful, but not entirely necessary to understand cron before proceeding.
# http://en.wikipedia.org/wiki/Cron
# Example:
#
# set :output, "/path/to/my/cron_log.log"
#
# every 2.hours do
# command "/usr/bin/some_great_command"
# runner "MyModel.some_method"
# rake "some:great:rake:task"
# end
#
# every 4.days do
# runner "AnotherModel.prune_old_records"
# end
# Learn more: http://github.com/javan/whenever
env :PATH, ENV['PATH']
every :day, at: '12:00 pm' do
rake 'crawl:crawl_industries_locations_jobs[4]'
end
\ No newline at end of file
require "open-uri"
@logger ||= Logger.new("#{Rails.root}/log/crawler.log")
namespace :crawl do
desc "crawl industries and locations"
task crawl_industries_locations: :environment do
desc "crawl industries locations jobs"
task :crawl_industries_locations_jobs, [:page, :link] => [:environment] do |task, args|
args.with_defaults(link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
crawl_industries_and_locations
job_links = get_job_links(args[:page].to_i, args[:link])
job_links.each do |link|
next if link.empty?
crawl_job(link)
end
end
desc "crawl jobs"
task crawl_jobs: :environment do
job_links = get_job_links(1)
crawl_jobs(job_links)
end
desc "crawl all"
task all: [:crawl_industries_locations, :crawl_jobs]
end
def get_job_links(page)
def get_job_links(page, link)
job_links = []
page.times do |i|
document = Nokogiri::HTML(open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i}-vi.html"))
page.times do
document = Nokogiri::HTML(open(link))
jobs_xml = document.xpath('//div/a[@class="job_link"]/@href')
jobs_xml.each { |item| job_links << item.value}
next_page = document.at_css('.next-page a')
break if next_page.nil?
link = next_page[:href]
end
job_links
end
......@@ -31,25 +32,20 @@ def crawl_company(company_link)
uri = URI.parse(URI.escape(company_link)) #fix error: uri must be ascii only
document = Nokogiri::HTML(open(uri))
company_name = document.css(".content .name").text
return nil if company_name.empty?
return if company_name.empty?
puts company_name
company_address = document.css(".content p")[1].text
company_description = document.css(".main-about-us").css('.content').text
Company.find_or_create_by(name: company_name) do |c|
c.address = company_address
c.description = company_description
Company.find_or_create_by(name: company_name) do |company|
company.address = company_address
company.description = company_description
end
rescue => exception
puts exception
return nil
end
end
def crawl_jobs(job_links)
job_links.each do |link|
crawl_job(link)
@logger.error "#{exception.message} - Company link: #{uri}"
return
end
end
......@@ -64,41 +60,24 @@ def crawl_job(job_link)
job_company = crawl_company(job_company_link)
return if job_company.nil?
## parse data
job_detail = document.css('.job-detail-content div.detail-box')
job_location_name = job_detail[0].css('p a').map{ |val| val.text.strip }
job_location_name = document.css('.map p a').map{ |val| val.text.strip }
job_locations = Location.where(city: job_location_name)
job_industry_names = job_detail[1].css('ul li')[1].css('p a').map{ |val| val.text.strip }
job_industry_names = document.at_xpath('//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]').css('p a').map{ |val| val.text.strip }
job_industries = Industry.where(name: job_industry_names)
job_created_at = job_detail[1].css('ul li')[0].css('p').text.strip
job_salary = ""
job_level = ""
job_experience = ""
job_expiration_date = ""
info_detail = job_detail[2].css('ul li')
info_detail.count.times do |i|
data = info_detail[i].css('p').text.strip
job_salary = data if info_detail[i].css('.fa.fa-usd').present?
job_experience = data if info_detail[i].css('.fa.fa-briefcase').present?
job_level = data if info_detail[i].css('.mdi.mdi-account').present?
job_expiration_date = data if info_detail[i].css('.mdi.mdi-calendar-check').present?
end
job_salary = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').try(:text).try(:strip)
job_level = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').try(:text).try(:strip)
job_experience = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').try(:text).try(:strip)
job_expiration_date = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').try(:text).try(:strip)
job_description = document.css('.job-detail-content .detail-row').to_s
##
Job.find_or_create_by(title: job_title, company_id: job_company.id) do |job|
job.salary = job_salary
job.experience = job_experience
job.level = job_level
job.expiration_date = job_expiration_date
job.created_at = job_created_at
job.description = job_description
job.industries << job_industries
job.locations << job_locations
......@@ -106,6 +85,7 @@ def crawl_job(job_link)
puts job_title
rescue => exception
puts exception
@logger.error "#{exception.message} - Job link: #{uri}"
return exception
end
end
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment