Commit 9707d2d0 by Thanh Hung Pham

Resolve thread

parent d6fa267c
require 'nokogiri' require 'thread'
require 'open-uri' require 'open-uri'
require 'rake' require 'nokogiri'
require 'logger'
namespace :crawler_data do class Careerbuilder
task load_page: :environment do attr_reader :domain, :thread_count, :logger
def initialize(domain, thread_count=1)
@domain = domain
@thread_count = thread_count
@mutex = Mutex.new
@logger = Logger.new("#{Rails.root}/log/careerbuilder_crawler.log")
end
def crawl
@logger.info('Start crawl')
workers = (0...thread_count).map do
Thread.new do
begin
doc = Nokogiri::HTML(open('http://careerbuilder.vn')) doc = Nokogiri::HTML(open('http://careerbuilder.vn'))
import_area import_area
import_category(doc) import_category(doc)
...@@ -11,22 +25,41 @@ namespace :crawler_data do ...@@ -11,22 +25,41 @@ namespace :crawler_data do
new_jobs_url = doc.xpath("//div[@class='logo_nav']/ul/li[@class=' hasmenu']/ul/li/a[text()='Việc làm mới nhất']/@href") new_jobs_url = doc.xpath("//div[@class='logo_nav']/ul/li[@class=' hasmenu']/ul/li/a[text()='Việc làm mới nhất']/@href")
inport_job(new_jobs_url.to_s) inport_job(new_jobs_url.to_s)
rescue ThreadError
end
end
end
workers.map(&:join)
@logger.info('Crawl finished')
end end
def import_area def import_area
@mutex.synchronize do
Area.new(name: 'Viet Nam').save if Area.where(name: 'Viet Nam').blank? Area.new(name: 'Viet Nam').save if Area.where(name: 'Viet Nam').blank?
Area.new(name: 'International').save if Area.where(name: 'International').blank? Area.new(name: 'International').save if Area.where(name: 'International').blank?
end end
rescue StandardError => e
logger.error(e.message)
logger.error(e.backtrace)
end
def import_category(doc) def import_category(doc)
@mutex.synchronize do
categories = doc.xpath("//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option") categories = doc.xpath("//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option")
categories = categories.slice(1..categories.size - 2) categories = categories.slice(1..categories.size - 2)
categories.each do |category| categories.each do |category|
Category.new(name: category.text.strip).save if Category.where(name: category.text.strip).blank? Category.new(name: category.text.strip).save if Category.where(name: category.text.strip).blank?
end end
end end
rescue StandardError => e
logger.error("[method: ] #{import_category}")
logger.error(e.message)
logger.error(e.backtrace)
end
def import_city(doc) def import_city(doc)
@mutex.synchronize do
cities = doc.xpath("//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option").drop(1) cities = doc.xpath("//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option").drop(1)
area_id = 1 area_id = 1
cities.each do |city| cities.each do |city|
...@@ -34,9 +67,15 @@ namespace :crawler_data do ...@@ -34,9 +67,15 @@ namespace :crawler_data do
City.new(name: city.text.strip, area: Area.find(area_id)).save if City.where(name: city.text.strip).blank? City.new(name: city.text.strip, area: Area.find(area_id)).save if City.where(name: city.text.strip).blank?
end end
end end
rescue StandardError => e
logger.error("[method: ] #{import_city}")
logger.error(e.message)
logger.error(e.backtrace)
end
def inport_job(url) def inport_job(url)
10.times do 10.times do
@mutex.synchronize do
doc_new_jobs = Nokogiri::HTML(open(url)) doc_new_jobs = Nokogiri::HTML(open(url))
doc_new_jobs.encoding = 'utf-8' doc_new_jobs.encoding = 'utf-8'
...@@ -80,4 +119,9 @@ namespace :crawler_data do ...@@ -80,4 +119,9 @@ namespace :crawler_data do
url = doc_new_jobs.xpath("//div[@class='paginationTwoStatus']/a[@class='right']/@href").to_s url = doc_new_jobs.xpath("//div[@class='paginationTwoStatus']/a[@class='right']/@href").to_s
end end
end end
rescue StandardError => e
logger.error("[URL] #{url}")
logger.error(e.message)
logger.error(e.backtrace)
end
end end
namespace :crawler do
desc 'client crawler'
task load: :environment do
require "#{Rails.root}/lib/tasks/careerbuilder"
thread_count = ENV['THREAD_COUNT'] || 1
Careerbuilder.new('http://careerbuilder.vn', thread_count.to_i).crawl
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment