Commit 09bd6db3 by Thanh Hung Pham

Resolve thread

parent 9707d2d0
......@@ -6,25 +6,39 @@ require 'logger'
class Careerbuilder
attr_reader :domain, :thread_count, :logger
def initialize(domain, thread_count=1)
def initialize(domain, thread_count = 1)
@links = [url: 'http://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html', handler: :top] * thread_count
@crawled = []
@mutex = Mutex.new
@domain = domain
@thread_count = thread_count
@mutex = Mutex.new
@logger = Logger.new("#{Rails.root}/log/careerbuilder_crawler.log")
end
def crawl
@logger.info('Start crawl')
workers = (0...thread_count).map do
Thread.new do
begin
doc = Nokogiri::HTML(open('http://careerbuilder.vn'))
import_area
import_category(doc)
import_city(doc)
new_jobs_url = doc.xpath("//div[@class='logo_nav']/ul/li[@class=' hasmenu']/ul/li/a[text()='Việc làm mới nhất']/@href")
inport_job(new_jobs_url.to_s)
workers = (0...thread_count).map do
Thread.new do
begin
while link = next_link
crawl_url = link[:url]
begin
doc_new_jobs = Nokogiri::HTML(open(crawl_url))
doc_new_jobs.encoding = 'utf-8'
send(link[:handler], doc_new_jobs, crawl_url)
rescue StandardError => e
logger.error("[URL: ] #{crawl_url}")
logger.error(e.message)
logger.error(e.backtrace)
end
end
puts '=======Thread End======='
rescue ThreadError
end
end
......@@ -34,77 +48,62 @@ class Careerbuilder
@logger.info('Crawl finished')
end
def import_area
def next_link
link = nil
@mutex.synchronize do
Area.new(name: 'Viet Nam').save if Area.where(name: 'Viet Nam').blank?
Area.new(name: 'International').save if Area.where(name: 'International').blank?
return if @crawled.count > 500
link = @links.shift
@crawled.push(link[:url]) if link
end
rescue StandardError => e
logger.error(e.message)
logger.error(e.backtrace)
link
end
def import_category(doc)
def push_link(link, handler)
@mutex.synchronize do
categories = doc.xpath("//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option")
categories = categories.slice(1..categories.size - 2)
categories.each do |category|
Category.new(name: category.text.strip).save if Category.where(name: category.text.strip).blank?
@links.push({url: link, handler: handler}) unless @crawled.include?(link)
end
end
rescue StandardError => e
logger.error("[method: ] #{import_category}")
logger.error(e.message)
logger.error(e.backtrace)
end
def import_city(doc)
def shift_link(link, handler)
@mutex.synchronize do
cities = doc.xpath("//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option").drop(1)
area_id = 1
cities.each do |city|
area_id = 2 if city.text == 'Angola'
City.new(name: city.text.strip, area: Area.find(area_id)).save if City.where(name: city.text.strip).blank?
@links.unshift({url: link, handler: handler}) unless @crawled.include?(link)
end
end
rescue StandardError => e
logger.error("[method: ] #{import_city}")
logger.error(e.message)
logger.error(e.backtrace)
def top(doc, _link)
doc.xpath("//div[@class='gird_standard ']/dl/dd/span/h3[@class='job']/a/@href").each do |link|
shift_link(link, :detail)
end
def inport_job(url)
10.times do
@mutex.synchronize do
doc_new_jobs = Nokogiri::HTML(open(url))
doc_new_jobs.encoding = 'utf-8'
next_page = doc.xpath("//div[@class='paginationTwoStatus']/a[@class='right']/@href").to_s
push_link(next_page, :top) if next_page
doc_new_jobs.xpath("//div[@class='gird_standard ']/dl/dd/span/h3[@class='job']/a/@href").each do |link|
encoded_url = URI.encode(link.to_s)
doc_job_details = Nokogiri::HTML(open(encoded_url))
end
def detail(doc, _link)
# Company Information
company_name = doc_job_details.xpath("//div[@class='tit_company']").text.strip # Company name
company_address = doc_job_details.xpath("//div[@class='box1Detail']/p[@class='TitleDetailNew']/label[@itemprop='address']/label[@itemprop='addressLocality']").text.strip # Company Address
company_description = doc_job_details.xpath("//div[@class='desc_company content_fck']").text.strip # Company description
company_name = doc.xpath("//div[@class='tit_company']").text.strip # Company name
company_address = doc.xpath("//div[@class='box1Detail']/p[@class='TitleDetailNew']/label[@itemprop='address']/label[@itemprop='addressLocality']").text.strip # Company Address
company_description = doc.xpath("//div[@class='desc_company content_fck']").text.strip # Company description
Company.new(name: company_name, address: company_address, description: company_description).save if Company.where(name: company_name).blank?
# Job Information
job_name = doc_job_details.xpath("//div[@class='LeftJobCB']/div[@class='top-job']/div[@class='top-job-info']/h1").text.strip # Job name
job_description = doc_job_details.xpath("//div[@class='MarBot20']").text.strip # Job description
job_name = doc.xpath("//div[@class='LeftJobCB']/div[@class='top-job']/div[@class='top-job-info']/h1").text.strip # Job name
job_description = doc.xpath("//div[@class='MarBot20']").text.strip # Job description
job_location = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Nơi làm việc: ']/b[@itemprop='jobLocation']").text.strip
job_location = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Nơi làm việc: ']/b[@itemprop='jobLocation']").text.strip
job_level = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Cấp bậc: ']/label[@itemprop='occupationalCategory']").text.strip
job_level = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Cấp bậc: ']/label[@itemprop='occupationalCategory']").text.strip
job_experience = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Kinh nghiệm: ']/text()")
job_experience = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Kinh nghiệm: ']/text()")
job_salary = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='baseSalary']").text.strip + " " +
doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='salaryCurrency']").text.strip
job_salary = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='baseSalary']").text.strip + " " +
doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='salaryCurrency']").text.strip
job_category = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Ngành nghề: ']/b/a[@itemprop='industry']").text.strip
job_category = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Ngành nghề: ']/b/a[@itemprop='industry']").text.strip
job_expiry_date = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Hết hạn nộp: ']/text()").to_s
job_expiry_date = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Hết hạn nộp: ']/text()").to_s
Job.new(name: job_name, description: job_description,
salary: job_salary,
......@@ -116,11 +115,37 @@ class Careerbuilder
JobCategory.new(job: Job.find_by_name(job_name), category: Category.find_by_name(category)).save
end
end
url = doc_new_jobs.xpath("//div[@class='paginationTwoStatus']/a[@class='right']/@href").to_s
def import_area
Area.new(name: 'Viet Nam').save if Area.where(name: 'Viet Nam').blank?
Area.new(name: 'International').save if Area.where(name: 'International').blank?
rescue StandardError => e
logger.error("[method: ] #{import_category}")
logger.error(e.message)
logger.error(e.backtrace)
end
def import_category(doc)
categories = doc.xpath("//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option")
categories = categories.slice(1..categories.size - 2)
categories.each do |category|
Category.new(name: category.text.strip).save if Category.where(name: category.text.strip).blank?
end
rescue StandardError => e
logger.error("[method: ] #{import_category}")
logger.error(e.message)
logger.error(e.backtrace)
end
def import_city(doc)
cities = doc.xpath("//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option").drop(1)
area_id = 1
cities.each do |city|
area_id = 2 if city.text == 'Angola'
City.new(name: city.text.strip, area: Area.find(area_id)).save if City.where(name: city.text.strip).blank?
end
rescue StandardError => e
logger.error("[URL] #{url}")
logger.error("[method: ] #{import_city}")
logger.error(e.message)
logger.error(e.backtrace)
end
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment