Commit 9707d2d0 by Thanh Hung Pham

Resolve thread

parent d6fa267c
require 'nokogiri' require 'thread'
require 'open-uri' require 'open-uri'
require 'rake' require 'nokogiri'
require 'logger'
namespace :crawler_data do class Careerbuilder
task load_page: :environment do attr_reader :domain, :thread_count, :logger
doc = Nokogiri::HTML(open('http://careerbuilder.vn'))
import_area
import_category(doc)
import_city(doc)
new_jobs_url = doc.xpath("//div[@class='logo_nav']/ul/li[@class=' hasmenu']/ul/li/a[text()='Việc làm mới nhất']/@href") def initialize(domain, thread_count=1)
inport_job(new_jobs_url.to_s) @domain = domain
@thread_count = thread_count
@mutex = Mutex.new
@logger = Logger.new("#{Rails.root}/log/careerbuilder_crawler.log")
end
def crawl
@logger.info('Start crawl')
workers = (0...thread_count).map do
Thread.new do
begin
doc = Nokogiri::HTML(open('http://careerbuilder.vn'))
import_area
import_category(doc)
import_city(doc)
new_jobs_url = doc.xpath("//div[@class='logo_nav']/ul/li[@class=' hasmenu']/ul/li/a[text()='Việc làm mới nhất']/@href")
inport_job(new_jobs_url.to_s)
rescue ThreadError
end
end
end
workers.map(&:join)
@logger.info('Crawl finished')
end end
def import_area def import_area
Area.new(name: 'Viet Nam').save if Area.where(name: 'Viet Nam').blank? @mutex.synchronize do
Area.new(name: 'International').save if Area.where(name: 'International').blank? Area.new(name: 'Viet Nam').save if Area.where(name: 'Viet Nam').blank?
Area.new(name: 'International').save if Area.where(name: 'International').blank?
end
rescue StandardError => e
logger.error(e.message)
logger.error(e.backtrace)
end end
def import_category(doc) def import_category(doc)
categories = doc.xpath("//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option") @mutex.synchronize do
categories = categories.slice(1..categories.size - 2) categories = doc.xpath("//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option")
categories.each do |category| categories = categories.slice(1..categories.size - 2)
Category.new(name: category.text.strip).save if Category.where(name: category.text.strip).blank? categories.each do |category|
Category.new(name: category.text.strip).save if Category.where(name: category.text.strip).blank?
end
end end
rescue StandardError => e
logger.error("[method: ] #{import_category}")
logger.error(e.message)
logger.error(e.backtrace)
end end
def import_city(doc) def import_city(doc)
cities = doc.xpath("//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option").drop(1) @mutex.synchronize do
area_id = 1 cities = doc.xpath("//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option").drop(1)
cities.each do |city| area_id = 1
area_id = 2 if city.text == 'Angola' cities.each do |city|
City.new(name: city.text.strip, area: Area.find(area_id)).save if City.where(name: city.text.strip).blank? area_id = 2 if city.text == 'Angola'
City.new(name: city.text.strip, area: Area.find(area_id)).save if City.where(name: city.text.strip).blank?
end
end end
rescue StandardError => e
logger.error("[method: ] #{import_city}")
logger.error(e.message)
logger.error(e.backtrace)
end end
def inport_job(url) def inport_job(url)
10.times do 10.times do
doc_new_jobs = Nokogiri::HTML(open(url)) @mutex.synchronize do
doc_new_jobs.encoding = 'utf-8' doc_new_jobs = Nokogiri::HTML(open(url))
doc_new_jobs.encoding = 'utf-8'
doc_new_jobs.xpath("//div[@class='gird_standard ']/dl/dd/span/h3[@class='job']/a/@href").each do |link| doc_new_jobs.xpath("//div[@class='gird_standard ']/dl/dd/span/h3[@class='job']/a/@href").each do |link|
encoded_url = URI.encode(link.to_s) encoded_url = URI.encode(link.to_s)
doc_job_details = Nokogiri::HTML(open(encoded_url)) doc_job_details = Nokogiri::HTML(open(encoded_url))
# Company Information # Company Information
company_name = doc_job_details.xpath("//div[@class='tit_company']").text.strip # Company name company_name = doc_job_details.xpath("//div[@class='tit_company']").text.strip # Company name
company_address = doc_job_details.xpath("//div[@class='box1Detail']/p[@class='TitleDetailNew']/label[@itemprop='address']/label[@itemprop='addressLocality']").text.strip # Company Address company_address = doc_job_details.xpath("//div[@class='box1Detail']/p[@class='TitleDetailNew']/label[@itemprop='address']/label[@itemprop='addressLocality']").text.strip # Company Address
company_description = doc_job_details.xpath("//div[@class='desc_company content_fck']").text.strip # Company description company_description = doc_job_details.xpath("//div[@class='desc_company content_fck']").text.strip # Company description
Company.new(name: company_name, address: company_address, description: company_description).save if Company.where(name: company_name).blank? Company.new(name: company_name, address: company_address, description: company_description).save if Company.where(name: company_name).blank?
# Job Information # Job Information
job_name = doc_job_details.xpath("//div[@class='LeftJobCB']/div[@class='top-job']/div[@class='top-job-info']/h1").text.strip # Job name job_name = doc_job_details.xpath("//div[@class='LeftJobCB']/div[@class='top-job']/div[@class='top-job-info']/h1").text.strip # Job name
job_description = doc_job_details.xpath("//div[@class='MarBot20']").text.strip # Job description job_description = doc_job_details.xpath("//div[@class='MarBot20']").text.strip # Job description
job_location = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Nơi làm việc: ']/b[@itemprop='jobLocation']").text.strip job_location = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Nơi làm việc: ']/b[@itemprop='jobLocation']").text.strip
job_level = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Cấp bậc: ']/label[@itemprop='occupationalCategory']").text.strip job_level = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Cấp bậc: ']/label[@itemprop='occupationalCategory']").text.strip
job_experience = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Kinh nghiệm: ']/text()") job_experience = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Kinh nghiệm: ']/text()")
job_salary = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='baseSalary']").text.strip + " " + job_salary = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='baseSalary']").text.strip + " " +
doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='salaryCurrency']").text.strip doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='salaryCurrency']").text.strip
job_category = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Ngành nghề: ']/b/a[@itemprop='industry']").text.strip job_category = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Ngành nghề: ']/b/a[@itemprop='industry']").text.strip
job_expiry_date = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Hết hạn nộp: ']/text()").to_s job_expiry_date = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Hết hạn nộp: ']/text()").to_s
Job.new(name: job_name, description: job_description, Job.new(name: job_name, description: job_description,
salary: job_salary, salary: job_salary,
city: City.find_by_name(job_location), city: City.find_by_name(job_location),
level: job_level, experience: job_experience, status: 0, level: job_level, experience: job_experience, status: 0,
expiry_date: job_expiry_date.to_datetime).save expiry_date: job_expiry_date.to_datetime).save
job_category.split(',').each do |category| job_category.split(',').each do |category|
JobCategory.new(job: Job.find_by_name(job_name), category: Category.find_by_name(category)).save JobCategory.new(job: Job.find_by_name(job_name), category: Category.find_by_name(category)).save
end
end end
url = doc_new_jobs.xpath("//div[@class='paginationTwoStatus']/a[@class='right']/@href").to_s
end end
url = doc_new_jobs.xpath("//div[@class='paginationTwoStatus']/a[@class='right']/@href").to_s
end end
rescue StandardError => e
logger.error("[URL] #{url}")
logger.error(e.message)
logger.error(e.backtrace)
end end
end end
namespace :crawler do
desc 'client crawler'
task load: :environment do
require "#{Rails.root}/lib/tasks/careerbuilder"
thread_count = ENV['THREAD_COUNT'] || 1
Careerbuilder.new('http://careerbuilder.vn', thread_count.to_i).crawl
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment