Commit 09bd6db3 by Thanh Hung Pham

Resolve thread

parent 9707d2d0
...@@ -6,25 +6,39 @@ require 'logger' ...@@ -6,25 +6,39 @@ require 'logger'
class Careerbuilder class Careerbuilder
attr_reader :domain, :thread_count, :logger attr_reader :domain, :thread_count, :logger
def initialize(domain, thread_count=1) def initialize(domain, thread_count = 1)
@links = [url: 'http://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html', handler: :top] * thread_count
@crawled = []
@mutex = Mutex.new
@domain = domain @domain = domain
@thread_count = thread_count @thread_count = thread_count
@mutex = Mutex.new
@logger = Logger.new("#{Rails.root}/log/careerbuilder_crawler.log") @logger = Logger.new("#{Rails.root}/log/careerbuilder_crawler.log")
end end
def crawl def crawl
@logger.info('Start crawl') @logger.info('Start crawl')
doc = Nokogiri::HTML(open('http://careerbuilder.vn'))
import_area
import_category(doc)
import_city(doc)
workers = (0...thread_count).map do workers = (0...thread_count).map do
Thread.new do Thread.new do
begin begin
doc = Nokogiri::HTML(open('http://careerbuilder.vn')) while link = next_link
import_area crawl_url = link[:url]
import_category(doc) begin
import_city(doc) doc_new_jobs = Nokogiri::HTML(open(crawl_url))
doc_new_jobs.encoding = 'utf-8'
new_jobs_url = doc.xpath("//div[@class='logo_nav']/ul/li[@class=' hasmenu']/ul/li/a[text()='Việc làm mới nhất']/@href") send(link[:handler], doc_new_jobs, crawl_url)
inport_job(new_jobs_url.to_s) rescue StandardError => e
logger.error("[URL: ] #{crawl_url}")
logger.error(e.message)
logger.error(e.backtrace)
end
end
puts '=======Thread End======='
rescue ThreadError rescue ThreadError
end end
end end
...@@ -34,94 +48,105 @@ class Careerbuilder ...@@ -34,94 +48,105 @@ class Careerbuilder
@logger.info('Crawl finished') @logger.info('Crawl finished')
end end
def import_area def next_link
link = nil
@mutex.synchronize do @mutex.synchronize do
Area.new(name: 'Viet Nam').save if Area.where(name: 'Viet Nam').blank? return if @crawled.count > 500
Area.new(name: 'International').save if Area.where(name: 'International').blank? link = @links.shift
@crawled.push(link[:url]) if link
end end
rescue StandardError => e link
logger.error(e.message)
logger.error(e.backtrace)
end end
def import_category(doc) def push_link(link, handler)
@mutex.synchronize do @mutex.synchronize do
categories = doc.xpath("//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option") @links.push({url: link, handler: handler}) unless @crawled.include?(link)
categories = categories.slice(1..categories.size - 2)
categories.each do |category|
Category.new(name: category.text.strip).save if Category.where(name: category.text.strip).blank?
end
end end
rescue StandardError => e
logger.error("[method: ] #{import_category}")
logger.error(e.message)
logger.error(e.backtrace)
end end
def import_city(doc) def shift_link(link, handler)
@mutex.synchronize do @mutex.synchronize do
cities = doc.xpath("//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option").drop(1) @links.unshift({url: link, handler: handler}) unless @crawled.include?(link)
area_id = 1
cities.each do |city|
area_id = 2 if city.text == 'Angola'
City.new(name: city.text.strip, area: Area.find(area_id)).save if City.where(name: city.text.strip).blank?
end
end end
rescue StandardError => e
logger.error("[method: ] #{import_city}")
logger.error(e.message)
logger.error(e.backtrace)
end end
def inport_job(url) def top(doc, _link)
10.times do doc.xpath("//div[@class='gird_standard ']/dl/dd/span/h3[@class='job']/a/@href").each do |link|
@mutex.synchronize do shift_link(link, :detail)
doc_new_jobs = Nokogiri::HTML(open(url)) end
doc_new_jobs.encoding = 'utf-8'
doc_new_jobs.xpath("//div[@class='gird_standard ']/dl/dd/span/h3[@class='job']/a/@href").each do |link| next_page = doc.xpath("//div[@class='paginationTwoStatus']/a[@class='right']/@href").to_s
encoded_url = URI.encode(link.to_s) push_link(next_page, :top) if next_page
doc_job_details = Nokogiri::HTML(open(encoded_url))
# Company Information end
company_name = doc_job_details.xpath("//div[@class='tit_company']").text.strip # Company name
company_address = doc_job_details.xpath("//div[@class='box1Detail']/p[@class='TitleDetailNew']/label[@itemprop='address']/label[@itemprop='addressLocality']").text.strip # Company Address
company_description = doc_job_details.xpath("//div[@class='desc_company content_fck']").text.strip # Company description
Company.new(name: company_name, address: company_address, description: company_description).save if Company.where(name: company_name).blank?
# Job Information def detail(doc, _link)
job_name = doc_job_details.xpath("//div[@class='LeftJobCB']/div[@class='top-job']/div[@class='top-job-info']/h1").text.strip # Job name # Company Information
job_description = doc_job_details.xpath("//div[@class='MarBot20']").text.strip # Job description company_name = doc.xpath("//div[@class='tit_company']").text.strip # Company name
company_address = doc.xpath("//div[@class='box1Detail']/p[@class='TitleDetailNew']/label[@itemprop='address']/label[@itemprop='addressLocality']").text.strip # Company Address
company_description = doc.xpath("//div[@class='desc_company content_fck']").text.strip # Company description
Company.new(name: company_name, address: company_address, description: company_description).save if Company.where(name: company_name).blank?
job_location = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Nơi làm việc: ']/b[@itemprop='jobLocation']").text.strip # Job Information
job_name = doc.xpath("//div[@class='LeftJobCB']/div[@class='top-job']/div[@class='top-job-info']/h1").text.strip # Job name
job_description = doc.xpath("//div[@class='MarBot20']").text.strip # Job description
job_level = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Cấp bậc: ']/label[@itemprop='occupationalCategory']").text.strip job_location = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Nơi làm việc: ']/b[@itemprop='jobLocation']").text.strip
job_experience = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Kinh nghiệm: ']/text()") job_level = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Cấp bậc: ']/label[@itemprop='occupationalCategory']").text.strip
job_salary = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='baseSalary']").text.strip + " " + job_experience = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Kinh nghiệm: ']/text()")
doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='salaryCurrency']").text.strip
job_category = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Ngành nghề: ']/b/a[@itemprop='industry']").text.strip job_salary = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='baseSalary']").text.strip + " " +
doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='salaryCurrency']").text.strip
job_expiry_date = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Hết hạn nộp: ']/text()").to_s job_category = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Ngành nghề: ']/b/a[@itemprop='industry']").text.strip
Job.new(name: job_name, description: job_description, job_expiry_date = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Hết hạn nộp: ']/text()").to_s
salary: job_salary,
city: City.find_by_name(job_location),
level: job_level, experience: job_experience, status: 0,
expiry_date: job_expiry_date.to_datetime).save
job_category.split(',').each do |category| Job.new(name: job_name, description: job_description,
JobCategory.new(job: Job.find_by_name(job_name), category: Category.find_by_name(category)).save salary: job_salary,
end city: City.find_by_name(job_location),
end level: job_level, experience: job_experience, status: 0,
url = doc_new_jobs.xpath("//div[@class='paginationTwoStatus']/a[@class='right']/@href").to_s expiry_date: job_expiry_date.to_datetime).save
end
job_category.split(',').each do |category|
JobCategory.new(job: Job.find_by_name(job_name), category: Category.find_by_name(category)).save
end
end
def import_area
Area.new(name: 'Viet Nam').save if Area.where(name: 'Viet Nam').blank?
Area.new(name: 'International').save if Area.where(name: 'International').blank?
rescue StandardError => e
logger.error("[method: ] #{import_category}")
logger.error(e.message)
logger.error(e.backtrace)
end
def import_category(doc)
categories = doc.xpath("//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option")
categories = categories.slice(1..categories.size - 2)
categories.each do |category|
Category.new(name: category.text.strip).save if Category.where(name: category.text.strip).blank?
end
rescue StandardError => e
logger.error("[method: ] #{import_category}")
logger.error(e.message)
logger.error(e.backtrace)
end
def import_city(doc)
cities = doc.xpath("//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option").drop(1)
area_id = 1
cities.each do |city|
area_id = 2 if city.text == 'Angola'
City.new(name: city.text.strip, area: Area.find(area_id)).save if City.where(name: city.text.strip).blank?
end end
rescue StandardError => e rescue StandardError => e
logger.error("[URL] #{url}") logger.error("[method: ] #{import_city}")
logger.error(e.message) logger.error(e.message)
logger.error(e.backtrace) logger.error(e.backtrace)
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment