Commit 95480c2e by Thanh Hung Pham

Merge remote-tracking branch 'origin/master'

parents 42dc461e 454004db
...@@ -14,5 +14,7 @@ module VeNJOB ...@@ -14,5 +14,7 @@ module VeNJOB
# Settings in config/environments/* take precedence over those specified here. # Settings in config/environments/* take precedence over those specified here.
# Application configuration should go into files in config/initializers # Application configuration should go into files in config/initializers
# -- all .rb files in that directory are automatically loaded. # -- all .rb files in that directory are automatically loaded.
config.autoload_paths += %W[#{config.root}/lib]
config.eager_load_paths += %W[#{config.root}/lib]
end end
end end
...@@ -24,5 +24,4 @@ set :output, { error: 'log/cron_error_log.log', standard: 'log/cron_log.log' } ...@@ -24,5 +24,4 @@ set :output, { error: 'log/cron_error_log.log', standard: 'log/cron_log.log' }
every 1.day, at: '12:00 pm' do every 1.day, at: '12:00 pm' do
rake 'crawler:load' rake 'crawler:load'
rake 'import:csv'
end end
...@@ -5,7 +5,7 @@ class CreateApplies < ActiveRecord::Migration[5.1] ...@@ -5,7 +5,7 @@ class CreateApplies < ActiveRecord::Migration[5.1]
t.references :job, index: true t.references :job, index: true
t.datetime :applied_at t.datetime :applied_at
t.string :ip_address t.string :ip_address
t.string :user_agrent t.string :user_agent
t.timestamps t.timestamps
end end
......
...@@ -17,7 +17,7 @@ ActiveRecord::Schema.define(version: 20170628020034) do ...@@ -17,7 +17,7 @@ ActiveRecord::Schema.define(version: 20170628020034) do
t.bigint "job_id" t.bigint "job_id"
t.datetime "applied_at" t.datetime "applied_at"
t.string "ip_address" t.string "ip_address"
t.string "user_agrent" t.string "user_agent"
t.datetime "created_at", null: false t.datetime "created_at", null: false
t.datetime "updated_at", null: false t.datetime "updated_at", null: false
t.index ["job_id"], name: "index_applies_on_job_id" t.index ["job_id"], name: "index_applies_on_job_id"
......
...@@ -5,3 +5,5 @@ ...@@ -5,3 +5,5 @@
# #
# movies = Movie.create([{ name: 'Star Wars' }, { name: 'Lord of the Rings' }]) # movies = Movie.create([{ name: 'Star Wars' }, { name: 'Lord of the Rings' }])
# Character.create(name: 'Luke', movie: movies.first) # Character.create(name: 'Luke', movie: movies.first)
Area.new(name: 'Viet Nam').save
Area.new(name: 'International').save
...@@ -3,7 +3,7 @@ require 'open-uri' ...@@ -3,7 +3,7 @@ require 'open-uri'
require 'nokogiri' require 'nokogiri'
require 'logger' require 'logger'
class Careerbuilder class Crawler::Careerbuilder
attr_reader :domain, :thread_count, :logger attr_reader :domain, :thread_count, :logger
def initialize(domain, thread_count = 1) def initialize(domain, thread_count = 1)
...@@ -18,8 +18,7 @@ class Careerbuilder ...@@ -18,8 +18,7 @@ class Careerbuilder
def crawl def crawl
@logger.info('Start crawl') @logger.info('Start crawl')
doc = Nokogiri::HTML(open('http://careerbuilder.vn')) doc = Nokogiri::HTML(open(@domain))
import_area
import_category(doc) import_category(doc)
import_city(doc) import_city(doc)
...@@ -86,7 +85,12 @@ class Careerbuilder ...@@ -86,7 +85,12 @@ class Careerbuilder
company_name = doc.xpath("//div[@class='tit_company']").text.strip # Company name company_name = doc.xpath("//div[@class='tit_company']").text.strip # Company name
company_address = doc.xpath("//div[@class='box1Detail']/p[@class='TitleDetailNew']/label[@itemprop='address']/label[@itemprop='addressLocality']").text.strip # Company Address company_address = doc.xpath("//div[@class='box1Detail']/p[@class='TitleDetailNew']/label[@itemprop='address']/label[@itemprop='addressLocality']").text.strip # Company Address
company_description = doc.xpath("//div[@class='desc_company content_fck']").text.strip # Company description company_description = doc.xpath("//div[@class='desc_company content_fck']").text.strip # Company description
Company.new(name: company_name, address: company_address, description: company_description).save if Company.where(name: company_name).blank?
company = Company.find_or_create_by(name: company_name)
company.name = company_name
company.address = company_address
company.description = company_description
company.save
# Job Information # Job Information
job_name = doc.xpath("//div[@class='LeftJobCB']/div[@class='top-job']/div[@class='top-job-info']/h1").text.strip # Job name job_name = doc.xpath("//div[@class='LeftJobCB']/div[@class='top-job']/div[@class='top-job-info']/h1").text.strip # Job name
...@@ -105,31 +109,28 @@ class Careerbuilder ...@@ -105,31 +109,28 @@ class Careerbuilder
job_expiry_date = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Hết hạn nộp: ']/text()").to_s job_expiry_date = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Hết hạn nộp: ']/text()").to_s
Job.new(name: job_name, description: job_description, city = City.find_by_name(job_location)
salary: job_salary,
city: City.find_by_name(job_location), job = Job.find_or_create_by(name: job_name, city: city, company: company)
level: job_level, experience: job_experience, status: 0, job.description = job_description
expiry_date: job_expiry_date.to_datetime).save job.salary = job_salary
job.level = job_level
job.experience = job_experience
job.status = 0
job.expiry_date = job_expiry_date.to_datetime
job.save
job_category.split(',').each do |category| job_category.split(',').each do |category|
JobCategory.new(job: Job.find_by_name(job_name), category: Category.find_by_name(category)).save category = Category.find_by_name(category)
JobCategory.find_or_create_by(job: job, category: category)
end end
end end
def import_area
Area.new(name: 'Viet Nam').save if Area.where(name: 'Viet Nam').blank?
Area.new(name: 'International').save if Area.where(name: 'International').blank?
rescue StandardError => e
logger.error("[method: ] #{import_category}")
logger.error(e.message)
logger.error(e.backtrace)
end
def import_category(doc) def import_category(doc)
categories = doc.xpath("//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option") categories = doc.xpath("//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option")
categories = categories.slice(1..categories.size - 2) categories = categories.drop(1)
categories.each do |category| categories.each do |category|
Category.new(name: category.text.strip).save if Category.where(name: category.text.strip).blank? Category.find_or_create_by(name: category.text.strip)
end end
rescue StandardError => e rescue StandardError => e
logger.error("[method: ] #{import_category}") logger.error("[method: ] #{import_category}")
...@@ -139,10 +140,10 @@ class Careerbuilder ...@@ -139,10 +140,10 @@ class Careerbuilder
def import_city(doc) def import_city(doc)
cities = doc.xpath("//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option").drop(1) cities = doc.xpath("//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option").drop(1)
area_id = 1 area = Area.find_by_name('Viet Nam')
cities.each do |city| cities.each do |city|
area_id = 2 if city.text == 'Angola' area = Area.find_by_name('International') if city.text == 'Angola'
City.new(name: city.text.strip, area: Area.find(area_id)).save if City.where(name: city.text.strip).blank? City.find_or_create_by(name: city.text.strip, area: area)
end end
rescue StandardError => e rescue StandardError => e
logger.error("[method: ] #{import_city}") logger.error("[method: ] #{import_city}")
......
namespace :crawler do namespace :crawler do
desc 'client crawler' desc 'client crawler'
task load: :environment do task load: :environment do
require "#{Rails.root}/lib/tasks/careerbuilder"
thread_count = ENV['THREAD_COUNT'] || 1 thread_count = ENV['THREAD_COUNT'] || 1
Careerbuilder.new('http://careerbuilder.vn', thread_count.to_i).crawl Crawler::Careerbuilder.new('http://careerbuilder.vn', thread_count.to_i).crawl
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment