Commit b03a88fd by Thanh Hung Pham

Merge remote-tracking branch 'refs/remotes/origin/master'

Conflicts:
	lib/tasks/import_csv.rake
parents 5fc4c779 01626a94
......@@ -14,5 +14,7 @@ module VeNJOB
# Settings in config/environments/* take precedence over those specified here.
# Application configuration should go into files in config/initializers
# -- all .rb files in that directory are automatically loaded.
config.autoload_paths += %W[#{config.root}/lib]
config.eager_load_paths += %W[#{config.root}/lib]
end
end
......@@ -5,3 +5,5 @@
#
# movies = Movie.create([{ name: 'Star Wars' }, { name: 'Lord of the Rings' }])
# Character.create(name: 'Luke', movie: movies.first)
Area.new(name: 'Viet Nam').save
Area.new(name: 'International').save
......@@ -3,7 +3,7 @@ require 'open-uri'
require 'nokogiri'
require 'logger'
class Careerbuilder
class Crawler::Careerbuilder
attr_reader :domain, :thread_count, :logger
def initialize(domain, thread_count = 1)
......@@ -18,8 +18,7 @@ class Careerbuilder
def crawl
@logger.info('Start crawl')
doc = Nokogiri::HTML(open('http://careerbuilder.vn'))
import_area
doc = Nokogiri::HTML(open(@domain))
import_category(doc)
import_city(doc)
......@@ -83,12 +82,15 @@ class Careerbuilder
def detail(doc, _link)
# Company Information
company = Company.new
company_name = doc.xpath("//div[@class='tit_company']").text.strip # Company name
company_address = doc.xpath("//div[@class='box1Detail']/p[@class='TitleDetailNew']/label[@itemprop='address']/label[@itemprop='addressLocality']").text.strip # Company Address
company_description = doc.xpath("//div[@class='desc_company content_fck']").text.strip # Company description
company = Company.find_or_create_by(name: company_name)
company.name = company_name
company.address = doc.xpath("//div[@class='box1Detail']/p[@class='TitleDetailNew']/label[@itemprop='address']/label[@itemprop='addressLocality']").text.strip # Company Address
company.description = doc.xpath("//div[@class='desc_company content_fck']").text.strip # Company description
company.save if Company.where(name: company_name).blank?
company.address = company_address
company.description = company_description
company.save
# Job Information
job_name = doc.xpath("//div[@class='LeftJobCB']/div[@class='top-job']/div[@class='top-job-info']/h1").text.strip # Job name
......@@ -109,38 +111,26 @@ class Careerbuilder
city = City.find_by_name(job_location)
job = Job.new
job.name = job_name
job = Job.find_or_create_by(name: job_name, city: city, company: company)
job.description = job_description
job.salary = job_salary
job.city = city
job.company = company
job.level = job_level
job.experience = job_experience
job.status = 0
job.expiry_date = job_expiry_date.to_datetime
job.save if Job.where(name: job_name, city: city, company: company).blank?
job.save
job_category.split(',').each do |category|
category = Category.find_by_name(category)
JobCategory.new(job: job, category: category).save if JobCategory.where(job: job, category: category).blank?
end
JobCategory.find_or_create_by(job: job, category: category)
end
def import_area
Area.new(name: 'Viet Nam').save if Area.where(name: 'Viet Nam').blank?
Area.new(name: 'International').save if Area.where(name: 'International').blank?
rescue StandardError => e
logger.error("[method: ] #{import_category}")
logger.error(e.message)
logger.error(e.backtrace)
end
def import_category(doc)
categories = doc.xpath("//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option")
categories = categories.slice(1..categories.size - 2)
categories = categories.drop(1)
categories.each do |category|
Category.new(name: category.text.strip).save if Category.where(name: category.text.strip).blank?
Category.find_or_create_by(name: category.text.strip)
end
rescue StandardError => e
logger.error("[method: ] #{import_category}")
......@@ -150,10 +140,10 @@ class Careerbuilder
def import_city(doc)
cities = doc.xpath("//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option").drop(1)
area_id = 1
area = Area.find_by_name('Viet Nam')
cities.each do |city|
area_id = 2 if city.text == 'Angola'
City.new(name: city.text.strip, area: Area.find(area_id)).save if City.where(name: city.text.strip).blank?
area = Area.find_by_name('International') if city.text == 'Angola'
City.find_or_create_by(name: city.text.strip, area: area)
end
rescue StandardError => e
logger.error("[method: ] #{import_city}")
......
require 'thread'
require 'logger'
require 'csv'
class Import::CSVReader
attr_reader :logger
def initialize(file)
@file = file
@logger = Logger.new("#{Rails.root}/log/csv_reader.log")
end
def import
@logger.info('Start read data')
puts '=======Start read data======='
csv_text = File.read(@file)
csv = CSV.parse(csv_text, headers: :true)
csv.each do |row|
begin
# Job type information
job_type = import_job_type(row)
# Contact information
contact = import_contact(row)
# Company information
company = import_company(row)
# Category information
category = import_category(row)
# City information
city = import_city(row)
# Job information
job = import_job(row, city, job_type, contact, company)
# Job Category Information
import_job_category(job, category)
rescue StandardError => e
logger.error(e.message)
logger.error(e.backtrace)
next
end
end
puts '=======End read data======='
@logger.info('End read data')
end
def import_job_type(row)
job_type_name = row['type'].strip unless row['type'].nil?
job_type = JobType.find_or_create_by(name: job_type_name)
job_type
end
def import_contact(row)
contact_name = row['contact name'].strip unless row['contact name'].nil?
contact_email = row['contact email'].strip unless row['contact email'].nil?
contact_phone = row['contact phone'].strip unless row['contact phone'].nil?
contact = Contact.find_or_create_by(email: contact_email)
contact.email = contact_email
contact.name = contact_name
contact.phone = contact_phone
contact.save
contact
end
def import_company(row)
company_address = row['company address'].strip unless row['company address'].nil?
company_district = row['company district'].strip unless row['company district'].nil?
company_name = row['company name'].strip unless row['company name'].nil?
company_province = row['company province'].strip unless row['company province'].nil?
company = Company.find_or_create_by(name: company_name)
company.address = company_address
company.district = company_district
company.name = company_name
company.province = company_province
company.save
company
end
def import_category(row)
category_name = row['category'].strip unless row['category'].nil?
category = Category.find_or_create_by(name: category_name)
category.name = category_name
category.save
category
end
def import_city(row)
city_name = row['work place'].strip unless row['work place'].nil?
city_name = city_name.tr('""', '').tr('[]', '') # Remove '["text"]' -> 'text'
city = City.find_or_create_by(name: city_name)
city.name = city_name
city.area = Area.find_by_name('Viet Nam')
city.save
city
end
def import_job(row, city, job_type, contact, company)
job_benefit = row['benefit'].strip unless row['benefit'].nil?
job_description = row['description'].strip unless row['description'].nil?
job_level = row['level'].strip unless row['level'].nil?
job_name = row['name'].strip unless row['name'].nil?
job_requirement = row['requirement'].strip unless row['requirement'].nil?
job_salary = row['salary'].strip unless row['salary'].nil?
job = Job.find_or_create_by(name: job_name, city: city, company: company)
job.benefit = job_benefit
job.description = job_description
job.level = job_level
job.name = job_name
job.requirement = job_requirement
job.salary = job_salary
job.city = city
job.job_type = job_type
job.contact = contact
job.company = company
job.save
job
end
def import_job_category(job, category)
job_category = JobCategory.find_or_create_by(job: job, category: category)
job_category.job = job
job_category.category = category
job_category.save
end
end
namespace :crawler do
desc 'client crawler'
task load: :environment do
require "#{Rails.root}/lib/tasks/careerbuilder"
thread_count = ENV['THREAD_COUNT'] || 1
Careerbuilder.new('http://careerbuilder.vn', thread_count.to_i).crawl
Crawler::Careerbuilder.new('http://careerbuilder.vn', thread_count.to_i).crawl
end
end
require 'net/ftp'
require 'rubygems'
require 'zip'
namespace :import do
desc 'Import CSV'
task csv: :environment do
require "#{Rails.root}/lib/tasks/csv_reader"
thread_count = ENV['THREAD_COUNT'] || 1
ftp = Net::FTP.new
ftp.connect('192.168.1.156')
ftp.login('training', 'training')
ftp.passive = true
files = ftp.nlst('*.zip')
path_zip = "#{Rails.root}/lib/tasks/jobs.zip"
files.each do |file_name|
ftp.getbinaryfile(file_name, path_zip)
end
ftp.close
Utils::Download.new('192.168.1.156', 'training', 'training', path_zip, '*.zip').download_ftp
path_csv = "#{Rails.root}/lib/tasks/jobs.csv"
Zip::File.open(path_zip) do |zipfile|
......@@ -28,7 +14,7 @@ namespace :import do
end
end
CSVReader.new(path_csv, thread_count).import
Import::CSVReader.new(path_csv).import
File.delete(path_zip)
File.delete(path_csv)
......
require 'net/ftp'
require 'zip'
class Utils::Download
attr_reader :url, :user, :password, :path, :file_type
def initialize(url, user, password, path, file_type)
@url = url
@user = user
@password = password
@path = path
@file_type = file_type
end
def download_ftp
ftp = Net::FTP.new
ftp.connect(@url)
ftp.login(@user, @password)
ftp.passive = true
files = ftp.nlst(@file_type)
files.each do |file_name|
ftp.getbinaryfile(file_name, @path)
end
ftp.close
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment