Commit c99884c7 by Tô Ngọc Ánh

separating def into class

parent ee6a23cd
Pipeline #721 canceled with stages
in 0 seconds
require 'open-uri'
@logger ||= Logger.new("./log/import_data.log")
class Crawler
def initialize(logger)
@logger = logger
end
namespace :crawl do
desc 'crawl industries locations jobs'
task :crawl_industries_locations_jobs, %i[page link] => [:environment] do |_, args|
args.with_defaults(link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
def crawl_data(page, base_link)
crawl_industries_locations
job_links = get_job_links(args[:page].to_i, args[:link])
job_links = get_job_links(page, base_link)
job_links.each do |link|
next if link.empty?
crawl_job(link)
end
end
end
def get_job_links(page, link)
job_links = []
page.times do
document = Nokogiri::HTML(URI.open(link))
jobs_xml = document.xpath('//div/a[@class="job_link"]/@href')
jobs_xml.each { |item| job_links << item.value }
next_page = document.at_css('.next-page a')
break if next_page.nil?
def get_job_links(page, link)
job_links = []
page.times do
document = Nokogiri::HTML(URI.open(link))
jobs_xml = document.xpath('//div/a[@class="job_link"]/@href')
jobs_xml.each { |item| job_links << item.value }
next_page = document.at_css('.next-page a')
break if next_page.nil?
link = next_page[:href]
link = next_page[:href]
end
job_links
end
job_links
end
def crawl_company(company_link)
uri = URI.parse(URI.escape(company_link)) # fix error: uri must be ascii only
document = Nokogiri::HTML(URI.open(uri))
company_name = document.css('.content .name').text
return if company_name.empty?
def crawl_company(company_link)
uri = URI.parse(URI.escape(company_link)) # fix error: uri must be ascii only
document = Nokogiri::HTML(URI.open(uri))
company_name = document.css('.content .name').text
return if company_name.empty?
company_address = document.css('.content p')[1].text
company_description = document.css('.main-about-us').css('.content').text
company_address = document.css('.content p')[1].text
company_description = document.css('.main-about-us').css('.content').text
Company.find_or_create_by(name: company_name) do |company|
company.address = company_address
company.description = company_description
Company.find_or_create_by(name: company_name) do |company|
company.address = company_address
company.description = company_description
end
rescue StandardError => e
@logger.error "#{e.message} - Company link: #{uri}"
end
rescue StandardError => e
@logger.error "#{e.message} - Company link: #{uri}"
end
def crawl_job(job_link)
uri = URI.parse(URI.escape(job_link)) # fix error: uri must be ascii only
document = Nokogiri::HTML(URI.open(uri))
job_title = document.at_css('.job-desc p.title').text
return if job_title.empty?
job_company_link = document.at_css('.job-desc a.job-company-name')[:href]
job_company = crawl_company(job_company_link)
return if job_company.nil?
job_location_name = document.css('.map p a').map { |val| val.text.strip }
job_locations = Location.where(city: job_location_name)
job_industry_names = document.at_xpath('//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]').css('p a').map { |val| val.text.strip }
job_industries = Industry.where(name: job_industry_names)
job_salary = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').try(:text).try(:strip)
job_level = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').try(:text).try(:strip)
job_experience = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').try(:text).try(:strip)
job_expiration_date = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').try(:text).try(:strip)
job_description = document.css('.job-detail-content .detail-row').to_s
Job.find_or_create_by(title: job_title,
company_id: job_company.id,
level: job_level,
experience: job_experience,
salary: job_salary,
expiration_date: job_expiration_date) do |job|
job.description = job_description
job.industries << job_industries
job.locations << job_locations
def crawl_job(job_link)
uri = URI.parse(URI.escape(job_link)) # fix error: uri must be ascii only
document = Nokogiri::HTML(URI.open(uri))
job_title = document.at_css('.job-desc p.title').text
return if job_title.empty?
job_company_link = document.at_css('.job-desc a.job-company-name')[:href]
job_company = crawl_company(job_company_link)
return if job_company.nil?
job_location_name = document.css('.map p a').map { |val| val.text.strip }
job_locations = Location.where(city: job_location_name)
job_industry_names = document.at_xpath('//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]').css('p a').map { |val| val.text.strip }
job_industries = Industry.where(name: job_industry_names)
job_salary = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').try(:text).try(:strip)
job_level = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').try(:text).try(:strip)
job_experience = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').try(:text).try(:strip)
job_exp_date = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').try(:text).try(:strip)
job_description = document.css('.job-detail-content .detail-row').to_s
Job.find_or_create_by(title: job_title,
company_id: job_company.id,
level: job_level,
experience: job_experience,
salary: job_salary,
expiration_date: job_exp_date) do |job|
job.description = job_description
job.industries << job_industries
job.locations << job_locations
end
rescue StandardError => e
@logger.error "#{e.message} - Job link: #{uri}"
end
rescue StandardError => e
@logger.error "#{e.message} - Job link: #{uri}"
end
def crawl_industries_locations
document = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
industries = document.css('#industry option').map(&:text)
locations = document.css('#location option').map(&:text)
def crawl_industries_locations
document = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
industries = document.css('#industry option').map(&:text)
locations = document.css('#location option').map(&:text)
industries.each do |val|
Industry.find_or_create_by(name: val)
end
industries.each do |val|
Industry.find_or_create_by(name: val)
end
locations.take(Location::CITY_VIETNAM_NUMBER).each do |val|
Location.find_or_create_by(city: val) do |location|
location.oversea = false
locations.take(Location::CITY_VIETNAM_NUMBER).each do |val|
Location.find_or_create_by(city: val) do |location|
location.oversea = false
end
end
end
locations.last(locations.count - Location::CITY_VIETNAM_NUMBER).each do |val|
Location.find_or_create_by(city: val) do |location|
location.oversea = true
locations.last(locations.count - Location::CITY_VIETNAM_NUMBER).each do |val|
Location.find_or_create_by(city: val) do |location|
location.oversea = true
end
end
end
end
require 'csv'
require 'zip'
require_relative '../common/ftp'
require './lib/common/extract_zip'
@logger ||= Logger.new("./log/import_data.log")
class CsvImport
include ExtractZip
namespace :csv_import do
desc 'Download csv file from FTP and import'
task csv: :environment do
destination_dir = './lib/data'
ftp = Ftp.new('192.168.1.156', 'training', 'training')
ftp.download_file('jobs.zip', destination_dir)
ftp.close
extract_zip("#{destination_dir}/jobs.zip", destination_dir)
import_job(destination_dir)
def initialize(logger)
@logger = logger
end
end
def extract_zip(file, destination)
FileUtils.mkdir_p(destination)
Zip::File.open(file) do |zip_file|
zip_file.each do |f|
fpath = File.join(destination, f.name)
zip_file.extract(f, fpath) unless File.exist?(fpath)
def import_job(direction)
index = 0
CSV.foreach("#{direction}/jobs.csv", headers: true) do |row|
index += 1
next if integer?(row['category'])
title = row['name'].strip
company = Company.find_or_create_by(name: row['company name']) do |c|
c.description = "Contact email: #{row['contact email']}\n"\
"Contact name: #{row['contact name']}\n"\
"Contact phone: #{row['contact phone']}"
c.address = "#{row['company address']}, #{row['company province']}"
end
industry = Industry.find_or_create_by(name: row['category'].strip)
level = row['level'].try(:strip)
salary = row['salary'].try(:strip)
locations_name = row['work place'].tr('"[]', '').split(',')
locations = Location.where(city: locations_name)
locations = locations_name.map { |city| Location.create(oversea: false, city: city) } if locations.empty?
description = "Benefits:\n#{row['benefit']}\n"\
"Descriptions:\n#{row['description']}\n"\
"Requirements:\n#{row['requirement']}"
Job.find_or_create_by(title: title, company_id: company.id, level: level, salary: salary) do |job|
job.industries << industry
job.locations << locations
job.description = description
end
puts title
end
rescue StandardError => e
puts e
@logger.error "Job #{index}: #{e.message}"
end
end
def import_job(direction)
index = 0
CSV.foreach("#{direction}/jobs.csv", headers: true) do |row|
index += 1
next if integer?(row['category'])
title = row['name'].strip
company = Company.find_or_create_by(name: row['company name']) do |c|
c.description = "Contact email: #{row['contact email']}\n"\
"Contact name: #{row['contact name']}\n"\
"Contact phone: #{row['contact phone']}"
c.address = "#{row['company address']}, #{row['company province']}"
end
industry = Industry.find_or_create_by(name: row['category'].strip)
level = row['level'].try(:strip)
salary = row['salary'].try(:strip)
locations_name = row['work place'].tr('"[]', '').split(',')
locations = Location.where(city: locations_name)
locations = locations_name.map { |city| Location.create(oversea: false, city: city) } if locations.empty?
description = "Benefits:\n#{row['benefit']}\n"\
"Descriptions:\n#{row['description']}\n"\
"Requirements:\n#{row['requirement']}"
private
Job.find_or_create_by(title: title, company_id: company.id, level: level, salary: salary) do |job|
job.industries << industry
job.locations << locations
job.description = description
end
puts title
def integer?(str)
str.to_i.to_s == str
end
rescue StandardError => e
puts e
@logger.error "Job #{index}: #{e.message}"
end
def integer?(str)
str.to_i.to_s == str
end
require 'zip'
module ExtractZip
def extract_zip(file, destination)
FileUtils.mkdir_p(destination)
Zip::File.open(file) do |zip_file|
zip_file.each do |f|
fpath = File.join(destination, f.name)
zip_file.extract(f, fpath) unless File.exist?(fpath)
end
end
end
end
require './lib/common/ftp'
require './lib/common/csv'
require './lib/common/crawler'
namespace :import_data do
logger ||= Logger.new('./log/import_data.log')
desc 'crawl industries locations jobs'
task :crawler, %i[page link] => [:environment] do |_, args|
args.with_defaults(page: 1, link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
crawler = Crawler.new(logger)
crawler.crawl_data(args[:page].to_i, args[:link])
end
desc 'Download csv file from FTP and import'
task csv: :environment do
destination_dir = './lib/data'
ftp = Ftp.new('192.168.1.156', 'training', 'training')
ftp.download_file('jobs.zip', destination_dir)
ftp.close
csv = CsvImport.new(logger)
csv.extract_zip("#{destination_dir}/jobs.zip", destination_dir)
csv.import_job(destination_dir)
end
desc 'Import data from crawler and csv file'
task all: %i[crawler csv]
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment