Commit c99884c7 by Tô Ngọc Ánh

separating def into class

parent ee6a23cd
Pipeline #721 canceled with stages
in 0 seconds
require 'open-uri'
@logger ||= Logger.new("./log/import_data.log")
class Crawler
def initialize(logger)
@logger = logger
end
namespace :crawl do
desc 'crawl industries locations jobs'
task :crawl_industries_locations_jobs, %i[page link] => [:environment] do |_, args|
args.with_defaults(link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
def crawl_data(page, base_link)
crawl_industries_locations
job_links = get_job_links(args[:page].to_i, args[:link])
job_links = get_job_links(page, base_link)
job_links.each do |link|
next if link.empty?
crawl_job(link)
end
end
end
def get_job_links(page, link)
def get_job_links(page, link)
job_links = []
page.times do
document = Nokogiri::HTML(URI.open(link))
......@@ -28,9 +27,9 @@ def get_job_links(page, link)
link = next_page[:href]
end
job_links
end
end
def crawl_company(company_link)
def crawl_company(company_link)
uri = URI.parse(URI.escape(company_link)) # fix error: uri must be ascii only
document = Nokogiri::HTML(URI.open(uri))
company_name = document.css('.content .name').text
......@@ -43,11 +42,11 @@ def crawl_company(company_link)
company.address = company_address
company.description = company_description
end
rescue StandardError => e
rescue StandardError => e
@logger.error "#{e.message} - Company link: #{uri}"
end
end
def crawl_job(job_link)
def crawl_job(job_link)
uri = URI.parse(URI.escape(job_link)) # fix error: uri must be ascii only
document = Nokogiri::HTML(URI.open(uri))
job_title = document.at_css('.job-desc p.title').text
......@@ -66,7 +65,7 @@ def crawl_job(job_link)
job_salary = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').try(:text).try(:strip)
job_level = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').try(:text).try(:strip)
job_experience = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').try(:text).try(:strip)
job_expiration_date = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').try(:text).try(:strip)
job_exp_date = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').try(:text).try(:strip)
job_description = document.css('.job-detail-content .detail-row').to_s
......@@ -75,16 +74,16 @@ def crawl_job(job_link)
level: job_level,
experience: job_experience,
salary: job_salary,
expiration_date: job_expiration_date) do |job|
expiration_date: job_exp_date) do |job|
job.description = job_description
job.industries << job_industries
job.locations << job_locations
end
rescue StandardError => e
rescue StandardError => e
@logger.error "#{e.message} - Job link: #{uri}"
end
end
def crawl_industries_locations
def crawl_industries_locations
document = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
industries = document.css('#industry option').map(&:text)
locations = document.css('#location option').map(&:text)
......@@ -104,4 +103,5 @@ def crawl_industries_locations
location.oversea = true
end
end
end
end
require 'csv'
require 'zip'
require_relative '../common/ftp'
require './lib/common/extract_zip'
@logger ||= Logger.new("./log/import_data.log")
class CsvImport
include ExtractZip
namespace :csv_import do
desc 'Download csv file from FTP and import'
task csv: :environment do
destination_dir = './lib/data'
ftp = Ftp.new('192.168.1.156', 'training', 'training')
ftp.download_file('jobs.zip', destination_dir)
ftp.close
extract_zip("#{destination_dir}/jobs.zip", destination_dir)
import_job(destination_dir)
def initialize(logger)
@logger = logger
end
end
def extract_zip(file, destination)
FileUtils.mkdir_p(destination)
Zip::File.open(file) do |zip_file|
zip_file.each do |f|
fpath = File.join(destination, f.name)
zip_file.extract(f, fpath) unless File.exist?(fpath)
end
end
end
def import_job(direction)
def import_job(direction)
index = 0
CSV.foreach("#{direction}/jobs.csv", headers: true) do |row|
index += 1
......@@ -57,11 +38,14 @@ def import_job(direction)
end
puts title
end
rescue StandardError => e
rescue StandardError => e
puts e
@logger.error "Job #{index}: #{e.message}"
end
end
def integer?(str)
private
def integer?(str)
str.to_i.to_s == str
end
end
require 'zip'
module ExtractZip
def extract_zip(file, destination)
FileUtils.mkdir_p(destination)
Zip::File.open(file) do |zip_file|
zip_file.each do |f|
fpath = File.join(destination, f.name)
zip_file.extract(f, fpath) unless File.exist?(fpath)
end
end
end
end
require './lib/common/ftp'
require './lib/common/csv'
require './lib/common/crawler'
namespace :import_data do
logger ||= Logger.new('./log/import_data.log')
desc 'crawl industries locations jobs'
task :crawler, %i[page link] => [:environment] do |_, args|
args.with_defaults(page: 1, link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
crawler = Crawler.new(logger)
crawler.crawl_data(args[:page].to_i, args[:link])
end
desc 'Download csv file from FTP and import'
task csv: :environment do
destination_dir = './lib/data'
ftp = Ftp.new('192.168.1.156', 'training', 'training')
ftp.download_file('jobs.zip', destination_dir)
ftp.close
csv = CsvImport.new(logger)
csv.extract_zip("#{destination_dir}/jobs.zip", destination_dir)
csv.import_job(destination_dir)
end
desc 'Import data from crawler and csv file'
task all: %i[crawler csv]
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment