Commit c99884c7 by Tô Ngọc Ánh

separating def into class

parent ee6a23cd
Pipeline #721 canceled with stages
in 0 seconds
require 'open-uri' require 'open-uri'
@logger ||= Logger.new("./log/import_data.log") class Crawler
def initialize(logger)
@logger = logger
end
namespace :crawl do def crawl_data(page, base_link)
desc 'crawl industries locations jobs'
task :crawl_industries_locations_jobs, %i[page link] => [:environment] do |_, args|
args.with_defaults(link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
crawl_industries_locations crawl_industries_locations
job_links = get_job_links(args[:page].to_i, args[:link]) job_links = get_job_links(page, base_link)
job_links.each do |link| job_links.each do |link|
next if link.empty? next if link.empty?
crawl_job(link) crawl_job(link)
end end
end end
end
def get_job_links(page, link) def get_job_links(page, link)
job_links = [] job_links = []
page.times do page.times do
document = Nokogiri::HTML(URI.open(link)) document = Nokogiri::HTML(URI.open(link))
...@@ -28,9 +27,9 @@ def get_job_links(page, link) ...@@ -28,9 +27,9 @@ def get_job_links(page, link)
link = next_page[:href] link = next_page[:href]
end end
job_links job_links
end end
def crawl_company(company_link) def crawl_company(company_link)
uri = URI.parse(URI.escape(company_link)) # fix error: uri must be ascii only uri = URI.parse(URI.escape(company_link)) # fix error: uri must be ascii only
document = Nokogiri::HTML(URI.open(uri)) document = Nokogiri::HTML(URI.open(uri))
company_name = document.css('.content .name').text company_name = document.css('.content .name').text
...@@ -43,11 +42,11 @@ def crawl_company(company_link) ...@@ -43,11 +42,11 @@ def crawl_company(company_link)
company.address = company_address company.address = company_address
company.description = company_description company.description = company_description
end end
rescue StandardError => e rescue StandardError => e
@logger.error "#{e.message} - Company link: #{uri}" @logger.error "#{e.message} - Company link: #{uri}"
end end
def crawl_job(job_link) def crawl_job(job_link)
uri = URI.parse(URI.escape(job_link)) # fix error: uri must be ascii only uri = URI.parse(URI.escape(job_link)) # fix error: uri must be ascii only
document = Nokogiri::HTML(URI.open(uri)) document = Nokogiri::HTML(URI.open(uri))
job_title = document.at_css('.job-desc p.title').text job_title = document.at_css('.job-desc p.title').text
...@@ -66,7 +65,7 @@ def crawl_job(job_link) ...@@ -66,7 +65,7 @@ def crawl_job(job_link)
job_salary = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').try(:text).try(:strip) job_salary = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').try(:text).try(:strip)
job_level = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').try(:text).try(:strip) job_level = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').try(:text).try(:strip)
job_experience = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').try(:text).try(:strip) job_experience = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').try(:text).try(:strip)
job_expiration_date = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').try(:text).try(:strip) job_exp_date = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').try(:text).try(:strip)
job_description = document.css('.job-detail-content .detail-row').to_s job_description = document.css('.job-detail-content .detail-row').to_s
...@@ -75,16 +74,16 @@ def crawl_job(job_link) ...@@ -75,16 +74,16 @@ def crawl_job(job_link)
level: job_level, level: job_level,
experience: job_experience, experience: job_experience,
salary: job_salary, salary: job_salary,
expiration_date: job_expiration_date) do |job| expiration_date: job_exp_date) do |job|
job.description = job_description job.description = job_description
job.industries << job_industries job.industries << job_industries
job.locations << job_locations job.locations << job_locations
end end
rescue StandardError => e rescue StandardError => e
@logger.error "#{e.message} - Job link: #{uri}" @logger.error "#{e.message} - Job link: #{uri}"
end end
def crawl_industries_locations def crawl_industries_locations
document = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')) document = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
industries = document.css('#industry option').map(&:text) industries = document.css('#industry option').map(&:text)
locations = document.css('#location option').map(&:text) locations = document.css('#location option').map(&:text)
...@@ -104,4 +103,5 @@ def crawl_industries_locations ...@@ -104,4 +103,5 @@ def crawl_industries_locations
location.oversea = true location.oversea = true
end end
end end
end
end end
require 'csv' require 'csv'
require 'zip' require './lib/common/extract_zip'
require_relative '../common/ftp'
@logger ||= Logger.new("./log/import_data.log") class CsvImport
include ExtractZip
namespace :csv_import do def initialize(logger)
desc 'Download csv file from FTP and import' @logger = logger
task csv: :environment do
destination_dir = './lib/data'
ftp = Ftp.new('192.168.1.156', 'training', 'training')
ftp.download_file('jobs.zip', destination_dir)
ftp.close
extract_zip("#{destination_dir}/jobs.zip", destination_dir)
import_job(destination_dir)
end end
end
def extract_zip(file, destination)
FileUtils.mkdir_p(destination)
Zip::File.open(file) do |zip_file|
zip_file.each do |f|
fpath = File.join(destination, f.name)
zip_file.extract(f, fpath) unless File.exist?(fpath)
end
end
end
def import_job(direction) def import_job(direction)
index = 0 index = 0
CSV.foreach("#{direction}/jobs.csv", headers: true) do |row| CSV.foreach("#{direction}/jobs.csv", headers: true) do |row|
index += 1 index += 1
...@@ -57,11 +38,14 @@ def import_job(direction) ...@@ -57,11 +38,14 @@ def import_job(direction)
end end
puts title puts title
end end
rescue StandardError => e rescue StandardError => e
puts e puts e
@logger.error "Job #{index}: #{e.message}" @logger.error "Job #{index}: #{e.message}"
end end
def integer?(str) private
def integer?(str)
str.to_i.to_s == str str.to_i.to_s == str
end
end end
require 'zip'
module ExtractZip
def extract_zip(file, destination)
FileUtils.mkdir_p(destination)
Zip::File.open(file) do |zip_file|
zip_file.each do |f|
fpath = File.join(destination, f.name)
zip_file.extract(f, fpath) unless File.exist?(fpath)
end
end
end
end
require './lib/common/ftp'
require './lib/common/csv'
require './lib/common/crawler'
namespace :import_data do
logger ||= Logger.new('./log/import_data.log')
desc 'crawl industries locations jobs'
task :crawler, %i[page link] => [:environment] do |_, args|
args.with_defaults(page: 1, link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
crawler = Crawler.new(logger)
crawler.crawl_data(args[:page].to_i, args[:link])
end
desc 'Download csv file from FTP and import'
task csv: :environment do
destination_dir = './lib/data'
ftp = Ftp.new('192.168.1.156', 'training', 'training')
ftp.download_file('jobs.zip', destination_dir)
ftp.close
csv = CsvImport.new(logger)
csv.extract_zip("#{destination_dir}/jobs.zip", destination_dir)
csv.import_job(destination_dir)
end
desc 'Import data from crawler and csv file'
task all: %i[crawler csv]
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment