separating def into class

c99884c7 · Tô Ngọc Ánh · ee6a23cd · c99884c7 · c99884c7 · c99884c7
Commit c99884c7 authored Jul 28, 2020 by Tô Ngọc Ánh
Hide whitespace changes
Inline Side-by-side

Showing with 162 additions and 136 deletions

lib/common/crawler.rb
+80 -80

lib/common/csv.rb
+40 -56

lib/common/extract_zip.rb
+14 -0

lib/tasks/import_data.rake
+28 -0

No files found.
--- a/lib/tasks/crawler.rake
+++ b/lib/tasks/crawler.rake
 require 'open-uri'

-@logger ||= Logger.new("./log/import_data.log")
+class Crawler
+  def initialize(logger)
+    @logger = logger
+  end

-namespace :crawl do
-  desc 'crawl industries locations jobs'
-  task :crawl_industries_locations_jobs, %i[page link] => [:environment] do |_, args|
-    args.with_defaults(link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
+  def crawl_data(page, base_link)
    crawl_industries_locations
-    job_links = get_job_links(args[:page].to_i, args[:link])
+    job_links = get_job_links(page, base_link)
    job_links.each do |link|
      next if link.empty?

      crawl_job(link)
    end
  end
-end

-def get_job_links(page, link)
-  job_links = []
-  page.times do
-    document = Nokogiri::HTML(URI.open(link))
-    jobs_xml = document.xpath('//div/a[@class="job_link"]/@href')
-    jobs_xml.each { |item| job_links << item.value }
-    next_page = document.at_css('.next-page a')
-    break if next_page.nil?
+  def get_job_links(page, link)
+    job_links = []
+    page.times do
+      document = Nokogiri::HTML(URI.open(link))
+      jobs_xml = document.xpath('//div/a[@class="job_link"]/@href')
+      jobs_xml.each { |item| job_links << item.value }
+      next_page = document.at_css('.next-page a')
+      break if next_page.nil?

-    link = next_page[:href]
+      link = next_page[:href]
+    end
+    job_links
  end
-  job_links
-end

-def crawl_company(company_link)
-  uri = URI.parse(URI.escape(company_link)) # fix error: uri must be ascii only
-  document = Nokogiri::HTML(URI.open(uri))
-  company_name = document.css('.content .name').text
-  return if company_name.empty?
+  def crawl_company(company_link)
+    uri = URI.parse(URI.escape(company_link)) # fix error: uri must be ascii only
+    document = Nokogiri::HTML(URI.open(uri))
+    company_name = document.css('.content .name').text
+    return if company_name.empty?

-  company_address = document.css('.content p')[1].text
-  company_description = document.css('.main-about-us').css('.content').text
+    company_address = document.css('.content p')[1].text
+    company_description = document.css('.main-about-us').css('.content').text

-  Company.find_or_create_by(name: company_name) do |company|
-    company.address = company_address
-    company.description = company_description
+    Company.find_or_create_by(name: company_name) do |company|
+      company.address = company_address
+      company.description = company_description
+    end
+  rescue StandardError => e
+    @logger.error "#{e.message} - Company link: #{uri}"
  end
-rescue StandardError => e
-  @logger.error "#{e.message} - Company link: #{uri}"
-end

-def crawl_job(job_link)
-  uri = URI.parse(URI.escape(job_link)) # fix error: uri must be ascii only
-  document = Nokogiri::HTML(URI.open(uri))
-  job_title = document.at_css('.job-desc p.title').text
-  return if job_title.empty?
-
-  job_company_link = document.at_css('.job-desc a.job-company-name')[:href]
-  job_company = crawl_company(job_company_link)
-  return if job_company.nil?
-
-  job_location_name = document.css('.map p a').map { |val| val.text.strip }
-  job_locations = Location.where(city: job_location_name)
-
-  job_industry_names = document.at_xpath('//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]').css('p a').map { |val| val.text.strip }
-  job_industries = Industry.where(name: job_industry_names)
-
-  job_salary = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').try(:text).try(:strip)
-  job_level = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').try(:text).try(:strip)
-  job_experience = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').try(:text).try(:strip)
-  job_expiration_date = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').try(:text).try(:strip)
-
-  job_description = document.css('.job-detail-content .detail-row').to_s
-
-  Job.find_or_create_by(title: job_title,
-                        company_id: job_company.id,
-                        level: job_level,
-                        experience: job_experience,
-                        salary: job_salary,
-                        expiration_date: job_expiration_date) do |job|
-    job.description = job_description
-    job.industries << job_industries
-    job.locations << job_locations
+  def crawl_job(job_link)
+    uri = URI.parse(URI.escape(job_link)) # fix error: uri must be ascii only
+    document = Nokogiri::HTML(URI.open(uri))
+    job_title = document.at_css('.job-desc p.title').text
+    return if job_title.empty?
+
+    job_company_link = document.at_css('.job-desc a.job-company-name')[:href]
+    job_company = crawl_company(job_company_link)
+    return if job_company.nil?
+
+    job_location_name = document.css('.map p a').map { |val| val.text.strip }
+    job_locations = Location.where(city: job_location_name)
+
+    job_industry_names = document.at_xpath('//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]').css('p a').map { |val| val.text.strip }
+    job_industries = Industry.where(name: job_industry_names)
+
+    job_salary = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').try(:text).try(:strip)
+    job_level = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').try(:text).try(:strip)
+    job_experience = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').try(:text).try(:strip)
+    job_exp_date = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').try(:text).try(:strip)
+
+    job_description = document.css('.job-detail-content .detail-row').to_s
+
+    Job.find_or_create_by(title: job_title,
+                          company_id: job_company.id,
+                          level: job_level,
+                          experience: job_experience,
+                          salary: job_salary,
+                          expiration_date: job_exp_date) do |job|
+      job.description = job_description
+      job.industries << job_industries
+      job.locations << job_locations
+    end
+  rescue StandardError => e
+    @logger.error "#{e.message} - Job link: #{uri}"
  end
-rescue StandardError => e
-  @logger.error "#{e.message} - Job link: #{uri}"
-end

-def crawl_industries_locations
-  document = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
-  industries = document.css('#industry option').map(&:text)
-  locations = document.css('#location option').map(&:text)
+  def crawl_industries_locations
+    document = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
+    industries = document.css('#industry option').map(&:text)
+    locations = document.css('#location option').map(&:text)

-  industries.each do |val|
-    Industry.find_or_create_by(name: val)
-  end
+    industries.each do |val|
+      Industry.find_or_create_by(name: val)
+    end

-  locations.take(Location::CITY_VIETNAM_NUMBER).each do |val|
-    Location.find_or_create_by(city: val) do |location|
-      location.oversea = false
+    locations.take(Location::CITY_VIETNAM_NUMBER).each do |val|
+      Location.find_or_create_by(city: val) do |location|
+        location.oversea = false
+      end
    end
-  end

-  locations.last(locations.count - Location::CITY_VIETNAM_NUMBER).each do |val|
-    Location.find_or_create_by(city: val) do |location|
-      location.oversea = true
+    locations.last(locations.count - Location::CITY_VIETNAM_NUMBER).each do |val|
+      Location.find_or_create_by(city: val) do |location|
+        location.oversea = true
+      end
    end
  end
 end
--- a/lib/tasks/csv_import.rake
+++ b/lib/tasks/csv_import.rake
 require 'csv'
-require 'zip'
-require_relative '../common/ftp'
+require './lib/common/extract_zip'

-@logger ||= Logger.new("./log/import_data.log")
+class CsvImport
+  include ExtractZip

-namespace :csv_import do
-  desc 'Download csv file from FTP and import'
-  task csv: :environment do
-    destination_dir = './lib/data'
-    ftp = Ftp.new('192.168.1.156', 'training', 'training')
-    ftp.download_file('jobs.zip', destination_dir)
-    ftp.close
-    extract_zip("#{destination_dir}/jobs.zip", destination_dir)
-    import_job(destination_dir)
+  def initialize(logger)
+    @logger = logger
  end
-end
-
-def extract_zip(file, destination)
-  FileUtils.mkdir_p(destination)

-  Zip::File.open(file) do |zip_file|
-    zip_file.each do |f|
-      fpath = File.join(destination, f.name)
-      zip_file.extract(f, fpath) unless File.exist?(fpath)
+  def import_job(direction)
+    index = 0
+    CSV.foreach("#{direction}/jobs.csv", headers: true) do |row|
+      index += 1
+      next if integer?(row['category'])
+
+      title = row['name'].strip
+      company = Company.find_or_create_by(name: row['company name']) do |c|
+        c.description = "Contact email: #{row['contact email']}\n"\
+                        "Contact name: #{row['contact name']}\n"\
+                        "Contact phone: #{row['contact phone']}"
+        c.address = "#{row['company address']}, #{row['company province']}"
+      end
+      industry = Industry.find_or_create_by(name: row['category'].strip)
+      level = row['level'].try(:strip)
+      salary = row['salary'].try(:strip)
+      locations_name = row['work place'].tr('"[]', '').split(',')
+      locations = Location.where(city: locations_name)
+      locations = locations_name.map { |city| Location.create(oversea: false, city: city) } if locations.empty?
+      description = "Benefits:\n#{row['benefit']}\n"\
+                    "Descriptions:\n#{row['description']}\n"\
+                    "Requirements:\n#{row['requirement']}"
+
+      Job.find_or_create_by(title: title, company_id: company.id, level: level, salary: salary) do |job|
+        job.industries << industry
+        job.locations << locations
+        job.description = description
+      end
+      puts title
    end
+  rescue StandardError => e
+    puts e
+    @logger.error "Job #{index}: #{e.message}"
  end
-end
-
-def import_job(direction)
-  index = 0
-  CSV.foreach("#{direction}/jobs.csv", headers: true) do |row|
-    index += 1
-    next if integer?(row['category'])

-    title = row['name'].strip
-    company = Company.find_or_create_by(name: row['company name']) do |c|
-      c.description = "Contact email: #{row['contact email']}\n"\
-                      "Contact name: #{row['contact name']}\n"\
-                      "Contact phone: #{row['contact phone']}"
-      c.address = "#{row['company address']}, #{row['company province']}"
-    end
-    industry = Industry.find_or_create_by(name: row['category'].strip)
-    level = row['level'].try(:strip)
-    salary = row['salary'].try(:strip)
-    locations_name = row['work place'].tr('"[]', '').split(',')
-    locations = Location.where(city: locations_name)
-    locations = locations_name.map { |city| Location.create(oversea: false, city: city) } if locations.empty?
-    description = "Benefits:\n#{row['benefit']}\n"\
-                  "Descriptions:\n#{row['description']}\n"\
-                  "Requirements:\n#{row['requirement']}"
+  private

-    Job.find_or_create_by(title: title, company_id: company.id, level: level, salary: salary) do |job|
-      job.industries << industry
-      job.locations << locations
-      job.description = description
-    end
-    puts title
+  def integer?(str)
+    str.to_i.to_s == str
  end
-rescue StandardError => e
-  puts e
-  @logger.error "Job #{index}: #{e.message}"
-end
-
-def integer?(str)
-  str.to_i.to_s == str
 end
--- a/lib/common/extract_zip.rb
+++ b/lib/common/extract_zip.rb
+require 'zip'
+
+module ExtractZip
+  def extract_zip(file, destination)
+    FileUtils.mkdir_p(destination)
+
+    Zip::File.open(file) do |zip_file|
+      zip_file.each do |f|
+        fpath = File.join(destination, f.name)
+        zip_file.extract(f, fpath) unless File.exist?(fpath)
+      end
+    end
+  end
+end
--- a/lib/tasks/import_data.rake
+++ b/lib/tasks/import_data.rake
+require './lib/common/ftp'
+require './lib/common/csv'
+require './lib/common/crawler'
+
+namespace :import_data do
+  logger ||= Logger.new('./log/import_data.log')
+
+  desc 'crawl industries locations jobs'
+  task :crawler, %i[page link] => [:environment] do |_, args|
+    args.with_defaults(page: 1, link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
+    crawler = Crawler.new(logger)
+    crawler.crawl_data(args[:page].to_i, args[:link])
+  end
+
+  desc 'Download csv file from FTP and import'
+  task csv: :environment do
+    destination_dir = './lib/data'
+    ftp = Ftp.new('192.168.1.156', 'training', 'training')
+    ftp.download_file('jobs.zip', destination_dir)
+    ftp.close
+    csv = CsvImport.new(logger)
+    csv.extract_zip("#{destination_dir}/jobs.zip", destination_dir)
+    csv.import_job(destination_dir)
+  end
+
+  desc 'Import data from crawler and csv file'
+  task all: %i[crawler csv]
+end