Merge branch 'ftp-import' into 'master'

import job from csv, using rubocop to fix convention code See merge request !5

Merge branch 'ftp-import' into 'master'
import job from csv, using rubocop to fix convention code See merge request !5
3a9ba394 · Tô Ngọc Ánh · ed8bc042 · 7df5cbbd · 3a9ba394 · 3a9ba394
Commit 3a9ba394 authored Jul 29, 2020 by Tô Ngọc Ánh
9 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,5 @@
 # Ignore master key for decrypting credentials and more.
 /config/master.key
+/lib/data
--- a/Gemfile
+++ b/Gemfile
@@ -45,10 +45,10 @@ group :development do
  # Access an interactive console on exception pages or by calling 'console' anywhere in the code.
  gem 'web-console', '>= 3.3.0'
  gem 'listen', '>= 3.0.5', '< 3.2'
+  gem 'dotenv-rails'
  # Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring
  gem 'spring'
  gem 'spring-watcher-listen', '~> 2.0.0'
-  gem 'dotenv-rails'
 end
 group :test do
@@ -63,6 +63,6 @@ end
 gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby]
 ##
-gem "nokogiri"
+gem 'nokogiri'
 gem 'whenever', require: false
 ##
--- a/config/database.yml
+++ b/config/database.yml
@@ -11,7 +11,7 @@
 #
 default: &default
  adapter: mysql2
-  encoding: utf8
+  encoding: utf8mb4
  pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
  username: <%= ENV['DB_USERNAME'] %>
  password: <%= ENV['DB_PASSWORD'] %>

--- a/db/schema.rb
+++ b/db/schema.rb
@@ -12,7 +12,7 @@
 ActiveRecord::Schema.define(version: 2020_07_20_075150) do
-  create_table "applied_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "applied_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.bigint "user_id"
    t.bigint "job_id"
    t.string "full_name"
@@ -24,7 +24,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
    t.index ["user_id"], name: "index_applied_jobs_on_user_id"
  end
-  create_table "companies", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "companies", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.string "name"
    t.text "description"
    t.string "address"
@@ -33,7 +33,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
    t.index ["name"], name: "index_companies_on_name", unique: true
  end
-  create_table "favorites", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "favorites", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.bigint "user_id"
    t.bigint "job_id"
    t.datetime "created_at", null: false
@@ -42,7 +42,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
    t.index ["user_id"], name: "index_favorites_on_user_id"
  end
-  create_table "histories", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "histories", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.bigint "user_id"
    t.bigint "job_id"
    t.datetime "created_at", null: false
@@ -51,20 +51,20 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
    t.index ["user_id"], name: "index_histories_on_user_id"
  end
-  create_table "industries", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "industries", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.string "name"
    t.datetime "created_at", null: false
    t.datetime "updated_at", null: false
  end
-  create_table "industries_jobs", id: false, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "industries_jobs", id: false, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.bigint "job_id"
    t.bigint "industry_id"
    t.index ["industry_id"], name: "index_industries_jobs_on_industry_id"
    t.index ["job_id"], name: "index_industries_jobs_on_job_id"
  end
-  create_table "jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.bigint "company_id"
    t.string "title"
    t.string "level"
@@ -77,14 +77,14 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
    t.index ["company_id"], name: "index_jobs_on_company_id"
  end
-  create_table "locations", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "locations", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.string "city"
    t.datetime "created_at", null: false
    t.datetime "updated_at", null: false
    t.boolean "oversea"
  end
-  create_table "locations_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "locations_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.bigint "job_id"
    t.bigint "location_id"
    t.datetime "created_at", null: false
@@ -93,7 +93,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
    t.index ["location_id"], name: "index_locations_jobs_on_location_id"
  end
-  create_table "users", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "users", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.string "email"
    t.string "full_name"
    t.string "curriculum_vitae"

--- a/lib/tasks/crawler.rake
+++ b/lib/tasks/crawler.rake
-require "open-uri"
+require 'open-uri'
-@logger ||= Logger.new("#{Rails.root}/log/crawler.log")
+class Crawler
-namespace :crawl do
+  def initialize(logger)
-  desc "crawl industries locations jobs"
+    @logger = logger
-  task :crawl_industries_locations_jobs, [:page, :link] => [:environment] do |task, args|
+  end
-    args.with_defaults(link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
-    crawl_industries_and_locations
+  def crawl_data(page_number, base_link)
-    job_links = get_job_links(args[:page].to_i, args[:link])
+    crawl_industries_locations
+    job_links = get_job_links(page_number, base_link)
    job_links.each do |link|
      next if link.empty?
      crawl_job(link)
    end
  end
-end
-def get_job_links(page, link)
+  def get_job_links(page_number, link)
    job_links = []
-  page.times do
+    page_number.times do
-    document = Nokogiri::HTML(open(link))
+      document = Nokogiri::HTML(URI.open(link))
      jobs_xml = document.xpath('//div/a[@class="job_link"]/@href')
-    jobs_xml.each { |item| job_links << item.value}
+      jobs_xml.each { |item| job_links << item.value }
      next_page = document.at_css('.next-page a')
      break if next_page.nil?
      link = next_page[:href]
    end
    job_links
-end
+  end
-def crawl_company(company_link)
+  def crawl_company(company_link)
-  begin
+    uri = URI.parse(URI.escape(company_link)) # fix error: uri must be ascii only
-    uri = URI.parse(URI.escape(company_link)) #fix error: uri must be ascii only
+    document = Nokogiri::HTML(URI.open(uri))
-    document = Nokogiri::HTML(open(uri))
+    company_name = document.css('.content .name').text
-    company_name = document.css(".content .name").text
    return if company_name.empty?
-    puts company_name
+    company_address = document.css('.content p')[1].text
-    company_address = document.css(".content p")[1].text
+    company_description = document.css('.main-about-us').css('.content').text
-    company_description = document.css(".main-about-us").css('.content').text
    Company.find_or_create_by(name: company_name) do |company|
      company.address = company_address
      company.description = company_description
    end
-  rescue => exception
+  rescue StandardError => e
-    puts exception
+    @logger.error "#{e.message} - Company link: #{uri}"
-    @logger.error "#{exception.message} - Company link: #{uri}"
-    return
  end
-end
-def crawl_job(job_link)
+  def crawl_job(job_link)
-  begin
+    uri = URI.parse(URI.escape(job_link)) # fix error: uri must be ascii only
-    uri = URI.parse(URI.escape(job_link)) #fix error: uri must be ascii only
+    document = Nokogiri::HTML(URI.open(uri))
-    document = Nokogiri::HTML(open(uri))
    job_title = document.at_css('.job-desc p.title').text
    return if job_title.empty?
@@ -60,43 +56,37 @@ def crawl_job(job_link)
    job_company = crawl_company(job_company_link)
    return if job_company.nil?
-    job_location_name = document.css('.map p a').map{ |val| val.text.strip }
+    job_location_name = document.css('.map p a').map { |val| val.text.strip }
    job_locations = Location.where(city: job_location_name)
-    job_industry_names = document.at_xpath('//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]').css('p a').map{ |val| val.text.strip }
+    job_industry_names = document.at_xpath('//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]').css('p a').map { |val| val.text.strip }
    job_industries = Industry.where(name: job_industry_names)
    job_salary = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').try(:text).try(:strip)
    job_level = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').try(:text).try(:strip)
    job_experience = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').try(:text).try(:strip)
-    job_expiration_date = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').try(:text).try(:strip)
+    job_exp_date = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').try(:text).try(:strip)
    job_description = document.css('.job-detail-content .detail-row').to_s
-    Job.find_or_create_by(title: job_title, company_id: job_company.id) do |job|
+    Job.find_or_create_by(title: job_title,
-      job.salary = job_salary
+                          company_id: job_company.id,
-      job.experience = job_experience
+                          level: job_level,
-      job.level = job_level
+                          experience: job_experience,
-      job.expiration_date = job_expiration_date
+                          salary: job_salary,
+                          expiration_date: job_exp_date) do |job|
      job.description = job_description
      job.industries << job_industries
      job.locations << job_locations
    end
-    puts job_title
+  rescue StandardError => e
-  rescue => exception
+    @logger.error "#{e.message} - Job link: #{uri}"
-    puts exception
-    @logger.error "#{exception.message} - Job link: #{uri}"
-    return exception
  end
-end
-def crawl_industries_and_locations
-  document = Nokogiri::HTML(open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
-  industries_xml = document.css('#industry option')
-  industries = industries_xml.map(&:text)
-  locations_xml = document.css('#location option')
-  locations = locations_xml.map(&:text)
+  def crawl_industries_locations
+    document = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
+    industries = document.css('#industry option').map(&:text)
+    locations = document.css('#location option').map(&:text)
    industries.each do |val|
      Industry.find_or_create_by(name: val)
@@ -113,4 +103,5 @@ def crawl_industries_and_locations
        location.oversea = true
      end
    end
+  end
 end
--- a/lib/common/csv.rb
+++ b/lib/common/csv.rb
+require 'csv'
+require './lib/common/extract_zip'
+class CsvImport
+  include ExtractZip
+  def initialize(logger)
+    @logger = logger
+  end
+  def import_job(direction)
+    CSV.foreach("#{direction}/jobs.csv", headers: true).with_index(2) do |row, index|
+      next if row['category'].blank? || row['category'].match(/^[0-9]+$/).present?
+      title = row['name'].strip
+      company = Company.find_or_create_by(name: row['company name']) do |c|
+        c.description = "Contact email: #{row['contact email']}\n"\
+                        "Contact name: #{row['contact name']}\n"\
+                        "Contact phone: #{row['contact phone']}"
+        c.address = "#{row['company address']}, #{row['company province']}"
+      end
+      industry = Industry.find_or_create_by(name: row['category'].strip)
+      level = row['level'].try(:strip)
+      salary = row['salary'].try(:strip)
+      locations_name = row['work place'].tr('"[]', '').split(',')
+      locations = Location.where(city: locations_name)
+      locations = locations_name.map { |city| Location.create(oversea: false, city: city) } if locations.empty?
+      description = "Benefits:\n#{row['benefit']}\n"\
+                    "Descriptions:\n#{row['description']}\n"\
+                    "Requirements:\n#{row['requirement']}"
+      Job.find_or_create_by(title: title, company_id: company.id, level: level, salary: salary) do |job|
+        job.industries << industry
+        job.locations << locations
+        job.description = description
+      end
+      puts title
+      rescue StandardError => e
+        puts e
+        @logger.error "Job #{index}: #{e.message}"
+    end
+  end
+end
--- a/lib/common/extract_zip.rb
+++ b/lib/common/extract_zip.rb
+require 'zip'
+module ExtractZip
+  def extract_zip(file, destination)
+    FileUtils.mkdir_p(destination)
+    Zip::File.open(file) do |zip_file|
+      zip_file.each do |f|
+        fpath = File.join(destination, f.name)
+        zip_file.extract(f, fpath) unless File.exist?(fpath)
+      end
+    end
+  end
+end
--- a/lib/common/ftp.rb
+++ b/lib/common/ftp.rb
+require 'net/ftp'
+class Ftp
+  def initialize(host, username, password)
+    @ftp = Net::FTP.new(host)
+    @ftp.login(username, password)
+  end
+  def download_file(file_name, destination_dir)
+    @ftp.get(file_name, "#{destination_dir}/#{file_name}")
+  end
+  def close
+    @ftp.close
+  end
+end
--- a/lib/tasks/import_data.rake
+++ b/lib/tasks/import_data.rake
+require './lib/common/ftp'
+require './lib/common/csv'
+require './lib/common/crawler'
+namespace :import_data do
+  logger ||= Logger.new('./log/import_data.log')
+  desc 'crawl industries locations jobs'
+  task :crawler, %i[page_number link] => [:environment] do |_, args|
+    args.with_defaults(page_number: 1, link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
+    crawler = Crawler.new(logger)
+    crawler.crawl_data(args[:page_number].to_i, args[:link])
+  end
+  desc 'Download csv file from FTP and import'
+  task csv: :environment do
+    destination_dir = './lib/data'
+    Dir.mkdir destination_dir unless File.exists?(destination_dir)
+    ftp = Ftp.new('192.168.1.156', 'training', 'training')
+    ftp.download_file('jobs.zip', destination_dir)
+    ftp.close
+    csv = CsvImport.new(logger)
+    csv.extract_zip("#{destination_dir}/jobs.zip", destination_dir)
+    csv.import_job(destination_dir)
+  end
+  desc 'Import data from crawler and csv file'
+  task all: %i[crawler csv]
+end