import job from csv, using rubocop to fix convention code

802a0c27 · Tô Ngọc Ánh · ed8bc042 · 802a0c27 · 802a0c27 · 802a0c27
Commit 802a0c27 authored Jul 27, 2020 by Tô Ngọc Ánh
Showing with 125 additions and 54 deletions

.gitignore
+2 -0

Gemfile
+2 -2

config/database.yml
+1 -1

db/schema.rb
+10 -10

lib/common/ftp.rb
+16 -0

lib/data/jobs.csv
+0 -0

lib/data/jobs.zip
+0 -0

lib/tasks/crawler.rake
+32 -41

lib/tasks/ftp_import.rake
+62 -0

No files found.
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,5 @@

 # Ignore master key for decrypting credentials and more.
 /config/master.key
+
+/lib/tasks/src
--- a/Gemfile
+++ b/Gemfile
@@ -47,8 +47,8 @@ group :development do
  gem 'listen', '>= 3.0.5', '< 3.2'
  # Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring
  gem 'spring'
-  gem 'spring-watcher-listen', '~> 2.0.0'
  gem 'dotenv-rails'
+  gem 'spring-watcher-listen', '~> 2.0.0'
 end

 group :test do
@@ -63,6 +63,6 @@ end
 gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby]

 ##
-gem "nokogiri"
+gem 'nokogiri'
 gem 'whenever', require: false
 ##
--- a/config/database.yml
+++ b/config/database.yml
@@ -11,7 +11,7 @@
 #
 default: &default
  adapter: mysql2
-  encoding: utf8
+  encoding: utf8mb4
  pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
  username: <%= ENV['DB_USERNAME'] %>
  password: <%= ENV['DB_PASSWORD'] %>

--- a/db/schema.rb
+++ b/db/schema.rb
@@ -12,7 +12,7 @@

 ActiveRecord::Schema.define(version: 2020_07_20_075150) do

-  create_table "applied_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "applied_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.bigint "user_id"
    t.bigint "job_id"
    t.string "full_name"
@@ -24,7 +24,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
    t.index ["user_id"], name: "index_applied_jobs_on_user_id"
  end

-  create_table "companies", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "companies", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.string "name"
    t.text "description"
    t.string "address"
@@ -33,7 +33,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
    t.index ["name"], name: "index_companies_on_name", unique: true
  end

-  create_table "favorites", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "favorites", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.bigint "user_id"
    t.bigint "job_id"
    t.datetime "created_at", null: false
@@ -42,7 +42,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
    t.index ["user_id"], name: "index_favorites_on_user_id"
  end

-  create_table "histories", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "histories", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.bigint "user_id"
    t.bigint "job_id"
    t.datetime "created_at", null: false
@@ -51,20 +51,20 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
    t.index ["user_id"], name: "index_histories_on_user_id"
  end

-  create_table "industries", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "industries", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.string "name"
    t.datetime "created_at", null: false
    t.datetime "updated_at", null: false
  end

-  create_table "industries_jobs", id: false, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "industries_jobs", id: false, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.bigint "job_id"
    t.bigint "industry_id"
    t.index ["industry_id"], name: "index_industries_jobs_on_industry_id"
    t.index ["job_id"], name: "index_industries_jobs_on_job_id"
  end

-  create_table "jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.bigint "company_id"
    t.string "title"
    t.string "level"
@@ -77,14 +77,14 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
    t.index ["company_id"], name: "index_jobs_on_company_id"
  end

-  create_table "locations", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "locations", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.string "city"
    t.datetime "created_at", null: false
    t.datetime "updated_at", null: false
    t.boolean "oversea"
  end

-  create_table "locations_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "locations_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.bigint "job_id"
    t.bigint "location_id"
    t.datetime "created_at", null: false
@@ -93,7 +93,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
    t.index ["location_id"], name: "index_locations_jobs_on_location_id"
  end

-  create_table "users", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
+  create_table "users", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
    t.string "email"
    t.string "full_name"
    t.string "curriculum_vitae"

--- a/lib/common/ftp.rb
+++ b/lib/common/ftp.rb
+require 'net/ftp'
+
+class Ftp
+  def initialize(host, username, password)
+    @ftp = Net::FTP.new(host)
+    @ftp.login(username, password)
+  end
+
+  def download_file(file_name, destination_dir)
+    @ftp.get(file_name, "#{destination_dir}/#{file_name}")
+  end
+
+  def close
+    @ftp.close
+  end
+end
--- a/lib/data/jobs.csv
+++ b/lib/data/jobs.csv
--- a/lib/data/jobs.zip
+++ b/lib/data/jobs.zip
--- a/lib/tasks/crawler.rake
+++ b/lib/tasks/crawler.rake
-require "open-uri"
+require 'open-uri'
+
 @logger ||= Logger.new("#{Rails.root}/log/crawler.log")

 namespace :crawl do
-  desc "crawl industries locations jobs"
-  task :crawl_industries_locations_jobs, [:page, :link] => [:environment] do |task, args|
+  desc 'crawl industries locations jobs'
+  task :crawl_industries_locations_jobs, %i[page link] => [:environment] do |_, args|
    args.with_defaults(link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
-    crawl_industries_and_locations
+    crawl_industries_locations
    job_links = get_job_links(args[:page].to_i, args[:link])
    job_links.each do |link|
      next if link.empty?
+
      crawl_job(link)
    end
  end
@@ -17,42 +19,37 @@ end
 def get_job_links(page, link)
  job_links = []
  page.times do
-    document = Nokogiri::HTML(open(link))
+    document = Nokogiri::HTML(URI.open(link))
    jobs_xml = document.xpath('//div/a[@class="job_link"]/@href')
-    jobs_xml.each { |item| job_links << item.value}
+    jobs_xml.each { |item| job_links << item.value }
    next_page = document.at_css('.next-page a')
    break if next_page.nil?
+
    link = next_page[:href]
  end
  job_links
 end

 def crawl_company(company_link)
-  begin
-    uri = URI.parse(URI.escape(company_link)) #fix error: uri must be ascii only
-    document = Nokogiri::HTML(open(uri))
-    company_name = document.css(".content .name").text
+  uri = URI.parse(CGI.escape(company_link)) # fix error: uri must be ascii only
+  document = Nokogiri::HTML(URI.open(uri))
+  company_name = document.css('.content .name').text
  return if company_name.empty?

-    puts company_name
-    company_address = document.css(".content p")[1].text
-    company_description = document.css(".main-about-us").css('.content').text
+  company_address = document.css('.content p')[1].text
+  company_description = document.css('.main-about-us').css('.content').text

  Company.find_or_create_by(name: company_name) do |company|
    company.address = company_address
    company.description = company_description
  end
-  rescue => exception
-    puts exception
-    @logger.error "#{exception.message} - Company link: #{uri}"
-    return
-  end
+rescue StandardError => e
+  @logger.error "#{e.message} - Company link: #{uri}"
 end

 def crawl_job(job_link)
-  begin
-    uri = URI.parse(URI.escape(job_link)) #fix error: uri must be ascii only
-    document = Nokogiri::HTML(open(uri))
+  uri = URI.parse(CGI.escape(job_link)) # fix error: uri must be ascii only
+  document = Nokogiri::HTML(URI.open(uri))
  job_title = document.at_css('.job-desc p.title').text
  return if job_title.empty?

@@ -60,10 +57,10 @@ def crawl_job(job_link)
  job_company = crawl_company(job_company_link)
  return if job_company.nil?

-    job_location_name = document.css('.map p a').map{ |val| val.text.strip }
+  job_location_name = document.css('.map p a').map { |val| val.text.strip }
  job_locations = Location.where(city: job_location_name)

-    job_industry_names = document.at_xpath('//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]').css('p a').map{ |val| val.text.strip }
+  job_industry_names = document.at_xpath('//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]').css('p a').map { |val| val.text.strip }
  job_industries = Industry.where(name: job_industry_names)

  job_salary = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').try(:text).try(:strip)
@@ -73,30 +70,24 @@ def crawl_job(job_link)

  job_description = document.css('.job-detail-content .detail-row').to_s

-    Job.find_or_create_by(title: job_title, company_id: job_company.id) do |job|
-      job.salary = job_salary
-      job.experience = job_experience
-      job.level = job_level
-      job.expiration_date = job_expiration_date
+  Job.find_or_create_by(title: job_title,
+                        company_id: job_company.id,
+                        level: job_level,
+                        experience: job_experience,
+                        salary: job_salary,
+                        expiration_date: job_expiration_date) do |job|
    job.description = job_description
    job.industries << job_industries
    job.locations << job_locations
  end
-    puts job_title
-  rescue => exception
-    puts exception
-    @logger.error "#{exception.message} - Job link: #{uri}"
-    return exception
-  end
+rescue StandardError => e
+  @logger.error "#{e.message} - Job link: #{uri}"
 end

-def crawl_industries_and_locations
-  document = Nokogiri::HTML(open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
-  industries_xml = document.css('#industry option')
-  industries = industries_xml.map(&:text)
-  locations_xml = document.css('#location option')
-  locations = locations_xml.map(&:text)
-
+def crawl_industries_locations
+  document = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
+  industries = document.css('#industry option').map(&:text)
+  locations = document.css('#location option').map(&:text)

  industries.each do |val|
    Industry.find_or_create_by(name: val)

--- a/lib/tasks/ftp_import.rake
+++ b/lib/tasks/ftp_import.rake
+require 'csv'
+require 'zip'
+require_relative '../common/ftp'
+
+namespace :ftp_import do
+  desc 'FTP import csv file'
+  task csv: :environment do
+    destination_dir = './lib/data'
+    ftp = Ftp.new('192.168.1.156', 'training', 'training')
+    ftp.download_file('jobs.zip', destination_dir)
+    ftp.close
+    extract_zip("#{destination_dir}/jobs.zip", destination_dir)
+    import_job(destination_dir)
+  end
+end
+
+def extract_zip(file, destination)
+  FileUtils.mkdir_p(destination)
+
+  Zip::File.open(file) do |zip_file|
+    zip_file.each do |f|
+      fpath = File.join(destination, f.name)
+      zip_file.extract(f, fpath) unless File.exist?(fpath)
+    end
+  end
+end
+
+def import_job(direction)
+  # i = 0
+  CSV.foreach("#{direction}/jobs.csv", headers: true) do |row|
+    # i+=1
+    next if row['name'].blank? || !row['category'].is_a?(String) || row['company name'].blank?
+
+    title = row['name'].strip
+    company = Company.find_or_create_by(name: row['company name']) do |c|
+      c.description = "Contact email: #{row['contact email']}\n"\
+                      "Contact name: #{row['contact name']}\n"\
+                      "Contact phone: #{row['contact phone']}"
+      c.address = "#{row['company address']}, #{row['company province']}"
+    end
+    industry = Industry.find_or_create_by(name: row['category'].strip)
+    level = row['level'].try(:strip)
+    salary = row['salary'].try(:strip)
+    locations_name = row['work place'].is_a?(Array) ? row['work place'] : row['work place'].split(',')
+    # byebug if i == 61
+    locations = Location.where(city: locations_name)
+    locations = locations_name.map { |city| Location.create(oversea: false, city: city) } if locations.empty?
+    description = "Benefits:\n#{row['benefit']}\n"\
+                  "Descriptions:\n#{row['description']}\n"\
+                  "Requirements:\n#{row['requirement']}"
+
+    Job.find_or_create_by(title: title, company_id: company.id, level: level, salary: salary) do |job|
+      job.industries << industry
+      job.locations << locations
+      job.description = description
+    end
+    puts title
+  end
+rescue StandardError => e
+  puts e
+  @logger.error e.message
+end