Commit 3a9ba394 by Tô Ngọc Ánh

Merge branch 'ftp-import' into 'master'

import job from csv, using rubocop to fix convention code

See merge request !5
parents ed8bc042 7df5cbbd
Pipeline #730 failed with stages
in 0 seconds
...@@ -25,3 +25,5 @@ ...@@ -25,3 +25,5 @@
# Ignore master key for decrypting credentials and more. # Ignore master key for decrypting credentials and more.
/config/master.key /config/master.key
/lib/data
...@@ -45,10 +45,10 @@ group :development do ...@@ -45,10 +45,10 @@ group :development do
# Access an interactive console on exception pages or by calling 'console' anywhere in the code. # Access an interactive console on exception pages or by calling 'console' anywhere in the code.
gem 'web-console', '>= 3.3.0' gem 'web-console', '>= 3.3.0'
gem 'listen', '>= 3.0.5', '< 3.2' gem 'listen', '>= 3.0.5', '< 3.2'
gem 'dotenv-rails'
# Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring # Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring
gem 'spring' gem 'spring'
gem 'spring-watcher-listen', '~> 2.0.0' gem 'spring-watcher-listen', '~> 2.0.0'
gem 'dotenv-rails'
end end
group :test do group :test do
...@@ -63,6 +63,6 @@ end ...@@ -63,6 +63,6 @@ end
gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby] gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby]
## ##
gem "nokogiri" gem 'nokogiri'
gem 'whenever', require: false gem 'whenever', require: false
## ##
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
# #
default: &default default: &default
adapter: mysql2 adapter: mysql2
encoding: utf8 encoding: utf8mb4
pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %> pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username: <%= ENV['DB_USERNAME'] %> username: <%= ENV['DB_USERNAME'] %>
password: <%= ENV['DB_PASSWORD'] %> password: <%= ENV['DB_PASSWORD'] %>
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
ActiveRecord::Schema.define(version: 2020_07_20_075150) do ActiveRecord::Schema.define(version: 2020_07_20_075150) do
create_table "applied_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "applied_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
t.bigint "user_id" t.bigint "user_id"
t.bigint "job_id" t.bigint "job_id"
t.string "full_name" t.string "full_name"
...@@ -24,7 +24,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do ...@@ -24,7 +24,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
t.index ["user_id"], name: "index_applied_jobs_on_user_id" t.index ["user_id"], name: "index_applied_jobs_on_user_id"
end end
create_table "companies", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "companies", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
t.string "name" t.string "name"
t.text "description" t.text "description"
t.string "address" t.string "address"
...@@ -33,7 +33,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do ...@@ -33,7 +33,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
t.index ["name"], name: "index_companies_on_name", unique: true t.index ["name"], name: "index_companies_on_name", unique: true
end end
create_table "favorites", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "favorites", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
t.bigint "user_id" t.bigint "user_id"
t.bigint "job_id" t.bigint "job_id"
t.datetime "created_at", null: false t.datetime "created_at", null: false
...@@ -42,7 +42,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do ...@@ -42,7 +42,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
t.index ["user_id"], name: "index_favorites_on_user_id" t.index ["user_id"], name: "index_favorites_on_user_id"
end end
create_table "histories", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "histories", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
t.bigint "user_id" t.bigint "user_id"
t.bigint "job_id" t.bigint "job_id"
t.datetime "created_at", null: false t.datetime "created_at", null: false
...@@ -51,20 +51,20 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do ...@@ -51,20 +51,20 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
t.index ["user_id"], name: "index_histories_on_user_id" t.index ["user_id"], name: "index_histories_on_user_id"
end end
create_table "industries", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "industries", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
t.string "name" t.string "name"
t.datetime "created_at", null: false t.datetime "created_at", null: false
t.datetime "updated_at", null: false t.datetime "updated_at", null: false
end end
create_table "industries_jobs", id: false, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "industries_jobs", id: false, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
t.bigint "job_id" t.bigint "job_id"
t.bigint "industry_id" t.bigint "industry_id"
t.index ["industry_id"], name: "index_industries_jobs_on_industry_id" t.index ["industry_id"], name: "index_industries_jobs_on_industry_id"
t.index ["job_id"], name: "index_industries_jobs_on_job_id" t.index ["job_id"], name: "index_industries_jobs_on_job_id"
end end
create_table "jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
t.bigint "company_id" t.bigint "company_id"
t.string "title" t.string "title"
t.string "level" t.string "level"
...@@ -77,14 +77,14 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do ...@@ -77,14 +77,14 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
t.index ["company_id"], name: "index_jobs_on_company_id" t.index ["company_id"], name: "index_jobs_on_company_id"
end end
create_table "locations", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "locations", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
t.string "city" t.string "city"
t.datetime "created_at", null: false t.datetime "created_at", null: false
t.datetime "updated_at", null: false t.datetime "updated_at", null: false
t.boolean "oversea" t.boolean "oversea"
end end
create_table "locations_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "locations_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
t.bigint "job_id" t.bigint "job_id"
t.bigint "location_id" t.bigint "location_id"
t.datetime "created_at", null: false t.datetime "created_at", null: false
...@@ -93,7 +93,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do ...@@ -93,7 +93,7 @@ ActiveRecord::Schema.define(version: 2020_07_20_075150) do
t.index ["location_id"], name: "index_locations_jobs_on_location_id" t.index ["location_id"], name: "index_locations_jobs_on_location_id"
end end
create_table "users", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "users", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
t.string "email" t.string "email"
t.string "full_name" t.string "full_name"
t.string "curriculum_vitae" t.string "curriculum_vitae"
......
require "open-uri" require 'open-uri'
@logger ||= Logger.new("#{Rails.root}/log/crawler.log")
class Crawler
namespace :crawl do def initialize(logger)
desc "crawl industries locations jobs" @logger = logger
task :crawl_industries_locations_jobs, [:page, :link] => [:environment] do |task, args| end
args.with_defaults(link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
crawl_industries_and_locations def crawl_data(page_number, base_link)
job_links = get_job_links(args[:page].to_i, args[:link]) crawl_industries_locations
job_links = get_job_links(page_number, base_link)
job_links.each do |link| job_links.each do |link|
next if link.empty? next if link.empty?
crawl_job(link) crawl_job(link)
end end
end end
end
def get_job_links(page, link) def get_job_links(page_number, link)
job_links = [] job_links = []
page.times do page_number.times do
document = Nokogiri::HTML(open(link)) document = Nokogiri::HTML(URI.open(link))
jobs_xml = document.xpath('//div/a[@class="job_link"]/@href') jobs_xml = document.xpath('//div/a[@class="job_link"]/@href')
jobs_xml.each { |item| job_links << item.value} jobs_xml.each { |item| job_links << item.value }
next_page = document.at_css('.next-page a') next_page = document.at_css('.next-page a')
break if next_page.nil? break if next_page.nil?
link = next_page[:href] link = next_page[:href]
end end
job_links job_links
end end
def crawl_company(company_link) def crawl_company(company_link)
begin uri = URI.parse(URI.escape(company_link)) # fix error: uri must be ascii only
uri = URI.parse(URI.escape(company_link)) #fix error: uri must be ascii only document = Nokogiri::HTML(URI.open(uri))
document = Nokogiri::HTML(open(uri)) company_name = document.css('.content .name').text
company_name = document.css(".content .name").text
return if company_name.empty? return if company_name.empty?
puts company_name company_address = document.css('.content p')[1].text
company_address = document.css(".content p")[1].text company_description = document.css('.main-about-us').css('.content').text
company_description = document.css(".main-about-us").css('.content').text
Company.find_or_create_by(name: company_name) do |company| Company.find_or_create_by(name: company_name) do |company|
company.address = company_address company.address = company_address
company.description = company_description company.description = company_description
end end
rescue => exception rescue StandardError => e
puts exception @logger.error "#{e.message} - Company link: #{uri}"
@logger.error "#{exception.message} - Company link: #{uri}"
return
end end
end
def crawl_job(job_link) def crawl_job(job_link)
begin uri = URI.parse(URI.escape(job_link)) # fix error: uri must be ascii only
uri = URI.parse(URI.escape(job_link)) #fix error: uri must be ascii only document = Nokogiri::HTML(URI.open(uri))
document = Nokogiri::HTML(open(uri))
job_title = document.at_css('.job-desc p.title').text job_title = document.at_css('.job-desc p.title').text
return if job_title.empty? return if job_title.empty?
...@@ -60,43 +56,37 @@ def crawl_job(job_link) ...@@ -60,43 +56,37 @@ def crawl_job(job_link)
job_company = crawl_company(job_company_link) job_company = crawl_company(job_company_link)
return if job_company.nil? return if job_company.nil?
job_location_name = document.css('.map p a').map{ |val| val.text.strip } job_location_name = document.css('.map p a').map { |val| val.text.strip }
job_locations = Location.where(city: job_location_name) job_locations = Location.where(city: job_location_name)
job_industry_names = document.at_xpath('//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]').css('p a').map{ |val| val.text.strip } job_industry_names = document.at_xpath('//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]').css('p a').map { |val| val.text.strip }
job_industries = Industry.where(name: job_industry_names) job_industries = Industry.where(name: job_industry_names)
job_salary = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').try(:text).try(:strip) job_salary = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').try(:text).try(:strip)
job_level = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').try(:text).try(:strip) job_level = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').try(:text).try(:strip)
job_experience = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').try(:text).try(:strip) job_experience = document.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').try(:text).try(:strip)
job_expiration_date = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').try(:text).try(:strip) job_exp_date = document.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').try(:text).try(:strip)
job_description = document.css('.job-detail-content .detail-row').to_s job_description = document.css('.job-detail-content .detail-row').to_s
Job.find_or_create_by(title: job_title, company_id: job_company.id) do |job| Job.find_or_create_by(title: job_title,
job.salary = job_salary company_id: job_company.id,
job.experience = job_experience level: job_level,
job.level = job_level experience: job_experience,
job.expiration_date = job_expiration_date salary: job_salary,
expiration_date: job_exp_date) do |job|
job.description = job_description job.description = job_description
job.industries << job_industries job.industries << job_industries
job.locations << job_locations job.locations << job_locations
end end
puts job_title rescue StandardError => e
rescue => exception @logger.error "#{e.message} - Job link: #{uri}"
puts exception
@logger.error "#{exception.message} - Job link: #{uri}"
return exception
end end
end
def crawl_industries_and_locations
document = Nokogiri::HTML(open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
industries_xml = document.css('#industry option')
industries = industries_xml.map(&:text)
locations_xml = document.css('#location option')
locations = locations_xml.map(&:text)
def crawl_industries_locations
document = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
industries = document.css('#industry option').map(&:text)
locations = document.css('#location option').map(&:text)
industries.each do |val| industries.each do |val|
Industry.find_or_create_by(name: val) Industry.find_or_create_by(name: val)
...@@ -113,4 +103,5 @@ def crawl_industries_and_locations ...@@ -113,4 +103,5 @@ def crawl_industries_and_locations
location.oversea = true location.oversea = true
end end
end end
end
end end
require 'csv'
require './lib/common/extract_zip'
class CsvImport
include ExtractZip
def initialize(logger)
@logger = logger
end
def import_job(direction)
CSV.foreach("#{direction}/jobs.csv", headers: true).with_index(2) do |row, index|
next if row['category'].blank? || row['category'].match(/^[0-9]+$/).present?
title = row['name'].strip
company = Company.find_or_create_by(name: row['company name']) do |c|
c.description = "Contact email: #{row['contact email']}\n"\
"Contact name: #{row['contact name']}\n"\
"Contact phone: #{row['contact phone']}"
c.address = "#{row['company address']}, #{row['company province']}"
end
industry = Industry.find_or_create_by(name: row['category'].strip)
level = row['level'].try(:strip)
salary = row['salary'].try(:strip)
locations_name = row['work place'].tr('"[]', '').split(',')
locations = Location.where(city: locations_name)
locations = locations_name.map { |city| Location.create(oversea: false, city: city) } if locations.empty?
description = "Benefits:\n#{row['benefit']}\n"\
"Descriptions:\n#{row['description']}\n"\
"Requirements:\n#{row['requirement']}"
Job.find_or_create_by(title: title, company_id: company.id, level: level, salary: salary) do |job|
job.industries << industry
job.locations << locations
job.description = description
end
puts title
rescue StandardError => e
puts e
@logger.error "Job #{index}: #{e.message}"
end
end
end
require 'zip'
module ExtractZip
def extract_zip(file, destination)
FileUtils.mkdir_p(destination)
Zip::File.open(file) do |zip_file|
zip_file.each do |f|
fpath = File.join(destination, f.name)
zip_file.extract(f, fpath) unless File.exist?(fpath)
end
end
end
end
require 'net/ftp'
class Ftp
def initialize(host, username, password)
@ftp = Net::FTP.new(host)
@ftp.login(username, password)
end
def download_file(file_name, destination_dir)
@ftp.get(file_name, "#{destination_dir}/#{file_name}")
end
def close
@ftp.close
end
end
require './lib/common/ftp'
require './lib/common/csv'
require './lib/common/crawler'
namespace :import_data do
logger ||= Logger.new('./log/import_data.log')
desc 'crawl industries locations jobs'
task :crawler, %i[page_number link] => [:environment] do |_, args|
args.with_defaults(page_number: 1, link: 'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')
crawler = Crawler.new(logger)
crawler.crawl_data(args[:page_number].to_i, args[:link])
end
desc 'Download csv file from FTP and import'
task csv: :environment do
destination_dir = './lib/data'
Dir.mkdir destination_dir unless File.exists?(destination_dir)
ftp = Ftp.new('192.168.1.156', 'training', 'training')
ftp.download_file('jobs.zip', destination_dir)
ftp.close
csv = CsvImport.new(logger)
csv.extract_zip("#{destination_dir}/jobs.zip", destination_dir)
csv.import_job(destination_dir)
end
desc 'Import data from crawler and csv file'
task all: %i[crawler csv]
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment