Commit b5ac3f15 by Trịnh Hoàng Phúc

Merge branch 'master' of gitlab.zigexn.vn:phucth/ven-job into feature/social_authenticate

parents 930943ec 629f8bc6
Pipeline #623 failed with stages
in 0 seconds
...@@ -29,13 +29,15 @@ gem 'rsolr' ...@@ -29,13 +29,15 @@ gem 'rsolr'
gem 'carrierwave' gem 'carrierwave'
gem 'activerecord-import' gem 'activerecord-import', require: false
gem 'will_paginate' gem 'will_paginate'
gem 'omniauth-facebook' gem 'omniauth-facebook'
gem 'settingslogic' gem 'settingslogic'
gem 'parallel', require: false
# Use Active Storage variant # Use Active Storage variant
# gem 'image_processing', '~> 1.2' # gem 'image_processing', '~> 1.2'
......
...@@ -178,6 +178,7 @@ GEM ...@@ -178,6 +178,7 @@ GEM
oauth2 (~> 1.1) oauth2 (~> 1.1)
omniauth (~> 1.9) omniauth (~> 1.9)
orm_adapter (0.5.0) orm_adapter (0.5.0)
parallel (1.19.1)
pry (0.13.0) pry (0.13.0)
coderay (~> 1.1) coderay (~> 1.1)
method_source (~> 1.0) method_source (~> 1.0)
...@@ -324,6 +325,7 @@ DEPENDENCIES ...@@ -324,6 +325,7 @@ DEPENDENCIES
meta-tags meta-tags
mysql2 mysql2
omniauth-facebook omniauth-facebook
parallel
pry pry
puma (~> 4.1) puma (~> 4.1)
rails (~> 6.0.2, >= 6.0.2.2) rails (~> 6.0.2, >= 6.0.2.2)
......
class City < ApplicationRecord class City < ApplicationRecord
validates :title, presence: true, uniqueness: true
has_and_belongs_to_many :jobs has_and_belongs_to_many :jobs
end end
class Company < ApplicationRecord class Company < ApplicationRecord
validates :title, presence: true
has_many :jobs has_many :jobs
end end
class Industry < ApplicationRecord class Industry < ApplicationRecord
validates :title, presence: true, uniqueness: true
has_and_belongs_to_many :jobs has_and_belongs_to_many :jobs
end end
class Job < ApplicationRecord class Job < ApplicationRecord
scope :by_cities, -> (city_id) {includes(:cities).where("cities.id = ?", city_id).references(:cities)}
scope :by_industries, -> (industry_id) {includes(:industries).where("industries.id = ?", industry_id).references(:industries)}
scope :by_companies, -> (company_id) {where("company_id = #{company_id}")}
EXPORT_CSV_ATTRIBUTES = %w(title updated_date_job level years_of_experience salary expiration_date).freeze
belongs_to :company belongs_to :company
has_many :applies has_many :applies
...@@ -10,9 +16,15 @@ class Job < ApplicationRecord ...@@ -10,9 +16,15 @@ class Job < ApplicationRecord
has_and_belongs_to_many :industries has_and_belongs_to_many :industries
has_and_belongs_to_many :cities has_and_belongs_to_many :cities
scope :by_cities, -> (city_id) {includes(:cities).where("cities.id = ?", city_id).references(:cities)} validate :updated_date_job_cannot_be_greater_than_expiration_date
scope :by_industries, -> (industry_id) {includes(:industries).where("industries.id = ?", industry_id).references(:industries)}
scope :by_companies, -> (company_id) {where("company_id = #{company_id}")} validates :title, length: { minimum: 6 }
validates :title, :updated_date_job, :level, :expiration_date, :salary, :min_salary, :max_salary, presence: true
validates :min_salary, :max_salary, numericality: { only_integer: true }
EXPORT_CSV_ATTRIBUTES = %w(title updated_date_job level years_of_experience salary expiration_date).freeze def updated_date_job_cannot_be_greater_than_expiration_date
if DateTime.parse(updated_date_job).to_i > DateTime.parse(expiration_date).to_i
errors.add(:updated_date_job, "can't be greater than expiration date")
end
end
end end
class CrawlerService
def self.convert_salary(salary)
return [0, 999_999_999] if salary == "Cạnh tranh"
vn_salary = salary.tr("^[0-9]{1,2}[.,]\d{1-2}", " ")
.tr(",",".")
.split(" ")
.map { |s| (s.to_f*1_000_000).to_i }
return [0, vn_salary[0]] if salary.include? "Dưới"
return [vn_salary[0], 0] if salary.include? "Trên"
[vn_salary[0], vn_salary[1]]
end
def self.imports(job_attributes, company_attributes, cities, industries)
raise Exception.new "Not enough data transferred" if job_attributes.nil? || company_attributes.nil? || cities.nil? || industries.nil?
ActiveRecord::Base.transaction do
job_attributes[:company_id] = Company.find_or_create_by!(company_attributes).id
job = Job.find_or_create_by!(job_attributes)
if job.errors.full_messages.present?
raise Exception.new "#{job.errors.full_messages.join(",")}"
raise ActiveRecord::Rollback
end
cities = cities.map do |city|
City.find_or_create_by({title: city})
end
industries = industries.map do |industry|
Industry.find_or_create_by({title: industry})
end
cities.each do |city|
job.cities << city
end
industries.each do |industry|
job.industries << industry
end
end
end
end
\ No newline at end of file
class AddForeignToCities < ActiveRecord::Migration[6.0] class AddForeignToCities < ActiveRecord::Migration[6.0]
def change def change
add_column :cities, :foreign, :boolean, :default => false add_column :cities, :foreign, :boolean, default: false
end end
end end
class AddColumnsToJobs < ActiveRecord::Migration[6.0]
def change
add_column :jobs, :min_salary, :bigint, default: 0
add_column :jobs, :max_salary, :bigint, default: 0
add_column :jobs, :benefit, :text
add_column :jobs, :job_requirements, :text
add_column :jobs, :other_information, :text
end
end
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2020_04_23_044651) do ActiveRecord::Schema.define(version: 2020_05_11_055632) do
create_table "admins", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t| create_table "admins", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4", force: :cascade do |t|
t.string "email", default: "", null: false t.string "email", default: "", null: false
...@@ -89,6 +89,11 @@ ActiveRecord::Schema.define(version: 2020_04_23_044651) do ...@@ -89,6 +89,11 @@ ActiveRecord::Schema.define(version: 2020_04_23_044651) do
t.bigint "company_id" t.bigint "company_id"
t.datetime "created_at", precision: 6, null: false t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false t.datetime "updated_at", precision: 6, null: false
t.bigint "min_salary", default: 0
t.bigint "max_salary", default: 0
t.text "benefit"
t.text "job_requirements"
t.text "other_information"
t.index ["company_id"], name: "index_jobs_on_company_id" t.index ["company_id"], name: "index_jobs_on_company_id"
end end
......
require "nokogiri" require "nokogiri"
require "open-uri" require "open-uri"
require "parallel"
require "activerecord-import"
namespace :crawler do namespace :crawler do
desc "Crawler Careerbuilder" desc "Crawler Careerbuilder"
task job: :environment do task job: :environment do
# Define exception logger # Define crawler logger
exception_logger = ActiveSupport::Logger.new("log/exception_logger.log") logger = Logger.new("log/crawler_logger.log")
html_careerbuilder_list_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"))
# Define skip logger total_page = (html_careerbuilder_list_jobs.at_css(".search-result-list .container .job-found .job-found-amout p").text.tr(",việc làm","").to_i / 50.0).ceil
skip_url_logger = ActiveSupport::Logger.new("log/skip_url_logger.log")
# Loop page # Loop page
(10..12).each do |page| (1..total_page).each do |page|
# Fetch and parse HTML document # Fetch and parse HTML document
html_jobs = Nokogiri::HTML.parse(open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html")) html_jobs = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
# Loop item # Loop item
html_jobs.css(".jobs-side-list .job-item").each do |item| Parallel.each(html_jobs.css(".jobs-side-list .job-item"), in_threads: 5) { |item|
# Job attributes begin
job_attributes = { url = item.css(".figure .figcaption .title .job_link @href").text
title: item.css(".figure .figcaption .title a @title").text, html_job_detail = Nokogiri::HTML.parse(URI.open(URI.encode(url)))
updated_date_job: item.css(".bottom-right-icon .time time").text, if html_job_detail.at_css(".search-result-list-detail .tabs div#tab-1").nil?
level: nil, logger.warn "Another template #{url}"
years_of_experience: nil, next
salary: item.css(".figure .figcaption .caption .salary").text.gsub("$ ",""),
expiration_date: nil,
job_description: nil,
company_id: nil,
}
# Defind cities array
cities = []
item.css(".figure .figcaption .caption .location ul li").each do |city|
city = check_exist_or_create_city(city.text.strip)
cities << city
end
if item.css(".figure .image a @href").text != "javascript:void(0);"
# Company attributes
html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text)))
unless html_company_detail.at_css(".jobsby-company").nil?
company_attributes = {
title: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content .name").text,
address: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content p")[1].text,
logo: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .img @src").text,
description: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content ul").inner_html.strip
}
# Check exist or create company
job_attributes[:company_id] = check_exist_or_create_company(company_attributes)
end end
end # Set salary, min-salary, max-salary
# Defind industry ids array if item.at_css(".figure .figcaption .caption .salary").text.include? "USD"
industries = [] logger.warn "Another template #{url}"
next
html_job_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .figcaption .title .job_link @href").text))) end
unless html_job_detail.at_css(".search-result-list-detail").nil? salary = item.at_css(".figure .figcaption .caption .salary").text.gsub("$ ","")
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .has-background ul li").each do |ele| min_salary, max_salary = CrawlerService.convert_salary(salary)
type = ele.css("strong").text # Job attributes
job_attributes = {
title: item.at_css(".figure .figcaption .title a @title").text,
updated_date_job: item.at_css(".bottom-right-icon .time time").text,
salary: salary,
min_salary: min_salary,
max_salary: max_salary
}
html_job_detail.css(".job-detail-content .row .has-background ul li").each do |ele|
type = ele.at_css("strong").text
case type case type
when "Hết hạn nộp" when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.css("p").text.strip job_attributes[:expiration_date] = ele.at_css("p").text.squish
when "Cấp bậc" when "Cấp bậc"
job_attributes[:level] = ele.css("p").text.strip job_attributes[:level] = ele.at_css("p").text.squish
when "Kinh nghiệm" when "Kinh nghiệm"
job_attributes[:years_of_experience] = ele.css("p").text.strip job_attributes[:years_of_experience] = ele.at_css("p").text.squish
end end
end end
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").each do |ele|
industry = check_exist_or_create_industry(ele.text.gsub(",","").strip)
industries << industry
end
# Get description for job attributes
description = ""
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele| html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele|
description << ele.inner_html next if ele.at_css(".detail-title").nil?
end type = ele.at_css(".detail-title").text
# Set description for job attributes case type
job_attributes[:job_description] = description.strip when "Phúc lợi "
else job_attributes[:benefit] = ele.at_css("ul").inner_html.squish
skip_url_logger.info "another template #{item.css(".figure .figcaption .title .job_link @href").text}" when "Mô tả Công việc"
end job_attributes[:job_description] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Mô tả Công việc</h3>","")
# Create job when "Yêu Cầu Công Việc"
job = check_exist_or_create_job(job_attributes) job_attributes[:job_requirements] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Yêu Cầu Công Việc</h3>","")
# Create city_job when "Thông tin khác"
if cities.count > 0 job_attributes[:other_information] = ele.inner_html.squish.gsub("<h3 class=\"detail-title\">Thông tin khác</h3>","")
cities.each do |city| end
job.cities << city
end end
end next if item.at_css(".figure .image a @href").text == "javascript:void(0);"
# Create industry_job # Company attributes
if industries.count > 0 html_company_detail = Nokogiri::HTML.parse(URI.open(URI.encode(item.css(".figure .image a @href").text)))
industries.each do |industry| next if html_company_detail.at_css(".jobsby-company").nil?
job.industries << industry company_css = ".jobsby-company .company-introduction .company-info .info "
company_attributes = {
title: html_company_detail.at_css(company_css + ".content .name").text,
address: html_company_detail.css(company_css + ".content p")[1].text,
logo: html_company_detail.at_css(company_css + ".img @src").text,
description: html_company_detail.at_css(company_css + ".content ul").inner_html.squish
}
# Defind cities array
cities = item.css(".figure .figcaption .caption .location ul li").map do |city|
city.text.squish
end end
end # Defind industries array
rescue industries = html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").map do |industry|
exception_logger.info "Error url: #{item.css(".figure .figcaption .title .job_link @href").text}" industry.text.tr(",","").squish
end
sleep rand
Mutex.new.synchronize {
result = CrawlerService.imports(job_attributes, company_attributes, cities, industries)
}
sleep rand
logger.info "Crawl success url : #{url}"
rescue => e
logger.error e
next next
end end
}
end end
end end
task city: :environment do task city: :environment do
# Fetch and parse HTML document # Fetch and parse HTML document
html_cities = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html")) html_cities = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
unless html_cities.at_css(".find-jobsby-categories .main-jobs-by-location").nil? # Get city in country
# Defind cities array cities_in_country = html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").map do |title|
cities = [] {
# Get city in country title: title.text.gsub("Việc làm tại ","").squish,
html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").each do |title| foreign: false
city = { }
title: title.text.gsub("Việc làm tại ","").strip,
foreign: false
}
cities << city
end
# Get city foreign
html_cities.css(".find-jobsby-categories .main-jobs-by-location .overseas-jobs .list-overseas-jobs li a").each do |title|
city = {
title: title.text.strip,
foreign: true
}
cities << city
end
if cities.count > 0
City.import cities
end
end end
end # Get city foreign
task industry: :environment do cities_foreign = html_cities.css(".find-jobsby-categories .main-jobs-by-location .overseas-jobs .list-overseas-jobs li a").map do |title|
# Fetch and parse HTML document {
html_industries = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html")) title: title.text.squish,
unless html_industries.at_css(".find-jobsby-categories .list-of-working-positions").nil? foreign: true
# Defind industries array }
industries = []
# Get industry
html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").each do |title|
industry = {
title: title.text.strip
}
industries << industry
end
if industries.count > 0
Industry.import industries
end
end end
end cities = cities_in_country + cities_foreign
def check_exist_or_create_company(company_attributes) if cities.length > 0
find_company = Company.find_or_create_by(company_attributes) City.import cities
return find_company.id
end
def check_exist_or_create_industry(industry_title)
industries = Industry.where("title LIKE ?", industry_title)
if industries.count == 0
industry = Industry.create(title: industry_title)
else
industry = industries[0]
end end
return industry
end end
def check_exist_or_create_city(city_title) task industry: :environment do
cities = City.where("title LIKE ?", city_title) # Fetch and parse HTML document
if cities.count == 0 html_industries = Nokogiri::HTML.parse(URI.open("https://careerbuilder.vn/tim-viec-lam.html"))
city = City.create(title: city_title) # Get industry
else industries = html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").map do |title|
city = cities[0] {
title: title.text.squish
}
end
if industries.length > 0
Industry.import industries
end end
return city
end
def check_exist_or_create_job(job_attributes)
job = Job.find_or_create_by(job_attributes)
return job
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment