Commit 2c0b2f55 by Hoang Phuc

Merge branch 'master' of gitlab.zigexn.vn:phucth/ven-job into feature/user_management_test

parents 275f83a2 096a9a9a
Pipeline #589 failed with stages
in 0 seconds
......@@ -27,6 +27,8 @@ gem 'devise'
gem 'carrierwave'
gem 'activerecord-import'
gem 'settingslogic'
# Use Active Storage variant
# gem 'image_processing', '~> 1.2'
......
......@@ -45,6 +45,8 @@ GEM
activerecord (6.0.2.2)
activemodel (= 6.0.2.2)
activesupport (= 6.0.2.2)
activerecord-import (1.0.4)
activerecord (>= 3.2)
activestorage (6.0.2.2)
actionpack (= 6.0.2.2)
activejob (= 6.0.2.2)
......@@ -282,6 +284,7 @@ PLATFORMS
x86-mswin32
DEPENDENCIES
activerecord-import
bcrypt (~> 3.1.7)
bootsnap (>= 1.4.2)
byebug
......
default: &default
adapter: mysql2
encoding: utf8
encoding: utf8mb4
pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username: dev
password: dev
......
......@@ -2,7 +2,6 @@ class CreateCities < ActiveRecord::Migration[6.0]
def change
create_table :cities do |t|
t.string :title
t.timestamps
end
end
......
class AddForeignToCities < ActiveRecord::Migration[6.0]
def change
add_column :cities, :foreign, :boolean, :default => false
end
end
......@@ -10,9 +10,9 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2020_04_10_070154) do
ActiveRecord::Schema.define(version: 2020_04_23_044651) do
create_table "applies", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
create_table "applies", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "job_id", null: false
t.datetime "created_at", precision: 6, null: false
......@@ -21,20 +21,21 @@ ActiveRecord::Schema.define(version: 2020_04_10_070154) do
t.index ["user_id"], name: "index_applies_on_user_id"
end
create_table "cities", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
create_table "cities", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", force: :cascade do |t|
t.string "title"
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
t.boolean "foreign", default: false
end
create_table "cities_jobs", id: false, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
create_table "cities_jobs", id: false, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", force: :cascade do |t|
t.bigint "city_id", null: false
t.bigint "job_id", null: false
t.index ["city_id", "job_id"], name: "index_cities_jobs_on_city_id_and_job_id"
t.index ["job_id", "city_id"], name: "index_cities_jobs_on_job_id_and_city_id"
end
create_table "companies", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
create_table "companies", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", force: :cascade do |t|
t.string "title"
t.string "address"
t.string "logo"
......@@ -43,7 +44,7 @@ ActiveRecord::Schema.define(version: 2020_04_10_070154) do
t.datetime "updated_at", precision: 6, null: false
end
create_table "favorites", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
create_table "favorites", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "job_id", null: false
t.datetime "created_at", precision: 6, null: false
......@@ -52,20 +53,20 @@ ActiveRecord::Schema.define(version: 2020_04_10_070154) do
t.index ["user_id"], name: "index_favorites_on_user_id"
end
create_table "industries", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
create_table "industries", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", force: :cascade do |t|
t.string "title"
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
create_table "industries_jobs", id: false, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
create_table "industries_jobs", id: false, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", force: :cascade do |t|
t.bigint "industry_id", null: false
t.bigint "job_id", null: false
t.index ["industry_id", "job_id"], name: "index_industries_jobs_on_industry_id_and_job_id"
t.index ["job_id", "industry_id"], name: "index_industries_jobs_on_job_id_and_industry_id"
end
create_table "jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
create_table "jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", force: :cascade do |t|
t.string "title"
t.string "updated_date_job"
t.string "level"
......@@ -79,7 +80,7 @@ ActiveRecord::Schema.define(version: 2020_04_10_070154) do
t.index ["company_id"], name: "index_jobs_on_company_id"
end
create_table "users", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
create_table "users", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", force: :cascade do |t|
t.string "email", default: "", null: false
t.string "encrypted_password", default: "", null: false
t.string "reset_password_token"
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -2,7 +2,7 @@ require "nokogiri"
require "open-uri"
namespace :crawler do
desc "TODO"
desc "Crawler Careerbuilder"
task job: :environment do
# Define exception logger
......@@ -12,240 +12,165 @@ namespace :crawler do
skip_url_logger = ActiveSupport::Logger.new("log/skip_url_logger.log")
# Loop page
(1..2).each do |page|
(10..12).each do |page|
# Fetch and parse HTML document
html_jobs = Nokogiri::HTML.parse(open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
# Loop item
(0..html_jobs.css(".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href").length - 1).each do |i|
# Get href of a tag and open job detail page
job_detail_url = html_jobs.css(".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href")[i].text
html_job_detail = Nokogiri::HTML.parse(open(URI.encode(job_detail_url)))
html_jobs.css(".jobs-side-list .job-item").each do |item|
# Job attributes
job_attributes = {
title: nil,
updated_date_job: nil,
title: item.css(".figure .figcaption .title a @title").text,
updated_date_job: item.css(".bottom-right-icon .time time").text,
level: nil,
years_of_experience: nil,
salary: nil,
salary: item.css(".figure .figcaption .caption .salary").text.gsub("$ ",""),
expiration_date: nil,
job_description: nil,
company_id: nil,
}
# Defind cities array
cities = []
item.css(".figure .figcaption .caption .location ul li").each do |city|
city = check_exist_or_create_city(city.text.strip)
cities << city
end
if item.css(".figure .image a @href").text != "javascript:void(0);"
# Company attributes
html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text)))
unless html_company_detail.at_css(".jobsby-company").nil?
company_attributes = {
title: nil,
address: nil,
logo: nil,
description: nil
title: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content .name").text,
address: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content p")[1].text,
logo: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .img @src").text,
description: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content ul").inner_html.strip
}
# Defind city ids array
cities = []
# Check exist or create company
job_attributes[:company_id] = check_exist_or_create_company(company_attributes)
end
end
# Defind industry ids array
industries = []
# Check what template job belongs to
if html_job_detail.at_css("#uni_container .MyJobDetail")
# CSS DOM
css_dom = "#uni_container .MyJobDetail .MyJobLeft .LeftJobCB"
# Title
job_attributes[:title] = html_job_detail.css("#{css_dom} .top-job .top-job-info h1").text
# Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .datepost span").text
# Hash company
company_attributes[:title] = html_job_detail.css("#{css_dom} .box1Detail .TitleDetailNew span").text
company_attributes[:address] = html_job_detail.css("#{css_dom} .box1Detail .TitleDetailNew label label").text
company_attributes[:logo] = html_job_detail.css("#{css_dom} .box1Detail .align_center.logocompany a img @src").text
company_attributes[:description] = html_job_detail.css("#{css_dom} .desc_company.content_fck #emp_collapse").text.split("...")[0]
# Get value for job attributes
html_job_detail.css("#{css_dom} .box2Detail .DetailJobNew li p").each_with_index do |ele, index|
type = ele.css("span").text
html_job_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .figcaption .title .job_link @href").text)))
unless html_job_detail.at_css(".search-result-list-detail").nil?
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .has-background ul li").each do |ele|
type = ele.css("strong").text
case type
when "Nơi làm việc: "
# Check exist or create city
ele.css("b a").each_with_index do |ele, index|
if index > 0
city = check_exist_or_create_city(ele.text.gsub(",",""))
cities << city
when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.css("p").text.strip
when "Cấp bậc"
job_attributes[:level] = ele.css("p").text.strip
when "Kinh nghiệm"
job_attributes[:years_of_experience] = ele.css("p").text.strip
end
end
when "Cấp bậc: "
job_attributes[:level] = ele.css("label").text
when "Kinh nghiệm: "
job_attributes[:years_of_experience] = ele.text.gsub("Kinh nghiệm: ","")
when "Lương: "
job_attributes[:salary] = ele.text.gsub("Lương: ","")
when "Ngành nghề: "
# Check exist or create industry
ele.css("b a").each_with_index do |ele, index|
industry = check_exist_or_create_industry(ele.text.gsub(",",""))
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").each do |ele|
industry = check_exist_or_create_industry(ele.text.gsub(",","").strip)
industries << industry
end
else
job_attributes[:expiration_date] = ele.text.gsub("Hết hạn nộp: ","")
end
end
# Get description for job attributes
description = ""
html_job_detail.css("#{css_dom} .MarBot20").each_with_index do |ele, index|
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele|
description << ele.inner_html
end
# Set description for job attributes
job_attributes[:job_description] = description
elsif html_job_detail.at_css("#uni_container .job-template-2")
# CSS DOM
css_dom = "#uni_container .job-template-2 .content-job-detail"
# Title
job_attributes[:title] = html_job_detail.css("#{css_dom} .top-job .top-job-info h1").text
# Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật:", "")
# Hash company
company_attributes[:title] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .top-job-info .tit_company").text
company_attributes[:address] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info p")[0].text
company_attributes[:logo] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .logocompany a img @src").text
company_attributes[:description] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .desc").text.split("...")[0]
# Get value for job attributes
html_job_detail.css("#{css_dom} .right-col .info-career .info li").each_with_index do |ele, index|
type = ele.css("b").text
case type
when "Nơi làm việc"
# Check exist or create city
ele.css("span a").each_with_index do |ele, index|
if index > 0
city = check_exist_or_create_city(ele.text.gsub(",",""))
cities << city
end
end
when "Cấp bậc"
job_attributes[:level] = ele.css("span").text
when "Kinh nghiệm"
job_attributes[:years_of_experience] = ele.css("span").text
when "Lương"
job_attributes[:salary] = ele.text.gsub("Lương: ","")
when "Ngành nghề"
# Check exist or create industry
ele.css("span a").each_with_index do |ele, index|
industry = check_exist_or_create_industry(ele.text.gsub(",",""))
industries << industry
end
else
job_attributes[:expiration_date] = ele.css("span").text
end
end
# Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#{css_dom} #showScroll").inner_html
elsif html_job_detail.at_css("#uni_container .job-template-201")
# CSS DOM
css_dom = "#uni_container .job-template-201"
# Title
job_attributes[:title] = html_job_detail.css("#{css_dom} .content-job-detail .top-job .top-job-info h1").text
# Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .content-job-detail .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật: ","")
# Hash company
company_attributes[:title] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .top-job-info .tit_company").text
company_attributes[:logo] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .logocompany a img @src").text
company_attributes[:address] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info p")[0].text
company_attributes[:description] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .desc").text.gsub(" Xem thêm", "")
# Get value for job attributes
html_job_detail.css("#{css_dom} .right-col .info-career .info li").each_with_index do |ele, index|
type = ele.css("b").text
case type
when "Nơi làm việc"
# Check exist or create city
ele.css("span a").each_with_index do |ele, index|
if index > 0
city = check_exist_or_create_city(ele.text)
cities << city
end
end
when "Cấp bậc"
job_attributes[:level] = ele.css("span").text
when "Lương"
job_attributes[:salary] = ele.css("span").text
when "Ngành nghề"
# Check exist or create industry
ele.css("span a").each_with_index do |ele, index|
industry = check_exist_or_create_industry(ele.text)
industries << industry
end
when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.css("span").text
else
job_attributes[:years_of_experience] = ele.css("span").text
end
end
# Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#{css_dom} .left-col #showScroll").inner_html
job_attributes[:job_description] = description.strip
else
skip_url_logger.info "another template #{job_detail_url}"
skip_url_logger.info "another template #{item.css(".figure .figcaption .title .job_link @href").text}"
end
# Check exist or create company
job_attributes[:company_id] = check_exist_or_create_company(company_attributes)
# Create job
job = check_exist_or_create_job(job_attributes)
# Create city_job
if cities.length > 0
if cities.count > 0
cities.each do |city|
job.cities << city
end
end
# Create industry_job
if industries.length > 0
if industries.count > 0
industries.each do |industry|
job.industries << industry
end
end
rescue
exception_logger.info "Error url: #{job_detail_url}"
exception_logger.info "Error url: #{item.css(".figure .figcaption .title .job_link @href").text}"
next
end
end
end
task city: :environment do
# Fetch and parse HTML document
html_cities = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
unless html_cities.at_css(".find-jobsby-categories .main-jobs-by-location").nil?
# Defind cities array
cities = []
# Get city in country
html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").each do |title|
city = {
title: title.text.gsub("Việc làm tại ","").strip,
foreign: false
}
cities << city
end
# Get city foreign
html_cities.css(".find-jobsby-categories .main-jobs-by-location .overseas-jobs .list-overseas-jobs li a").each do |title|
city = {
title: title.text.strip,
foreign: true
}
cities << city
end
if cities.count > 0
City.import cities
end
end
end
task industry: :environment do
# Fetch and parse HTML document
html_industries = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
unless html_industries.at_css(".find-jobsby-categories .list-of-working-positions").nil?
# Defind industries array
industries = []
# Get industry
html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").each do |title|
industry = {
title: title.text.strip
}
industries << industry
end
if industries.count > 0
Industry.import industries
end
end
end
def check_exist_or_create_company(company_attributes)
find_company = Company.find_or_create_by(company_attributes)
return find_company.id
end
def check_exist_or_create_city(city_title)
find_city = City.find_or_create_by(title: city_title)
return find_city
def check_exist_or_create_industry(industry_title)
industries = Industry.where("title LIKE ?", industry_title)
if industries.count == 0
industry = Industry.create(title: industry_title)
else
industry = industries[0]
end
return industry
end
def check_exist_or_create_industry(industry_title)
find_industry = Industry.find_or_create_by(title: industry_title)
return find_industry
def check_exist_or_create_city(city_title)
cities = City.where("title LIKE ?", city_title)
if cities.count == 0
city = City.create(title: city_title)
else
city = cities[0]
end
return city
end
def check_exist_or_create_job(job_attributes)
......
require 'csv'
namespace :import do
desc "Import CSV"
task csv: :environment do
csv_text = File.read('jobs.csv')
csv = CSV.parse(csv_text, :headers => true)
csv.each do |row|
# Job attributes
job_attributes = {
title: row[9],
level: row[8],
salary: row[11],
job_description: row[7],
company_id: nil,
}
# Company attributes
company_attributes = {
title: row[5],
address: row[2],
logo: "https://via.placeholder.com/66x38?text=Logo",
description: row[14]
}
# Check exist or create company
job_attributes[:company_id] = check_exist_or_create_company(company_attributes)
# Create job
job = check_exist_or_create_job(job_attributes)
if job.cities.count == 0 && job.industries.count == 0
# Industry
industry = check_exist_or_create_industry(row[1])
# Industry job
job.industries << industry
# City
city = check_exist_or_create_city(row[16].gsub('["',"").gsub('"]',""))
# City job
job.cities << city
end
end
end
def check_exist_or_create_company(company_attributes)
find_company = Company.find_or_create_by(company_attributes)
return find_company.id
end
def check_exist_or_create_industry(industry_title)
industries = Industry.where("title LIKE ?", industry_title)
if industries.count == 0
industry = Industry.create(title: industry_title)
else
industry = industries[0]
end
return industry
end
def check_exist_or_create_city(city_title)
cities = City.where("title LIKE ?", city_title)
if cities.count == 0
city = City.create(title: city_title)
else
city = cities[0]
end
return city
end
def check_exist_or_create_job(job_attributes)
job = Job.find_or_create_by(job_attributes)
return job
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment