Commit 459a3629 by Trịnh Hoàng Phúc

Merge branch 'feature/rake_task_crawler' into 'master'

Migrate database, rake task crawler

See merge request !1
parents 9281ef78 36ba17f0
Pipeline #543 failed with stages
in 0 seconds
......@@ -6,7 +6,8 @@ ruby '2.7.0'
# Bundle edge Rails instead: gem 'rails', github: 'rails/rails'
gem 'rails', '~> 6.0.2', '>= 6.0.2.2'
# Use sqlite3 as the database for Active Record
gem 'sqlite3', '~> 1.4'
# gem 'sqlite3', '~> 1.4'
gem 'mysql2'
# Use Puma as the app server
gem 'puma', '~> 4.1'
# Use SCSS for stylesheets
......
......@@ -61,6 +61,8 @@ GEM
bindex (0.8.1)
bootsnap (1.4.6)
msgpack (~> 1.0)
bootsnap (1.4.6-java)
msgpack (~> 1.0)
builder (3.2.4)
byebug (11.1.1)
capybara (3.32.0)
......@@ -76,6 +78,9 @@ GEM
crass (1.0.6)
erubi (1.9.0)
ffi (1.12.2)
ffi (1.12.2-java)
ffi (1.12.2-x64-mingw32)
ffi (1.12.2-x86-mingw32)
globalid (0.4.2)
activesupport (>= 4.2.0)
i18n (1.8.2)
......@@ -99,12 +104,27 @@ GEM
mini_portile2 (2.4.0)
minitest (5.14.0)
msgpack (1.3.3)
msgpack (1.3.3-java)
msgpack (1.3.3-x64-mingw32)
msgpack (1.3.3-x86-mingw32)
mysql2 (0.5.3)
mysql2 (0.5.3-x64-mingw32)
mysql2 (0.5.3-x86-mingw32)
mysql2 (0.5.3-x86-mswin32-60)
nio4r (2.5.2)
nio4r (2.5.2-java)
nokogiri (1.10.9)
mini_portile2 (~> 2.4.0)
nokogiri (1.10.9-java)
nokogiri (1.10.9-x64-mingw32)
mini_portile2 (~> 2.4.0)
nokogiri (1.10.9-x86-mingw32)
mini_portile2 (~> 2.4.0)
public_suffix (4.0.3)
puma (4.3.3)
nio4r (~> 2.0)
puma (4.3.3-java)
nio4r (~> 2.0)
rack (2.2.2)
rack-proxy (0.6.5)
rack
......@@ -147,6 +167,10 @@ GEM
sassc-rails (~> 2.1, >= 2.1.1)
sassc (2.2.1)
ffi (~> 1.9)
sassc (2.2.1-x64-mingw32)
ffi (~> 1.9)
sassc (2.2.1-x86-mingw32)
ffi (~> 1.9)
sassc-rails (2.1.2)
railties (>= 4.0.0)
sassc (>= 2.0)
......@@ -167,15 +191,17 @@ GEM
actionpack (>= 4.0)
activesupport (>= 4.0)
sprockets (>= 3.0.0)
sqlite3 (1.4.2)
thor (1.0.1)
thread_safe (0.3.6)
thread_safe (0.3.6-java)
tilt (2.0.10)
turbolinks (5.2.1)
turbolinks-source (~> 5.2)
turbolinks-source (5.2.0)
tzinfo (1.2.6)
thread_safe (~> 0.1)
tzinfo-data (1.2019.3)
tzinfo (>= 1.0.0)
web-console (4.0.1)
actionview (>= 6.0.0)
activemodel (>= 6.0.0)
......@@ -191,13 +217,19 @@ GEM
railties (>= 4.2)
websocket-driver (0.7.1)
websocket-extensions (>= 0.1.0)
websocket-driver (0.7.1-java)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.4)
xpath (3.2.0)
nokogiri (~> 1.8)
zeitwerk (2.3.0)
PLATFORMS
java
ruby
x64-mingw32
x86-mingw32
x86-mswin32
DEPENDENCIES
bootsnap (>= 1.4.2)
......@@ -205,13 +237,13 @@ DEPENDENCIES
capybara (>= 2.15)
jbuilder (~> 2.7)
listen (>= 3.0.5, < 3.2)
mysql2
puma (~> 4.1)
rails (~> 6.0.2, >= 6.0.2.2)
sass-rails (>= 6)
selenium-webdriver
spring
spring-watcher-listen (~> 2.0.0)
sqlite3 (~> 1.4)
turbolinks (~> 5)
tzinfo-data
web-console (>= 3.3.0)
......
class Apply < ApplicationRecord
belongs_to :user
belongs_to :job
end
class City < ApplicationRecord
has_and_belongs_to_many :jobs
end
class CityJob < ApplicationRecord
belongs_to :city
belongs_to :job
end
class Company < ApplicationRecord
has_many :jobs
end
class Favorite < ApplicationRecord
belongs_to :user
belongs_to :job
end
class Industry < ApplicationRecord
has_and_belongs_to_many :jobs
end
class IndustryJob < ApplicationRecord
belongs_to :industry
belongs_to :job
end
class Job < ApplicationRecord
belongs_to :company
has_many :applies
has_many :users, through: :applies
has_many :favorites
has_many :users, through: :favorites
has_and_belongs_to_many :industries
has_and_belongs_to_many :cities
end
class User < ApplicationRecord
has_many :applies
has_many :jobs, through: :applies
has_many :favorites
has_many :jobs, through: :favorites
end
# SQLite. Versions 3.8.0 and up are supported.
# gem install sqlite3
#
# Ensure the SQLite 3 gem is defined in your Gemfile
# gem 'sqlite3'
#
default: &default
adapter: sqlite3
adapter: mysql2
encoding: utf8
pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username: dev
password: dev
flags:
- -COMPRESS
- FOUND_ROWS
- MULTI_STATEMENTS
secure_auth: false
timeout: 5000
# socket: /var/run/mysqld/mysqld.sock
development:
<<: *default
database: db/development.sqlite3
# Warning: The database defined as "test" will be erased and
# re-generated from your development database when you run "rake".
# Do not set this db to the same as development or production.
database: venjob_development
test:
<<: *default
database: db/test.sqlite3
database: venjob_test
production:
<<: *default
database: db/production.sqlite3
database: venjob_production
username: venjob
password: <%= ENV['VENJOB_DATABASE_PASSWORD'] %>
\ No newline at end of file
class CreateJobs < ActiveRecord::Migration[6.0]
def change
create_table :jobs do |t|
t.string :title
t.string :updated_date_job
t.string :level
t.string :years_of_experience
t.string :salary
t.string :expiration_date
t.text :job_description
t.timestamps
end
end
end
class CreateCities < ActiveRecord::Migration[6.0]
def change
create_table :cities do |t|
t.string :title
t.timestamps
end
end
end
class CreateUsers < ActiveRecord::Migration[6.0]
def change
create_table :users do |t|
t.string :full_name
t.string :email
t.string :password
t.string :cv_of_user
t.timestamps
end
end
end
class CreateCompanies < ActiveRecord::Migration[6.0]
def change
create_table :companies do |t|
t.string :title
t.string :address
t.string :logo
t.text :description
t.timestamps
end
end
end
class CreateIndustries < ActiveRecord::Migration[6.0]
def change
create_table :industries do |t|
t.string :title
t.timestamps
end
end
end
class AddReferencesCompanyToJobs < ActiveRecord::Migration[6.0]
def change
add_reference :jobs, :company, null: false, foreign_key: true
end
end
class CreateApplies < ActiveRecord::Migration[6.0]
def change
create_table :applies do |t|
t.references :user, null: false, foreign_key: true
t.references :job, null: false, foreign_key: true
t.timestamps
end
end
end
class CreateFavorites < ActiveRecord::Migration[6.0]
def change
create_table :favorites do |t|
t.references :user, null: false, foreign_key: true
t.references :job, null: false, foreign_key: true
t.timestamps
end
end
end
class CreateCityJob < ActiveRecord::Migration[6.0]
def change
create_table :city_jobs do |t|
t.references :city, null: false, foreign_key: true
t.references :job, null: false, foreign_key: true
end
end
end
class CreateIndustryJob < ActiveRecord::Migration[6.0]
def change
create_table :industry_jobs do |t|
t.references :industry, null: false, foreign_key: true
t.references :job, null: false, foreign_key: true
end
end
end
# This file is auto-generated from the current state of the database. Instead
# of editing this file, please use the migrations feature of Active Record to
# incrementally modify your database, and then regenerate this schema definition.
#
# This file is the source Rails uses to define your schema when running `rails
# db:schema:load`. When creating a new database, `rails db:schema:load` tends to
# be faster and is potentially less error prone than running all of your
# migrations from scratch. Old migrations may fail to apply correctly if those
# migrations use external dependencies or application code.
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2020_03_20_101302) do
create_table "applies", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "job_id", null: false
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
t.index ["job_id"], name: "index_applies_on_job_id"
t.index ["user_id"], name: "index_applies_on_user_id"
end
create_table "cities", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.string "title"
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
create_table "city_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.bigint "city_id", null: false
t.bigint "job_id", null: false
t.index ["city_id"], name: "index_city_jobs_on_city_id"
t.index ["job_id"], name: "index_city_jobs_on_job_id"
end
create_table "companies", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.string "title"
t.string "address"
t.string "logo"
t.text "description"
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
create_table "favorites", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.bigint "user_id", null: false
t.bigint "job_id", null: false
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
t.index ["job_id"], name: "index_favorites_on_job_id"
t.index ["user_id"], name: "index_favorites_on_user_id"
end
create_table "industries", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.string "title"
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
create_table "industry_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.bigint "industry_id", null: false
t.bigint "job_id", null: false
t.index ["industry_id"], name: "index_industry_jobs_on_industry_id"
t.index ["job_id"], name: "index_industry_jobs_on_job_id"
end
create_table "jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.string "title"
t.string "updated_date_job"
t.string "level"
t.string "years_of_experience"
t.string "salary"
t.string "expiration_date"
t.text "job_description"
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
t.bigint "company_id", null: false
t.index ["company_id"], name: "index_jobs_on_company_id"
end
create_table "users", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.string "full_name"
t.string "email"
t.string "password"
t.string "cv_of_user"
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
end
add_foreign_key "applies", "jobs"
add_foreign_key "applies", "users"
add_foreign_key "city_jobs", "cities"
add_foreign_key "city_jobs", "jobs"
add_foreign_key "favorites", "jobs"
add_foreign_key "favorites", "users"
add_foreign_key "industry_jobs", "industries"
add_foreign_key "industry_jobs", "jobs"
add_foreign_key "jobs", "companies"
end
require "nokogiri"
require "open-uri"
namespace :crawler do
desc "TODO"
task job: :environment do
# Define exception logger
exception_logger = ActiveSupport::Logger.new("log/exception_logger.log")
# Define skip logger
skip_url_logger = ActiveSupport::Logger.new("log/skip_url_logger.log")
# Loop page
(1..2).each do |page|
# Fetch and parse HTML document
html_jobs = Nokogiri::HTML.parse(open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
# Loop item
(0..html_jobs.css(".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href").length - 1).each do |i|
# Get href of a tag and open job detail page
job_detail_url = html_jobs.css(".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href")[i].text
html_job_detail = Nokogiri::HTML.parse(open(URI.encode(job_detail_url)))
# Job attributes
job_attributes = {
title: nil,
updated_date_job: nil,
level: nil,
years_of_experience: nil,
salary: nil,
expiration_date: nil,
job_description: nil,
company_id: nil
}
# Company attributes
company_attributes = {
title: nil,
address: nil,
logo: nil,
description: nil
}
# Defind city ids array
city_ids = []
# Defind industry ids array
industry_ids = []
# Check what template job belongs to
if html_job_detail.at_css("#uni_container .MyJobDetail")
# CSS DOM
css_dom = "#uni_container .MyJobDetail .MyJobLeft .LeftJobCB"
# Title
job_attributes[:title] = html_job_detail.css("#{css_dom} .top-job .top-job-info h1").text
# Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .datepost span").text
# Hash company
company_attributes[:title] = html_job_detail.css("#{css_dom} .box1Detail .TitleDetailNew span").text
company_attributes[:address] = html_job_detail.css("#{css_dom} .box1Detail .TitleDetailNew label label").text
company_attributes[:logo] = html_job_detail.css("#{css_dom} .box1Detail .align_center.logocompany a img @src").text
company_attributes[:description] = html_job_detail.css("#{css_dom} .desc_company.content_fck #emp_collapse").text.split("...")[0]
# Get value for job attributes
html_job_detail.css("#{css_dom} .box2Detail .DetailJobNew li p").each_with_index do |ele, index|
type = ele.css("span").text
case type
when "Nơi làm việc: "
# Check exist or create city
ele.css("b a").each_with_index do |ele, index|
if index > 0
city_id = check_exist_or_create_city(ele.text.gsub(",",""))
city_ids << city_id
end
end
when "Cấp bậc: "
job_attributes[:level] = ele.css("label").text
when "Kinh nghiệm: "
job_attributes[:years_of_experience] = ele.text.gsub("Kinh nghiệm: ","")
when "Lương: "
job_attributes[:salary] = ele.text.gsub("Lương: ","")
when "Ngành nghề: "
# Check exist or create industry
ele.css("b a").each_with_index do |ele, index|
industry_id = check_exist_or_create_industry(ele.text.gsub(",",""))
industry_ids << industry_id
end
else
job_attributes[:expiration_date] = ele.text.gsub("Hết hạn nộp: ","")
end
end
# Get description for job attributes
description = ""
html_job_detail.css("#{css_dom} .MarBot20").each_with_index do |ele, index|
description << ele.inner_html
end
# Set description for job attributes
job_attributes[:job_description] = description
elsif html_job_detail.at_css("#uni_container .job-template-2")
# CSS DOM
css_dom = "#uni_container .job-template-2 .content-job-detail"
# Title
job_attributes[:title] = html_job_detail.css("#{css_dom} .top-job .top-job-info h1").text
# Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật:", "")
# Hash company
company_attributes[:title] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .top-job-info .tit_company").text
company_attributes[:address] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info p")[0].text
company_attributes[:logo] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .logocompany a img @src").text
company_attributes[:description] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .desc").text.split("...")[0]
# Get value for job attributes
html_job_detail.css("#{css_dom} .right-col .info-career .info li").each_with_index do |ele, index|
type = ele.css("b").text
case type
when "Nơi làm việc"
# Check exist or create city
ele.css("span a").each_with_index do |ele, index|
if index > 0
city_id = check_exist_or_create_city(ele.text.gsub(",",""))
city_ids << city_id
end
end
when "Cấp bậc"
job_attributes[:level] = ele.css("span").text
when "Kinh nghiệm"
job_attributes[:years_of_experience] = ele.css("span").text
when "Lương"
job_attributes[:salary] = ele.text.gsub("Lương: ","")
when "Ngành nghề"
# Check exist or create industry
ele.css("span a").each_with_index do |ele, index|
industry_id = check_exist_or_create_industry(ele.text.gsub(",",""))
industry_ids << industry_id
end
else
job_attributes[:expiration_date] = ele.css("span").text
end
end
# Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#{css_dom} #showScroll").inner_html
elsif html_job_detail.at_css("#uni_container .job-template-201")
# CSS DOM
css_dom = "#uni_container .job-template-201"
# Title
job_attributes[:title] = html_job_detail.css("#{css_dom} .content-job-detail .top-job .top-job-info h1").text
# Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .content-job-detail .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật: ","")
# Hash company
company_attributes[:title] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .top-job-info .tit_company").text
company_attributes[:logo] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .logocompany a img @src").text
company_attributes[:address] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info p")[0].text
company_attributes[:description] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .desc").text.gsub(" Xem thêm", "")
# Get value for job attributes
html_job_detail.css("#{css_dom} .right-col .info-career .info li").each_with_index do |ele, index|
type = ele.css("b").text
case type
when "Nơi làm việc"
# Check exist or create city
ele.css("span a").each_with_index do |ele, index|
if index > 0
city_id = check_exist_or_create_city(ele.text)
city_ids << city_id
end
end
when "Cấp bậc"
job_attributes[:level] = ele.css("span").text
when "Lương"
job_attributes[:salary] = ele.css("span").text
when "Ngành nghề"
# Check exist or create industry
ele.css("span a").each_with_index do |ele, index|
industry_id = check_exist_or_create_industry(ele.text)
industry_ids << industry_id
end
when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.css("span").text
else
job_attributes[:years_of_experience] = ele.css("span").text
end
end
# Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#{css_dom} .left-col #showScroll").inner_html
else
skip_url_logger.info "another template #{job_detail_url}"
end
# Check exist or create company
job_attributes[:company_id] = check_exist_or_create_company(company_attributes)
# Create job
job_id = check_exist_or_create_job(job_attributes)
# Create city_job
if city_ids.length > 0
city_ids.each do |city_id|
check_exist_or_create_city_job(city_id, job_id)
end
end
# Create industry_job
if industry_ids.length > 0
industry_ids.each do |industry_id|
check_exist_or_create_industry_job(industry_id, job_id)
end
end
rescue
exception_logger.info "Error url: #{job_detail_url}"
next
end
end
end
def check_exist_or_create_company(company_attributes)
find_company = Company.find_or_create_by(company_attributes)
return find_company.id
end
def check_exist_or_create_city(city_title)
find_city = City.find_or_create_by(title: city_title)
return find_city.id
end
def check_exist_or_create_industry(industry_title)
find_industry = Industry.find_or_create_by(title: industry_title)
return find_industry.id
end
def check_exist_or_create_job(job_attributes)
job = Job.find_or_create_by(job_attributes)
return job.id
end
def check_exist_or_create_city_job(city_id, job_id)
return CityJob.find_or_create_by(city_id: city_id, job_id: job_id)
end
def check_exist_or_create_industry_job(industry_id, job_id)
return IndustryJob.find_or_create_by(industry_id: industry_id, job_id: job_id)
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment