Commit 56fee0d7 by Thanh Hung Pham

Merge branch 'crawler' into 'master'

Create crawler

See merge request !2
parents 6a2e0164 2f3cbcc4
Pipeline #1364 failed with stages
in 0 seconds
...@@ -6,6 +6,9 @@ ruby '3.0.1' ...@@ -6,6 +6,9 @@ ruby '3.0.1'
# Bundle edge Rails instead: gem 'rails', github: 'rails/rails', branch: 'main' # Bundle edge Rails instead: gem 'rails', github: 'rails/rails', branch: 'main'
gem 'rails', '~> 6.1.3', '>= 6.1.3.2' gem 'rails', '~> 6.1.3', '>= 6.1.3.2'
gem 'bootstrap', '~> 5.0.1' gem 'bootstrap', '~> 5.0.1'
gem 'nokogiri', '~> 1.11', '>= 1.11.7'
# Use sqlite3 as the database for Active Record # Use sqlite3 as the database for Active Record
gem 'mysql2', '~> 0.5.3' gem 'mysql2', '~> 0.5.3'
# Use Puma as the app server # Use Puma as the app server
...@@ -28,6 +31,7 @@ gem 'jbuilder', '~> 2.7' ...@@ -28,6 +31,7 @@ gem 'jbuilder', '~> 2.7'
# Reduces boot times through caching; required in config/boot.rb # Reduces boot times through caching; required in config/boot.rb
gem 'bootsnap', '>= 1.4.4', require: false gem 'bootsnap', '>= 1.4.4', require: false
gem 'whenever', require: false
group :development, :test do group :development, :test do
# Call 'byebug' anywhere in the code to stop execution and get a debugger console # Call 'byebug' anywhere in the code to stop execution and get a debugger console
......
...@@ -82,6 +82,7 @@ GEM ...@@ -82,6 +82,7 @@ GEM
regexp_parser (>= 1.5, < 3.0) regexp_parser (>= 1.5, < 3.0)
xpath (~> 3.2) xpath (~> 3.2)
childprocess (3.0.0) childprocess (3.0.0)
chronic (0.10.2)
concurrent-ruby (1.1.9) concurrent-ruby (1.1.9)
crass (1.0.6) crass (1.0.6)
erubi (1.10.0) erubi (1.10.0)
...@@ -200,6 +201,8 @@ GEM ...@@ -200,6 +201,8 @@ GEM
websocket-driver (0.7.5) websocket-driver (0.7.5)
websocket-extensions (>= 0.1.0) websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5) websocket-extensions (0.1.5)
whenever (1.0.0)
chronic (>= 0.6.3)
xpath (3.2.0) xpath (3.2.0)
nokogiri (~> 1.8) nokogiri (~> 1.8)
zeitwerk (2.4.2) zeitwerk (2.4.2)
...@@ -215,6 +218,7 @@ DEPENDENCIES ...@@ -215,6 +218,7 @@ DEPENDENCIES
jbuilder (~> 2.7) jbuilder (~> 2.7)
listen (~> 3.3) listen (~> 3.3)
mysql2 (~> 0.5.3) mysql2 (~> 0.5.3)
nokogiri (~> 1.11, >= 1.11.7)
puma (~> 5.0) puma (~> 5.0)
rack-mini-profiler (~> 2.0) rack-mini-profiler (~> 2.0)
rails (~> 6.1.3, >= 6.1.3.2) rails (~> 6.1.3, >= 6.1.3.2)
...@@ -226,6 +230,7 @@ DEPENDENCIES ...@@ -226,6 +230,7 @@ DEPENDENCIES
web-console (>= 4.1.0) web-console (>= 4.1.0)
webdrivers webdrivers
webpacker (~> 5.0) webpacker (~> 5.0)
whenever
RUBY VERSION RUBY VERSION
ruby 3.0.1p64 ruby 3.0.1p64
......
require_relative "boot" require_relative 'boot'
require "rails/all" require 'rails/all'
# Require the gems listed in Gemfile, including any gems # Require the gems listed in Gemfile, including any gems
# you've limited to :test, :development, or :production. # you've limited to :test, :development, or :production.
......
ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../Gemfile', __dir__) ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../Gemfile', __dir__)
require "bundler/setup" # Set up gems listed in the Gemfile. require 'bundler/setup' # Set up gems listed in the Gemfile.
require "bootsnap/setup" # Speed up boot time by caching expensive operations. require 'bootsnap/setup' # Speed up boot time by caching expensive operations.
# Load the Rails application. # Load the Rails application.
require_relative "application" require_relative 'application'
# Initialize the Rails application. # Initialize the Rails application.
Rails.application.initialize! Rails.application.initialize!
require "active_support/core_ext/integer/time" require 'active_support/core_ext/integer/time'
Rails.application.configure do Rails.application.configure do
# Settings specified here will take precedence over those in config/application.rb. # Settings specified here will take precedence over those in config/application.rb.
......
require "active_support/core_ext/integer/time" require 'active_support/core_ext/integer/time'
Rails.application.configure do Rails.application.configure do
# Settings specified here will take precedence over those in config/application.rb. # Settings specified here will take precedence over those in config/application.rb.
...@@ -53,7 +53,7 @@ Rails.application.configure do ...@@ -53,7 +53,7 @@ Rails.application.configure do
config.log_level = :info config.log_level = :info
# Prepend all log lines with the following tags. # Prepend all log lines with the following tags.
config.log_tags = [ :request_id ] config.log_tags = [:request_id]
# Use a different cache store in production. # Use a different cache store in production.
# config.cache_store = :mem_cache_store # config.cache_store = :mem_cache_store
...@@ -88,7 +88,7 @@ Rails.application.configure do ...@@ -88,7 +88,7 @@ Rails.application.configure do
# require "syslog/logger" # require "syslog/logger"
# config.logger = ActiveSupport::TaggedLogging.new(Syslog::Logger.new 'app-name') # config.logger = ActiveSupport::TaggedLogging.new(Syslog::Logger.new 'app-name')
if ENV["RAILS_LOG_TO_STDOUT"].present? if ENV['RAILS_LOG_TO_STDOUT'].present?
logger = ActiveSupport::Logger.new(STDOUT) logger = ActiveSupport::Logger.new(STDOUT)
logger.formatter = config.log_formatter logger.formatter = config.log_formatter
config.logger = ActiveSupport::TaggedLogging.new(logger) config.logger = ActiveSupport::TaggedLogging.new(logger)
......
require "active_support/core_ext/integer/time" require 'active_support/core_ext/integer/time'
# The test environment is used exclusively to run your application's # The test environment is used exclusively to run your application's
# test suite. You never need to work with it otherwise. Remember that # test suite. You never need to work with it otherwise. Remember that
......
...@@ -5,4 +5,4 @@ ...@@ -5,4 +5,4 @@
# You can also remove all the silencers if you're trying to debug a problem that might stem from framework code # You can also remove all the silencers if you're trying to debug a problem that might stem from framework code
# by setting BACKTRACE=1 before calling your invocation, like "BACKTRACE=1 ./bin/rails runner 'MyClass.perform'". # by setting BACKTRACE=1 before calling your invocation, like "BACKTRACE=1 ./bin/rails runner 'MyClass.perform'".
Rails.backtrace_cleaner.remove_silencers! if ENV["BACKTRACE"] Rails.backtrace_cleaner.remove_silencers! if ENV['BACKTRACE']
# Be sure to restart your server when you modify this file. # Be sure to restart your server when you modify this file.
# Configure sensitive parameters which will be filtered from the log file. # Configure sensitive parameters which will be filtered from the log file.
Rails.application.config.filter_parameters += [ Rails.application.config.filter_parameters += %i[
:passw, :secret, :token, :_key, :crypt, :salt, :certificate, :otp, :ssn passw secret token _key crypt salt certificate otp ssn
] ]
...@@ -4,25 +4,25 @@ ...@@ -4,25 +4,25 @@
# the maximum value specified for Puma. Default is set to 5 threads for minimum # the maximum value specified for Puma. Default is set to 5 threads for minimum
# and maximum; this matches the default thread size of Active Record. # and maximum; this matches the default thread size of Active Record.
# #
max_threads_count = ENV.fetch("RAILS_MAX_THREADS") { 5 } max_threads_count = ENV.fetch('RAILS_MAX_THREADS') { 5 }
min_threads_count = ENV.fetch("RAILS_MIN_THREADS") { max_threads_count } min_threads_count = ENV.fetch('RAILS_MIN_THREADS') { max_threads_count }
threads min_threads_count, max_threads_count threads min_threads_count, max_threads_count
# Specifies the `worker_timeout` threshold that Puma will use to wait before # Specifies the `worker_timeout` threshold that Puma will use to wait before
# terminating a worker in development environments. # terminating a worker in development environments.
# #
worker_timeout 3600 if ENV.fetch("RAILS_ENV", "development") == "development" worker_timeout 3600 if ENV.fetch('RAILS_ENV', 'development') == 'development'
# Specifies the `port` that Puma will listen on to receive requests; default is 3000. # Specifies the `port` that Puma will listen on to receive requests; default is 3000.
# #
port ENV.fetch("PORT") { 3000 } port ENV.fetch('PORT') { 3000 }
# Specifies the `environment` that Puma will run in. # Specifies the `environment` that Puma will run in.
# #
environment ENV.fetch("RAILS_ENV") { "development" } environment ENV.fetch('RAILS_ENV') { 'development' }
# Specifies the `pidfile` that Puma will use. # Specifies the `pidfile` that Puma will use.
pidfile ENV.fetch("PIDFILE") { "tmp/pids/server.pid" } pidfile ENV.fetch('PIDFILE') { 'tmp/pids/server.pid' }
# Specifies the number of `workers` to boot in clustered mode. # Specifies the number of `workers` to boot in clustered mode.
# Workers are forked web server processes. If using threads and workers together # Workers are forked web server processes. If using threads and workers together
......
every 1.day, at: '08:00 am' do
rake 'crawler:all'
end
Spring.watch( Spring.watch(
".ruby-version", '.ruby-version',
".rbenv-vars", '.rbenv-vars',
"tmp/restart.txt", 'tmp/restart.txt',
"tmp/caching-dev.txt" 'tmp/caching-dev.txt'
) )
class RemoveAddressFromCities < ActiveRecord::Migration[6.1]
def change
remove_column :cities, :address, :string
end
end
class RemoveMessageFromCompanies < ActiveRecord::Migration[6.1]
def change
remove_column :companies, :message, :text
remove_column :companies, :benefits, :string
remove_column :companies, :type, :string
remove_column :companies, :total_employee, :integer
end
end
class AddAddressOverviewToCompanies < ActiveRecord::Migration[6.1]
def change
add_column :companies, :address, :string
add_column :companies, :overview, :text
end
end
class RemoveDegreeFromJobs < ActiveRecord::Migration[6.1]
def change
remove_column :jobs, :degree, :string
end
end
class ChangeJobs < ActiveRecord::Migration[6.1]
def up
change_column :jobs, :experience, :string
change_column :jobs, :salary, :string
rename_column :jobs, :type, :job_type
end
def down
change_column :jobs, :experience, :integer
change_column :jobs, :salary, :integer
end
end
class ChangeCompanies < ActiveRecord::Migration[6.1]
def change
change_column :companies, :address, :text
end
end
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2021_07_09_043354) do ActiveRecord::Schema.define(version: 2021_07_23_035105) do
create_table "apply_jobs", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| create_table "apply_jobs", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t|
t.bigint "user_id", null: false t.bigint "user_id", null: false
...@@ -25,7 +25,6 @@ ActiveRecord::Schema.define(version: 2021_07_09_043354) do ...@@ -25,7 +25,6 @@ ActiveRecord::Schema.define(version: 2021_07_09_043354) do
create_table "cities", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| create_table "cities", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t|
t.bigint "region_id", null: false t.bigint "region_id", null: false
t.string "name" t.string "name"
t.string "address"
t.datetime "created_at", precision: 6, null: false t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false t.datetime "updated_at", precision: 6, null: false
t.index ["region_id"], name: "index_cities_on_region_id" t.index ["region_id"], name: "index_cities_on_region_id"
...@@ -43,12 +42,10 @@ ActiveRecord::Schema.define(version: 2021_07_09_043354) do ...@@ -43,12 +42,10 @@ ActiveRecord::Schema.define(version: 2021_07_09_043354) do
create_table "companies", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| create_table "companies", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t|
t.string "name" t.string "name"
t.text "description" t.text "description"
t.integer "total_employee"
t.string "type"
t.string "benefits"
t.text "message"
t.datetime "created_at", precision: 6, null: false t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false t.datetime "updated_at", precision: 6, null: false
t.text "address"
t.text "overview"
end end
create_table "companies_cities", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| create_table "companies_cities", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t|
...@@ -99,11 +96,10 @@ ActiveRecord::Schema.define(version: 2021_07_09_043354) do ...@@ -99,11 +96,10 @@ ActiveRecord::Schema.define(version: 2021_07_09_043354) do
t.text "overview" t.text "overview"
t.text "requirement" t.text "requirement"
t.text "other_requirement" t.text "other_requirement"
t.integer "salary" t.string "salary"
t.string "type" t.string "job_type"
t.string "level" t.string "level"
t.integer "experience" t.string "experience"
t.string "degree"
t.string "benefits" t.string "benefits"
t.datetime "expired_at" t.datetime "expired_at"
t.datetime "created_at", precision: 6, null: false t.datetime "created_at", precision: 6, null: false
......
require 'open-uri'
namespace :crawler do
desc 'Crawl Jobs and Companies'
task jobs: :environment do
logger = Logger.new("#{Rails.root}/log/crawler_jobs.log")
logger.info "Start crawler job at: #{Time.current}"
base_url = Nokogiri::HTML(URI.open('https://careerbuilder.vn/'))
job_page = base_url.css('div.menu div.dropdown-menu ul li a')[0].attributes['href'].value
parse_job_page = Nokogiri::HTML(URI.open(job_page))
job_listing = parse_job_page.css('div.job-item')
per_page = job_listing.present? ? job_listing.length : 0
page = 1
total = parse_job_page.css('div.job-found p').text.gsub(/[^0-9]/, '')
last_page = (total.to_f / per_page).round
while page <= last_page
pagination_page_job = "https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"
parse_pagination_page_job = Nokogiri::HTML(URI.open(pagination_page_job))
pagination_job_listing = parse_pagination_page_job.css('div.job-item')
pagination_job_listing.each do |detail_jobs|
company_url = detail_jobs.css('a.company-name').attribute('href').text
next if company_url == 'javascript:void(0);'
slug_company = CGI.escape(company_url.gsub('https://careerbuilder.vn/vi/nha-tuyen-dung/', '').strip)
company_page = "https://careerbuilder.vn/vi/nha-tuyen-dung/#{slug_company}"
parse_company_page = Nokogiri::HTML(URI.open(company_page).read)
company = parse_company_page.css('div.container')
company_name = company.css('div.company-info div.content p.name')
next if company_name.nil?
logger.info("Link company: #{company_page}")
company_info = company.css('div.company-info div.content')
address = company_info.css('p')[1].try(:text)
description = company_info.css('ul li').text
overview = company.css('div.row div.content p').text.squish.strip
Company.find_or_create_by(
name: company_name.text,
address: address,
description: description,
overview: overview
)
slug_job = CGI.escape(detail_jobs.css('a.job_link').attribute('href').text
.gsub('https://careerbuilder.vn/vi/tim-viec-lam/', '').strip)
job_detail_page = "https://careerbuilder.vn/vi/tim-viec-lam/#{slug_job}"
parse_job_detail_page = Nokogiri::HTML(URI.open(job_detail_page))
detail_job = parse_job_detail_page.css('div.container')
title = detail_job.css('div.job-desc h1.title')
next if title.nil?
logger.info("Link job: #{job_detail_page}")
salary, experience, type, level, expired_at = ''
detail_content = detail_job.css('div.row div.detail-box.has-background ul li')
detail_content.each do |content|
case content.css('strong').text.strip
when 'Lương'
salary = content.css('p').text
when 'Kinh nghiệm'
experience = content.css('p').text.gsub('\r\n', '').strip
when 'Hình thức'
type = content.css('p').text
when 'Cấp bậc'
level = content.css('p').text
when 'Hết hạn nộp'
expired_at = content.css('p').text
end
end
benefits, overview, requirement, other_requirement = ''
detail_require = detail_job.css('div.detail-row')
detail_require.each do |detail|
case detail.css('h3').text
when 'Phúc lợi '
benefits = detail.css('ul li').text
when 'Mô tả Công việc'
overview = detail.css('p').text
when 'Yêu Cầu Công Việc'
requirement = detail.css('p').text
when 'Thông tin khác'
other_requirement = detail.css('div.content_fck ul li').text.squish
end
end
job = Job.find_or_create_by(
title: title.text,
salary: salary,
experience: experience,
job_type: type,
level: level,
expired_at: expired_at,
benefits: benefits,
overview: overview,
requirement: requirement,
other_requirement: other_requirement,
company_id: Company.find_by(name: company_name.text).id
)
job_industries = []
industries = detail_job.css('div.detail-box.has-background ul li p a')
industries.each do |industry|
name = industry.text.squish
job_industry = Industry.find_by(name: name)
next if job_industry.nil?
job_industries << job_industry
end
job.industries << job_industries
job_cities = []
location = detail_job.css('div.map p a')
location.each do |city|
name = city.text
job_city = City.find_by(name: name)
next if job_city.nil?
job_cities << job_city
end
job.cities << job_cities
rescue StandardError => error
logger.error "The company url has error: #{company_page}"
logger.error "The job url has error: #{job_detail_page}"
logger.error error
next
end
page += 1
end
logger.info "End crawler job at: #{Time.current}"
end
desc 'Crawl Industries'
task industries: :environment do
industries_listing = parse_base_url.css('div.col-md-6.col-lg-4.cus-col ul.list-jobs li a')
industries_listing.each do |industry|
industry_name = industry.text
Industry.find_or_create_by(
name: industry_name
)
end
end
desc 'Crawl Cities'
task cities: :environment do
cities = parse_base_url.css('div.container div.jobs-in-country li a')
cities.each do |city|
city_name = city.text.gsub('Việc làm tại', '').strip
City.find_or_create_by(
name: city_name,
region_id: Region.find_by(name: 'Trong Nước').id
)
end
cities_foreign = parse_base_url.css('div.container div.overseas-jobs li a')
cities_foreign.each do |city|
city_name = city.text.strip
City.find_or_create_by(
name: city_name,
region_id: Region.find_by(name: 'Nước Ngoài').id
)
end
end
desc 'Crawl Regions'
task regions: :environment do
regions = parse_base_url.css('div.container div.col-xl-3 div.main-jobs-by-location h3')
regions.each do |region|
region_name = region.text.gsub('Việc Làm', '').strip
Region.find_or_create_by(
name: region_name
)
end
end
desc 'Craw regions, cities, industries, jobs and companies'
task all: %i[regions cities industries jobs]
def parse_base_url
base_url = Nokogiri::HTML(URI.open('https://careerbuilder.vn/'))
industries_url = base_url.css('div.menu div.dropdown-menu ul li a')[1].attributes['href'].text
Nokogiri::HTML(URI.open(industries_url))
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment