Commit f5b0ef08 by Son Do Hong

Merge branch 'feature/crawler' into 'master'

Jobs Crawler

See merge request !2
parents 159d97f4 db5fb9fa
Pipeline #253 failed with stages
in 0 seconds
source 'https://rubygems.org' source "https://rubygems.org"
git_source(:github) { |repo| "https://github.com/#{repo}.git" } git_source(:github) { |repo| "https://github.com/#{repo}.git" }
ruby '2.6.5' ruby "2.6.5"
# Bundle edge Rails instead: gem 'rails', github: 'rails/rails' # Bundle edge Rails instead: gem "rails", github: "rails/rails"
gem 'rails', '~> 6.0.1' gem "rails", "~> 6.0.1"
# Use mysql2 as the database for Active Record # Use mysql2 as the database for Active Record
gem 'mysql2' gem "mysql2"
# Use Puma as the app server # Use Puma as the app server
gem 'puma', '~> 4.1' gem "puma", "~> 4.1"
# Use SCSS for stylesheets # Use SCSS for stylesheets
gem 'sass-rails', '>= 6' gem "sass-rails", ">= 6"
# Transpile app-like JavaScript. Read more: https://github.com/rails/webpacker # Transpile app-like JavaScript. Read more: https://github.com/rails/webpacker
gem 'webpacker', '~> 4.0' gem "webpacker", "~> 4.0"
# Turbolinks makes navigating your web application faster. Read more: https://github.com/turbolinks/turbolinks # Turbolinks makes navigating your web application faster. Read more: https://github.com/turbolinks/turbolinks
gem 'turbolinks', '~> 5' gem "turbolinks", "~> 5"
# Build JSON APIs with ease. Read more: https://github.com/rails/jbuilder # Build JSON APIs with ease. Read more: https://github.com/rails/jbuilder
gem 'jbuilder', '~> 2.7' gem "jbuilder", "~> 2.7"
# Use Redis adapter to run Action Cable in production # Use Redis adapter to run Action Cable in production
# gem 'redis', '~> 4.0' # gem "redis", "~> 4.0"
# Use Active Model has_secure_password # Use Active Model has_secure_password
# gem 'bcrypt', '~> 3.1.7' # gem "bcrypt", "~> 3.1.7"
# Use Active Storage variant # Use Active Storage variant
# gem 'image_processing', '~> 1.2' # gem "image_processing", "~> 1.2"
# Reduces boot times through caching; required in config/boot.rb # Reduces boot times through caching; required in config/boot.rb
gem 'bootsnap', '>= 1.4.2', require: false gem "bootsnap", ">= 1.4.2", require: false
gem "devise"
gem 'devise' gem "nokogiri"
gem "mechanize"
gem "rubysl-open-uri"
gem "whenever", require: false
group :development, :test do group :development, :test do
# Call 'byebug' anywhere in the code to stop execution and get a debugger console # Call "byebug" anywhere in the code to stop execution and get a debugger console
gem 'byebug', platforms: [:mri, :mingw, :x64_mingw] gem "byebug", platforms: [:mri, :mingw, :x64_mingw]
gem "pry"
end end
group :development do group :development do
# Access an interactive console on exception pages or by calling 'console' anywhere in the code. # Access an interactive console on exception pages or by calling "console" anywhere in the code.
gem 'web-console', '>= 3.3.0' gem "web-console", ">= 3.3.0"
gem 'listen', '>= 3.0.5', '< 3.2' gem "listen", ">= 3.0.5", "< 3.2"
# Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring # Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring
gem 'spring' gem "spring"
gem 'spring-watcher-listen', '~> 2.0.0' gem "spring-watcher-listen", "~> 2.0.0"
end end
group :test do group :test do
# Adds support for Capybara system testing and selenium driver # Adds support for Capybara system testing and selenium driver
gem 'capybara', '>= 2.15' gem "capybara", ">= 2.15"
gem 'selenium-webdriver' gem "selenium-webdriver"
# Easy installation and use of web drivers to run system tests with browsers # Easy installation and use of web drivers to run system tests with browsers
gem 'webdrivers' gem "webdrivers"
end end
# Windows does not include zoneinfo files, so bundle the tzinfo-data gem # Windows does not include zoneinfo files, so bundle the tzinfo-data gem
gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby] gem "tzinfo-data", platforms: [:mingw, :mswin, :x64_mingw, :jruby]
...@@ -58,15 +58,10 @@ GEM ...@@ -58,15 +58,10 @@ GEM
zeitwerk (~> 2.2) zeitwerk (~> 2.2)
addressable (2.7.0) addressable (2.7.0)
public_suffix (>= 2.0.2, < 5.0) public_suffix (>= 2.0.2, < 5.0)
autoprefixer-rails (9.7.2)
execjs
bcrypt (3.1.13) bcrypt (3.1.13)
bindex (0.8.1) bindex (0.8.1)
bootsnap (1.4.5) bootsnap (1.4.5)
msgpack (~> 1.0) msgpack (~> 1.0)
bootstrap-sass (3.3.7)
autoprefixer-rails (>= 5.2.1)
sass (>= 3.3.4)
builder (3.2.3) builder (3.2.3)
byebug (11.0.1) byebug (11.0.1)
capybara (3.29.0) capybara (3.29.0)
...@@ -78,7 +73,10 @@ GEM ...@@ -78,7 +73,10 @@ GEM
regexp_parser (~> 1.5) regexp_parser (~> 1.5)
xpath (~> 3.2) xpath (~> 3.2)
childprocess (3.0.0) childprocess (3.0.0)
chronic (0.10.2)
coderay (1.1.2)
concurrent-ruby (1.1.5) concurrent-ruby (1.1.5)
connection_pool (2.2.2)
crass (1.0.5) crass (1.0.5)
devise (4.7.1) devise (4.7.1)
bcrypt (~> 3.0) bcrypt (~> 3.0)
...@@ -86,11 +84,14 @@ GEM ...@@ -86,11 +84,14 @@ GEM
railties (>= 4.1.0) railties (>= 4.1.0)
responders responders
warden (~> 1.2.3) warden (~> 1.2.3)
domain_name (0.5.20190701)
unf (>= 0.0.5, < 1.0.0)
erubi (1.9.0) erubi (1.9.0)
execjs (2.7.0)
ffi (1.11.3) ffi (1.11.3)
globalid (0.4.2) globalid (0.4.2)
activesupport (>= 4.2.0) activesupport (>= 4.2.0)
http-cookie (1.0.3)
domain_name (~> 0.5)
i18n (1.7.0) i18n (1.7.0)
concurrent-ruby (~> 1.0) concurrent-ruby (~> 1.0)
jbuilder (2.9.1) jbuilder (2.9.1)
...@@ -106,17 +107,36 @@ GEM ...@@ -106,17 +107,36 @@ GEM
mini_mime (>= 0.1.1) mini_mime (>= 0.1.1)
marcel (0.3.3) marcel (0.3.3)
mimemagic (~> 0.3.2) mimemagic (~> 0.3.2)
mechanize (2.7.6)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (>= 1.17.2)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (>= 2.5.2)
nokogiri (~> 1.6)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
method_source (0.9.2) method_source (0.9.2)
mime-types (3.3)
mime-types-data (~> 3.2015)
mime-types-data (3.2019.1009)
mimemagic (0.3.3) mimemagic (0.3.3)
mini_mime (1.0.2) mini_mime (1.0.2)
mini_portile2 (2.4.0) mini_portile2 (2.4.0)
minitest (5.13.0) minitest (5.13.0)
msgpack (1.3.1) msgpack (1.3.1)
mysql2 (0.5.3) mysql2 (0.5.3)
net-http-digest_auth (1.4.1)
net-http-persistent (3.1.0)
connection_pool (~> 2.2)
nio4r (2.5.2) nio4r (2.5.2)
nokogiri (1.10.5) nokogiri (1.10.5)
mini_portile2 (~> 2.4.0) mini_portile2 (~> 2.4.0)
ntlm-http (0.1.1)
orm_adapter (0.5.0) orm_adapter (0.5.0)
pry (0.12.2)
coderay (~> 1.1.0)
method_source (~> 0.9.0)
public_suffix (4.0.1) public_suffix (4.0.1)
puma (4.3.0) puma (4.3.0)
nio4r (~> 2.0) nio4r (~> 2.0)
...@@ -160,12 +180,8 @@ GEM ...@@ -160,12 +180,8 @@ GEM
actionpack (>= 5.0) actionpack (>= 5.0)
railties (>= 5.0) railties (>= 5.0)
ruby_dep (1.5.0) ruby_dep (1.5.0)
rubysl-open-uri (2.0.0)
rubyzip (2.0.0) rubyzip (2.0.0)
sass (3.7.4)
sass-listen (~> 4.0.0)
sass-listen (4.0.0)
rb-fsevent (~> 0.9, >= 0.9.4)
rb-inotify (~> 0.9, >= 0.9.7)
sass-rails (6.0.0) sass-rails (6.0.0)
sassc-rails (~> 2.1, >= 2.1.1) sassc-rails (~> 2.1, >= 2.1.1)
sassc (2.2.1) sassc (2.2.1)
...@@ -198,6 +214,9 @@ GEM ...@@ -198,6 +214,9 @@ GEM
turbolinks-source (5.2.0) turbolinks-source (5.2.0)
tzinfo (1.2.5) tzinfo (1.2.5)
thread_safe (~> 0.1) thread_safe (~> 0.1)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.6)
warden (1.2.8) warden (1.2.8)
rack (>= 2.0.6) rack (>= 2.0.6)
web-console (4.0.1) web-console (4.0.1)
...@@ -213,9 +232,12 @@ GEM ...@@ -213,9 +232,12 @@ GEM
activesupport (>= 4.2) activesupport (>= 4.2)
rack-proxy (>= 0.6.1) rack-proxy (>= 0.6.1)
railties (>= 4.2) railties (>= 4.2)
webrobots (0.1.2)
websocket-driver (0.7.1) websocket-driver (0.7.1)
websocket-extensions (>= 0.1.0) websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.4) websocket-extensions (0.1.4)
whenever (1.0.0)
chronic (>= 0.6.3)
xpath (3.2.0) xpath (3.2.0)
nokogiri (~> 1.8) nokogiri (~> 1.8)
zeitwerk (2.2.1) zeitwerk (2.2.1)
...@@ -225,15 +247,18 @@ PLATFORMS ...@@ -225,15 +247,18 @@ PLATFORMS
DEPENDENCIES DEPENDENCIES
bootsnap (>= 1.4.2) bootsnap (>= 1.4.2)
bootstrap-sass (= 3.3.7)
byebug byebug
capybara (>= 2.15) capybara (>= 2.15)
devise devise
jbuilder (~> 2.7) jbuilder (~> 2.7)
listen (>= 3.0.5, < 3.2) listen (>= 3.0.5, < 3.2)
mechanize
mysql2 mysql2
nokogiri
pry
puma (~> 4.1) puma (~> 4.1)
rails (~> 6.0.1) rails (~> 6.0.1)
rubysl-open-uri
sass-rails (>= 6) sass-rails (>= 6)
selenium-webdriver selenium-webdriver
spring spring
...@@ -243,6 +268,7 @@ DEPENDENCIES ...@@ -243,6 +268,7 @@ DEPENDENCIES
web-console (>= 3.3.0) web-console (>= 3.3.0)
webdrivers webdrivers
webpacker (~> 4.0) webpacker (~> 4.0)
whenever
RUBY VERSION RUBY VERSION
ruby 2.6.5p114 ruby 2.6.5p114
......
# Load the Rails application. # Load the Rails application.
require_relative 'application' require_relative "application"
# Initialize the Rails application. # Initialize the Rails application.
Rails.application.initialize! Rails.application.initialize!
# Use this file to easily define all of your cron jobs.
#
# It's helpful, but not entirely necessary to understand cron before proceeding.
# http://en.wikipedia.org/wiki/Cron
# Learn more: http://github.com/javan/whenever
set :environment, "development"
set :output, "/crawler/config/import_log.log"
every 1.day at: '5:00 pm' do
rake "job:create"
end
...@@ -11,7 +11,7 @@ class CreateJobs < ActiveRecord::Migration[6.0] ...@@ -11,7 +11,7 @@ class CreateJobs < ActiveRecord::Migration[6.0]
t.string :position t.string :position
t.datetime :posted_at t.datetime :posted_at
t.datetime :expired_at t.datetime :expired_at
t.timestamps null: false t.timestamps null: false
end end
end end
......
class AddCodeToCompany < ActiveRecord::Migration[6.0]
def change
add_column :companies, :code, :string
end
end
class AddCodeToJob < ActiveRecord::Migration[6.0]
def change
add_column :jobs, :code, :string
end
end
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2019_11_27_075301) do ActiveRecord::Schema.define(version: 2019_12_05_082359) do
create_table "cities", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "cities", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.string "name" t.string "name"
...@@ -34,6 +34,7 @@ ActiveRecord::Schema.define(version: 2019_11_27_075301) do ...@@ -34,6 +34,7 @@ ActiveRecord::Schema.define(version: 2019_11_27_075301) do
t.string "name" t.string "name"
t.datetime "created_at", precision: 6, null: false t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false t.datetime "updated_at", precision: 6, null: false
t.string "code"
end end
create_table "industries", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "industries", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
...@@ -64,6 +65,7 @@ ActiveRecord::Schema.define(version: 2019_11_27_075301) do ...@@ -64,6 +65,7 @@ ActiveRecord::Schema.define(version: 2019_11_27_075301) do
t.datetime "expired_at" t.datetime "expired_at"
t.datetime "created_at", precision: 6, null: false t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false t.datetime "updated_at", precision: 6, null: false
t.string "code"
end end
create_table "user_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "user_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
......
require "rubygems"
require "open-uri"
require "nokogiri"
require "mechanize"
require "pry"
namespace :job do
desc "crawl data"
task crawl: :environment do
agent = Mechanize.new
main_page = Nokogiri::HTML(open(URI.escape("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-vi.html")))
total_page = main_page.css("div.ais-stats").css("h1.col-sm-10").css("span").text.to_i
(1..total_page).each do |num|
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{num}-vi.html")
links = page.links.select { |link| link.href.include?("careerbuilder.vn/vi/tim-viec-lam/") }.map(&:href)
links.each do |link|
puts link
job_page = Nokogiri::HTML(open(URI.escape(link)))
begin
title = job_page.css("div.MyJobDetail").css("div.MyJobLeft").css("div.LeftJobCB").css("div.top-job").css("div.top-job-info").css("h1").text
if job_page.css("div.LeftJobCB").present?
description = job_page.css("div.LeftJobCB").css("div.MarBot20")[1].css("div.content_fck").css("p").text
short_description = job_page.css("div.LeftJobCB").css('div.desc_company.content_fck').css('span#emp_collapse').text
requirement = job_page.css("div.LeftJobCB").css("div.MarBot20")[2].css("div.content_fck").css("p").text
elsif job_page.css("div.content_fck.content_job_info").present?
description = job_page.css("div.content_fck.content_job_info").css("div.decs")[0].text
requirement = job_page.css("div.content_fck.content_job_info").css("div.decs")[1].text
elsif job_page.css("div.content_fck.job_requirement").present?
description = job_page.css("div.content_fck.job_requirement").css("div")[0].text
requirement = job_page.css("div.content_fck.job_requirement").css("div")[2].text
elsif job_page.css("div.content_fck").present?
description = job_page.css("div.content_fck")[0].css("p").text
short_description = "N/A"
requirement = job_page.css("div.content_fck")[1].css("p").text
end
salary = job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine2").css("p.fl_right").text
position = job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_left").text
rescue
crawler_logger = ActiveSupport::Logger.new("log/crawler.log")
crawler_logger.info "Skip #{link}"
next
end
post_code = /.([^.]*).html/.match(link)
job = Job.find_or_initialize_by(code: post_code[1])
job.update(
title: title,
description: description,
short_description: short_description,
salary: salary,
requirement: requirement,
position: position)
end
end
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment