import data into Job

parent 4bfb13bf
Pipeline #1354 failed with stages
in 0 seconds
...@@ -6,6 +6,8 @@ ruby '3.0.1' ...@@ -6,6 +6,8 @@ ruby '3.0.1'
# Bundle edge Rails instead: gem 'rails', github: 'rails/rails', branch: 'main' # Bundle edge Rails instead: gem 'rails', github: 'rails/rails', branch: 'main'
gem 'rails', '~> 6.1.3', '>= 6.1.3.2' gem 'rails', '~> 6.1.3', '>= 6.1.3.2'
gem 'bootstrap', '~> 5.0.1' gem 'bootstrap', '~> 5.0.1'
gem 'nokogiri', '~> 1.11', '>= 1.11.7'
# Use sqlite3 as the database for Active Record # Use sqlite3 as the database for Active Record
gem 'mysql2', '~> 0.5.3' gem 'mysql2', '~> 0.5.3'
# Use Puma as the app server # Use Puma as the app server
......
...@@ -91,6 +91,7 @@ GEM ...@@ -91,6 +91,7 @@ GEM
activesupport (>= 4.2.0) activesupport (>= 4.2.0)
i18n (1.8.10) i18n (1.8.10)
concurrent-ruby (~> 1.0) concurrent-ruby (~> 1.0)
io-wait (0.1.0)
jbuilder (2.11.2) jbuilder (2.11.2)
activesupport (>= 5.0.0) activesupport (>= 5.0.0)
listen (3.5.1) listen (3.5.1)
...@@ -107,6 +108,12 @@ GEM ...@@ -107,6 +108,12 @@ GEM
minitest (5.14.4) minitest (5.14.4)
msgpack (1.4.2) msgpack (1.4.2)
mysql2 (0.5.3) mysql2 (0.5.3)
net-http (0.1.1)
net-protocol
uri
net-protocol (0.1.1)
io-wait
timeout
nio4r (2.5.7) nio4r (2.5.7)
nokogiri (1.11.7-x86_64-linux) nokogiri (1.11.7-x86_64-linux)
racc (~> 1.4) racc (~> 1.4)
...@@ -178,11 +185,13 @@ GEM ...@@ -178,11 +185,13 @@ GEM
sprockets (>= 3.0.0) sprockets (>= 3.0.0)
thor (1.1.0) thor (1.1.0)
tilt (2.0.10) tilt (2.0.10)
timeout (0.1.1)
turbolinks (5.2.1) turbolinks (5.2.1)
turbolinks-source (~> 5.2) turbolinks-source (~> 5.2)
turbolinks-source (5.2.0) turbolinks-source (5.2.0)
tzinfo (2.0.4) tzinfo (2.0.4)
concurrent-ruby (~> 1.0) concurrent-ruby (~> 1.0)
uri (0.10.1)
web-console (4.1.0) web-console (4.1.0)
actionview (>= 6.0.0) actionview (>= 6.0.0)
activemodel (>= 6.0.0) activemodel (>= 6.0.0)
...@@ -215,6 +224,8 @@ DEPENDENCIES ...@@ -215,6 +224,8 @@ DEPENDENCIES
jbuilder (~> 2.7) jbuilder (~> 2.7)
listen (~> 3.3) listen (~> 3.3)
mysql2 (~> 0.5.3) mysql2 (~> 0.5.3)
net-http
nokogiri (~> 1.11, >= 1.11.7)
puma (~> 5.0) puma (~> 5.0)
rack-mini-profiler (~> 2.0) rack-mini-profiler (~> 2.0)
rails (~> 6.1.3, >= 6.1.3.2) rails (~> 6.1.3, >= 6.1.3.2)
......
require_relative "boot" require_relative 'boot'
require "rails/all" require 'rails/all'
# Require the gems listed in Gemfile, including any gems # Require the gems listed in Gemfile, including any gems
# you've limited to :test, :development, or :production. # you've limited to :test, :development, or :production.
......
ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../Gemfile', __dir__) ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../Gemfile', __dir__)
require "bundler/setup" # Set up gems listed in the Gemfile. require 'bundler/setup' # Set up gems listed in the Gemfile.
require "bootsnap/setup" # Speed up boot time by caching expensive operations. require 'bootsnap/setup' # Speed up boot time by caching expensive operations.
# Load the Rails application. # Load the Rails application.
require_relative "application" require_relative 'application'
# Initialize the Rails application. # Initialize the Rails application.
Rails.application.initialize! Rails.application.initialize!
require "active_support/core_ext/integer/time" require 'active_support/core_ext/integer/time'
Rails.application.configure do Rails.application.configure do
# Settings specified here will take precedence over those in config/application.rb. # Settings specified here will take precedence over those in config/application.rb.
......
require "active_support/core_ext/integer/time" require 'active_support/core_ext/integer/time'
Rails.application.configure do Rails.application.configure do
# Settings specified here will take precedence over those in config/application.rb. # Settings specified here will take precedence over those in config/application.rb.
...@@ -53,7 +53,7 @@ Rails.application.configure do ...@@ -53,7 +53,7 @@ Rails.application.configure do
config.log_level = :info config.log_level = :info
# Prepend all log lines with the following tags. # Prepend all log lines with the following tags.
config.log_tags = [ :request_id ] config.log_tags = [:request_id]
# Use a different cache store in production. # Use a different cache store in production.
# config.cache_store = :mem_cache_store # config.cache_store = :mem_cache_store
...@@ -88,7 +88,7 @@ Rails.application.configure do ...@@ -88,7 +88,7 @@ Rails.application.configure do
# require "syslog/logger" # require "syslog/logger"
# config.logger = ActiveSupport::TaggedLogging.new(Syslog::Logger.new 'app-name') # config.logger = ActiveSupport::TaggedLogging.new(Syslog::Logger.new 'app-name')
if ENV["RAILS_LOG_TO_STDOUT"].present? if ENV['RAILS_LOG_TO_STDOUT'].present?
logger = ActiveSupport::Logger.new(STDOUT) logger = ActiveSupport::Logger.new(STDOUT)
logger.formatter = config.log_formatter logger.formatter = config.log_formatter
config.logger = ActiveSupport::TaggedLogging.new(logger) config.logger = ActiveSupport::TaggedLogging.new(logger)
......
require "active_support/core_ext/integer/time" require 'active_support/core_ext/integer/time'
# The test environment is used exclusively to run your application's # The test environment is used exclusively to run your application's
# test suite. You never need to work with it otherwise. Remember that # test suite. You never need to work with it otherwise. Remember that
......
...@@ -5,4 +5,4 @@ ...@@ -5,4 +5,4 @@
# You can also remove all the silencers if you're trying to debug a problem that might stem from framework code # You can also remove all the silencers if you're trying to debug a problem that might stem from framework code
# by setting BACKTRACE=1 before calling your invocation, like "BACKTRACE=1 ./bin/rails runner 'MyClass.perform'". # by setting BACKTRACE=1 before calling your invocation, like "BACKTRACE=1 ./bin/rails runner 'MyClass.perform'".
Rails.backtrace_cleaner.remove_silencers! if ENV["BACKTRACE"] Rails.backtrace_cleaner.remove_silencers! if ENV['BACKTRACE']
# Be sure to restart your server when you modify this file. # Be sure to restart your server when you modify this file.
# Configure sensitive parameters which will be filtered from the log file. # Configure sensitive parameters which will be filtered from the log file.
Rails.application.config.filter_parameters += [ Rails.application.config.filter_parameters += %i[
:passw, :secret, :token, :_key, :crypt, :salt, :certificate, :otp, :ssn passw secret token _key crypt salt certificate otp ssn
] ]
...@@ -4,25 +4,25 @@ ...@@ -4,25 +4,25 @@
# the maximum value specified for Puma. Default is set to 5 threads for minimum # the maximum value specified for Puma. Default is set to 5 threads for minimum
# and maximum; this matches the default thread size of Active Record. # and maximum; this matches the default thread size of Active Record.
# #
max_threads_count = ENV.fetch("RAILS_MAX_THREADS") { 5 } max_threads_count = ENV.fetch('RAILS_MAX_THREADS') { 5 }
min_threads_count = ENV.fetch("RAILS_MIN_THREADS") { max_threads_count } min_threads_count = ENV.fetch('RAILS_MIN_THREADS') { max_threads_count }
threads min_threads_count, max_threads_count threads min_threads_count, max_threads_count
# Specifies the `worker_timeout` threshold that Puma will use to wait before # Specifies the `worker_timeout` threshold that Puma will use to wait before
# terminating a worker in development environments. # terminating a worker in development environments.
# #
worker_timeout 3600 if ENV.fetch("RAILS_ENV", "development") == "development" worker_timeout 3600 if ENV.fetch('RAILS_ENV', 'development') == 'development'
# Specifies the `port` that Puma will listen on to receive requests; default is 3000. # Specifies the `port` that Puma will listen on to receive requests; default is 3000.
# #
port ENV.fetch("PORT") { 3000 } port ENV.fetch('PORT') { 3000 }
# Specifies the `environment` that Puma will run in. # Specifies the `environment` that Puma will run in.
# #
environment ENV.fetch("RAILS_ENV") { "development" } environment ENV.fetch('RAILS_ENV') { 'development' }
# Specifies the `pidfile` that Puma will use. # Specifies the `pidfile` that Puma will use.
pidfile ENV.fetch("PIDFILE") { "tmp/pids/server.pid" } pidfile ENV.fetch('PIDFILE') { 'tmp/pids/server.pid' }
# Specifies the number of `workers` to boot in clustered mode. # Specifies the number of `workers` to boot in clustered mode.
# Workers are forked web server processes. If using threads and workers together # Workers are forked web server processes. If using threads and workers together
......
Spring.watch( Spring.watch(
".ruby-version", '.ruby-version',
".rbenv-vars", '.rbenv-vars',
"tmp/restart.txt", 'tmp/restart.txt',
"tmp/caching-dev.txt" 'tmp/caching-dev.txt'
) )
require 'nokogiri'
require 'open-uri'
require 'byebug'
def scraper
base_url = Nokogiri::HTML(URI.open('https://careerbuilder.vn/'))
#doc va parse url tat ca cac job
list_url = base_url.css('div.menu div.dropdown-menu ul li a')[0].attributes["href"].value
list_url_job = Nokogiri::HTML(URI.open(list_url))
job_listing = list_url_job.css('div.job-item')
page = 1
per_page = job_listing.length
total = list_url_job.css('div.job-found p').text.split(' ')[0].gsub(',','').to_i
last_page = (total.to_f / per_page.to_f).round
while page <= last_page
pagination_list_url = "https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"
pagination_list_url_job = Nokogiri::HTML(URI.open(pagination_list_url))
pagination_job_listing = pagination_list_url_job.css('div.job-item')
pagination_job_listing.each do |detail_jobs|
pagination_url = detail_jobs.css('a')[1].attributes["href"].value
pagination_detail_url = Nokogiri::HTML(URI.open(pagination_url))
pagination_detail_job = pagination_detail_url.css('div.container')
strong_element_value = pagination_detail_job.css('div.detail-box.has-background ul li')
puts pagination_detail_job.css('div.job-desc h1.title')[0].text
strong_element_value.each do |title_strong|
case title_strong.css('strong').text
when "Lương"
puts title_strong.css('p').text.gsub(/\s+/, " ").strip
when "Kinh nghiệm"
puts title_strong.css('p').text.gsub(/\s+/, " ").strip
when "Cấp bậc"
puts title_strong.css('p').text.gsub(/\s+/, " ").strip
when "Hết hạn nộp"
puts title_strong.css('p').text.gsub(/\s+/, " ").strip
end
end
h3_element_value = pagination_detail_job.css('div.detail-row')
h3_element_value.each do |h3_element|
case h3_element.css('h3').text
when "Mô tả Công việc"
puts h3_element.css('p').text.gsub(/\s+/, " ").strip
when "Yêu Cầu Công Việc"
puts h3_element.css('p').text.gsub(/\s+/, " ").strip
when "Thông tin khác"
puts h3_element.css('div.content_fck ul li').text.gsub(/\s+/, " ").strip
end
end
end
pagination_job_listing.each do |detail_company|
company_url = detail_company.css('a')[0].attributes["href"].value
parse_company_url = Nokogiri::HTML(URI.open(pagination_url))
company = parse_company_url.css('company-content')
puts pagination_detail_job.css('div.job-desc a.employer.job-company-name')[0].text
company.each do |info_company|
case info_company.css('h3').text
when "Giới thiệu về công ty"
puts info_company.css('p').text.gsub(/\s+/, " ").strip
when "Thông điệp từ CÔNG TY"
puts info_company.css('p').text.gsub(/\s+/, " ").strip
end
end
page +=1
end
end
scraper
\ No newline at end of file
class RemoveAddressFromCities < ActiveRecord::Migration[6.1]
def change
remove_column :cities, :address, :string
end
end
class RemoveMessageFromCompanies < ActiveRecord::Migration[6.1]
def change
remove_column :companies, :message, :text
remove_column :companies, :benefits, :string
remove_column :companies, :type, :string
remove_column :companies, :total_employee, :integer
end
end
class AddAddressOverviewToCompanies < ActiveRecord::Migration[6.1]
def change
add_column :companies, :address, :string
add_column :companies, :overview, :text
end
end
class RemoveDegreeFromJobs < ActiveRecord::Migration[6.1]
def change
remove_column :jobs, :degree, :string
end
end
class ChangeJobs < ActiveRecord::Migration[6.1]
def up
change_column :jobs, :salary, :string
remove_column :jobs, :industries_type, :text
remove_column :jobs, :location, :text
end
def down
change_column :jobs, :salary, :integer
change_column :jobs, :industries_type, :text
change_column :jobs, :location, :text
end
end
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2021_07_09_043354) do ActiveRecord::Schema.define(version: 2021_07_20_145646) do
create_table "apply_jobs", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| create_table "apply_jobs", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t|
t.bigint "user_id", null: false t.bigint "user_id", null: false
...@@ -25,7 +25,6 @@ ActiveRecord::Schema.define(version: 2021_07_09_043354) do ...@@ -25,7 +25,6 @@ ActiveRecord::Schema.define(version: 2021_07_09_043354) do
create_table "cities", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| create_table "cities", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t|
t.bigint "region_id", null: false t.bigint "region_id", null: false
t.string "name" t.string "name"
t.string "address"
t.datetime "created_at", precision: 6, null: false t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false t.datetime "updated_at", precision: 6, null: false
t.index ["region_id"], name: "index_cities_on_region_id" t.index ["region_id"], name: "index_cities_on_region_id"
...@@ -43,12 +42,10 @@ ActiveRecord::Schema.define(version: 2021_07_09_043354) do ...@@ -43,12 +42,10 @@ ActiveRecord::Schema.define(version: 2021_07_09_043354) do
create_table "companies", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| create_table "companies", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t|
t.string "name" t.string "name"
t.text "description" t.text "description"
t.integer "total_employee"
t.string "type"
t.string "benefits"
t.text "message"
t.datetime "created_at", precision: 6, null: false t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false t.datetime "updated_at", precision: 6, null: false
t.string "address"
t.text "overview"
end end
create_table "companies_cities", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t| create_table "companies_cities", charset: "utf8mb4", collation: "utf8mb4_0900_ai_ci", force: :cascade do |t|
...@@ -99,11 +96,10 @@ ActiveRecord::Schema.define(version: 2021_07_09_043354) do ...@@ -99,11 +96,10 @@ ActiveRecord::Schema.define(version: 2021_07_09_043354) do
t.text "overview" t.text "overview"
t.text "requirement" t.text "requirement"
t.text "other_requirement" t.text "other_requirement"
t.integer "salary" t.string "salary"
t.string "type" t.string "type"
t.string "level" t.string "level"
t.integer "experience" t.integer "experience"
t.string "degree"
t.string "benefits" t.string "benefits"
t.datetime "expired_at" t.datetime "expired_at"
t.datetime "created_at", precision: 6, null: false t.datetime "created_at", precision: 6, null: false
......
require 'open-uri' require 'open-uri'
namespace :crawler do namespace :crawler do
desc "TODO" desc 'Crawl Jobs and Companies'
task jobs: :environment do task jobs: :environment do
base_url = Nokogiri::HTML(URI.open('https://careerbuilder.vn/')) base_url = Nokogiri::HTML(URI.open('https://careerbuilder.vn/'))
list_url = base_url.css('div.menu div.dropdown-menu ul li a')[0].attributes["href"].value job_page = base_url.css('div.menu div.dropdown-menu ul li a')[0].attributes['href'].value
parse_list_url = Nokogiri::HTML(URI.open(list_url)) parse_job_page = Nokogiri::HTML(URI.open(job_page))
job_listing = parse_list_url.css('div.job-item') job_listing = parse_job_page.css('div.job-item')
page = 1 page = 1
per_page = job_listing.length per_page = job_listing.length
total = parse_list_url.css('div.job-found p').text.split(' ')[0].gsub(',','').to_i total = parse_job_page.css('div.job-found p').text.split(' ')[0].gsub(',', '').to_i
last_page = (total.to_f / per_page.to_f).round last_page = (total.to_f / per_page).round
while page <= last_page while page <= last_page
pagination_list_url = "https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html" pagination_page_job = "https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"
parse_list_url = Nokogiri::HTML(URI.open(pagination_list_url)) parse_pagination_page_job = Nokogiri::HTML(URI.open(pagination_page_job ))
pagination_job_listing = parse_list_url.css('div.job-item') pagination_job_listing = parse_pagination_page_job.css('div.job-item')
pagination_job_listing.each do |detail_jobs| pagination_job_listing.each do |detail_jobs|
pagination_url = detail_jobs.css('a')[1].attributes["href"].value company_page = detail_jobs.css('a.company-name').attribute('href').value
parse_pagination_url = Nokogiri::HTML(URI.open(pagination_url)) parse_company_url = Nokogiri::HTML(URI.open(company_page))
pagination_detail_job = parse_pagination_url.css('div.container') company = parse_company_url.css('div.container')
strong_element_value = pagination_detail_job.css('div.detail-box.has-background ul li') company_name = company.css('div.company-info div.content p.name').text
title = pagination_detail_job.css('div.job-desc h1.title')[0].text company_info = company.css('div.company-info div.content')
strong_element_value.each do |title_strong| address = company_info.css('p')[1].text
case title_strong.css('strong').text description = company_info.css('ul li').text
when "Lương" overview = company.css('div.row div.content p').text.gsub(/\s+/, '').strip
salary = title_strong.css('p').text.gsub(/\s+/, " ").strip Company.find_or_create_by(
when "Kinh nghiệm" name: company_name,
experience = title_strong.css('p').text.gsub(/\s+/, " ").strip address: address,
when "Cấp bậc" description: description,
level = title_strong.css('p').text.gsub(/\s+/, " ").strip overview: overview
when "Hết hạn nộp" )
expired_at = title_strong.css('p').text.gsub(/\s+/, " ").strip
end job_detail_page = detail_jobs.css('a.job_link').attribute('href').value
end parse_job_detail_page = Nokogiri::HTML(URI.open(job_detail_page))
h3_element_value = pagination_detail_job.css('div.detail-row') detail_job = parse_job_detail_page.css('div.container')
h3_element_value.each do |h3_element|
case h3_element.css('h3').text title = detail_job.css('div.job-desc h1.title').text
when "Mô tả Công việc"
overview = h3_element.css('p').text.gsub(/\s+/, " ").strip salary, experience, level, expired_at = ''
when "Yêu Cầu Công Việc" industry_type = []
requirement = h3_element.css('p').text.gsub(/\s+/, " ").strip detail_content = detail_job.css('div.detail-box.has-background ul li')
when "Thông tin khác" detail_content.each do |content|
other_requirement = h3_element.css('div.content_fck ul li').text.gsub(/\s+/, " ").strip case content.css('strong').text
when 'Lương'
salary = content.css('p').text.gsub(/\s+/, '').strip
when 'Kinh nghiệm'
experience = content.css('p').text.gsub(/\s+/, '').strip
when 'Cấp bậc'
level = content.css('p').text.gsub(/\s+/, '').strip
when 'Ngành nghề'
industry_type = content.css('p a')
when 'Hết hạn nộp'
expired_at = content.css('p').text.gsub(/\s+/, '').strip
end end
end end
company_url = detail_jobs.css('a')[0].attributes["href"].value benefits, overview, requirement, other_requirement = ''
parse_company_url = Nokogiri::HTML(URI.open(company_url)) detail_require = detail_job.css('div.detail-row')
company = parse_company_url.css('div.container') detail_require.each do |detail|
comapny = company.css('div.company-info div.info div.content p.name').text case detail.css('h3').text
company_info = company.css('div.company-info div.info div.content') when 'Phúc lợi '
address company_info.css('p')[1].text benefits = detail.css('ul li').text.strip
description = company_info.css('ul li').text when 'Mô tả Công việc'
overview = company.css('div.row div.content p').text.gsub(/\s+/, " ").strip overview = detail.css('p').text.strip
when 'Yêu Cầu Công Việc'
requirement = detail.css('p').text.strip
when 'Thông tin khác'
other_requirement = detail.css('div.content_fck ul li').text.gsub('\r\n', '').strip
end end
page +=1
end end
job = Job.find_or_create_by(
title: title,
salary: salary,
experience: experience,
level: level,
expired_at: expired_at,
benefits: benefits,
overview: overview,
requirement: requirement,
other_requirement: other_requirement,
company_id: Company.find_by(name: company_name).id
)
industry_type.each do |industry|
industry_name = industry.text.gsub(/\s+/, '').split('/')
industries = Industry.find_or_create_by(
name: industry_name
)
job.industries << industries
end end
desc "TODO" location = detail_job.css('div.map p a')
location.each do |city|
city_name = city.text
cities = City.find_or_create_by(
name: city_name
)
job.cities << cities
end
end
page += 1
end
end
desc 'Crawl Industries'
task industries: :environment do task industries: :environment do
industries_listing = parse_base_url.css('div.container div.list-of-working-positions div.col-md-6.col-lg-4.cus-col') industries_listing = parse_base_url.css('div.container div.list-of-working-positions div.col-md-6.col-lg-4.cus-col')
industries_listing.each do |industries| industries_listing.each do |industries|
industries_name = industries.css('ul.list-jobs li').text industries_type = industries.css('ul.list-jobs li')
puts 'Added: ' + (industries_name ? industries_name : '') industries_type.each do |industries_name|
name = industries_name.text
Industry.find_or_create_by(
name: name
)
end
end end
end end
desc "TODO" desc 'Crawl Cities'
task cities: :environment do task cities: :environment do
cities = parse_base_url.css('div.container div.col-xl-3 div.main-jobs-by-location div.jobs-in-country li a') cities = parse_base_url.css('div.container div.jobs-in-country li a')
cities.each do |city| cities.each do |city|
city_name = city.text.gsub('Việc làm tại','') city_name = city.text.gsub('Việc làm tại', '').strip
City.find_or_create_by( City.find_or_create_by(
name: city_name name: city_name,
region_id: Region.find_by(name: 'Trong Nước').id
) )
Region.find_or_create_by(name: 'Trong nước').id
puts 'Added: ' + (city_name ? city_name : '')
end end
cities_foreign = parse_base_url.css('div.container div.overseas-jobs li a') cities_foreign = parse_base_url.css('div.container div.overseas-jobs li a')
cities_foreign.each do |city| cities_foreign.each do |city|
city_name = city.text city_name = city.text.strip
City.find_or_create_by( City.find_or_create_by(
name: city_name name: city_name,
region_id: Region.find_by(name: 'Nước Ngoài').id
) )
Region.find_or_create_by(name: 'Nước Ngoài').id
puts 'Added: ' + (city_name ? city_name : '')
end end
end end
desc 'Crawl Regions'
desc "TODO"
task regions: :environment do task regions: :environment do
regions = parse_base_url.css('div.container div.col-xl-3 div.main-jobs-by-location h3') regions = parse_base_url.css('div.container div.col-xl-3 div.main-jobs-by-location h3')
regions.each do |region| regions.each do |region|
region_name region.text.gsub('Việc Làm','') region_name = region.text.gsub('Việc Làm', '').strip
Region.find_or_create_by( Region.find_or_create_by(
name: region_name name: region_name
) )
...@@ -104,7 +151,7 @@ namespace :crawler do ...@@ -104,7 +151,7 @@ namespace :crawler do
def parse_base_url def parse_base_url
base_url = Nokogiri::HTML(URI.open('https://careerbuilder.vn/')) base_url = Nokogiri::HTML(URI.open('https://careerbuilder.vn/'))
industries_url = base_url.css('div.menu div.dropdown-menu ul li a')[1].attributes["href"].value industries_url = base_url.css('div.menu div.dropdown-menu ul li a')[1].attributes['href'].value
parse_industries_url = Nokogiri::HTML(URI.open(industries_url)) Nokogiri::HTML(URI.open(industries_url))
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment