Commit 6d1348ad by Đường Sỹ Hoàng

second commit

parent f24e4f4b
Pipeline #249 canceled with stages
in 0 seconds
source 'https://rubygems.org'
source "https://rubygems.org"
git_source(:github) { |repo| "https://github.com/#{repo}.git" }
ruby '2.6.5'
ruby "2.6.5"
# Bundle edge Rails instead: gem 'rails', github: 'rails/rails'
gem 'rails', '~> 6.0.1'
# Bundle edge Rails instead: gem "rails", github: "rails/rails"
gem "rails", "~> 6.0.1"
# Use mysql2 as the database for Active Record
gem 'mysql2'
gem "mysql2"
# Use Puma as the app server
gem 'puma', '~> 4.1'
gem "puma", "~> 4.1"
# Use SCSS for stylesheets
gem 'sass-rails', '>= 6'
gem "sass-rails", ">= 6"
# Transpile app-like JavaScript. Read more: https://github.com/rails/webpacker
gem 'webpacker', '~> 4.0'
gem "webpacker", "~> 4.0"
# Turbolinks makes navigating your web application faster. Read more: https://github.com/turbolinks/turbolinks
gem 'turbolinks', '~> 5'
gem "turbolinks", "~> 5"
# Build JSON APIs with ease. Read more: https://github.com/rails/jbuilder
gem 'jbuilder', '~> 2.7'
gem "jbuilder", "~> 2.7"
# Use Redis adapter to run Action Cable in production
# gem 'redis', '~> 4.0'
# gem "redis", "~> 4.0"
# Use Active Model has_secure_password
# gem 'bcrypt', '~> 3.1.7'
# gem "bcrypt", "~> 3.1.7"
# Use Active Storage variant
# gem 'image_processing', '~> 1.2'
# gem "image_processing", "~> 1.2"
# Reduces boot times through caching; required in config/boot.rb
gem 'bootsnap', '>= 1.4.2', require: false
gem 'devise'
source "https://rubygems.org"
gem "bootsnap", ">= 1.4.2", require: false
gem "devise"
gem "nokogiri"
gem "mechanize"
gem "pry"
gem "rubysl-open-uri"
gem "activerecord-import"
gem "whenever", require: false
group :development, :test do
# Call 'byebug' anywhere in the code to stop execution and get a debugger console
gem 'byebug', platforms: [:mri, :mingw, :x64_mingw]
# Call "byebug" anywhere in the code to stop execution and get a debugger console
gem "byebug", platforms: [:mri, :mingw, :x64_mingw]
gem "pry"
end
group :development do
# Access an interactive console on exception pages or by calling 'console' anywhere in the code.
gem 'web-console', '>= 3.3.0'
gem 'listen', '>= 3.0.5', '< 3.2'
# Access an interactive console on exception pages or by calling "console" anywhere in the code.
gem "web-console", ">= 3.3.0"
gem "listen", ">= 3.0.5", "< 3.2"
# Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring
gem 'spring'
gem 'spring-watcher-listen', '~> 2.0.0'
gem "spring"
gem "spring-watcher-listen", "~> 2.0.0"
end
group :test do
# Adds support for Capybara system testing and selenium driver
gem 'capybara', '>= 2.15'
gem 'selenium-webdriver'
gem "capybara", ">= 2.15"
gem "selenium-webdriver"
# Easy installation and use of web drivers to run system tests with browsers
gem 'webdrivers'
gem "webdrivers"
end
# Windows does not include zoneinfo files, so bundle the tzinfo-data gem
gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby]
gem "tzinfo-data", platforms: [:mingw, :mswin, :x64_mingw, :jruby]
......@@ -73,6 +73,7 @@ GEM
regexp_parser (~> 1.5)
xpath (~> 3.2)
childprocess (3.0.0)
chronic (0.10.2)
coderay (1.1.2)
concurrent-ruby (1.1.5)
connection_pool (2.2.2)
......@@ -235,6 +236,8 @@ GEM
websocket-driver (0.7.1)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.4)
whenever (1.0.0)
chronic (>= 0.6.3)
xpath (3.2.0)
nokogiri (~> 1.8)
zeitwerk (2.2.1)
......@@ -265,6 +268,7 @@ DEPENDENCIES
web-console (>= 3.3.0)
webdrivers
webpacker (~> 4.0)
whenever
RUBY VERSION
ruby 2.6.5p114
......
# Use this file to easily define all of your cron jobs.
#
# It's helpful, but not entirely necessary to understand cron before proceeding.
# http://en.wikipedia.org/wiki/Cron
# Learn more: http://github.com/javan/whenever
set :environment, "development"
set :output, "/crawler/config/import_log.log"
every 1.day at: '5:00 pm' do
rake "job:create"
end
require "rubygems"
require "open-uri"
require "nokogiri"
require "mechanize"
require "csv"
require "pry"
agent = Mechanize.new
main_page = Nokogiri::HTML(open(URI.escape("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-vi.html")))
total_page = main_page.css("div.ais-stats").css("h1.col-sm-10").css("span").text.to_i
(1..total_page).each do |num|
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{num}-vi.html")
links = page.links.select { |link| link.href.include?("careerbuilder.vn/vi/tim-viec-lam/") }.map(&:href)
links.each do |link|
job_page = Nokogiri::HTML(open(URI.escape(link)))
if (title1 = job_page.css("div.MyJobDetail").css("div.MyJobLeft").css("div.LeftJobCB").css("div.top-job").css("div.top-job-info").css("h1").text)
puts title1
# elseif
# (title2 = job_page.css("div.main_content_right").css("div.content_470").css("div.box_470").css("div.midle_tile").css("h1 p").text)
# puts title2
# else
# puts job_page.css("div.col-xs-12 job-template-2").css("div.box-shadow col-xs-12 content-job-detail").css("div.col-xs-12 top-job").css("div.top-job-info").css("p")[0].text
end
#company_name
puts job_page.css("div.MyJobDetail").css("div.MyJobLeft").css("div.LeftJobCB").css("div.top-job").css("div.tit_company").text
#updated date
puts job_page.css("div.datepost").text
# puts job_page.css("div.datepost").css("span").text
# #city_job
# puts page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li")[0].css("b").css("a")[1].text
#position
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_left").text
# #experienced
puts job_page.css("ul.DetailJobNew").css("li.bgLine2").css("p.fl_left").text
# #salary
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine2").css("p.fl_right").text
#industry
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_left").css("b").css("a").text
#expired_date
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_right").text
#All job information related
#job_description
puts job_page.css("div.LeftJobCB").css("div.MarBot20").text
# puts job_page.css("div.LeftJobCB").css("div.MarBot20").css("div.content_fck").css("p").text
# detail company
puts job_page.css("div.box1Detail").css("p.TitleDetailNew").css("label").text
end
end
......@@ -11,7 +11,7 @@ class CreateJobs < ActiveRecord::Migration[6.0]
t.string :position
t.datetime :posted_at
t.datetime :expired_at
t.timestamps null: false
end
end
......
class AddCodeToCompany < ActiveRecord::Migration[6.0]
def change
add_column :companies, :code, :string
end
end
class AddCodeToJob < ActiveRecord::Migration[6.0]
def change
add_column :jobs, :code, :string
end
end
......@@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2019_11_27_075301) do
ActiveRecord::Schema.define(version: 2019_12_05_082359) do
create_table "cities", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.string "name"
......@@ -34,6 +34,7 @@ ActiveRecord::Schema.define(version: 2019_11_27_075301) do
t.string "name"
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
t.string "code"
end
create_table "industries", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
......@@ -64,6 +65,7 @@ ActiveRecord::Schema.define(version: 2019_11_27_075301) do
t.datetime "expired_at"
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
t.string "code"
end
create_table "user_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
......
require "csv"
require "activerecord-import/base"
def import
jobs=[]
CSV.foreach("app/lib/Venjob.csv",headers: true) do |row|
jobs << { title: row["name"],
description: row["description"],
company_id: row["company id"],
salary: row["salary"],
requirement: row["requirement"],
position: row["level"]
}
end
Job.import jobs
end
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -4,71 +4,55 @@ require "nokogiri"
require "mechanize"
require "pry"
namespace :Job do
desc "Import data from crawler to database"
task :import do
puts "Starting import data to database"
end
namespace :job do
desc "crawl data"
task create: :environment do
agent = Mechanize.new
main_page = Nokogiri::HTML(open(URI.escape("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-vi.html")))
total_page = main_page.css("div.ais-stats").css("h1.col-sm-10").css("span").text.to_i
wheneverize
(1..total_page).each do |num|
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{num}-vi.html")
links = page.links.select { |link| link.href.include?("careerbuilder.vn/vi/tim-viec-lam/") }.map(&:href)
links.each do |link|
puts link
job_page = Nokogiri::HTML(open(URI.escape(link)))
if (title1 = job_page.css("div.MyJobDetail").css("div.MyJobLeft").css("div.LeftJobCB").css("div.top-job").css("div.top-job-info").css("h1").text)
puts title1
# elseif
# (title2 = job_page.css("div.main_content_right").css("div.content_470").css("div.box_470").css("div.midle_tile").css("h1 p").text)
# puts title2
# else
# puts job_page.css("div.col-xs-12 job-template-2").css("div.box-shadow col-xs-12 content-job-detail").css("div.col-xs-12 top-job").css("div.top-job-info").css("p")[0].text
begin
title = job_page.css("div.MyJobDetail").css("div.MyJobLeft").css("div.LeftJobCB").css("div.top-job").css("div.top-job-info").css("h1").text
if job_page.css("div.LeftJobCB").present?
description = job_page.css("div.LeftJobCB").css("div.MarBot20")[1].css("div.content_fck").css("p").text
short_description = job_page.css("div.LeftJobCB").css('div.desc_company.content_fck').css('span#emp_collapse').text
requirement = job_page.css("div.LeftJobCB").css("div.MarBot20")[2].css("div.content_fck").css("p").text
elsif job_page.css("div.content_fck.content_job_info").present?
description = job_page.css("div.content_fck.content_job_info").css("div.decs")[0].text
requirement = job_page.css("div.content_fck.content_job_info").css("div.decs")[1].text
elsif job_page.css("div.content_fck.job_requirement").present?
description = job_page.css("div.content_fck.job_requirement").css("div")[0].text
requirement = job_page.css("div.content_fck.job_requirement").css("div")[2].text
elsif job_page.css("div.content_fck").present?
description = job_page.css("div.content_fck")[0].css("p").text
short_description = "N/A"
requirement = job_page.css("div.content_fck")[1].css("p").text
end
salary = job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine2").css("p.fl_right").text
position = job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_left").text
rescue
puts "Skip #{link}"
next
end
#company_name
puts job_page.css("div.MyJobDetail").css("div.MyJobLeft").css("div.LeftJobCB").css("div.top-job").css("div.tit_company").text
#updated date
puts job_page.css("div.datepost").text
# puts job_page.css("div.datepost").css("span").text
# #city_job
# puts page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li")[0].css("b").css("a")[1].text
#position
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_left").text
# #experienced
puts job_page.css("ul.DetailJobNew").css("li.bgLine2").css("p.fl_left").text
# #salary
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine2").css("p.fl_right").text
#industry
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_left").css("b").css("a").text
#expired_date
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_right").text
#All job information related
#job_description
puts job_page.css("div.LeftJobCB").css("div.MarBot20").text
# puts job_page.css("div.LeftJobCB").css("div.MarBot20").css("div.content_fck").css("p").text
# detail company
puts job_page.css("div.box1Detail").css("p.TitleDetailNew").css("label").text
end
Job.create!(
title: job_page.css("div.MyJobDetail").css("div.MyJobLeft").css("div.LeftJobCB").css("div.top-job").css("div.top-job-info").css("h1").text.to_s
description: job_page.css("div.LeftJobCB").css("div.MarBot20").css("h4.TitleJobNew").text
description: job_page.css("div.LeftJobCB").css("div.MarBot20").css("div.content_fck").css("p").text
short_description: job_page.css("div.LeftJobCB").css('div.desc_company.content_fck').css('span#emp_collapse').text
salary: job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine2").css("p.fl_right").text
requirement: job_page.css("div.LeftJobCB").css("div.MarBot20").css("h4.TitleJobNew").text
requirement: job_page.css("div.LeftJobCB").css("div.MarBot20").css("div.content_fck").css("p").text
position: job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_left").text.to_s)
Job.save!
post_code = /.([^.]*).html/.match(link)
job = Job.find_or_initialize_by(code: post_code[1])
job.update(
title: title,
description: description,
short_description: short_description,
salary: salary,
requirement: requirement,
position: position)
end
end
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment