Commit f24e4f4b by Đường Sỹ Hoàng

First commit

parent 159d97f4
Pipeline #248 failed with stages
in 0 seconds
...@@ -6,7 +6,7 @@ ruby '2.6.5' ...@@ -6,7 +6,7 @@ ruby '2.6.5'
# Bundle edge Rails instead: gem 'rails', github: 'rails/rails' # Bundle edge Rails instead: gem 'rails', github: 'rails/rails'
gem 'rails', '~> 6.0.1' gem 'rails', '~> 6.0.1'
# Use mysql2 as the database for Active Record # Use mysql2 as the database for Active Record
gem 'mysql2' gem 'mysql2'
# Use Puma as the app server # Use Puma as the app server
gem 'puma', '~> 4.1' gem 'puma', '~> 4.1'
# Use SCSS for stylesheets # Use SCSS for stylesheets
...@@ -30,6 +30,18 @@ gem 'bootsnap', '>= 1.4.2', require: false ...@@ -30,6 +30,18 @@ gem 'bootsnap', '>= 1.4.2', require: false
gem 'devise' gem 'devise'
source "https://rubygems.org"
gem "nokogiri"
gem "mechanize"
gem "pry"
gem "rubysl-open-uri"
gem "activerecord-import"
group :development, :test do group :development, :test do
# Call 'byebug' anywhere in the code to stop execution and get a debugger console # Call 'byebug' anywhere in the code to stop execution and get a debugger console
gem 'byebug', platforms: [:mri, :mingw, :x64_mingw] gem 'byebug', platforms: [:mri, :mingw, :x64_mingw]
......
...@@ -58,15 +58,10 @@ GEM ...@@ -58,15 +58,10 @@ GEM
zeitwerk (~> 2.2) zeitwerk (~> 2.2)
addressable (2.7.0) addressable (2.7.0)
public_suffix (>= 2.0.2, < 5.0) public_suffix (>= 2.0.2, < 5.0)
autoprefixer-rails (9.7.2)
execjs
bcrypt (3.1.13) bcrypt (3.1.13)
bindex (0.8.1) bindex (0.8.1)
bootsnap (1.4.5) bootsnap (1.4.5)
msgpack (~> 1.0) msgpack (~> 1.0)
bootstrap-sass (3.3.7)
autoprefixer-rails (>= 5.2.1)
sass (>= 3.3.4)
builder (3.2.3) builder (3.2.3)
byebug (11.0.1) byebug (11.0.1)
capybara (3.29.0) capybara (3.29.0)
...@@ -78,7 +73,9 @@ GEM ...@@ -78,7 +73,9 @@ GEM
regexp_parser (~> 1.5) regexp_parser (~> 1.5)
xpath (~> 3.2) xpath (~> 3.2)
childprocess (3.0.0) childprocess (3.0.0)
coderay (1.1.2)
concurrent-ruby (1.1.5) concurrent-ruby (1.1.5)
connection_pool (2.2.2)
crass (1.0.5) crass (1.0.5)
devise (4.7.1) devise (4.7.1)
bcrypt (~> 3.0) bcrypt (~> 3.0)
...@@ -86,11 +83,14 @@ GEM ...@@ -86,11 +83,14 @@ GEM
railties (>= 4.1.0) railties (>= 4.1.0)
responders responders
warden (~> 1.2.3) warden (~> 1.2.3)
domain_name (0.5.20190701)
unf (>= 0.0.5, < 1.0.0)
erubi (1.9.0) erubi (1.9.0)
execjs (2.7.0)
ffi (1.11.3) ffi (1.11.3)
globalid (0.4.2) globalid (0.4.2)
activesupport (>= 4.2.0) activesupport (>= 4.2.0)
http-cookie (1.0.3)
domain_name (~> 0.5)
i18n (1.7.0) i18n (1.7.0)
concurrent-ruby (~> 1.0) concurrent-ruby (~> 1.0)
jbuilder (2.9.1) jbuilder (2.9.1)
...@@ -106,17 +106,36 @@ GEM ...@@ -106,17 +106,36 @@ GEM
mini_mime (>= 0.1.1) mini_mime (>= 0.1.1)
marcel (0.3.3) marcel (0.3.3)
mimemagic (~> 0.3.2) mimemagic (~> 0.3.2)
mechanize (2.7.6)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (>= 1.17.2)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (>= 2.5.2)
nokogiri (~> 1.6)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
method_source (0.9.2) method_source (0.9.2)
mime-types (3.3)
mime-types-data (~> 3.2015)
mime-types-data (3.2019.1009)
mimemagic (0.3.3) mimemagic (0.3.3)
mini_mime (1.0.2) mini_mime (1.0.2)
mini_portile2 (2.4.0) mini_portile2 (2.4.0)
minitest (5.13.0) minitest (5.13.0)
msgpack (1.3.1) msgpack (1.3.1)
mysql2 (0.5.3) mysql2 (0.5.3)
net-http-digest_auth (1.4.1)
net-http-persistent (3.1.0)
connection_pool (~> 2.2)
nio4r (2.5.2) nio4r (2.5.2)
nokogiri (1.10.5) nokogiri (1.10.5)
mini_portile2 (~> 2.4.0) mini_portile2 (~> 2.4.0)
ntlm-http (0.1.1)
orm_adapter (0.5.0) orm_adapter (0.5.0)
pry (0.12.2)
coderay (~> 1.1.0)
method_source (~> 0.9.0)
public_suffix (4.0.1) public_suffix (4.0.1)
puma (4.3.0) puma (4.3.0)
nio4r (~> 2.0) nio4r (~> 2.0)
...@@ -160,12 +179,8 @@ GEM ...@@ -160,12 +179,8 @@ GEM
actionpack (>= 5.0) actionpack (>= 5.0)
railties (>= 5.0) railties (>= 5.0)
ruby_dep (1.5.0) ruby_dep (1.5.0)
rubysl-open-uri (2.0.0)
rubyzip (2.0.0) rubyzip (2.0.0)
sass (3.7.4)
sass-listen (~> 4.0.0)
sass-listen (4.0.0)
rb-fsevent (~> 0.9, >= 0.9.4)
rb-inotify (~> 0.9, >= 0.9.7)
sass-rails (6.0.0) sass-rails (6.0.0)
sassc-rails (~> 2.1, >= 2.1.1) sassc-rails (~> 2.1, >= 2.1.1)
sassc (2.2.1) sassc (2.2.1)
...@@ -198,6 +213,9 @@ GEM ...@@ -198,6 +213,9 @@ GEM
turbolinks-source (5.2.0) turbolinks-source (5.2.0)
tzinfo (1.2.5) tzinfo (1.2.5)
thread_safe (~> 0.1) thread_safe (~> 0.1)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.6)
warden (1.2.8) warden (1.2.8)
rack (>= 2.0.6) rack (>= 2.0.6)
web-console (4.0.1) web-console (4.0.1)
...@@ -213,6 +231,7 @@ GEM ...@@ -213,6 +231,7 @@ GEM
activesupport (>= 4.2) activesupport (>= 4.2)
rack-proxy (>= 0.6.1) rack-proxy (>= 0.6.1)
railties (>= 4.2) railties (>= 4.2)
webrobots (0.1.2)
websocket-driver (0.7.1) websocket-driver (0.7.1)
websocket-extensions (>= 0.1.0) websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.4) websocket-extensions (0.1.4)
...@@ -225,15 +244,18 @@ PLATFORMS ...@@ -225,15 +244,18 @@ PLATFORMS
DEPENDENCIES DEPENDENCIES
bootsnap (>= 1.4.2) bootsnap (>= 1.4.2)
bootstrap-sass (= 3.3.7)
byebug byebug
capybara (>= 2.15) capybara (>= 2.15)
devise devise
jbuilder (~> 2.7) jbuilder (~> 2.7)
listen (>= 3.0.5, < 3.2) listen (>= 3.0.5, < 3.2)
mechanize
mysql2 mysql2
nokogiri
pry
puma (~> 4.1) puma (~> 4.1)
rails (~> 6.0.1) rails (~> 6.0.1)
rubysl-open-uri
sass-rails (>= 6) sass-rails (>= 6)
selenium-webdriver selenium-webdriver
spring spring
......
require "rubygems"
require "open-uri"
require "nokogiri"
require "mechanize"
require "csv"
require "pry"
agent = Mechanize.new
main_page = Nokogiri::HTML(open(URI.escape("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-vi.html")))
total_page = main_page.css("div.ais-stats").css("h1.col-sm-10").css("span").text.to_i
(1..total_page).each do |num|
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{num}-vi.html")
links = page.links.select { |link| link.href.include?("careerbuilder.vn/vi/tim-viec-lam/") }.map(&:href)
links.each do |link|
job_page = Nokogiri::HTML(open(URI.escape(link)))
if (title1 = job_page.css("div.MyJobDetail").css("div.MyJobLeft").css("div.LeftJobCB").css("div.top-job").css("div.top-job-info").css("h1").text)
puts title1
# elseif
# (title2 = job_page.css("div.main_content_right").css("div.content_470").css("div.box_470").css("div.midle_tile").css("h1 p").text)
# puts title2
# else
# puts job_page.css("div.col-xs-12 job-template-2").css("div.box-shadow col-xs-12 content-job-detail").css("div.col-xs-12 top-job").css("div.top-job-info").css("p")[0].text
end
#company_name
puts job_page.css("div.MyJobDetail").css("div.MyJobLeft").css("div.LeftJobCB").css("div.top-job").css("div.tit_company").text
#updated date
puts job_page.css("div.datepost").text
# puts job_page.css("div.datepost").css("span").text
# #city_job
# puts page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li")[0].css("b").css("a")[1].text
#position
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_left").text
# #experienced
puts job_page.css("ul.DetailJobNew").css("li.bgLine2").css("p.fl_left").text
# #salary
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine2").css("p.fl_right").text
#industry
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_left").css("b").css("a").text
#expired_date
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_right").text
#All job information related
#job_description
puts job_page.css("div.LeftJobCB").css("div.MarBot20").text
# puts job_page.css("div.LeftJobCB").css("div.MarBot20").css("div.content_fck").css("p").text
# detail company
puts job_page.css("div.box1Detail").css("p.TitleDetailNew").css("label").text
end
end
require "csv"
require "activerecord-import/base"
def import
jobs=[]
CSV.foreach("app/lib/Venjob.csv",headers: true) do |row|
jobs << { title: row["name"],
description: row["description"],
company_id: row["company id"],
salary: row["salary"],
requirement: row["requirement"],
position: row["level"]
}
end
Job.import jobs
end
This source diff could not be displayed because it is too large. You can view the blob instead.
require "rubygems"
require "open-uri"
require "nokogiri"
require "mechanize"
require "pry"
namespace :Job do
desc "Import data from crawler to database"
task :import do
puts "Starting import data to database"
end
task create: :environment do
agent = Mechanize.new
main_page = Nokogiri::HTML(open(URI.escape("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-vi.html")))
total_page = main_page.css("div.ais-stats").css("h1.col-sm-10").css("span").text.to_i
(1..total_page).each do |num|
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{num}-vi.html")
links = page.links.select { |link| link.href.include?("careerbuilder.vn/vi/tim-viec-lam/") }.map(&:href)
links.each do |link|
job_page = Nokogiri::HTML(open(URI.escape(link)))
if (title1 = job_page.css("div.MyJobDetail").css("div.MyJobLeft").css("div.LeftJobCB").css("div.top-job").css("div.top-job-info").css("h1").text)
puts title1
# elseif
# (title2 = job_page.css("div.main_content_right").css("div.content_470").css("div.box_470").css("div.midle_tile").css("h1 p").text)
# puts title2
# else
# puts job_page.css("div.col-xs-12 job-template-2").css("div.box-shadow col-xs-12 content-job-detail").css("div.col-xs-12 top-job").css("div.top-job-info").css("p")[0].text
end
#company_name
puts job_page.css("div.MyJobDetail").css("div.MyJobLeft").css("div.LeftJobCB").css("div.top-job").css("div.tit_company").text
#updated date
puts job_page.css("div.datepost").text
# puts job_page.css("div.datepost").css("span").text
# #city_job
# puts page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li")[0].css("b").css("a")[1].text
#position
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_left").text
# #experienced
puts job_page.css("ul.DetailJobNew").css("li.bgLine2").css("p.fl_left").text
# #salary
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine2").css("p.fl_right").text
#industry
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_left").css("b").css("a").text
#expired_date
puts job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_right").text
#All job information related
#job_description
puts job_page.css("div.LeftJobCB").css("div.MarBot20").text
# puts job_page.css("div.LeftJobCB").css("div.MarBot20").css("div.content_fck").css("p").text
# detail company
puts job_page.css("div.box1Detail").css("p.TitleDetailNew").css("label").text
end
Job.create!(
title: job_page.css("div.MyJobDetail").css("div.MyJobLeft").css("div.LeftJobCB").css("div.top-job").css("div.top-job-info").css("h1").text.to_s
description: job_page.css("div.LeftJobCB").css("div.MarBot20").css("h4.TitleJobNew").text
description: job_page.css("div.LeftJobCB").css("div.MarBot20").css("div.content_fck").css("p").text
short_description: job_page.css("div.LeftJobCB").css('div.desc_company.content_fck').css('span#emp_collapse').text
salary: job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine2").css("p.fl_right").text
requirement: job_page.css("div.LeftJobCB").css("div.MarBot20").css("h4.TitleJobNew").text
requirement: job_page.css("div.LeftJobCB").css("div.MarBot20").css("div.content_fck").css("p").text
position: job_page.css("div#showScroll.box2Detail").css("ul.DetailJobNew").css("li.bgLine1").css("p.fl_left").text.to_s)
Job.save!
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment