Commit c797139b by Hung0326 Committed by GitHub

Merge pull request #5 from Hung0326/dev

40% create crawler job
parents e529714f 1326d754
...@@ -20,7 +20,7 @@ gem 'coffee-rails', '~> 4.2' ...@@ -20,7 +20,7 @@ gem 'coffee-rails', '~> 4.2'
gem 'turbolinks', '~> 5' gem 'turbolinks', '~> 5'
# Build JSON APIs with ease. Read more: https://github.com/rails/jbuilder # Build JSON APIs with ease. Read more: https://github.com/rails/jbuilder
gem 'jbuilder', '~> 2.5' gem 'jbuilder', '~> 2.5'
gem 'mechanize', '2.7.6' gem 'nokogiri'
# Use Redis adapter to run Action Cable in production # Use Redis adapter to run Action Cable in production
# gem 'redis', '~> 4.0' # gem 'redis', '~> 4.0'
# Use ActiveModel has_secure_password # Use ActiveModel has_secure_password
......
...@@ -72,17 +72,12 @@ GEM ...@@ -72,17 +72,12 @@ GEM
execjs execjs
coffee-script-source (1.12.2) coffee-script-source (1.12.2)
concurrent-ruby (1.1.6) concurrent-ruby (1.1.6)
connection_pool (2.2.3)
crass (1.0.6) crass (1.0.6)
domain_name (0.5.20190701)
unf (>= 0.0.5, < 1.0.0)
erubi (1.9.0) erubi (1.9.0)
execjs (2.7.0) execjs (2.7.0)
ffi (1.13.1) ffi (1.13.1)
globalid (0.4.2) globalid (0.4.2)
activesupport (>= 4.2.0) activesupport (>= 4.2.0)
http-cookie (1.0.3)
domain_name (~> 0.5)
i18n (1.8.3) i18n (1.8.3)
concurrent-ruby (~> 1.0) concurrent-ruby (~> 1.0)
io-like (0.3.1) io-like (0.3.1)
...@@ -100,32 +95,16 @@ GEM ...@@ -100,32 +95,16 @@ GEM
mini_mime (>= 0.1.1) mini_mime (>= 0.1.1)
marcel (0.3.3) marcel (0.3.3)
mimemagic (~> 0.3.2) mimemagic (~> 0.3.2)
mechanize (2.7.6)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (>= 1.17.2)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (>= 2.5.2)
nokogiri (~> 1.6)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
method_source (1.0.0) method_source (1.0.0)
mime-types (3.3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2020.0512)
mimemagic (0.3.5) mimemagic (0.3.5)
mini_mime (1.0.2) mini_mime (1.0.2)
mini_portile2 (2.4.0) mini_portile2 (2.4.0)
minitest (5.14.1) minitest (5.14.1)
msgpack (1.3.3) msgpack (1.3.3)
mysql2 (0.5.3) mysql2 (0.5.3)
net-http-digest_auth (1.4.1)
net-http-persistent (4.0.0)
connection_pool (~> 2.2)
nio4r (2.5.2) nio4r (2.5.2)
nokogiri (1.10.10) nokogiri (1.10.10)
mini_portile2 (~> 2.4.0) mini_portile2 (~> 2.4.0)
ntlm-http (0.1.1)
public_suffix (4.0.5) public_suffix (4.0.5)
puma (3.12.6) puma (3.12.6)
rack (2.2.3) rack (2.2.3)
...@@ -201,15 +180,11 @@ GEM ...@@ -201,15 +180,11 @@ GEM
thread_safe (~> 0.1) thread_safe (~> 0.1)
uglifier (4.2.0) uglifier (4.2.0)
execjs (>= 0.3.0, < 3) execjs (>= 0.3.0, < 3)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.7)
web-console (3.7.0) web-console (3.7.0)
actionview (>= 5.0) actionview (>= 5.0)
activemodel (>= 5.0) activemodel (>= 5.0)
bindex (>= 0.4.0) bindex (>= 0.4.0)
railties (>= 5.0) railties (>= 5.0)
webrobots (0.1.2)
websocket-driver (0.7.3) websocket-driver (0.7.3)
websocket-extensions (>= 0.1.0) websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5) websocket-extensions (0.1.5)
...@@ -227,8 +202,8 @@ DEPENDENCIES ...@@ -227,8 +202,8 @@ DEPENDENCIES
coffee-rails (~> 4.2) coffee-rails (~> 4.2)
jbuilder (~> 2.5) jbuilder (~> 2.5)
listen (>= 3.0.5, < 3.2) listen (>= 3.0.5, < 3.2)
mechanize (= 2.7.6)
mysql2 (= 0.5.3) mysql2 (= 0.5.3)
nokogiri
puma (~> 3.11) puma (~> 3.11)
rails (~> 5.2.4, >= 5.2.4.3) rails (~> 5.2.4, >= 5.2.4.3)
sass-rails (~> 5.0) sass-rails (~> 5.0)
......
class ApplicationController < ActionController::Base class ApplicationController < ActionController::Base
require 'nokogiri'
require 'open-uri'
include CrawlerHelper include CrawlerHelper
end end
class HomeController < ApplicationController class HomeController < ApplicationController
def index def index
craw_data_companies() # crawl_data_jobs_interface_1()
# val = "\r\n \r\n\tĐược thành lập vào năm 2002, Công ty Cổ phần Maison (Maison) đã nhanh chóng trở thành một trong những công ty phân phối thời trang lớn nhất trong việc giới thiệu các thương hiệu thời trang cao cấp và sang trọng tại Việt Nam. Sau thành công ngoài mong đợi của cửa hàng đầu tiên, hiện Maison đang là ngôi nhà chung của hơn 23 thương hiệu đình đám thế giới.\r\n\r\n\t \r\n\r\n\tNhân viên là tài sản giá trị nhất của chúng tôi bởi vai trò quan trọng họ mang lại trong quá trình đạt được mục tiêu kinh doanh của Maison. Chúng tôi luôn hỗ trợ phát triển kỹ năng và chuyên môn ở từng giai đoạn sự nghiệp của mọi người.\r\n\r\n\tLuôn có những cơ hội mở để phát triển bản thân và sự nghiệp, bởi chúng tôi hiểu rằng con người là tài sản cốt lõi để củng cố doanh nghiệp và danh tiếng thương hiệu.\r\n\r\n\tChúng tôi \"will be ahihi\" tin rằng áp dụng những giá trị này vào thực tế sẽ tạo nền tảng lợi ích lâu dài đối với con người.\r\n\r\n\t \r\n\r\n " # crawl_data_jobs_interface_2()
# val.to_s.delete!("[\n,\t,\r]") crawl_data_jobs_interface_3()
# val.lstrip!
# val.rstrip!
# @data = val
end end
end end
...@@ -14,7 +14,7 @@ default: &default ...@@ -14,7 +14,7 @@ default: &default
encoding: utf8 encoding: utf8
pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %> pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username: root username: root
password: '12345678' password: '1'
socket: /var/run/mysqld/mysqld.sock socket: /var/run/mysqld/mysqld.sock
......
...@@ -9,7 +9,7 @@ threads threads_count, threads_count ...@@ -9,7 +9,7 @@ threads threads_count, threads_count
# Specifies the `port` that Puma will listen on to receive requests; default is 3000. # Specifies the `port` that Puma will listen on to receive requests; default is 3000.
# #
port ENV.fetch("PORT") { 1111 } port ENV.fetch("PORT") { 3000 }
# Specifies the `environment` that Puma will run in. # Specifies the `environment` that Puma will run in.
# #
......
require 'open-uri'
namespace :db do namespace :db do
task populate: :environment do task populate: :environment do
make_cities
make_industries
make_companies
end end
# CRAWLER ALL CITIES # CRAWLER ALL CITIES
task make_cities: :environment do $page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
@data_list_cities = [] p1 = Nokogiri::HTML(URI.open('https://careerbuilder.vn/vi/tim-viec-lam/vinhomes-chuyen-vien-thu-tuc-bat-dong-san.35B449B5.html'))
agent = Mechanize.new def make_cities
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html") @data_list_cities = []
data = page.search("#location option") data = $page.search("#location option")
list_cities = data.to_s.split("</option>") list_cities = data.to_s.split("</option>")
list_cities.each do |x| list_cities.each do |x|
@data_list_cities << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip @data_list_cities << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end end
for i in 0..(@data_list_cities.length - 1) @data_list_cities.length.times do |i|
if i <= 69 if i <= 69
name = (@data_list_cities[i].to_s) name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 1) City.create!(name: name, area: 1)
...@@ -25,72 +29,79 @@ namespace :db do ...@@ -25,72 +29,79 @@ namespace :db do
end end
# CRAWLER ALL INDUSTRIES # CRAWLER ALL INDUSTRIES
task make_industry: :environment do def make_industries
@data_list_industries = [] @data_list_industries = []
agent = Mechanize.new data = $page.search("#industry option")
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html")
data = page.search("#industry option")
list_industries = data.to_s.split("</option>") list_industries = data.to_s.split("</option>")
list_industries.each do |x| list_industries.each do |x|
@data_list_industries << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip @data_list_industries << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end end
for i in 0..(@data_list_industries.length - 1) @data_list_industries.length.times do |i|
name = (@data_list_industries[i].to_s) name = (@data_list_industries[i].to_s)
Industry.create!(name: name) Industry.create!(name: name)
end end
end end
# CRAWLER COMPANIES # CRAWLER LINK JOB & COMPANIES
task make_companies: :environment do def crawl_link_for_companies_jobs
def crawl_link_for_companies_data data = []
@website = [] website_companies = []
num_page_will_crawl = 1 website_jobs = []
agent = Mechanize.new num_page_will_crawl = 3
for i in 1..num_page_will_crawl do num_page_will_crawl.times do |i|
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i}-vi.html") page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i+1}-vi.html"))
@website << page.search(".figcaption .caption a/@href").text.to_s.split('https://careerbuilder.vn/') website_companies << page.search(".figcaption .caption a/@href").text.to_s.split('https://careerbuilder.vn/')
end website_jobs << page.search(".figcaption .title .job_link @href").text.to_s.split('https://careerbuilder.vn/')
@website = @website.join(",")
@website = @website.split(",").uniq!
@website = @website.select { |val| val != ''}
end end
website_companies = website_companies.join(",")
website_companies = website_companies.split(",").uniq!
website_companies = website_companies.select { |val| val != ''}
website_jobs = website_jobs.join(",")
website_jobs = website_jobs.split(",")
website_jobs = website_jobs.select { |val| val != ''}
data << website_companies << website_jobs
end
# CRAWLER COMPANIES
def make_companies
def craw_data_companies def craw_data_companies
link_crawl = crawl_link_for_companies_data() link_crawl = crawl_link_for_companies_jobs()
@data_companies = {} @data_companies = {}
@data_companies_name = [] @data_companies_name = []
@data_companies_address = [] @data_companies_address = []
@data_companies_description = [] @data_companies_description = []
agent = Mechanize.new link_crawl[0].each do |url|
link_crawl.each do |url| page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
page = agent.get("https://careerbuilder.vn/#{url}") name = ''
address = ''
desc = ''
if page.search(".company-info .info .content .name").text == "" if page.search(".company-info .info .content .name").text == ""
name = page.search(".section-page #cp_company_name").text name = page.search(".section-page #cp_company_name").text
@data_companies_name << name.to_s.rstrip address = page.search(".section-page .cp_basic_info_details ul li:nth-child(1)").text
address = page.search(".section-page .cp_basic_info_details ul li:nth-child(1)").text desc = page.search(".cp_aboutus_item .content_fck").text
@data_companies_address << address.to_s.rstrip
desc = page.search(".cp_aboutus_item .content_fck").text
@data_companies_description << desc
else else
name = page.search(".company-info .info .content .name").text name = page.search(".company-info .info .content .name").text
address = page.search(".company-info .info .content p:nth-child(3)").text
desc = page.search(".main-about-us .content").text
end
if (name != "" && address != "" && desc != "")
@data_companies_name << name.to_s.rstrip @data_companies_name << name.to_s.rstrip
address = page.search(".company-info .info .content p:nth-child(3)").text
@data_companies_address << address.to_s.rstrip @data_companies_address << address.to_s.rstrip
desc = page.search(".main-about-us .content").text
@data_companies_description << desc @data_companies_description << desc
end end
end end
@data_companies[:name] = @data_companies_name.select { |val| val != ''} @data_companies[:name] = @data_companies_name
@data_companies[:address] = @data_companies_address.select { |val| val != ''} @data_companies[:address] = @data_companies_address
@data_companies_description.each do |val| @data_companies_description.each do |val|
val.to_s.delete!("[\n,\t,\r]") val.to_s.delete!("[\n,\t,\r]")
val.lstrip! val.strip!
val.rstrip!
end end
@data_companies[:description] = @data_companies_description.select { |val| val != ''} @data_companies[:description] = @data_companies_description
@data_companies @data_companies
end end
...@@ -108,4 +119,7 @@ namespace :db do ...@@ -108,4 +119,7 @@ namespace :db do
end end
end end
def make_jobs
end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment