Commit 1326d754 by Ngô Trung Hưng

40% create crawler job

parent e529714f
......@@ -20,7 +20,7 @@ gem 'coffee-rails', '~> 4.2'
gem 'turbolinks', '~> 5'
# Build JSON APIs with ease. Read more: https://github.com/rails/jbuilder
gem 'jbuilder', '~> 2.5'
gem 'mechanize', '2.7.6'
gem 'nokogiri'
# Use Redis adapter to run Action Cable in production
# gem 'redis', '~> 4.0'
# Use ActiveModel has_secure_password
......
......@@ -72,17 +72,12 @@ GEM
execjs
coffee-script-source (1.12.2)
concurrent-ruby (1.1.6)
connection_pool (2.2.3)
crass (1.0.6)
domain_name (0.5.20190701)
unf (>= 0.0.5, < 1.0.0)
erubi (1.9.0)
execjs (2.7.0)
ffi (1.13.1)
globalid (0.4.2)
activesupport (>= 4.2.0)
http-cookie (1.0.3)
domain_name (~> 0.5)
i18n (1.8.3)
concurrent-ruby (~> 1.0)
io-like (0.3.1)
......@@ -100,32 +95,16 @@ GEM
mini_mime (>= 0.1.1)
marcel (0.3.3)
mimemagic (~> 0.3.2)
mechanize (2.7.6)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (>= 1.17.2)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (>= 2.5.2)
nokogiri (~> 1.6)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
method_source (1.0.0)
mime-types (3.3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2020.0512)
mimemagic (0.3.5)
mini_mime (1.0.2)
mini_portile2 (2.4.0)
minitest (5.14.1)
msgpack (1.3.3)
mysql2 (0.5.3)
net-http-digest_auth (1.4.1)
net-http-persistent (4.0.0)
connection_pool (~> 2.2)
nio4r (2.5.2)
nokogiri (1.10.10)
mini_portile2 (~> 2.4.0)
ntlm-http (0.1.1)
public_suffix (4.0.5)
puma (3.12.6)
rack (2.2.3)
......@@ -201,15 +180,11 @@ GEM
thread_safe (~> 0.1)
uglifier (4.2.0)
execjs (>= 0.3.0, < 3)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.7)
web-console (3.7.0)
actionview (>= 5.0)
activemodel (>= 5.0)
bindex (>= 0.4.0)
railties (>= 5.0)
webrobots (0.1.2)
websocket-driver (0.7.3)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5)
......@@ -227,8 +202,8 @@ DEPENDENCIES
coffee-rails (~> 4.2)
jbuilder (~> 2.5)
listen (>= 3.0.5, < 3.2)
mechanize (= 2.7.6)
mysql2 (= 0.5.3)
nokogiri
puma (~> 3.11)
rails (~> 5.2.4, >= 5.2.4.3)
sass-rails (~> 5.0)
......
class ApplicationController < ActionController::Base
require 'nokogiri'
require 'open-uri'
include CrawlerHelper
end
class HomeController < ApplicationController
def index
craw_data_companies()
# val = "\r\n \r\n\tĐược thành lập vào năm 2002, Công ty Cổ phần Maison (Maison) đã nhanh chóng trở thành một trong những công ty phân phối thời trang lớn nhất trong việc giới thiệu các thương hiệu thời trang cao cấp và sang trọng tại Việt Nam. Sau thành công ngoài mong đợi của cửa hàng đầu tiên, hiện Maison đang là ngôi nhà chung của hơn 23 thương hiệu đình đám thế giới.\r\n\r\n\t \r\n\r\n\tNhân viên là tài sản giá trị nhất của chúng tôi bởi vai trò quan trọng họ mang lại trong quá trình đạt được mục tiêu kinh doanh của Maison. Chúng tôi luôn hỗ trợ phát triển kỹ năng và chuyên môn ở từng giai đoạn sự nghiệp của mọi người.\r\n\r\n\tLuôn có những cơ hội mở để phát triển bản thân và sự nghiệp, bởi chúng tôi hiểu rằng con người là tài sản cốt lõi để củng cố doanh nghiệp và danh tiếng thương hiệu.\r\n\r\n\tChúng tôi \"will be ahihi\" tin rằng áp dụng những giá trị này vào thực tế sẽ tạo nền tảng lợi ích lâu dài đối với con người.\r\n\r\n\t \r\n\r\n "
# val.to_s.delete!("[\n,\t,\r]")
# val.lstrip!
# val.rstrip!
# @data = val
# crawl_data_jobs_interface_1()
# crawl_data_jobs_interface_2()
crawl_data_jobs_interface_3()
end
end
module CrawlerHelper
def crawl_industries_data
data_list_industries = []
agent = Mechanize.new
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html")
page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
data = page.search("#industry option")
list_industries = data.to_s.split("</option>")
list_industries.each do |x|
......@@ -14,8 +13,7 @@ module CrawlerHelper
def crawl_cities_data
data_list_cities = []
agent = Mechanize.new
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html")
page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
data = page.search("#location option")
list_cities = data.to_s.split("</option>")
list_cities.each do |x|
......@@ -24,58 +22,226 @@ module CrawlerHelper
render plain: data_list_cities;
end
def crawl_link_for_companies_data
@website = []
def crawl_link_for_companies_jobs
data = []
website_companies = []
website_jobs = []
num_page_will_crawl = 1
agent = Mechanize.new
for i in 1..num_page_will_crawl do
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i}-vi.html")
@website << page.search(".figcaption .caption a/@href").text.to_s.split('https://careerbuilder.vn/')
num_page_will_crawl.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i+1}-vi.html"))
website_companies << page.search(".figcaption .caption a/@href").text.to_s.split('https://careerbuilder.vn/')
website_jobs << page.search(".figcaption .title .job_link @href").text.to_s.split('https://careerbuilder.vn/')
end
@website = @website.join(",")
@website = @website.split(",").uniq!
@website = @website.select { |val| val != ''}
website_companies = website_companies.join(",")
website_companies = website_companies.split(",").uniq!
website_companies = website_companies.select { |val| val != ''}
website_jobs = website_jobs.join(",")
website_jobs = website_jobs.split(",")
website_jobs = website_jobs.select { |val| val != ''}
data << website_companies << website_jobs
end
def craw_data_companies
link_crawl = crawl_link_for_companies_data()
link_crawl = crawl_link_for_companies_jobs()
@data_companies = {}
@data_companies_name = []
@data_companies_address = []
@data_companies_description = []
agent = Mechanize.new
link_crawl.each do |url|
page = agent.get("https://careerbuilder.vn/#{url}")
link_crawl[0].each do |url|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
if page.search(".company-info .info .content .name").text == ""
name = page.search(".section-page #cp_company_name").text
@data_companies_name << name.to_s.rstrip
address = page.search(".section-page .cp_basic_info_details ul li:nth-child(1)").text
@data_companies_address << address.to_s.rstrip
desc = page.search(".cp_aboutus_item .content_fck").text
@data_companies_description << desc
desc = page.search(".cp_aboutus_item .content_fck").text
if (name != "" && address != "" && desc != "")
@data_companies_name << name.to_s.rstrip
@data_companies_address << address.to_s.rstrip
@data_companies_description << desc
end
else
name = page.search(".company-info .info .content .name").text
@data_companies_name << name.to_s.rstrip
address = page.search(".company-info .info .content p:nth-child(3)").text
@data_companies_address << address.to_s.rstrip
desc = page.search(".main-about-us .content").text
@data_companies_description << desc
if (name != "" && address != "" && desc != "")
@data_companies_name << name.to_s.rstrip
@data_companies_address << address.to_s.rstrip
@data_companies_description << desc
end
end
end
@data_companies[:name] = @data_companies_name.select { |val| val != ''}
@data_companies[:address] = @data_companies_address.select { |val| val != ''}
@data_companies[:name] = @data_companies_name
@data_companies[:address] = @data_companies_address
@data_companies_description.each do |val|
val.to_s.delete!("[\n,\t,\r]")
val.lstrip!
val.rstrip!
val.strip!
end
@data_companies[:description] = @data_companies_description.select { |val| val != ''}
@data_companies[:description] = @data_companies_description
render plain: "#{@data_companies[:name]} -- #{@data_companies[:address]} -- #{@data_companies[:description]}"
# render plain: @data_companies
end
# Crawler job
def crawl_data_jobs_interface_1
link_crawl = crawl_link_for_companies_jobs()
@data = {}
@name = []
@company_name = []
@level = []
@exprience = []
@salary = []
@create_date = []
@expiration_date = []
@description = []
@industry_name = []
@city_name = []
# link_crawl[1].each do |url|
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
# end
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/ky-su-dau-thau-mep.35B45617.html"))))
#interface1
name = page.search(".apply-now-content .job-desc .title").text
@data[:name] = name
company_name = page.search(".apply-now-content .job-desc .job-company-name").text
@data[:company_name] = company_name
city_name = page.search(".detail-box .map p a").text
@data[:city_name] = city_name
created_date = page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[0].text
@data[:created_date] = created_date
expiratiom_date = page.search(".item-blue .detail-box ul li:last")[1].text.delete!("[\n,\t,\r]").split(' ').last
@data[:expiratiom_date] = expiratiom_date
salary = page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[1].text
@data[:salary] = salary
industry_name = page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(2) a").text
industry_name = industry_name.delete!("[\n,\t,\r]").split(' ').select { |v| v != ''}
@data[:industry_name] = industry_name.join(',')
description = page.search(".tabs .tab-content .detail-row:nth-child(n)").to_s.delete!("[\n,\t,\r]")
@data[:description] = description
level = page.search(".item-blue .detail-box:last ul li:nth-child(3)").text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
level = level[1].to_s.strip
if level == ""
level = page.search(".item-blue .detail-box:last ul li:nth-child(2)").text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
level = level[1].to_s.strip
end
@data[:level] = level
exprience = page.search(".item-blue .detail-box:last ul li:nth-child(2)").text.delete!("[\n,\t,\r]").split('Kinh nghiệm')
exprience = exprience[1].to_s.strip
@data[:exprience] = exprience
render plain: "#{@data}"
end
def crawl_data_jobs_interface_2
link_crawl = crawl_link_for_companies_jobs()
@data = {}
@name = []
@company_name = []
@level = []
@exprience = []
@salary = []
@create_date = []
@expiration_date = []
@description = []
@industry_name = []
@city_name = []
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/dai-dien-tieu-thu-sales-representative-quang-binh-tp-dong-hoi.35B4572F.html"))))
#interface1
name = page.search(".apply-now-content .job-desc .title").text
@data[:name] = name
company_name = page.search(".top-job .top-job-info .tit_company").text
@data[:company_name] = company_name
city_name = page.search(".info-workplace .value a").text
@data[:city_name] = city_name
@data[:created_date] = ""
expiration_date = page.search(".info li:nth-child(4)").text
@data[:expiration_date] = expiration_date.to_s.delete!("[\n,\t,\r]").split(' ').last
salary = page.search(".info li:nth-child(3)").text.split("Lương")
@data[:salary] = salary.last.strip
industry_name = page.search(".info li:nth-child(5) .value").text
@data[:industry_name] = industry_name
description = page.search(".left-col").to_s.delete!("[\n,\t,\r]")
@data[:description] = description
level = page.search(".boxtp .info li:nth-child(2)").text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc').last.strip
@data[:level] = level
# render plain: "#{@data_companies[:name].length} -- #{@data_companies[:address].length} -- #{@data_companies[:description].length}"
render plain: @data_companies
exprience = page.search(".info li:nth-child(6)").text.delete!("[\n,\t,\r]").split('Kinh nghiệm').last.strip
@data[:exprience] = exprience
render plain: "#{@data}"
end
def crawl_data_jobs_interface_3
link_crawl = crawl_link_for_companies_jobs()
@data = {}
@name = []
@company_name = []
@level = []
@exprience = []
@salary = []
@create_date = []
@expiration_date = []
@description = []
@industry_name = []
@city_name = []
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/tuyen-tai-xe-van-phong-cho-sep-han-quoc-tu-binh-thanh.35B45A41.html"))))
#interface1
name = page.search(".info-company h1").text
@data[:name] = name
company_name = page.search(".zone-company .text-job h2").text
@data[:company_name] = company_name
city_name = page.search(".DetailJobNew ul li:nth-child(1) a").text
@data[:city_name] = city_name
@data[:created_date] = ""
expiration_date = page.search(".DetailJobNew li:nth-child(7) span").text
@data[:expiration_date] = expiration_date
salary = page.search(".DetailJobNew li:nth-child(6) span").text
@data[:salary] = salary
industry_name = page.search(".DetailJobNew li:nth-child(3) span a").text
@data[:industry_name] = industry_name.delete!("[\n,\t,\r]").split(' ').select { |v| v != ''}
description = page.search(".left-col").to_s.delete!("[\n,\t,\r]")
@data[:description] = description
level = page.search(".DetailJobNew ul li:nth-child(2) span").text
@data[:level] = level
@data[:exprience] = ""
render plain: "#{@data}"
end
end
\ No newline at end of file
......@@ -14,7 +14,7 @@ default: &default
encoding: utf8
pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username: root
password: '12345678'
password: '1'
socket: /var/run/mysqld/mysqld.sock
......
......@@ -9,7 +9,7 @@ threads threads_count, threads_count
# Specifies the `port` that Puma will listen on to receive requests; default is 3000.
#
port ENV.fetch("PORT") { 1111 }
port ENV.fetch("PORT") { 3000 }
# Specifies the `environment` that Puma will run in.
#
......
require 'open-uri'
namespace :db do
task populate: :environment do
task populate: :environment do
make_cities
make_industries
make_companies
end
# CRAWLER ALL CITIES
task make_cities: :environment do
@data_list_cities = []
agent = Mechanize.new
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html")
data = page.search("#location option")
$page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
p1 = Nokogiri::HTML(URI.open('https://careerbuilder.vn/vi/tim-viec-lam/vinhomes-chuyen-vien-thu-tuc-bat-dong-san.35B449B5.html'))
def make_cities
@data_list_cities = []
data = $page.search("#location option")
list_cities = data.to_s.split("</option>")
list_cities.each do |x|
@data_list_cities << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
for i in 0..(@data_list_cities.length - 1)
@data_list_cities.length.times do |i|
if i <= 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 1)
......@@ -25,72 +29,79 @@ namespace :db do
end
# CRAWLER ALL INDUSTRIES
task make_industry: :environment do
@data_list_industries = []
agent = Mechanize.new
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html")
data = page.search("#industry option")
def make_industries
@data_list_industries = []
data = $page.search("#industry option")
list_industries = data.to_s.split("</option>")
list_industries.each do |x|
@data_list_industries << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
for i in 0..(@data_list_industries.length - 1)
@data_list_industries.length.times do |i|
name = (@data_list_industries[i].to_s)
Industry.create!(name: name)
end
end
# CRAWLER COMPANIES
task make_companies: :environment do
def crawl_link_for_companies_data
@website = []
num_page_will_crawl = 1
agent = Mechanize.new
for i in 1..num_page_will_crawl do
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i}-vi.html")
@website << page.search(".figcaption .caption a/@href").text.to_s.split('https://careerbuilder.vn/')
end
@website = @website.join(",")
@website = @website.split(",").uniq!
@website = @website.select { |val| val != ''}
# CRAWLER LINK JOB & COMPANIES
def crawl_link_for_companies_jobs
data = []
website_companies = []
website_jobs = []
num_page_will_crawl = 3
num_page_will_crawl.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i+1}-vi.html"))
website_companies << page.search(".figcaption .caption a/@href").text.to_s.split('https://careerbuilder.vn/')
website_jobs << page.search(".figcaption .title .job_link @href").text.to_s.split('https://careerbuilder.vn/')
end
website_companies = website_companies.join(",")
website_companies = website_companies.split(",").uniq!
website_companies = website_companies.select { |val| val != ''}
website_jobs = website_jobs.join(",")
website_jobs = website_jobs.split(",")
website_jobs = website_jobs.select { |val| val != ''}
data << website_companies << website_jobs
end
# CRAWLER COMPANIES
def make_companies
def craw_data_companies
link_crawl = crawl_link_for_companies_data()
link_crawl = crawl_link_for_companies_jobs()
@data_companies = {}
@data_companies_name = []
@data_companies_address = []
@data_companies_description = []
agent = Mechanize.new
link_crawl.each do |url|
page = agent.get("https://careerbuilder.vn/#{url}")
link_crawl[0].each do |url|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
name = ''
address = ''
desc = ''
if page.search(".company-info .info .content .name").text == ""
name = page.search(".section-page #cp_company_name").text
@data_companies_name << name.to_s.rstrip
address = page.search(".section-page .cp_basic_info_details ul li:nth-child(1)").text
@data_companies_address << address.to_s.rstrip
desc = page.search(".cp_aboutus_item .content_fck").text
@data_companies_description << desc
name = page.search(".section-page #cp_company_name").text
address = page.search(".section-page .cp_basic_info_details ul li:nth-child(1)").text
desc = page.search(".cp_aboutus_item .content_fck").text
else
name = page.search(".company-info .info .content .name").text
name = page.search(".company-info .info .content .name").text
address = page.search(".company-info .info .content p:nth-child(3)").text
desc = page.search(".main-about-us .content").text
end
if (name != "" && address != "" && desc != "")
@data_companies_name << name.to_s.rstrip
address = page.search(".company-info .info .content p:nth-child(3)").text
@data_companies_address << address.to_s.rstrip
desc = page.search(".main-about-us .content").text
@data_companies_description << desc
end
end
@data_companies[:name] = @data_companies_name.select { |val| val != ''}
@data_companies[:address] = @data_companies_address.select { |val| val != ''}
@data_companies[:name] = @data_companies_name
@data_companies[:address] = @data_companies_address
@data_companies_description.each do |val|
val.to_s.delete!("[\n,\t,\r]")
val.lstrip!
val.rstrip!
val.strip!
end
@data_companies[:description] = @data_companies_description.select { |val| val != ''}
@data_companies[:description] = @data_companies_description
@data_companies
end
......@@ -108,4 +119,7 @@ namespace :db do
end
end
def make_jobs
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment