Commit 943b2278 by Ngo Trung Hung

done crawl data

parent 4a66e48a
...@@ -13,7 +13,7 @@ gem 'sass-rails', '~> 5.0' ...@@ -13,7 +13,7 @@ gem 'sass-rails', '~> 5.0'
gem 'uglifier', '>= 1.3.0' gem 'uglifier', '>= 1.3.0'
# See https://github.com/rails/execjs#readme for more supported runtimes # See https://github.com/rails/execjs#readme for more supported runtimes
# gem 'mini_racer', platforms: :ruby # gem 'mini_racer', platforms: :ruby
gem 'pry'
# Use CoffeeScript for .coffee assets and views # Use CoffeeScript for .coffee assets and views
gem 'coffee-rails', '~> 4.2' gem 'coffee-rails', '~> 4.2'
# Turbolinks makes navigating your web application faster. Read more: https://github.com/turbolinks/turbolinks # Turbolinks makes navigating your web application faster. Read more: https://github.com/turbolinks/turbolinks
......
...@@ -64,6 +64,7 @@ GEM ...@@ -64,6 +64,7 @@ GEM
chromedriver-helper (2.1.1) chromedriver-helper (2.1.1)
archive-zip (~> 0.10) archive-zip (~> 0.10)
nokogiri (~> 1.8) nokogiri (~> 1.8)
coderay (1.1.3)
coffee-rails (4.2.2) coffee-rails (4.2.2)
coffee-script (>= 2.2.0) coffee-script (>= 2.2.0)
railties (>= 4.0.0) railties (>= 4.0.0)
...@@ -105,6 +106,9 @@ GEM ...@@ -105,6 +106,9 @@ GEM
nio4r (2.5.2) nio4r (2.5.2)
nokogiri (1.10.10) nokogiri (1.10.10)
mini_portile2 (~> 2.4.0) mini_portile2 (~> 2.4.0)
pry (0.13.1)
coderay (~> 1.1)
method_source (~> 1.0)
public_suffix (4.0.5) public_suffix (4.0.5)
puma (3.12.6) puma (3.12.6)
rack (2.2.3) rack (2.2.3)
...@@ -204,6 +208,7 @@ DEPENDENCIES ...@@ -204,6 +208,7 @@ DEPENDENCIES
listen (>= 3.0.5, < 3.2) listen (>= 3.0.5, < 3.2)
mysql2 (= 0.5.3) mysql2 (= 0.5.3)
nokogiri nokogiri
pry
puma (~> 3.11) puma (~> 3.11)
rails (~> 5.2.4, >= 5.2.4.3) rails (~> 5.2.4, >= 5.2.4.3)
sass-rails (~> 5.0) sass-rails (~> 5.0)
......
class HomeController < ApplicationController class HomeController < ApplicationController
def index def index
# crawl_data_jobs_interface_1() @data = Job.all
# crawl_data_jobs_interface_2() @data2 = Company.all
# crawl_data_jobs_interface_3()
# crawl_data_jobs_interface_4()
# crawl_data_jobs_interface_5()
make_data
# craw_data_companies
end end
end end
<% @data.each do |val| %>
<h1></h1> <%= val.name %>
<% end %>
<% @data2.each do |val| %>
<h1></h1> <%= val.name %>
<% end %>
\ No newline at end of file
...@@ -14,7 +14,7 @@ default: &default ...@@ -14,7 +14,7 @@ default: &default
encoding: utf8 encoding: utf8
pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %> pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username: root username: root
password: '1' password: '12345678'
socket: /var/run/mysqld/mysqld.sock socket: /var/run/mysqld/mysqld.sock
......
require 'open-uri' require 'open-uri'
require 'src/interface_web' require 'src/interface_web'
class Clawler class Clawler
@page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html')) @page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
# PILL DATA CITIES # PILL DATA CITIES
...@@ -87,15 +87,16 @@ class Clawler ...@@ -87,15 +87,16 @@ class Clawler
expiration_date: expiration_date, expiration_date: expiration_date,
description: description) description: description)
self.make_foreign_industries_table(@data_jobs[:industry_name][n],id_job.id) self.make_foreign_industries_table(@data_jobs[:industry_name][n],id_job.id)
self.make_cities_cities_table(@data_jobs[:city_name][n],id_job.id) self.make_foreign_cities_table(@data_jobs[:city_name][n],id_job.id)
end end
end end
def self.make_foreign_industries_table(data,id_job) def self.make_foreign_industries_table(data,id_job)
@content = data.split(',') @content = data.split(',')
length = @content.length length = @content.length
length.times do |n| length.times do |n|
id_industry = Industry.find_by name: @content[n].strip id_industry = Industry.find_by name: (@content[n].strip)
if !id_industry if !id_industry
id_industry = Industry.create!(name: @content[n].strip).id id_industry = Industry.create!(name: @content[n].strip).id
...@@ -107,18 +108,14 @@ class Clawler ...@@ -107,18 +108,14 @@ class Clawler
end end
end end
def self.make_cities_cities_table(data,id_job) def self.make_foreign_cities_table(data,id_job)
if data.include?(',') @list_city = data.split(',')
@content = data.split(',') length = @list_city.length
else
@content = data.split('|')
end
length = @content.length
length.times do |n| length.times do |n|
id_cities = City.find_by name: @content[n].strip id_cities = City.find_by name: @list_city[n].strip
if !id_cities if !id_cities
id_cities = City.create!(name: @content[n].strip).id id_cities = City.create!(name: @list_city[n].strip).id
else else
id_cities = id_cities.id id_cities = id_cities.id
end end
...@@ -127,4 +124,4 @@ class Clawler ...@@ -127,4 +124,4 @@ class Clawler
end end
end end
end end
\ No newline at end of file
class Interface_web class Interface_web
# func get "n" link company & job
def self.crawl_link_for_companies_jobs(page) def self.crawl_link_for_companies_jobs(page)
data = [] data = []
website_companies = [] website_companies = []
...@@ -21,13 +21,23 @@ class Interface_web ...@@ -21,13 +21,23 @@ class Interface_web
data << website_companies << website_jobs data << website_companies << website_jobs
end end
@crawl_link_for_companies_jobs = crawl_link_for_companies_jobs(4)
def self.get_link_job_and_companies
@crawl_link_for_companies_jobs ||= crawl_link_for_companies_jobs(4)
end
def self.base_link(url)
Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
end
def self.craw_data_companies def self.craw_data_companies
link_crawl = crawl_link_for_companies_jobs(1) link_crawl = get_link_job_and_companies
@data_companies = {} @data_companies = {}
@data_companies_name = [] @data_companies_name = []
@data_companies_address = [] @data_companies_address = []
@data_companies_description = [] @data_companies_description = []
link_crawl[0].each do |url| link_crawl[0].each do |url|
page = base_link(url) page = base_link(url)
name = '' name = ''
...@@ -55,14 +65,10 @@ class Interface_web ...@@ -55,14 +65,10 @@ class Interface_web
val.to_s.delete!("[\n,\t,\r]") val.to_s.delete!("[\n,\t,\r]")
val.strip! val.strip!
end end
@data_companies[:description] = @data_companies_description @data_companies[:description] = @data_companies_description
@data_companies @data_companies
end end
def self.base_link(url)
Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
end
def self.crawl_data_jobs_interface_1(url) def self.crawl_data_jobs_interface_1(url)
page = base_link(url) page = base_link(url)
...@@ -71,8 +77,14 @@ class Interface_web ...@@ -71,8 +77,14 @@ class Interface_web
@company_name << page.search(".apply-now-content .job-desc .job-company-name").text @company_name << page.search(".apply-now-content .job-desc .job-company-name").text
@data[:company_name] = @company_name @data[:company_name] = @company_name
@city_name << page.search(".detail-box .map p a").text location = []
length = page.search(".detail-box .map p a").size
length.times do |n|
location << page.search(".detail-box .map p a:nth-child(#{n+1})").text
end
@city_name << location.join(',')
location.clear
@data[:city_name] = @city_name @data[:city_name] = @city_name
@created_date << page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[0].text @created_date << page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[0].text
...@@ -112,16 +124,23 @@ class Interface_web ...@@ -112,16 +124,23 @@ class Interface_web
def self.crawl_data_jobs_interface_2(url) def self.crawl_data_jobs_interface_2(url)
page = base_link(url) page = base_link(url)
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/dai-dien-tieu-thu-sales-representative-quang-binh-tp-dong-hoi.35B4572F.html")))) # page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/dai-dien-tieu-thu-sales-representative-quang-binh-tp-dong-hoi.35B4572F.html"))))
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
#interface1
@name << page.search(".apply-now-content .job-desc .title").text @name << page.search(".apply-now-content .job-desc .title").text
@data[:name] = @name @data[:name] = @name
@company_name << page.search(".top-job .top-job-info .tit_company").text @company_name << page.search(".top-job .top-job-info .tit_company").text
@data[:company_name] = @company_name @data[:company_name] = @company_name
@city_name << page.search(".info-workplace .value a").text location = []
length = page.search(".info-workplace .value a").size
length.times do |n|
location << page.search(".info-workplace .value a:nth-child(#{n+1})").text
end
@city_name << location.join(',')
location.clear
@data[:city_name] = @city_name @data[:city_name] = @city_name
# @city_name << page.search(".info-workplace .value a").text
@created_date << "" @created_date << ""
@data[:created_date] =@created_date @data[:created_date] =@created_date
...@@ -187,7 +206,8 @@ class Interface_web ...@@ -187,7 +206,8 @@ class Interface_web
if page.search(".zone-company .text-job h2").text == "" if page.search(".zone-company .text-job h2").text == ""
@company_name << page.search(".info-company .text-job h2").text @company_name << page.search(".info-company .text-job h2").text
@industry_name << page.search(".DetailJobNew li:nth-child(3) span").text.strip industry_name = page.search(".DetailJobNew li:nth-child(3) span").text.strip
@industry_name << industry_name.delete!("[\n,\t,\r]").split(' ').select { |v| v != ''}
else else
@company_name << page.search(".zone-company .text-job h2").text.strip @company_name << page.search(".zone-company .text-job h2").text.strip
industry_name = page.search(".DetailJobNew li:nth-child(3) span a").text industry_name = page.search(".DetailJobNew li:nth-child(3) span a").text
...@@ -265,7 +285,7 @@ class Interface_web ...@@ -265,7 +285,7 @@ class Interface_web
@industry_name = [] @industry_name = []
@city_name = [] @city_name = []
link_crawl = crawl_link_for_companies_jobs(1) link_crawl = get_link_job_and_companies
link_crawl[1].each do |path| link_crawl[1].each do |path|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{path}")))) page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{path}"))))
if page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[0] != nil if page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[0] != nil
...@@ -274,8 +294,8 @@ class Interface_web ...@@ -274,8 +294,8 @@ class Interface_web
crawl_data_jobs_interface_2(path) crawl_data_jobs_interface_2(path)
elsif page.search(".DetailJobNew ul li").size == 10 elsif page.search(".DetailJobNew ul li").size == 10
crawl_data_jobs_interface_5(path) crawl_data_jobs_interface_5(path)
elsif page.search(".DetailJobNew ul li").size == 8 # elsif page.search(".DetailJobNew ul li").size == 8
crawl_data_jobs_interface_4(path) # crawl_data_jobs_interface_4(path)
else else
crawl_data_jobs_interface_3(path) crawl_data_jobs_interface_3(path)
end end
......
require 'src/crawler' require 'src/crawler'
namespace :db do namespace :db do
task populate: :environment do task populate: :environment do
# Clawler.make_industries
Clawler.make_cities Clawler.make_cities
Clawler.make_industries
Clawler.make_companies Clawler.make_companies
Clawler.make_jobs Clawler.make_jobs
end end
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment