Commit 943b2278 by Ngo Trung Hung

done crawl data

parent 4a66e48a
......@@ -13,7 +13,7 @@ gem 'sass-rails', '~> 5.0'
gem 'uglifier', '>= 1.3.0'
# See https://github.com/rails/execjs#readme for more supported runtimes
# gem 'mini_racer', platforms: :ruby
gem 'pry'
# Use CoffeeScript for .coffee assets and views
gem 'coffee-rails', '~> 4.2'
# Turbolinks makes navigating your web application faster. Read more: https://github.com/turbolinks/turbolinks
......
......@@ -64,6 +64,7 @@ GEM
chromedriver-helper (2.1.1)
archive-zip (~> 0.10)
nokogiri (~> 1.8)
coderay (1.1.3)
coffee-rails (4.2.2)
coffee-script (>= 2.2.0)
railties (>= 4.0.0)
......@@ -105,6 +106,9 @@ GEM
nio4r (2.5.2)
nokogiri (1.10.10)
mini_portile2 (~> 2.4.0)
pry (0.13.1)
coderay (~> 1.1)
method_source (~> 1.0)
public_suffix (4.0.5)
puma (3.12.6)
rack (2.2.3)
......@@ -204,6 +208,7 @@ DEPENDENCIES
listen (>= 3.0.5, < 3.2)
mysql2 (= 0.5.3)
nokogiri
pry
puma (~> 3.11)
rails (~> 5.2.4, >= 5.2.4.3)
sass-rails (~> 5.0)
......
class HomeController < ApplicationController
def index
# crawl_data_jobs_interface_1()
# crawl_data_jobs_interface_2()
# crawl_data_jobs_interface_3()
# crawl_data_jobs_interface_4()
# crawl_data_jobs_interface_5()
make_data
# craw_data_companies
@data = Job.all
@data2 = Company.all
end
end
<% @data.each do |val| %>
<h1></h1> <%= val.name %>
<% end %>
<% @data2.each do |val| %>
<h1></h1> <%= val.name %>
<% end %>
\ No newline at end of file
......@@ -14,7 +14,7 @@ default: &default
encoding: utf8
pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username: root
password: '1'
password: '12345678'
socket: /var/run/mysqld/mysqld.sock
......
......@@ -87,15 +87,16 @@ class Clawler
expiration_date: expiration_date,
description: description)
self.make_foreign_industries_table(@data_jobs[:industry_name][n],id_job.id)
self.make_cities_cities_table(@data_jobs[:city_name][n],id_job.id)
self.make_foreign_cities_table(@data_jobs[:city_name][n],id_job.id)
end
end
def self.make_foreign_industries_table(data,id_job)
@content = data.split(',')
length = @content.length
length.times do |n|
id_industry = Industry.find_by name: @content[n].strip
id_industry = Industry.find_by name: (@content[n].strip)
if !id_industry
id_industry = Industry.create!(name: @content[n].strip).id
......@@ -107,18 +108,14 @@ class Clawler
end
end
def self.make_cities_cities_table(data,id_job)
if data.include?(',')
@content = data.split(',')
else
@content = data.split('|')
end
length = @content.length
def self.make_foreign_cities_table(data,id_job)
@list_city = data.split(',')
length = @list_city.length
length.times do |n|
id_cities = City.find_by name: @content[n].strip
id_cities = City.find_by name: @list_city[n].strip
if !id_cities
id_cities = City.create!(name: @content[n].strip).id
id_cities = City.create!(name: @list_city[n].strip).id
else
id_cities = id_cities.id
end
......
class Interface_web
# func get "n" link company & job
def self.crawl_link_for_companies_jobs(page)
data = []
website_companies = []
......@@ -21,8 +21,18 @@ class Interface_web
data << website_companies << website_jobs
end
@crawl_link_for_companies_jobs = crawl_link_for_companies_jobs(4)
def self.get_link_job_and_companies
@crawl_link_for_companies_jobs ||= crawl_link_for_companies_jobs(4)
end
def self.base_link(url)
Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
end
def self.craw_data_companies
link_crawl = crawl_link_for_companies_jobs(1)
link_crawl = get_link_job_and_companies
@data_companies = {}
@data_companies_name = []
@data_companies_address = []
......@@ -56,13 +66,9 @@ class Interface_web
val.strip!
end
@data_companies[:description] = @data_companies_description
@data_companies
end
def self.base_link(url)
Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
end
def self.crawl_data_jobs_interface_1(url)
page = base_link(url)
......@@ -72,7 +78,13 @@ class Interface_web
@company_name << page.search(".apply-now-content .job-desc .job-company-name").text
@data[:company_name] = @company_name
@city_name << page.search(".detail-box .map p a").text
location = []
length = page.search(".detail-box .map p a").size
length.times do |n|
location << page.search(".detail-box .map p a:nth-child(#{n+1})").text
end
@city_name << location.join(',')
location.clear
@data[:city_name] = @city_name
@created_date << page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[0].text
......@@ -112,16 +124,23 @@ class Interface_web
def self.crawl_data_jobs_interface_2(url)
page = base_link(url)
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/dai-dien-tieu-thu-sales-representative-quang-binh-tp-dong-hoi.35B4572F.html"))))
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
#interface1
@name << page.search(".apply-now-content .job-desc .title").text
@data[:name] = @name
@company_name << page.search(".top-job .top-job-info .tit_company").text
@data[:company_name] = @company_name
@city_name << page.search(".info-workplace .value a").text
location = []
length = page.search(".info-workplace .value a").size
length.times do |n|
location << page.search(".info-workplace .value a:nth-child(#{n+1})").text
end
@city_name << location.join(',')
location.clear
@data[:city_name] = @city_name
# @city_name << page.search(".info-workplace .value a").text
@created_date << ""
@data[:created_date] =@created_date
......@@ -187,7 +206,8 @@ class Interface_web
if page.search(".zone-company .text-job h2").text == ""
@company_name << page.search(".info-company .text-job h2").text
@industry_name << page.search(".DetailJobNew li:nth-child(3) span").text.strip
industry_name = page.search(".DetailJobNew li:nth-child(3) span").text.strip
@industry_name << industry_name.delete!("[\n,\t,\r]").split(' ').select { |v| v != ''}
else
@company_name << page.search(".zone-company .text-job h2").text.strip
industry_name = page.search(".DetailJobNew li:nth-child(3) span a").text
......@@ -265,7 +285,7 @@ class Interface_web
@industry_name = []
@city_name = []
link_crawl = crawl_link_for_companies_jobs(1)
link_crawl = get_link_job_and_companies
link_crawl[1].each do |path|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{path}"))))
if page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[0] != nil
......@@ -274,8 +294,8 @@ class Interface_web
crawl_data_jobs_interface_2(path)
elsif page.search(".DetailJobNew ul li").size == 10
crawl_data_jobs_interface_5(path)
elsif page.search(".DetailJobNew ul li").size == 8
crawl_data_jobs_interface_4(path)
# elsif page.search(".DetailJobNew ul li").size == 8
# crawl_data_jobs_interface_4(path)
else
crawl_data_jobs_interface_3(path)
end
......
require 'src/crawler'
namespace :db do
task populate: :environment do
# Clawler.make_industries
Clawler.make_cities
Clawler.make_industries
Clawler.make_companies
Clawler.make_jobs
end
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment