Commit 4a66e48a by Ngô Trung Hưng

craw 90%

parent a1f70d89
......@@ -6,6 +6,7 @@ class HomeController < ApplicationController
# crawl_data_jobs_interface_3()
# crawl_data_jobs_interface_4()
# crawl_data_jobs_interface_5()
make_data
make_data
# craw_data_companies
end
end
......@@ -14,7 +14,7 @@ default: &default
encoding: utf8
pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username: root
password: '12345678'
password: '1'
socket: /var/run/mysqld/mysqld.sock
......
require 'open-uri'
class Clawler
@page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
# PILL DATA CITIES
def self.make_cities
@data_list_cities = []
data = @page.search("#location option")
list_cities = data.to_s.split("</option>")
list_cities.each do |x|
@data_list_cities << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
@data_list_cities.length.times do |i|
if i <= 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 1)
elsif i > 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 0)
end
end
end
#PIL DATA INDUSTRIES
def self.make_industries
@data_list_industries = []
data = @page.search("#industry option")
list_industries = data.to_s.split("</option>")
list_industries.each do |x|
@data_list_industries << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
@data_list_industries.length.times do |i|
name = (@data_list_industries[i].to_s)
Industry.create!(name: name)
end
end
# CRAWLER LINK JOB & COMPANIES
def self.crawl_link_for_companies_jobs
data = []
website_companies = []
website_jobs = []
num_page_will_crawl = 1
num_page_will_crawl.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i+1}-vi.html"))
website_companies << page.search(".figcaption .caption a/@href").text.to_s.split('https://careerbuilder.vn/')
website_jobs << page.search(".figcaption .title .job_link @href").text.to_s.split('https://careerbuilder.vn/')
end
website_companies = website_companies.join(",")
website_companies = website_companies.split(",").uniq!
website_companies = website_companies.select { |val| val != ''}
website_jobs = website_jobs.join(",")
website_jobs = website_jobs.split(",")
website_jobs = website_jobs.select { |val| val != ''}
data << website_companies << website_jobs
end
# CRAWLER DATA COMPANIES
def self.craw_data_companies
link_crawl = crawl_link_for_companies_jobs()
@data_companies = {}
@data_companies_name = []
@data_companies_address = []
@data_companies_description = []
link_crawl[0].each do |url|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
name = ''
address = ''
desc = ''
if page.search(".company-info .info .content .name").text == ""
name = page.search(".section-page #cp_company_name").text
address = page.search(".section-page .cp_basic_info_details ul li:nth-child(1)").text
desc = page.search(".cp_aboutus_item .content_fck").text
else
name = page.search(".company-info .info .content .name").text
address = page.search(".company-info .info .content p:nth-child(3)").text
desc = page.search(".main-about-us .content").text
end
if (name != "" && address != "" && desc != "")
@data_companies_name << name.to_s.rstrip
@data_companies_address << address.to_s.rstrip
@data_companies_description << desc
end
end
@data_companies[:name] = @data_companies_name
@data_companies[:address] = @data_companies_address
@data_companies_description.each do |val|
val.to_s.delete!("[\n,\t,\r]")
val.strip!
end
@data_companies[:description] = @data_companies_description
@data_companies
end
# FILL DATA COMPANIES
def self.make_companies
@data = self.craw_data_companies()
i = @data[:name].length
i.times do |n|
name = @data[:name][n]
address = @data[:address][n]
short_description = @data[:description][n]
password = "password"
Company.create!(name: name,
address: address,
short_description: short_description)
end
end
end
\ No newline at end of file
class Linkcrawl
# CRAWLER LINK JOB & COMPANIES
end
\ No newline at end of file
require 'open-uri'
require 'src/interface_web'
class Clawler
@page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
# PILL DATA CITIES
def self.make_cities
@data_list_cities = []
data = @page.search("#location option")
list_cities = data.to_s.split("</option>")
list_cities.each do |x|
@data_list_cities << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
@data_list_cities.length.times do |i|
if i <= 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 1)
elsif i > 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 0)
end
end
end
#PIL DATA INDUSTRIES
def self.make_industries
@data_list_industries = []
data = @page.search("#industry option")
list_industries = data.to_s.split("</option>")
list_industries.each do |x|
@data_list_industries << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').strip
end
@data_list_industries.length.times do |i|
name = (@data_list_industries[i].to_s)
if name.include?('&amp;')
name.gsub!('&amp;','&')
end
Industry.create!(name: name)
end
end
# FILL DATA COMPANIES
def self.make_companies
Company.create!(name: "Company name",
address: "Vui lòng trong mô tả công việc",
short_description: "Vui lòng trong mô tả công việc")
@data = Interface_web.craw_data_companies()
i = @data[:name].length
i.times do |n|
name = @data[:name][n]
address = @data[:address][n]
short_description = @data[:description][n]
Company.create!(name: name,
address: address,
short_description: short_description)
end
end
# FILL DATA JOBS
def self.make_jobs
@data_jobs = Interface_web.make_data()
i = @data_jobs[:name].length
i.times do |n|
name = @data_jobs[:name][n].to_s
company_name = @data_jobs[:company_name][n].to_s.strip
id_company = Company.find_by name: company_name
if id_company != nil
id_company = id_company.id
else
id_company = 1
end
level = @data_jobs[:level][n].to_s
experience = @data_jobs[:exprience][n].to_s
salary = @data_jobs[:salary][n].to_s
create_date = @data_jobs[:created_date][n].to_s
expiration_date = @data_jobs[:expiration_date][n].to_s
description = @data_jobs[:description][n].to_s
id_job = Job.create!(name: name,
company_id: id_company,
level: level,
experience: experience,
salary: salary,
create_date: create_date,
expiration_date: expiration_date,
description: description)
self.make_foreign_industries_table(@data_jobs[:industry_name][n],id_job.id)
self.make_cities_cities_table(@data_jobs[:city_name][n],id_job.id)
end
end
def self.make_foreign_industries_table(data,id_job)
@content = data.split(',')
length = @content.length
length.times do |n|
id_industry = Industry.find_by name: @content[n].strip
if !id_industry
id_industry = Industry.create!(name: @content[n].strip).id
else
id_industry = id_industry.id
end
IndustryJob.create!(industry_id: id_industry,
job_id: id_job)
end
end
def self.make_cities_cities_table(data,id_job)
if data.include?(',')
@content = data.split(',')
else
@content = data.split('|')
end
length = @content.length
length.times do |n|
id_cities = City.find_by name: @content[n].strip
if !id_cities
id_cities = City.create!(name: @content[n].strip).id
else
id_cities = id_cities.id
end
CityJob.create!(job_id: id_job,
city_id: id_cities)
end
end
end
\ No newline at end of file
require 'helper/crawler'
require 'src/crawler'
namespace :db do
task populate: :environment do
Clawler.make_cities
Clawler.make_industries
Clawler.make_companies
Clawler.make_jobs
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment