Commit a1f70d89 by Ngo Trung Hung

finished crawler 50% jobs

parent c797139b
class ApplicationController < ActionController::Base
require 'nokogiri'
require 'nokogiri'
require 'open-uri'
include CrawlerHelper
......
......@@ -3,6 +3,9 @@ class HomeController < ApplicationController
def index
# crawl_data_jobs_interface_1()
# crawl_data_jobs_interface_2()
crawl_data_jobs_interface_3()
# crawl_data_jobs_interface_3()
# crawl_data_jobs_interface_4()
# crawl_data_jobs_interface_5()
make_data
end
end
......@@ -14,7 +14,7 @@ default: &default
encoding: utf8
pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username: root
password: '1'
password: '12345678'
socket: /var/run/mysqld/mysqld.sock
......
require 'open-uri'
class Clawler
@page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
# PILL DATA CITIES
def self.make_cities
@data_list_cities = []
data = @page.search("#location option")
list_cities = data.to_s.split("</option>")
list_cities.each do |x|
@data_list_cities << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
@data_list_cities.length.times do |i|
if i <= 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 1)
elsif i > 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 0)
end
end
end
#PIL DATA INDUSTRIES
def self.make_industries
@data_list_industries = []
data = @page.search("#industry option")
list_industries = data.to_s.split("</option>")
list_industries.each do |x|
@data_list_industries << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
@data_list_industries.length.times do |i|
name = (@data_list_industries[i].to_s)
Industry.create!(name: name)
end
end
# CRAWLER LINK JOB & COMPANIES
def self.crawl_link_for_companies_jobs
data = []
website_companies = []
website_jobs = []
num_page_will_crawl = 1
num_page_will_crawl.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i+1}-vi.html"))
website_companies << page.search(".figcaption .caption a/@href").text.to_s.split('https://careerbuilder.vn/')
website_jobs << page.search(".figcaption .title .job_link @href").text.to_s.split('https://careerbuilder.vn/')
end
website_companies = website_companies.join(",")
website_companies = website_companies.split(",").uniq!
website_companies = website_companies.select { |val| val != ''}
website_jobs = website_jobs.join(",")
website_jobs = website_jobs.split(",")
website_jobs = website_jobs.select { |val| val != ''}
data << website_companies << website_jobs
end
# CRAWLER DATA COMPANIES
def self.craw_data_companies
link_crawl = crawl_link_for_companies_jobs()
@data_companies = {}
@data_companies_name = []
@data_companies_address = []
@data_companies_description = []
link_crawl[0].each do |url|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
name = ''
address = ''
desc = ''
if page.search(".company-info .info .content .name").text == ""
name = page.search(".section-page #cp_company_name").text
address = page.search(".section-page .cp_basic_info_details ul li:nth-child(1)").text
desc = page.search(".cp_aboutus_item .content_fck").text
else
name = page.search(".company-info .info .content .name").text
address = page.search(".company-info .info .content p:nth-child(3)").text
desc = page.search(".main-about-us .content").text
end
if (name != "" && address != "" && desc != "")
@data_companies_name << name.to_s.rstrip
@data_companies_address << address.to_s.rstrip
@data_companies_description << desc
end
end
@data_companies[:name] = @data_companies_name
@data_companies[:address] = @data_companies_address
@data_companies_description.each do |val|
val.to_s.delete!("[\n,\t,\r]")
val.strip!
end
@data_companies[:description] = @data_companies_description
@data_companies
end
# FILL DATA COMPANIES
def self.make_companies
@data = self.craw_data_companies()
i = @data[:name].length
i.times do |n|
name = @data[:name][n]
address = @data[:address][n]
short_description = @data[:description][n]
password = "password"
Company.create!(name: name,
address: address,
short_description: short_description)
end
end
end
\ No newline at end of file
require 'open-uri'
require 'helper/crawler'
namespace :db do
task populate: :environment do
make_cities
make_industries
make_companies
end
# CRAWLER ALL CITIES
$page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
p1 = Nokogiri::HTML(URI.open('https://careerbuilder.vn/vi/tim-viec-lam/vinhomes-chuyen-vien-thu-tuc-bat-dong-san.35B449B5.html'))
def make_cities
@data_list_cities = []
data = $page.search("#location option")
list_cities = data.to_s.split("</option>")
list_cities.each do |x|
@data_list_cities << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
@data_list_cities.length.times do |i|
if i <= 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 1)
elsif i > 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 0)
end
end
end
# CRAWLER ALL INDUSTRIES
def make_industries
@data_list_industries = []
data = $page.search("#industry option")
list_industries = data.to_s.split("</option>")
list_industries.each do |x|
@data_list_industries << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
@data_list_industries.length.times do |i|
name = (@data_list_industries[i].to_s)
Industry.create!(name: name)
end
end
# CRAWLER LINK JOB & COMPANIES
def crawl_link_for_companies_jobs
data = []
website_companies = []
website_jobs = []
num_page_will_crawl = 3
num_page_will_crawl.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i+1}-vi.html"))
website_companies << page.search(".figcaption .caption a/@href").text.to_s.split('https://careerbuilder.vn/')
website_jobs << page.search(".figcaption .title .job_link @href").text.to_s.split('https://careerbuilder.vn/')
end
website_companies = website_companies.join(",")
website_companies = website_companies.split(",").uniq!
website_companies = website_companies.select { |val| val != ''}
website_jobs = website_jobs.join(",")
website_jobs = website_jobs.split(",")
website_jobs = website_jobs.select { |val| val != ''}
data << website_companies << website_jobs
end
# CRAWLER COMPANIES
def make_companies
def craw_data_companies
link_crawl = crawl_link_for_companies_jobs()
@data_companies = {}
@data_companies_name = []
@data_companies_address = []
@data_companies_description = []
link_crawl[0].each do |url|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
name = ''
address = ''
desc = ''
if page.search(".company-info .info .content .name").text == ""
name = page.search(".section-page #cp_company_name").text
address = page.search(".section-page .cp_basic_info_details ul li:nth-child(1)").text
desc = page.search(".cp_aboutus_item .content_fck").text
else
name = page.search(".company-info .info .content .name").text
address = page.search(".company-info .info .content p:nth-child(3)").text
desc = page.search(".main-about-us .content").text
end
if (name != "" && address != "" && desc != "")
@data_companies_name << name.to_s.rstrip
@data_companies_address << address.to_s.rstrip
@data_companies_description << desc
end
end
@data_companies[:name] = @data_companies_name
@data_companies[:address] = @data_companies_address
@data_companies_description.each do |val|
val.to_s.delete!("[\n,\t,\r]")
val.strip!
end
@data_companies[:description] = @data_companies_description
@data_companies
end
@data = craw_data_companies()
i = @data[:name].length
i.times do |n|
name = @data[:name][n]
address = @data[:address][n]
short_description = @data[:description][n]
password = "password"
Company.create!(name: name,
address: address,
short_description: short_description)
end
end
def make_jobs
Clawler.make_cities
Clawler.make_industries
Clawler.make_companies
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment