Commit c797139b by Hung0326 Committed by GitHub

Merge pull request #5 from Hung0326/dev

40% create crawler job
parents e529714f 1326d754
......@@ -20,7 +20,7 @@ gem 'coffee-rails', '~> 4.2'
gem 'turbolinks', '~> 5'
# Build JSON APIs with ease. Read more: https://github.com/rails/jbuilder
gem 'jbuilder', '~> 2.5'
gem 'mechanize', '2.7.6'
gem 'nokogiri'
# Use Redis adapter to run Action Cable in production
# gem 'redis', '~> 4.0'
# Use ActiveModel has_secure_password
......
......@@ -72,17 +72,12 @@ GEM
execjs
coffee-script-source (1.12.2)
concurrent-ruby (1.1.6)
connection_pool (2.2.3)
crass (1.0.6)
domain_name (0.5.20190701)
unf (>= 0.0.5, < 1.0.0)
erubi (1.9.0)
execjs (2.7.0)
ffi (1.13.1)
globalid (0.4.2)
activesupport (>= 4.2.0)
http-cookie (1.0.3)
domain_name (~> 0.5)
i18n (1.8.3)
concurrent-ruby (~> 1.0)
io-like (0.3.1)
......@@ -100,32 +95,16 @@ GEM
mini_mime (>= 0.1.1)
marcel (0.3.3)
mimemagic (~> 0.3.2)
mechanize (2.7.6)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (>= 1.17.2)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (>= 2.5.2)
nokogiri (~> 1.6)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
method_source (1.0.0)
mime-types (3.3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2020.0512)
mimemagic (0.3.5)
mini_mime (1.0.2)
mini_portile2 (2.4.0)
minitest (5.14.1)
msgpack (1.3.3)
mysql2 (0.5.3)
net-http-digest_auth (1.4.1)
net-http-persistent (4.0.0)
connection_pool (~> 2.2)
nio4r (2.5.2)
nokogiri (1.10.10)
mini_portile2 (~> 2.4.0)
ntlm-http (0.1.1)
public_suffix (4.0.5)
puma (3.12.6)
rack (2.2.3)
......@@ -201,15 +180,11 @@ GEM
thread_safe (~> 0.1)
uglifier (4.2.0)
execjs (>= 0.3.0, < 3)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.7)
web-console (3.7.0)
actionview (>= 5.0)
activemodel (>= 5.0)
bindex (>= 0.4.0)
railties (>= 5.0)
webrobots (0.1.2)
websocket-driver (0.7.3)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5)
......@@ -227,8 +202,8 @@ DEPENDENCIES
coffee-rails (~> 4.2)
jbuilder (~> 2.5)
listen (>= 3.0.5, < 3.2)
mechanize (= 2.7.6)
mysql2 (= 0.5.3)
nokogiri
puma (~> 3.11)
rails (~> 5.2.4, >= 5.2.4.3)
sass-rails (~> 5.0)
......
class ApplicationController < ActionController::Base
require 'nokogiri'
require 'open-uri'
include CrawlerHelper
end
class HomeController < ApplicationController
def index
craw_data_companies()
# val = "\r\n \r\n\tĐược thành lập vào năm 2002, Công ty Cổ phần Maison (Maison) đã nhanh chóng trở thành một trong những công ty phân phối thời trang lớn nhất trong việc giới thiệu các thương hiệu thời trang cao cấp và sang trọng tại Việt Nam. Sau thành công ngoài mong đợi của cửa hàng đầu tiên, hiện Maison đang là ngôi nhà chung của hơn 23 thương hiệu đình đám thế giới.\r\n\r\n\t \r\n\r\n\tNhân viên là tài sản giá trị nhất của chúng tôi bởi vai trò quan trọng họ mang lại trong quá trình đạt được mục tiêu kinh doanh của Maison. Chúng tôi luôn hỗ trợ phát triển kỹ năng và chuyên môn ở từng giai đoạn sự nghiệp của mọi người.\r\n\r\n\tLuôn có những cơ hội mở để phát triển bản thân và sự nghiệp, bởi chúng tôi hiểu rằng con người là tài sản cốt lõi để củng cố doanh nghiệp và danh tiếng thương hiệu.\r\n\r\n\tChúng tôi \"will be ahihi\" tin rằng áp dụng những giá trị này vào thực tế sẽ tạo nền tảng lợi ích lâu dài đối với con người.\r\n\r\n\t \r\n\r\n "
# val.to_s.delete!("[\n,\t,\r]")
# val.lstrip!
# val.rstrip!
# @data = val
# crawl_data_jobs_interface_1()
# crawl_data_jobs_interface_2()
crawl_data_jobs_interface_3()
end
end
......@@ -14,7 +14,7 @@ default: &default
encoding: utf8
pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username: root
password: '12345678'
password: '1'
socket: /var/run/mysqld/mysqld.sock
......
......@@ -9,7 +9,7 @@ threads threads_count, threads_count
# Specifies the `port` that Puma will listen on to receive requests; default is 3000.
#
port ENV.fetch("PORT") { 1111 }
port ENV.fetch("PORT") { 3000 }
# Specifies the `environment` that Puma will run in.
#
......
require 'open-uri'
namespace :db do
task populate: :environment do
task populate: :environment do
make_cities
make_industries
make_companies
end
# CRAWLER ALL CITIES
task make_cities: :environment do
@data_list_cities = []
agent = Mechanize.new
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html")
data = page.search("#location option")
$page = Nokogiri::HTML(URI.open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
p1 = Nokogiri::HTML(URI.open('https://careerbuilder.vn/vi/tim-viec-lam/vinhomes-chuyen-vien-thu-tuc-bat-dong-san.35B449B5.html'))
def make_cities
@data_list_cities = []
data = $page.search("#location option")
list_cities = data.to_s.split("</option>")
list_cities.each do |x|
@data_list_cities << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
for i in 0..(@data_list_cities.length - 1)
@data_list_cities.length.times do |i|
if i <= 69
name = (@data_list_cities[i].to_s)
City.create!(name: name, area: 1)
......@@ -25,72 +29,79 @@ namespace :db do
end
# CRAWLER ALL INDUSTRIES
task make_industry: :environment do
@data_list_industries = []
agent = Mechanize.new
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html")
data = page.search("#industry option")
def make_industries
@data_list_industries = []
data = $page.search("#industry option")
list_industries = data.to_s.split("</option>")
list_industries.each do |x|
@data_list_industries << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
for i in 0..(@data_list_industries.length - 1)
@data_list_industries.length.times do |i|
name = (@data_list_industries[i].to_s)
Industry.create!(name: name)
end
end
# CRAWLER COMPANIES
task make_companies: :environment do
def crawl_link_for_companies_data
@website = []
num_page_will_crawl = 1
agent = Mechanize.new
for i in 1..num_page_will_crawl do
page = agent.get("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i}-vi.html")
@website << page.search(".figcaption .caption a/@href").text.to_s.split('https://careerbuilder.vn/')
end
@website = @website.join(",")
@website = @website.split(",").uniq!
@website = @website.select { |val| val != ''}
# CRAWLER LINK JOB & COMPANIES
def crawl_link_for_companies_jobs
data = []
website_companies = []
website_jobs = []
num_page_will_crawl = 3
num_page_will_crawl.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i+1}-vi.html"))
website_companies << page.search(".figcaption .caption a/@href").text.to_s.split('https://careerbuilder.vn/')
website_jobs << page.search(".figcaption .title .job_link @href").text.to_s.split('https://careerbuilder.vn/')
end
website_companies = website_companies.join(",")
website_companies = website_companies.split(",").uniq!
website_companies = website_companies.select { |val| val != ''}
website_jobs = website_jobs.join(",")
website_jobs = website_jobs.split(",")
website_jobs = website_jobs.select { |val| val != ''}
data << website_companies << website_jobs
end
# CRAWLER COMPANIES
def make_companies
def craw_data_companies
link_crawl = crawl_link_for_companies_data()
link_crawl = crawl_link_for_companies_jobs()
@data_companies = {}
@data_companies_name = []
@data_companies_address = []
@data_companies_description = []
agent = Mechanize.new
link_crawl.each do |url|
page = agent.get("https://careerbuilder.vn/#{url}")
link_crawl[0].each do |url|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
name = ''
address = ''
desc = ''
if page.search(".company-info .info .content .name").text == ""
name = page.search(".section-page #cp_company_name").text
@data_companies_name << name.to_s.rstrip
address = page.search(".section-page .cp_basic_info_details ul li:nth-child(1)").text
@data_companies_address << address.to_s.rstrip
desc = page.search(".cp_aboutus_item .content_fck").text
@data_companies_description << desc
name = page.search(".section-page #cp_company_name").text
address = page.search(".section-page .cp_basic_info_details ul li:nth-child(1)").text
desc = page.search(".cp_aboutus_item .content_fck").text
else
name = page.search(".company-info .info .content .name").text
name = page.search(".company-info .info .content .name").text
address = page.search(".company-info .info .content p:nth-child(3)").text
desc = page.search(".main-about-us .content").text
end
if (name != "" && address != "" && desc != "")
@data_companies_name << name.to_s.rstrip
address = page.search(".company-info .info .content p:nth-child(3)").text
@data_companies_address << address.to_s.rstrip
desc = page.search(".main-about-us .content").text
@data_companies_description << desc
end
end
@data_companies[:name] = @data_companies_name.select { |val| val != ''}
@data_companies[:address] = @data_companies_address.select { |val| val != ''}
@data_companies[:name] = @data_companies_name
@data_companies[:address] = @data_companies_address
@data_companies_description.each do |val|
val.to_s.delete!("[\n,\t,\r]")
val.lstrip!
val.rstrip!
val.strip!
end
@data_companies[:description] = @data_companies_description.select { |val| val != ''}
@data_companies[:description] = @data_companies_description
@data_companies
end
......@@ -108,4 +119,7 @@ namespace :db do
end
end
def make_jobs
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment