Commit ad336f02 by Hung0326 Committed by GitHub

Merge pull request #8 from Hung0326/dev

fix crawler
parents 81cfe475 f5d71986
......@@ -687,7 +687,6 @@ $main-color: #221f20;
.box_info_salary {
font-size: 13px;
color: #008563;
font-weight: 600;
transform: translateY(-2px);
}
.box_btn_favotite {
......@@ -818,7 +817,7 @@ $main-color: #221f20;
}
.box_text_five_jobs.box_padding_city {
background-color: #da6d2e;
background-image: linear-gradient(to right, #86cb49, #169b74, #86cb49);
color: white;
font-weight: 600;
margin-bottom: 0px !important;
......@@ -849,12 +848,13 @@ $main-color: #221f20;
cursor: pointer;
color: white;
text-align: center;
font-size: 18px;
font-size: 17px;
font-weight: bold;
}
.ct_jobs_count {
cursor: pointer;
font-size: 16px;
text-align: center;
color: #999;
}
......
......@@ -6,7 +6,7 @@ class City < ApplicationRecord
hash = {}
data_cities = City.all
data_cities.each do |val|
hash[val.id] = val.jobs.count
hash[val.name] = val.jobs.count
end
hash.sort_by { |k,v| v }.reverse
end
......
<div class="box_info_city">
<div class="ct_name">
<% data = City.find(k) %>
<%= link_to data.name, '#' ,class: 'link_ct'%>
<%= link_to name_city, '#' ,class: 'link_ct'%>
</div>
<div class="ct_jobs_count">
<%= link_to "#{v} công việc", '#' ,class: 'link_ct'%>
<%= link_to "#{jobs_count} công việc", '#' ,class: 'link_ct'%>
</div>
</div>
\ No newline at end of file
......@@ -5,12 +5,10 @@
<% i = 0%>
<% @five_jobs.each do |val| %>
<div class="box_jobs">
<div class="col-sm-12 d-block d-sm-none">
<div class="col-sm-12 d-block d-sm-none">
<button type="submit" class="btn_favorite_outline xs">
<i class="far fa-heart"></i>
</button>
</button>
</div>
<div class="row">
<div class="col-sm-10 col-md-9 col-lg-10">
......@@ -25,13 +23,13 @@
<div class="loc">
<h5 class="box_info_location"><i class="fas fa-map-marker-alt"></i>
<% dt = [] %>
<% val.cities.each do |x| %>
<% dt << (x.name << ' | ') %>
<% end %>
<% val.cities.each do |x| %>
<% dt << (x.name << ' | ') %>
<% end %>
<%= dt.join('').chomp('| ')%>
</h5>
</div>
<h5 class="box_info_salary"><i class="fas fa-dollar-sign"></i> Lương: <%= val.salary %></h5>
<h5 class="box_info_salary"><i class="fas fa-dollar-sign"></i>&nbsp; Lương: <%= val.salary %></h5>
<div class="coc">
<h5 class="box_info_des"><%= strip_tags(val.description) %></h5>
</div>
......
......@@ -6,7 +6,7 @@
<div class="row no-gutters">
<% @top_city.each do |k,v| %>
<div class="col-lg-4 col-md-4 col-sm-6 col-xs-12">
<%= render 'shared/block_cities_hot',k: k, v: v %>
<%= render 'shared/block_cities_hot',name_city: k, jobs_count: v %>
</div>
<% end %>
<div class="col-lg-4 col-md-4 col-sm-6 col-xs-12">
......
......@@ -12,7 +12,7 @@ class Clawler
list_cities.each do |x|
data_list_cities << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').rstrip
end
puts "Save data to database... \n"
puts "Save data to database... \n------------------------"
data_list_cities.length.times do |i|
area = i > 69 ? 0 : 1
name = (data_list_cities[i].to_s)
......@@ -29,7 +29,7 @@ class Clawler
list_industries.each do |x|
data_list_industries << x.gsub(/(^<[\w\D]*>)/, '').gsub(/\n/,'').strip
end
puts "Save data to database... \n"
puts "Save data to database... \n------------------------"
data_list_industries.length.times do |i|
name = data_list_industries[i].to_s
if name.include?('&amp;')
......@@ -39,23 +39,27 @@ class Clawler
end
end
# FILL DATA COMPANIES
def self.make_companies
Company.create!(name: "Bảo mật",
address: "Vui lòng xem trong mô tả công việc",
short_description: "Vui lòng xem trong mô tả công việc")
# Company.create!(name: "Bảo mật",
# address: "Vui lòng xem trong mô tả công việc",
# short_description: "Vui lòng xem trong mô tả công việc")
Company.find_or_create_by(name: 'Bảo mật', address: 'Vui lòng xem trong mô tả công việc') do |company|
company.name = 'Bảo mật'
company.address = 'Vui lòng xem trong mô tả công việc'
company.short_description = 'Vui lòng xem trong mô tả công việc'
end
@data = Interface_web.craw_data_companies()
puts 'Save info companies to database . . .'
i = 0
@data[:name].each do |n|
if Company.find_by(name: n).blank?
address = @data[:address][i]
short_description = @data[:description][i]
Company.create!(name: n,
puts 'Save info companies to database . . .'
@data[:name].each_with_index do |name, index|
if Company.find_by(name: name).blank?
address = @data[:address][index]
short_description = @data[:description][index]
Company.create!(name: name,
address: address,
short_description: short_description)
end
i += 1
end
end
# FILL DATA JOBS
......
class Interface_web
# func get "n" link company & job
debugger
def self.crawl_link_for_companies_jobs(page)
puts "Crawling link on page...\nPLease wait...\n"
data = []
......@@ -20,14 +19,13 @@ class Interface_web
website_jobs = website_jobs.join(",")
website_jobs = website_jobs.split(",")
website_jobs = website_jobs.select { |val| val != ''}
puts "Result:\nCompany: #{website_companies.length} link\nJob : #{website_jobs} link"
puts "Result:\nCompany: #{website_companies.length} link\nJob : #{website_jobs.length} link\n------------------------"
data << website_companies << website_jobs
end
@crawl_link_for_companies_jobs = crawl_link_for_companies_jobs(1)
@crawl_link_for_companies_jobs = crawl_link_for_companies_jobs(15)
def self.get_link_job_and_companies
@crawl_link_for_companies_jobs ||= crawl_link_for_companies_jobs(1)
@crawl_link_for_companies_jobs ||= crawl_link_for_companies_jobs(15)
end
def self.base_link(url)
......@@ -36,13 +34,12 @@ class Interface_web
def self.craw_data_companies
link_crawl = get_link_job_and_companies
@data_companies = {}
@data_companies_name = []
@data_companies_address = []
@data_companies_description = []
data_companies = {}
data_companies_name = []
data_companies_address = []
data_companies_description = []
puts 'Crawl data companies'
@current_company = 0
link_crawl[0].each do |url|
link_crawl[0].each_with_index do |url,i|
page = base_link(url)
name = ''
address = ''
......@@ -57,40 +54,37 @@ class Interface_web
desc = page.search(".main-about-us .content").text
end
if (name != "" && address != "" && desc != "")
@data_companies_name << name.to_s.strip
@data_companies_address << address.to_s.strip
@data_companies_description << desc
@current_company += 1
if (name.present? && address.present? && desc.present?)
data_companies_name << name.to_s.strip
data_companies_address << address.to_s.strip
data_companies_description << desc
end
puts "Crawling #{@current_company}"
puts "Process company #{i+1}. . .\n------------------------"
end
@data_companies[:name] = @data_companies_name
@data_companies[:address] = @data_companies_address
@data_companies_description.each do |val|
data_companies[:name] = data_companies_name
data_companies[:address] = data_companies_address
data_companies_description.each do |val|
val.to_s.delete!("[\n,\t,\r]")
val.strip!
end
@data_companies[:description] = @data_companies_description
@data_companies
data_companies[:description] = data_companies_description
data_companies
end
def self.add_data
@data[:name] = @name
@data[:company_name] = @company_name
@data[:city_name] = @city_name
@data[:created_date] = @created_date
@data[:expiration_date] = @expiration_date
@data[:salary] = @salary
@data[:industry_name] = @industry_name
@data[:description] = @description
@data[:level] = @level
@data[:exprience] = @exprience
def self.add_data(name, company_name, city_name, created_date, expiration_date, salary, industry_name, description, level, exprience)
@data[:name] = name
@data[:company_name] = company_name
@data[:city_name] = city_name
@data[:created_date] = created_date
@data[:expiration_date] = expiration_date
@data[:salary] = salary
@data[:industry_name] = industry_name
@data[:description] = description
@data[:level] = level
@data[:exprience] = exprience
end
def self.crawl_data_jobs_interface_1(url)
page = base_link(url)
def self.crawl_data_jobs_interface_1(page)
@name << page.search(".apply-now-content .job-desc .title").text
@company_name << page.search(".apply-now-content .job-desc .job-company-name").text
......@@ -109,31 +103,30 @@ class Interface_web
@salary << page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[1].text
industry_name = page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(2) a").text
industry_name = industry_name.delete!("[\n,\t,\r]").split(' ').select { |v| v != ''}
@industry_name << industry_name.join(',')
industries = page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(2) a").text
industries = industries.delete!("[\n,\t,\r]").split(' ').select { |v| v != ''}
@industry_name << industries.join(',')
@description << page.search(".tabs .tab-content .detail-row:nth-child(n)").to_s.delete!("[\n,\t,\r]")
get_level = page.search(".item-blue .detail-box:last ul li:nth-child(3)").text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
get_level = get_level[1].to_s.strip
if get_level == ""
level = page.search(".item-blue .detail-box:last ul li:nth-child(2)").text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
@level << level[1].to_s.strip
g_level = page.search(".item-blue .detail-box:last ul li:nth-child(2)").text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
@level << g_level[1].to_s.strip
else
level = page.search(".item-blue .detail-box:last ul li:nth-child(3)").text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
@level << level[1].to_s.strip
g_level = page.search(".item-blue .detail-box:last ul li:nth-child(3)").text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
@level << g_level[1].to_s.strip
end
exprience = page.search(".item-blue .detail-box:last ul li:nth-child(2)").text.delete!("[\n,\t,\r]").split('Kinh nghiệm')
exprience = exprience[1].to_s.strip
@exprience << exprience
exp = page.search(".item-blue .detail-box:last ul li:nth-child(2)").text.delete!("[\n,\t,\r]").split('Kinh nghiệm')
exp = exp[1].to_s.strip
@exprience << exp
add_data()
add_data(@name, @company_name, @city_name, @created_date, @expiration_date, @salary, @industry_name, @description, @level, @exprience)
end
def self.crawl_data_jobs_interface_2(url)
page = base_link(url)
def self.crawl_data_jobs_interface_2(page)
@name << page.search(".apply-now-content .job-desc .title").text
......@@ -168,12 +161,7 @@ class Interface_web
else
@level << lv.delete!("[\n,\t,\r]").strip.split('Cấp bậc').last.strip
end
# if exp == ""
# @exprience << ""
# else
# @exprience << exp.delete!("[\n,\t,\r]").split('Kinh nghiệm').last.strip
# end
exp = page.search(".info li:nth-child(6)").text
if exp.blank?
@exprience << ""
......@@ -181,68 +169,11 @@ class Interface_web
@exprience << exp.delete!("[\n,\t,\r]").split('Kinh nghiệm').last.strip
end
add_data()
add_data(@name, @company_name, @city_name, @created_date, @expiration_date, @salary, @industry_name, @description, @level, @exprience)
end
def self.crawl_data_jobs_interface_3(url)
page = base_link(url)
@name << page.search(".intro_job h1").text
@company_name << page.search(".info-company .text-job h2").text
@city_name << page.search(".DetailJobNew ul li:nth-child(1) a").text
@created_date << ""
@expiration_date << page.search(".DetailJobNew .info ul li:nth-child(3) p").text.strip
@salary << page.search(".DetailJobNew .salary ul li:nth-child(3) p").text.strip
@industry_name << page.search(".DetailJobNew .salary ul li:nth-child(2) p").text.strip
@description << page.search(".content_job .detail-row").to_s.delete!("[\n,\t,\r]")
@level << page.search(".DetailJobNew .info ul li:nth-child(2) p").text.strip
@exprience << page.search(".DetailJobNew .info ul li:nth-child(1) p").text.strip
add_data()
end
def self.crawl_data_jobs_interface_4(url)
page = base_link(url)
@name << page.search(".info-company h1").text
if page.search(".zone-company .text-job h2").text == ""
@company_name << page.search(".info-company .text-job h2").text
industry_name = page.search(".DetailJobNew li:nth-child(3) span").text.strip
@industry_name << industry_name.delete!("[\n,\t,\r]").split(' ').select { |v| v != ''}
else
@company_name << page.search(".zone-company .text-job h2").text.strip
industry_name = page.search(".DetailJobNew li:nth-child(3) span a").text
@industry_name << industry_name.delete!("[\n,\t,\r]").split(' ').select { |v| v != ''}
end
@city_name << page.search(".DetailJobNew ul li:nth-child(1) a").text
@created_date << ""
@expiration_date << page.search(".DetailJobNew li:nth-child(7) span").text
@salary << page.search(".DetailJobNew li:nth-child(6) span").text
@description << page.search(".left-col").to_s.delete!("[\n,\t,\r]")
@level << page.search(".DetailJobNew ul li:nth-child(2) span").text
@exprience << ""
add_data()
end
def self.crawl_data_jobs_interface_5(url)
page = base_link(url)
def self.crawl_data_jobs_interface_5(page)
# page = base_link(url)
@name << page.search(".info-company h1").text
@company_name << page.search(".info-company .text-job h2").text
......@@ -263,7 +194,7 @@ class Interface_web
@exprience << page.search(".DetailJobNew li:nth-child(5) span").text.strip
add_data()
add_data(@name, @company_name, @city_name, @created_date, @expiration_date, @salary, @industry_name, @description, @level, @exprience)
end
def self.make_data
......@@ -281,24 +212,25 @@ class Interface_web
@city_name = []
link_crawl = get_link_job_and_companies
total_jobs = link_crawl[1].length
current_job = 1
link_crawl[1].each do |path|
link_crawl[1].each_with_index do |path,i|
page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{path}"))))
if page.search(".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p")[0] != nil
crawl_data_jobs_interface_1(path)
crawl_data_jobs_interface_1(page)
elsif page.search("section .template-200").text != ""
crawl_data_jobs_interface_2(path)
crawl_data_jobs_interface_2(page)
elsif (page.search(".DetailJobNew ul li").size == 10 && !page.search('.right-col ul li').text.include?('Độ tuổi'))
crawl_data_jobs_interface_5(path)
crawl_data_jobs_interface_5(page)
end
puts "Process: #{current_job}/#{total_jobs}"
current_job += 1
puts "Process: #{i+1}/#{link_crawl[1].length}"
end
@data
end
end
# else # insert "page.search(".DetailJobNew ul li").size == 8" (if want catch interface 4)
# crawl_data_jobs_interface_3(path)
\ No newline at end of file
require 'src/crawler'
namespace :db do
task populate: :environment do
# Clawler.make_industries
# Clawler.make_cities
Clawler.make_industries
Clawler.make_cities
Clawler.make_companies
# Clawler.make_jobs
Clawler.make_jobs
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment