Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
venjob_nth
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
3
Merge Requests
3
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ngô Trung Hưng
venjob_nth
Commits
0ac0989b
Commit
0ac0989b
authored
Jul 29, 2020
by
Ngô Trung Hưng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix code
parent
8fecd429
Pipeline
#736
failed with stages
in 0 seconds
Changes
5
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
176 additions
and
188 deletions
+176
-188
app/models/city.rb
+1
-0
db/migrate/20200729064551_change_column_table_city.rb
+6
-0
db/schema.rb
+2
-2
lib/src/interface_web.rb
+167
-185
lib/tasks/crawler.rake
+0
-1
No files found.
app/models/city.rb
View file @
0ac0989b
...
...
@@ -4,4 +4,5 @@
class
City
<
ApplicationRecord
has_many
:city_jobs
has_many
:jobs
,
through: :city_jobs
enum
area:
{
international:
0
,
domestic
:
1
,
range:
69
}
end
db/migrate/20200729064551_change_column_table_city.rb
0 → 100644
View file @
0ac0989b
class
ChangeColumnTableCity
<
ActiveRecord
::
Migration
[
5.2
]
def
change
change_column
:cities
,
:area
,
:integer
#Ex:- change_column("admin_users", "email", :string, :limit =>25)
end
end
db/schema.rb
View file @
0ac0989b
...
...
@@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord
::
Schema
.
define
(
version:
2020_07_2
8_021412
)
do
ActiveRecord
::
Schema
.
define
(
version:
2020_07_2
9_064551
)
do
create_table
"applied_jobs"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8"
,
force: :cascade
do
|
t
|
t
.
bigint
"user_id"
...
...
@@ -26,7 +26,7 @@ ActiveRecord::Schema.define(version: 2020_07_28_021412) do
create_table
"cities"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8"
,
force: :cascade
do
|
t
|
t
.
string
"name"
t
.
boolean
"area"
t
.
integer
"area"
t
.
datetime
"created_at"
,
null:
false
t
.
datetime
"updated_at"
,
null:
false
end
...
...
lib/src/interface_web.rb
View file @
0ac0989b
...
...
@@ -5,32 +5,45 @@ require 'open-uri'
# Crawler data
class
Crawler
COMPANY_SECURITY
=
1
NUMBER_LINK
=
1
NUMBER_LINK
=
2
SIZE_LI_INTERFACE_5
=
10
INTERNATIONAL
=
0
DOMESTIC
=
1
RANGE
=
69
def
path_to_first_link
Rails
.
root
.
join
(
'tmp'
,
'link.txt'
)
end
def
logger
@logger
||=
Logger
.
new
(
Rails
.
root
.
join
(
'log'
,
'crawler.log'
))
end
def
stop_crawler
file
=
File
.
readlines
(
path_to_first_link
,
'r'
)
if
File
.
exist?
(
path_to_first_link
)
file
.
blank?
?
''
:
file
.
join
end
def
safe_link
(
url
)
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
url
))))
end
def
crawl_link
(
page
)
puts
"Crawling link on page...
\n
PLease wait...
\n
"
data
=
[]
website_companies
=
[]
website_jobs
=
[]
file
=
File
.
readlines
(
'tmp/link.txt'
,
'r'
)
if
File
.
exist?
(
'tmp/link.txt'
)
@@stop_crawl
=
file
.
blank?
?
''
:
file
.
join
begin
page
.
times
do
|
i
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
i
+
1
}
-vi.html"
))
link_companies
=
page
.
search
(
'.figcaption .caption @href'
)
website_companies
+=
link_companies
.
map
(
&
:value
).
uniq
link_jobs
=
page
.
search
(
'.figcaption .title .job_link @href'
)
website_jobs
+=
link_jobs
.
map
(
&
:value
)
break
if
website_jobs
.
include?
(
@@stop_crawl
)
break
if
website_jobs
.
include?
(
stop_crawler
)
end
rescue
StandardError
=>
e
logger
.
error
"Crawler link on page have error
#{
e
}
"
end
website_companies
=
website_companies
.
select
{
|
val
|
val
.
present?
&&
val
!=
'javascript:void(0);'
}
website_companies
=
website_companies
.
select
(
&
:present?
)
website_jobs
=
website_jobs
.
select
(
&
:present?
)
puts
"Result:
\n
Company:
#{
website_companies
.
length
}
link
\n
Job :
#{
website_jobs
.
length
}
link
\n
--------------"
File
.
write
(
'tmp/link.txt'
,
website_jobs
[
0
])
File
.
write
(
path_to_first_link
,
website_jobs
[
0
])
data
<<
website_companies
<<
website_jobs
end
...
...
@@ -38,189 +51,158 @@ class Crawler
@link_job_and_companies
||=
crawl_link
(
NUMBER_LINK
)
end
def
self
.
safe_link
(
url
)
Nokogiri
::
HTML
(
URI
.
parse
(
URI
.
escape
(
url
)))
end
def
craw_data_cities
page
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
))
puts
"Crawling data location...
\n
.
\n
.
\n
."
data_list_cities
=
[]
data
=
page
.
search
(
'#location option'
)
list_cities
=
data
.
to_s
.
split
(
'</option>'
)
list_cities
.
each
do
|
x
|
data_list_cities
<<
x
.
gsub
(
/(^<[\w\D]*>)/
,
''
).
gsub
(
/\n/
,
''
).
rstrip
end
puts
'Save data to database...'
data_list_cities
.
each_with_index
do
|
val
,
index
|
area
=
index
>
RANGE
?
INTERNATIONAL
:
DOMESTIC
City
.
find_or_create_by
(
name:
val
)
do
|
city
|
city
.
area
=
area
end
locations
=
page
.
search
(
'#location option'
).
map
(
&
:text
)
locations
.
each_with_index
do
|
val
,
index
|
area
=
index
>
City
.
areas
[
'range'
]
?
City
.
areas
[
'international'
]
:
City
.
areas
[
'domestic'
]
City
.
find_or_create_by
(
name:
val
)
{
|
city
|
city
.
area
=
area
}
end
end
def
craw_data_companies
puts
'Crawl data companies'
link_crawl
=
link_job_and_companies
link_crawl
[
0
].
each
do
|
url
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
url
))))
name
=
''
address
=
''
desc
=
''
company_name
=
page
.
search
(
'.company-info .info .content .name'
).
text
if
company_name
.
blank?
name
=
page
.
search
(
'.section-page #cp_company_name'
).
text
.
strip
address
=
page
.
search
(
'.section-page .cp_basic_info_details ul li:nth-child(1)'
).
text
desc
=
page
.
search
(
'.cp_aboutus_item .content_fck'
).
text
else
name
=
company_name
.
strip
address
=
page
.
search
(
'.company-info .info .content p:nth-child(3)'
).
text
desc
=
page
.
search
(
'.main-about-us .content'
).
text
end
begin
if
name
.
present?
&&
address
.
present?
&&
desc
.
present?
Company
.
find_or_create_by
(
name:
name
.
strip
)
do
|
company
|
company
.
address
=
address
company
.
short_description
=
desc
end
puts
name
end
rescue
StandardError
=>
e
puts
e
end
end
page
=
safe_link
(
url
)
company_name
=
page
.
search
(
'.company-info .content .name'
).
text
Company
.
find_or_create_by
(
name:
company_name
)
do
|
company
|
company
.
address
=
page
.
search
(
'.company-info .info .content p:nth-child(3)'
).
text
company
.
short_description
=
page
.
search
(
'.main-about-us .content'
).
text
end
def
make_data
puts
'Please wait for crawl jobs data! . . .'
link_crawl
=
link_job_and_companies
arr_link
=
[]
link_crawl
[
1
].
each
do
|
val
|
break
if
@@stop_crawl
==
val
arr_link
<<
val
end
arr_link
.
reverse!
.
each_with_index
do
|
path
,
i
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
path
))))
if
page
.
search
(
'.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p'
)[
0
].
present?
crawl_data_jobs_interface_1
(
page
)
elsif
page
.
search
(
'section .template-200'
).
text
.
present?
crawl_data_jobs_interface_2
(
page
)
elsif
page
.
search
(
'.DetailJobNew ul li'
).
size
==
SIZE_LI_INTERFACE_5
&&
!
page
.
search
(
'.right-col ul li'
).
text
.
include?
(
'Độ tuổi'
)
crawl_data_jobs_interface_5
(
page
)
end
puts
"
#{
i
}
-
#{
path
}
"
end
puts
'Crawler data jobs success!'
end
private
def
add_data
(
data
)
id_company
=
Company
.
find_by
name:
data
[
:company_name
]
id_company
=
id_company
.
present?
?
id_company
.
id
:
COMPANY_SECURITY
id_job
=
Job
.
create!
(
name:
data
[
:name
],
company_id:
id_company
,
level:
data
[
:level
],
experience:
data
[
:exprience
],
salary:
data
[
:salary
],
create_date:
data
[
:created_date
],
expiration_date:
data
[
:expiration_date
],
description:
data
[
:description
])
make_foreign_industries_table
(
data
[
:industry_name
],
id_job
.
id
)
make_foreign_cities_table
(
data
[
:city_name
],
id_job
.
id
)
rescue
StandardError
=>
e
puts
e
end
def
crawl_data_jobs_interface_1
(
page
)
data
=
{}
data
[
:name
]
=
page
.
search
(
'.apply-now-content .job-desc .title'
).
text
data
[
:company_name
]
=
page
.
search
(
'.apply-now-content .job-desc .job-company-name'
).
text
location
=
[]
length
=
page
.
search
(
'.detail-box .map p a'
).
size
length
.
times
do
|
n
|
location
<<
page
.
search
(
".detail-box .map p a:nth-child(
#{
n
+
1
}
)"
).
text
end
data
[
:city_name
]
=
location
.
join
(
','
)
data
[
:created_date
]
=
page
.
search
(
'.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p'
)[
0
].
text
data
[
:expiration_date
]
=
page
.
search
(
'.item-blue .detail-box ul li:last'
)[
1
].
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
last
data
[
:salary
]
=
page
.
search
(
'.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p'
)[
1
].
text
industries
=
page
.
search
(
'.item-blue .detail-box:nth-child(1) ul li:nth-child(2) a'
).
text
industries
=
industries
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
select
(
&
:present?
)
data
[
:industry_name
]
=
industries
.
join
(
','
)
data
[
:description
]
=
page
.
search
(
'.tabs .tab-content .detail-row:nth-child(n)'
).
to_s
get_level
=
page
.
search
(
'.item-blue .detail-box:last ul li:nth-child(3)'
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
)
get_level
=
get_level
[
1
].
to_s
.
strip
if
get_level
.
blank?
g_level
=
page
.
search
(
'.item-blue .detail-box:last ul li:nth-child(2)'
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
)
data
[
:level
]
=
g_level
[
1
].
to_s
.
strip
else
data
[
:level
]
=
get_level
end
exp
=
page
.
search
(
'.item-blue .detail-box:last ul li:nth-child(2)'
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
'Kinh nghiệm'
)
exp
=
exp
[
1
].
to_s
.
strip
data
[
:exprience
]
=
exp
add_data
(
data
)
end
def
crawl_data_jobs_interface_2
(
page
)
data
=
{}
data
[
:name
]
=
page
.
search
(
'.apply-now-content .job-desc .title'
).
text
data
[
:company_name
]
=
page
.
search
(
'.top-job .top-job-info .tit_company'
).
text
locations
=
[]
length
=
page
.
search
(
'.info-workplace .value a'
).
size
length
.
times
do
|
n
|
locations
<<
page
.
search
(
".info-workplace .value a:nth-child(
#{
n
+
1
}
)"
).
text
end
data
[
:city_name
]
=
locations
.
join
(
','
)
data
[
:created_date
]
=
''
expiration_date
=
page
.
search
(
'.info li:nth-child(4)'
).
text
data
[
:expiration_date
]
=
expiration_date
.
blank?
?
''
:
expiration_date
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
last
data
[
:salary
]
=
page
.
search
(
'.info li:nth-child(3)'
).
text
.
split
(
'Lương'
).
last
.
strip
data
[
:industry_name
]
=
page
.
search
(
'.info li:nth-child(5) .value'
).
text
data
[
:description
]
=
page
.
search
(
'.left-col'
).
to_s
lv
=
page
.
search
(
'.boxtp .info li:nth-child(2)'
).
text
data
[
:level
]
=
lv
.
blank?
?
''
:
lv
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
strip
.
split
(
'Cấp bậc'
).
last
.
strip
exp
=
page
.
search
(
'.info li:nth-child(6)'
).
text
data
[
:exprience
]
=
exp
.
blank?
?
''
:
exp
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
'Kinh nghiệm'
).
last
.
strip
add_data
(
data
)
end
def
crawl_data_jobs_interface_5
(
page
)
data
=
{}
data
[
:name
]
=
page
.
search
(
'.info-company h1'
).
text
data
[
:company_name
]
=
page
.
search
(
'.info-company .text-job h2'
).
text
data
[
:city_name
]
=
page
.
search
(
'.DetailJobNew ul li:nth-child(1) a'
).
text
data
[
:created_date
]
=
''
data
[
:expiration_date
]
=
page
.
search
(
'.DetailJobNew li:nth-child(9) span'
).
text
.
strip
data
[
:salary
]
=
page
.
search
(
'.DetailJobNew li:nth-child(3) span'
).
text
.
strip
data
[
:industry_name
]
=
page
.
search
(
'.DetailJobNew li:nth-child(2) span'
).
text
.
strip
data
[
:description
]
=
page
.
search
(
'.left-col .detail-row'
)
data
[
:level
]
=
page
.
search
(
'.DetailJobNew ul li:nth-child(6) span'
).
text
.
strip
data
[
:exprience
]
=
page
.
search
(
'.DetailJobNew li:nth-child(5) span'
).
text
.
strip
add_data
(
data
)
end
def
make_foreign_industries_table
(
data
,
id_job
)
unless
data
.
blank?
&&
id_job
.
blank?
content
=
data
.
split
(
','
)
content
.
each
do
|
val
|
val
.
gsub!
(
'&'
,
'&'
)
if
val
.
include?
(
'&'
)
data_industry
=
Industry
.
find_by
name:
val
.
strip
id_industry
=
data_industry
.
blank?
?
Industry
.
create!
(
name:
val
.
strip
).
id
:
data_industry
.
id
IndustryJob
.
create!
(
industry_id:
id_industry
,
job_id:
id_job
)
end
end
end
def
make_foreign_cities_table
(
data
,
id_job
)
return
if
data
.
blank?
&&
id_job
.
blank?
cities
=
data
.
split
(
','
)
cities
.
each
do
|
city
|
data_city
=
City
.
find_by
name:
city
.
strip
id_cities
=
data_city
.
blank?
?
City
.
create!
(
name:
city
.
strip
,
area:
DOMESTIC
).
id
:
data_city
.
id
CityJob
.
create!
(
job_id:
id_job
,
city_id:
id_cities
)
end
logger
.
error
"Crawler data companies has error:
#{
e
}
"
end
end
# def make_data
# puts 'Please wait for crawl jobs data! . . .'
# link_crawl = link_job_and_companies
# arr_link = []
# link_crawl[1].each do |val|
# break if stop_crawler == val
# arr_link << val
# end
# arr_link.reverse!.each_with_index do |path, i|
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape(path))))
# if page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].present?
# crawl_data_jobs_interface_1(page)
# elsif page.search('section .template-200').text.present?
# crawl_data_jobs_interface_2(page)
# elsif page.search('.DetailJobNew ul li').size == SIZE_LI_INTERFACE_5 && !page.search('.right-col ul li').text.include?('Độ tuổi')
# crawl_data_jobs_interface_5(page)
# end
# puts "#{i} - #{path}"
# end
# puts 'Crawler data jobs success!'
# end
# private
# def add_data(data)
# id_company = Company.find_by name: data[:company_name]
# id_company = id_company.present? ? id_company.id : COMPANY_SECURITY
# id_job = Job.create!(name: data[:name],
# company_id: id_company,
# level: data[:level],
# experience: data[:exprience],
# salary: data[:salary],
# create_date: data[:created_date],
# expiration_date: data[:expiration_date],
# description: data[:description])
# make_foreign_industries_table(data[:industry_name], id_job.id)
# make_foreign_cities_table(data[:city_name], id_job.id)
# rescue StandardError => e
# puts e
# end
# def crawl_data_jobs_interface_1(page)
# data = {}
# data[:name] = page.search('.apply-now-content .job-desc .title').text
# data[:company_name] = page.search('.apply-now-content .job-desc .job-company-name').text
# location = []
# length = page.search('.detail-box .map p a').size
# length.times do |n|
# location << page.search(".detail-box .map p a:nth-child(#{n + 1})").text
# end
# data[:city_name] = location.join(',')
# data[:created_date] = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].text
# data[:expiration_date] = page.search('.item-blue .detail-box ul li:last')[1].text.delete!("[\n,\t,\r]").split(' ').last
# data[:salary] = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[1].text
# industries = page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(2) a').text
# industries = industries.delete!("[\n,\t,\r]").split(' ').select(&:present?)
# data[:industry_name] = industries.join(',')
# data[:description] = page.search('.tabs .tab-content .detail-row:nth-child(n)').to_s
# get_level = page.search('.item-blue .detail-box:last ul li:nth-child(3)').text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
# get_level = get_level[1].to_s.strip
# if get_level.blank?
# g_level = page.search('.item-blue .detail-box:last ul li:nth-child(2)').text.delete!("[\n,\t,\r]").lstrip.split('Cấp bậc')
# data[:level] = g_level[1].to_s.strip
# else
# data[:level] = get_level
# end
# exp = page.search('.item-blue .detail-box:last ul li:nth-child(2)').text.delete!("[\n,\t,\r]").split('Kinh nghiệm')
# exp = exp[1].to_s.strip
# data[:exprience] = exp
# add_data(data)
# end
# def crawl_data_jobs_interface_2(page)
# data = {}
# data[:name] = page.search('.apply-now-content .job-desc .title').text
# data[:company_name] = page.search('.top-job .top-job-info .tit_company').text
# locations = []
# length = page.search('.info-workplace .value a').size
# length.times do |n|
# locations << page.search(".info-workplace .value a:nth-child(#{n + 1})").text
# end
# data[:city_name] = locations.join(',')
# data[:created_date] = ''
# expiration_date = page.search('.info li:nth-child(4)').text
# data[:expiration_date] = expiration_date.blank? ? '' : expiration_date.delete!("[\n,\t,\r]").split(' ').last
# data[:salary] = page.search('.info li:nth-child(3)').text.split('Lương').last.strip
# data[:industry_name] = page.search('.info li:nth-child(5) .value').text
# data[:description] = page.search('.left-col').to_s
# lv = page.search('.boxtp .info li:nth-child(2)').text
# data[:level] = lv.blank? ? '' : lv.delete!("[\n,\t,\r]").strip.split('Cấp bậc').last.strip
# exp = page.search('.info li:nth-child(6)').text
# data[:exprience] = exp.blank? ? '' : exp.delete!("[\n,\t,\r]").split('Kinh nghiệm').last.strip
# add_data(data)
# end
# def crawl_data_jobs_interface_5(page)
# data = {}
# data[:name] = page.search('.info-company h1').text
# data[:company_name] = page.search('.info-company .text-job h2').text
# data[:city_name] = page.search('.DetailJobNew ul li:nth-child(1) a').text
# data[:created_date] = ''
# data[:expiration_date] = page.search('.DetailJobNew li:nth-child(9) span').text.strip
# data[:salary] = page.search('.DetailJobNew li:nth-child(3) span').text.strip
# data[:industry_name] = page.search('.DetailJobNew li:nth-child(2) span').text.strip
# data[:description] = page.search('.left-col .detail-row')
# data[:level] = page.search('.DetailJobNew ul li:nth-child(6) span').text.strip
# data[:exprience] = page.search('.DetailJobNew li:nth-child(5) span').text.strip
# add_data(data)
# end
# def make_foreign_industries_table(data, id_job)
# unless data.blank? && id_job.blank?
# content = data.split(',')
# content.each do |val|
# val.gsub!('&', '&') if val.include?('&')
# data_industry = Industry.find_by name: val.strip
# id_industry = data_industry.blank? ? Industry.create!(name: val.strip).id : data_industry.id
# IndustryJob.create!(industry_id: id_industry, job_id: id_job)
# end
# end
# end
# def make_foreign_cities_table(data, id_job)
# return if data.blank? && id_job.blank?
# cities = data.split(',')
# cities.each do |city|
# data_city = City.find_by name: city.strip
# id_cities = data_city.blank? ? City.create!(name: city.strip, area: DOMESTIC).id : data_city.id
# CityJob.create!(job_id: id_job, city_id: id_cities)
# end
# end
lib/tasks/crawler.rake
View file @
0ac0989b
...
...
@@ -13,6 +13,5 @@ namespace :crawler do
cw
=
Crawler
.
new
cw
.
craw_data_cities
cw
.
craw_data_companies
cw
.
make_data
end
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment