Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
Venjob_HungNT
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ngô Trung Hưng
Venjob_HungNT
Commits
a1f70d89
Commit
a1f70d89
authored
Jul 16, 2020
by
Ngo Trung Hung
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
finished crawler 50% jobs
parent
c797139b
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
303 additions
and
180 deletions
+303
-180
app/controllers/application_controller.rb
+1
-1
app/controllers/home_controller.rb
+4
-1
app/helpers/crawler_helper.rb
+176
-56
config/database.yml
+1
-1
lib/helper/crawler.rb
+117
-0
lib/tasks/crawler.rake
+4
-121
No files found.
app/controllers/application_controller.rb
View file @
a1f70d89
class
ApplicationController
<
ActionController
::
Base
require
'nokogiri'
require
'nokogiri'
require
'open-uri'
include
CrawlerHelper
...
...
app/controllers/home_controller.rb
View file @
a1f70d89
...
...
@@ -3,6 +3,9 @@ class HomeController < ApplicationController
def
index
# crawl_data_jobs_interface_1()
# crawl_data_jobs_interface_2()
crawl_data_jobs_interface_3
()
# crawl_data_jobs_interface_3()
# crawl_data_jobs_interface_4()
# crawl_data_jobs_interface_5()
make_data
end
end
app/helpers/crawler_helper.rb
View file @
a1f70d89
require
'open-uri'
module
CrawlerHelper
def
crawl_industries_data
data_list_industries
=
[]
page
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
))
...
...
@@ -86,9 +87,96 @@ module CrawlerHelper
# render plain: @data_companies
end
def
base_link
(
url
)
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/
#{
url
}
"
))))
end
# Crawler job
def
crawl_data_jobs_interface_1
def
crawl_data_jobs_interface_1
(
url
)
page
=
base_link
(
url
)
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/ky-su-dau-thau-mep.35B45617.html"))))
@name
<<
page
.
search
(
".apply-now-content .job-desc .title"
).
text
@data
[
:name
]
=
@name
@company_name
<<
page
.
search
(
".apply-now-content .job-desc .job-company-name"
).
text
@data
[
:company_name
]
=
@company_name
@city_name
<<
page
.
search
(
".detail-box .map p a"
).
text
@data
[
:city_name
]
=
@city_name
@created_date
<<
page
.
search
(
".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p"
)[
0
].
text
@data
[
:created_date
]
=
@created_date
@expiration_date
<<
page
.
search
(
".item-blue .detail-box ul li:last"
)[
1
].
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
last
@data
[
:expiration_date
]
=
@expiration_date
@salary
<<
page
.
search
(
".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p"
)[
1
].
text
@data
[
:salary
]
=
@salary
industry_name
=
page
.
search
(
".item-blue .detail-box:nth-child(1) ul li:nth-child(2) a"
).
text
industry_name
=
industry_name
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
select
{
|
v
|
v
!=
''
}
@industry_name
<<
industry_name
.
join
(
','
)
@data
[
:industry_name
]
=
@industry_name
@description
<<
page
.
search
(
".tabs .tab-content .detail-row:nth-child(n)"
).
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
@data
[
:description
]
=
@description
get_level
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(3)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
)
get_level
=
get_level
[
1
].
to_s
.
strip
if
get_level
==
""
level
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(2)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
)
@level
<<
level
[
1
].
to_s
.
strip
else
level
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(3)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
)
@level
<<
level
[
1
].
to_s
.
strip
end
@data
[
:level
]
=
@level
exprience
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(2)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
'Kinh nghiệm'
)
exprience
=
exprience
[
1
].
to_s
.
strip
@exprience
<<
exprience
@data
[
:exprience
]
=
@exprience
end
def
crawl_data_jobs_interface_2
(
url
)
page
=
base_link
(
url
)
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/dai-dien-tieu-thu-sales-representative-quang-binh-tp-dong-hoi.35B4572F.html"))))
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
#interface1
@name
<<
page
.
search
(
".apply-now-content .job-desc .title"
).
text
@data
[
:name
]
=
@name
@company_name
<<
page
.
search
(
".top-job .top-job-info .tit_company"
).
text
@data
[
:company_name
]
=
@company_name
@city_name
<<
page
.
search
(
".info-workplace .value a"
).
text
@data
[
:city_name
]
=
@city_name
@created_date
<<
""
@data
[
:created_date
]
=
@created_date
expiration_date
=
page
.
search
(
".info li:nth-child(4)"
).
text
@expiration_date
<<
expiration_date
.
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
last
@data
[
:expiration_date
]
=
@expiration_date
@salary
<<
page
.
search
(
".info li:nth-child(3)"
).
text
.
split
(
"Lương"
).
last
.
strip
@data
[
:salary
]
=
@salary
@industry_name
<<
page
.
search
(
".info li:nth-child(5) .value"
).
text
@data
[
:industry_name
]
=
@industry_name
@description
<<
page
.
search
(
".left-col"
).
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
@data
[
:description
]
=
@description
@level
<<
page
.
search
(
".boxtp .info li:nth-child(2)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
).
last
.
strip
@data
[
:level
]
=
@level
@exprience
<<
page
.
search
(
".info li:nth-child(6)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
'Kinh nghiệm'
).
last
.
strip
@data
[
:exprience
]
=
@exprience
end
def
crawl_data_jobs_interface_3
(
url
)
link_crawl
=
crawl_link_for_companies_jobs
()
@data
=
{}
@name
=
[]
...
...
@@ -101,55 +189,43 @@ module CrawlerHelper
@description
=
[]
@industry_name
=
[]
@city_name
=
[]
# link_crawl[1].each do |url|
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/#{url}"))))
# end
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/vi/tim-viec-lam/
ky-su-dau-thau-mep.35B45617
.html"
))))
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/vi/tim-viec-lam/
tuyen-tai-xe-van-phong-cho-sep-han-quoc-tu-binh-thanh.35B45A41
.html"
))))
#interface1
name
=
page
.
search
(
".
apply-now-content .job-desc .title
"
).
text
name
=
page
.
search
(
".
info-company h1
"
).
text
@data
[
:name
]
=
name
company_name
=
page
.
search
(
".
apply-now-content .job-desc .job-company-name
"
).
text
company_name
=
page
.
search
(
".
zone-company .text-job h2
"
).
text
@data
[
:company_name
]
=
company_name
city_name
=
page
.
search
(
".
detail-box .map p
a"
).
text
city_name
=
page
.
search
(
".
DetailJobNew ul li:nth-child(1)
a"
).
text
@data
[
:city_name
]
=
city_name
created_date
=
page
.
search
(
".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p"
)[
0
].
text
@data
[
:created_date
]
=
created_date
@data
[
:created_date
]
=
""
expiratio
m_date
=
page
.
search
(
".item-blue .detail-box ul li:last"
)[
1
].
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
last
@data
[
:expiratio
m_date
]
=
expiratiom
_date
expiratio
n_date
=
page
.
search
(
".DetailJobNew li:nth-child(7) span"
).
text
@data
[
:expiratio
n_date
]
=
expiration
_date
salary
=
page
.
search
(
".
item-blue .detail-box:nth-child(1) ul li:nth-child(1) p"
)[
1
]
.
text
salary
=
page
.
search
(
".
DetailJobNew li:nth-child(6) span"
)
.
text
@data
[
:salary
]
=
salary
industry_name
=
page
.
search
(
".item-blue .detail-box:nth-child(1) ul li:nth-child(2) a"
).
text
industry_name
=
industry_name
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
select
{
|
v
|
v
!=
''
}
@data
[
:industry_name
]
=
industry_name
.
join
(
','
)
industry_name
=
page
.
search
(
".DetailJobNew li:nth-child(3) span a"
).
text
@data
[
:industry_name
]
=
industry_name
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
select
{
|
v
|
v
!=
''
}
description
=
page
.
search
(
".
tabs .tab-content .detail-row:nth-child(n)
"
).
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
description
=
page
.
search
(
".
left-col
"
).
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
@data
[
:description
]
=
description
level
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(3)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
)
level
=
level
[
1
].
to_s
.
strip
if
level
==
""
level
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(2)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
)
level
=
level
[
1
].
to_s
.
strip
end
level
=
page
.
search
(
".DetailJobNew ul li:nth-child(2) span"
).
text
@data
[
:level
]
=
level
exprience
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(2)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
'Kinh nghiệm'
)
exprience
=
exprience
[
1
].
to_s
.
strip
@data
[
:exprience
]
=
exprience
@data
[
:exprience
]
=
""
render
plain:
"
#{
@data
}
"
end
def
crawl_data_jobs_interface_
2
link_crawl
=
crawl_link_for_companies_jobs
()
def
crawl_data_jobs_interface_
4
(
url
)
#
link_crawl = crawl_link_for_companies_jobs()
@data
=
{}
@name
=
[]
@company_name
=
[]
...
...
@@ -162,43 +238,42 @@ module CrawlerHelper
@industry_name
=
[]
@city_name
=
[]
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/vi/tim-viec-lam/dai-dien-tieu-thu-sales-representative-quang-
binh-tp-dong-hoi.35B4572F
.html"
))))
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/vi/tim-viec-lam/dai-dien-tieu-thu-sales-representative-quang-
nam-phuoc-son-hiep-duc-thang-binh.35B4572D
.html"
))))
#interface1
name
=
page
.
search
(
".
apply-now-content .job-desc .title
"
).
text
name
=
page
.
search
(
".
info-company h1
"
).
text
@data
[
:name
]
=
name
company_name
=
page
.
search
(
".
top-job .top-job-info .tit_company
"
).
text
company_name
=
page
.
search
(
".
info-company .text-job h2
"
).
text
@data
[
:company_name
]
=
company_name
city_name
=
page
.
search
(
".
info-workplace .value
a"
).
text
city_name
=
page
.
search
(
".
DetailJobNew ul li:nth-child(1)
a"
).
text
@data
[
:city_name
]
=
city_name
@data
[
:created_date
]
=
""
expiration_date
=
page
.
search
(
".
info li:nth-child(4)"
).
text
@data
[
:expiration_date
]
=
expiration_date
.
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
last
expiration_date
=
page
.
search
(
".
DetailJobNew li:nth-child(7) span"
).
text
@data
[
:expiration_date
]
=
expiration_date
salary
=
page
.
search
(
".
info li:nth-child(3)"
).
text
.
split
(
"Lương"
)
@data
[
:salary
]
=
salary
.
last
.
strip
salary
=
page
.
search
(
".
DetailJobNew li:nth-child(6) span"
).
text
@data
[
:salary
]
=
salary
industry_name
=
page
.
search
(
".
info li:nth-child(5) .value"
).
text
industry_name
=
page
.
search
(
".
DetailJobNew li:nth-child(3) span"
).
text
.
strip
@data
[
:industry_name
]
=
industry_name
description
=
page
.
search
(
".left-col"
).
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
@data
[
:description
]
=
description
level
=
page
.
search
(
".boxtp .info li:nth-child(2)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
).
last
.
strip
level
=
page
.
search
(
".DetailJobNew ul li:nth-child(2) span"
).
text
@data
[
:level
]
=
level
exprience
=
page
.
search
(
".info li:nth-child(6)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
'Kinh nghiệm'
).
last
.
strip
@data
[
:exprience
]
=
exprience
@data
[
:exprience
]
=
""
render
plain:
"
#{
@data
}
"
end
def
crawl_data_jobs_interface_
3
link_crawl
=
crawl_link_for_companies_jobs
()
def
crawl_data_jobs_interface_
5
(
url
)
#
link_crawl = crawl_link_for_companies_jobs()
@data
=
{}
@name
=
[]
@company_name
=
[]
...
...
@@ -211,12 +286,12 @@ module CrawlerHelper
@industry_name
=
[]
@city_name
=
[]
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/vi/tim-viec-lam/
tuyen-tai-xe-van-phong-cho-sep-han-quoc-tu-binh-thanh.35B45A41
.html"
))))
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/vi/tim-viec-lam/
program-management-executive.35B428B5
.html"
))))
#interface1
name
=
page
.
search
(
".info-company h1"
).
text
@data
[
:name
]
=
name
company_name
=
page
.
search
(
".
zone
-company .text-job h2"
).
text
company_name
=
page
.
search
(
".
info
-company .text-job h2"
).
text
@data
[
:company_name
]
=
company_name
city_name
=
page
.
search
(
".DetailJobNew ul li:nth-child(1) a"
).
text
...
...
@@ -225,23 +300,67 @@ module CrawlerHelper
@data
[
:created_date
]
=
""
expiration_date
=
page
.
search
(
".DetailJobNew li:nth-child(
7) span"
).
text
expiration_date
=
page
.
search
(
".DetailJobNew li:nth-child(
9) span"
).
text
.
strip
@data
[
:expiration_date
]
=
expiration_date
salary
=
page
.
search
(
".DetailJobNew li:nth-child(
6) span"
).
text
salary
=
page
.
search
(
".DetailJobNew li:nth-child(
3) span"
).
text
.
strip
@data
[
:salary
]
=
salary
industry_name
=
page
.
search
(
".DetailJobNew li:nth-child(
3) span a"
).
text
@data
[
:industry_name
]
=
industry_name
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
select
{
|
v
|
v
!=
''
}
industry_name
=
page
.
search
(
".DetailJobNew li:nth-child(
2) span"
).
text
.
strip
@data
[
:industry_name
]
=
industry_name
description
=
page
.
search
(
".left-col"
).
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
description
=
page
.
search
(
".left-col
.detail-row
"
).
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
@data
[
:description
]
=
description
level
=
page
.
search
(
".DetailJobNew ul li:nth-child(
2) span"
).
text
level
=
page
.
search
(
".DetailJobNew ul li:nth-child(
6) span"
).
text
.
strip
@data
[
:level
]
=
level
@data
[
:exprience
]
=
""
exprience
=
page
.
search
(
".DetailJobNew li:nth-child(5) span"
).
text
.
strip
@data
[
:exprience
]
=
exprience
render
plain:
"
#{
@data
}
"
end
def
make_data
@data
=
{}
@name
=
[]
@company_name
=
[]
@level
=
[]
@exprience
=
[]
@salary
=
[]
@created_date
=
[]
@expiration_date
=
[]
@description
=
[]
@industry_name
=
[]
@city_name
=
[]
link_crawl
=
crawl_link_for_companies_jobs
()
link_crawl
[
1
].
each
do
|
path
|
# debugger
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/
#{
path
}
"
))))
if
page
.
search
(
".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p"
)[
0
]
!=
nil
crawl_data_jobs_interface_1
(
path
)
elsif
page
.
search
(
".DetailJobNew li:nth-child(5) span"
).
to_s
==
""
crawl_data_jobs_interface_2
(
path
)
# elsif page.search(".DetailJobNew li:nth-child(5) span").to_s != ""
# crawl_data_jobs_interface_5(path)
end
end
# render plain: "#{link_crawl[1]} --- #{link_crawl[1].length}"
render
plain:
"
#{
@data
}
=>
#{
@data
[
:name
][
0
]
}
=>
#{
@data
[
:company_name
][
0
]
}
=>
#{
@data
[
:level
][
0
]
}
=>
#{
@data
[
:industry_name
][
0
]
}
=>
#{
@data
[
:exprience
][
0
]
}
=>
#{
@data
[
:expiration_date
][
0
]
}
=>
#{
@data
[
:created_date
][
0
]
}
=>
#{
@data
[
:city_name
][
0
]
}
=>
#{
@data
[
:description
][
0
]
}
=>
#{
@data
[
:salary
][
0
]
}
"
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/truong-tram-y-te-cong-ty.35B44FDF.html"))))
# page = Nokogiri::HTML(URI.open(URI.parse(URI.escape("https://careerbuilder.vn/vi/tim-viec-lam/hr-admin-executive.35B45B43.html"))))
end
end
\ No newline at end of file
config/database.yml
View file @
a1f70d89
...
...
@@ -14,7 +14,7 @@ default: &default
encoding
:
utf8
pool
:
<%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username
:
root
password
:
'
1'
password
:
'
1
2345678
'
socket
:
/var/run/mysqld/mysqld.sock
...
...
lib/helper/crawler.rb
0 → 100644
View file @
a1f70d89
require
'open-uri'
class
Clawler
@page
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
))
# PILL DATA CITIES
def
self
.
make_cities
@data_list_cities
=
[]
data
=
@page
.
search
(
"#location option"
)
list_cities
=
data
.
to_s
.
split
(
"</option>"
)
list_cities
.
each
do
|
x
|
@data_list_cities
<<
x
.
gsub
(
/(^<[\w\D]*>)/
,
''
).
gsub
(
/\n/
,
''
).
rstrip
end
@data_list_cities
.
length
.
times
do
|
i
|
if
i
<=
69
name
=
(
@data_list_cities
[
i
].
to_s
)
City
.
create!
(
name:
name
,
area:
1
)
elsif
i
>
69
name
=
(
@data_list_cities
[
i
].
to_s
)
City
.
create!
(
name:
name
,
area:
0
)
end
end
end
#PIL DATA INDUSTRIES
def
self
.
make_industries
@data_list_industries
=
[]
data
=
@page
.
search
(
"#industry option"
)
list_industries
=
data
.
to_s
.
split
(
"</option>"
)
list_industries
.
each
do
|
x
|
@data_list_industries
<<
x
.
gsub
(
/(^<[\w\D]*>)/
,
''
).
gsub
(
/\n/
,
''
).
rstrip
end
@data_list_industries
.
length
.
times
do
|
i
|
name
=
(
@data_list_industries
[
i
].
to_s
)
Industry
.
create!
(
name:
name
)
end
end
# CRAWLER LINK JOB & COMPANIES
def
self
.
crawl_link_for_companies_jobs
data
=
[]
website_companies
=
[]
website_jobs
=
[]
num_page_will_crawl
=
1
num_page_will_crawl
.
times
do
|
i
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
i
+
1
}
-vi.html"
))
website_companies
<<
page
.
search
(
".figcaption .caption a/@href"
).
text
.
to_s
.
split
(
'https://careerbuilder.vn/'
)
website_jobs
<<
page
.
search
(
".figcaption .title .job_link @href"
).
text
.
to_s
.
split
(
'https://careerbuilder.vn/'
)
end
website_companies
=
website_companies
.
join
(
","
)
website_companies
=
website_companies
.
split
(
","
).
uniq!
website_companies
=
website_companies
.
select
{
|
val
|
val
!=
''
}
website_jobs
=
website_jobs
.
join
(
","
)
website_jobs
=
website_jobs
.
split
(
","
)
website_jobs
=
website_jobs
.
select
{
|
val
|
val
!=
''
}
data
<<
website_companies
<<
website_jobs
end
# CRAWLER DATA COMPANIES
def
self
.
craw_data_companies
link_crawl
=
crawl_link_for_companies_jobs
()
@data_companies
=
{}
@data_companies_name
=
[]
@data_companies_address
=
[]
@data_companies_description
=
[]
link_crawl
[
0
].
each
do
|
url
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/
#{
url
}
"
))))
name
=
''
address
=
''
desc
=
''
if
page
.
search
(
".company-info .info .content .name"
).
text
==
""
name
=
page
.
search
(
".section-page #cp_company_name"
).
text
address
=
page
.
search
(
".section-page .cp_basic_info_details ul li:nth-child(1)"
).
text
desc
=
page
.
search
(
".cp_aboutus_item .content_fck"
).
text
else
name
=
page
.
search
(
".company-info .info .content .name"
).
text
address
=
page
.
search
(
".company-info .info .content p:nth-child(3)"
).
text
desc
=
page
.
search
(
".main-about-us .content"
).
text
end
if
(
name
!=
""
&&
address
!=
""
&&
desc
!=
""
)
@data_companies_name
<<
name
.
to_s
.
rstrip
@data_companies_address
<<
address
.
to_s
.
rstrip
@data_companies_description
<<
desc
end
end
@data_companies
[
:name
]
=
@data_companies_name
@data_companies
[
:address
]
=
@data_companies_address
@data_companies_description
.
each
do
|
val
|
val
.
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
val
.
strip!
end
@data_companies
[
:description
]
=
@data_companies_description
@data_companies
end
# FILL DATA COMPANIES
def
self
.
make_companies
@data
=
self
.
craw_data_companies
()
i
=
@data
[
:name
].
length
i
.
times
do
|
n
|
name
=
@data
[
:name
][
n
]
address
=
@data
[
:address
][
n
]
short_description
=
@data
[
:description
][
n
]
password
=
"password"
Company
.
create!
(
name:
name
,
address:
address
,
short_description:
short_description
)
end
end
end
\ No newline at end of file
lib/tasks/crawler.rake
View file @
a1f70d89
require
'
open-uri
'
require
'
helper/crawler
'
namespace
:db
do
task
populate: :environment
do
make_cities
make_industries
make_companies
end
# CRAWLER ALL CITIES
$page
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
))
p1
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/vi/tim-viec-lam/vinhomes-chuyen-vien-thu-tuc-bat-dong-san.35B449B5.html'
))
def
make_cities
@data_list_cities
=
[]
data
=
$page
.
search
(
"#location option"
)
list_cities
=
data
.
to_s
.
split
(
"</option>"
)
list_cities
.
each
do
|
x
|
@data_list_cities
<<
x
.
gsub
(
/(^<[\w\D]*>)/
,
''
).
gsub
(
/\n/
,
''
).
rstrip
end
@data_list_cities
.
length
.
times
do
|
i
|
if
i
<=
69
name
=
(
@data_list_cities
[
i
].
to_s
)
City
.
create!
(
name:
name
,
area:
1
)
elsif
i
>
69
name
=
(
@data_list_cities
[
i
].
to_s
)
City
.
create!
(
name:
name
,
area:
0
)
end
end
end
# CRAWLER ALL INDUSTRIES
def
make_industries
@data_list_industries
=
[]
data
=
$page
.
search
(
"#industry option"
)
list_industries
=
data
.
to_s
.
split
(
"</option>"
)
list_industries
.
each
do
|
x
|
@data_list_industries
<<
x
.
gsub
(
/(^<[\w\D]*>)/
,
''
).
gsub
(
/\n/
,
''
).
rstrip
end
@data_list_industries
.
length
.
times
do
|
i
|
name
=
(
@data_list_industries
[
i
].
to_s
)
Industry
.
create!
(
name:
name
)
end
end
# CRAWLER LINK JOB & COMPANIES
def
crawl_link_for_companies_jobs
data
=
[]
website_companies
=
[]
website_jobs
=
[]
num_page_will_crawl
=
3
num_page_will_crawl
.
times
do
|
i
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
i
+
1
}
-vi.html"
))
website_companies
<<
page
.
search
(
".figcaption .caption a/@href"
).
text
.
to_s
.
split
(
'https://careerbuilder.vn/'
)
website_jobs
<<
page
.
search
(
".figcaption .title .job_link @href"
).
text
.
to_s
.
split
(
'https://careerbuilder.vn/'
)
end
website_companies
=
website_companies
.
join
(
","
)
website_companies
=
website_companies
.
split
(
","
).
uniq!
website_companies
=
website_companies
.
select
{
|
val
|
val
!=
''
}
website_jobs
=
website_jobs
.
join
(
","
)
website_jobs
=
website_jobs
.
split
(
","
)
website_jobs
=
website_jobs
.
select
{
|
val
|
val
!=
''
}
data
<<
website_companies
<<
website_jobs
end
# CRAWLER COMPANIES
def
make_companies
def
craw_data_companies
link_crawl
=
crawl_link_for_companies_jobs
()
@data_companies
=
{}
@data_companies_name
=
[]
@data_companies_address
=
[]
@data_companies_description
=
[]
link_crawl
[
0
].
each
do
|
url
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/
#{
url
}
"
))))
name
=
''
address
=
''
desc
=
''
if
page
.
search
(
".company-info .info .content .name"
).
text
==
""
name
=
page
.
search
(
".section-page #cp_company_name"
).
text
address
=
page
.
search
(
".section-page .cp_basic_info_details ul li:nth-child(1)"
).
text
desc
=
page
.
search
(
".cp_aboutus_item .content_fck"
).
text
else
name
=
page
.
search
(
".company-info .info .content .name"
).
text
address
=
page
.
search
(
".company-info .info .content p:nth-child(3)"
).
text
desc
=
page
.
search
(
".main-about-us .content"
).
text
end
if
(
name
!=
""
&&
address
!=
""
&&
desc
!=
""
)
@data_companies_name
<<
name
.
to_s
.
rstrip
@data_companies_address
<<
address
.
to_s
.
rstrip
@data_companies_description
<<
desc
end
end
@data_companies
[
:name
]
=
@data_companies_name
@data_companies
[
:address
]
=
@data_companies_address
@data_companies_description
.
each
do
|
val
|
val
.
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
val
.
strip!
end
@data_companies
[
:description
]
=
@data_companies_description
@data_companies
end
@data
=
craw_data_companies
()
i
=
@data
[
:name
].
length
i
.
times
do
|
n
|
name
=
@data
[
:name
][
n
]
address
=
@data
[
:address
][
n
]
short_description
=
@data
[
:description
][
n
]
password
=
"password"
Company
.
create!
(
name:
name
,
address:
address
,
short_description:
short_description
)
end
end
def
make_jobs
Clawler
.
make_cities
Clawler
.
make_industries
Clawler
.
make_companies
end
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment