Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
VeNJOB
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Nguyen Hoang Mai Phuong
VeNJOB
Commits
20dff5d8
Commit
20dff5d8
authored
Jul 19, 2021
by
Nguyen Hoang Mai Phuong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
create crawler with rake task
parent
fef9193c
Pipeline
#1352
failed with stages
in 0 seconds
Changes
2
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
127 additions
and
22 deletions
+127
-22
craw.rb
+43
-22
lib/tasks/crawler.rake
+84
-0
No files found.
craw.rb
View file @
20dff5d8
...
@@ -10,7 +10,7 @@ def scraper
...
@@ -10,7 +10,7 @@ def scraper
job_listing
=
list_url_job
.
css
(
'div.job-item'
)
job_listing
=
list_url_job
.
css
(
'div.job-item'
)
page
=
1
page
=
1
per_page
=
job_listing
.
count
per_page
=
job_listing
.
length
total
=
list_url_job
.
css
(
'div.job-found p'
).
text
.
split
(
' '
)[
0
].
gsub
(
','
,
''
).
to_i
total
=
list_url_job
.
css
(
'div.job-found p'
).
text
.
split
(
' '
)[
0
].
gsub
(
','
,
''
).
to_i
last_page
=
(
total
.
to_f
/
per_page
.
to_f
).
round
last_page
=
(
total
.
to_f
/
per_page
.
to_f
).
round
...
@@ -18,33 +18,53 @@ def scraper
...
@@ -18,33 +18,53 @@ def scraper
pagination_list_url
=
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
page
}
-vi.html"
pagination_list_url
=
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
page
}
-vi.html"
pagination_list_url_job
=
Nokogiri
::
HTML
(
URI
.
open
(
pagination_list_url
))
pagination_list_url_job
=
Nokogiri
::
HTML
(
URI
.
open
(
pagination_list_url
))
pagination_job_listing
=
pagination_list_url_job
.
css
(
'div.job-item'
)
pagination_job_listing
=
pagination_list_url_job
.
css
(
'div.job-item'
)
pagination_url
=
pagination_job_listing
.
css
(
'a'
)[
1
].
attributes
[
"href"
].
value
pagination_job_listing
.
each
do
|
detail_jobs
|
pagination_url
=
detail_jobs
.
css
(
'a'
)[
1
].
attributes
[
"href"
].
value
pagination_detail_url
=
Nokogiri
::
HTML
(
URI
.
open
(
pagination_url
))
pagination_detail_url
=
Nokogiri
::
HTML
(
URI
.
open
(
pagination_url
))
pagination_detail_job
=
pagination_detail_url
.
css
(
'div.container'
)
pagination_detail_job
=
pagination_detail_url
.
css
(
'div.container'
)
strong_element_value
=
pagination_detail_job
.
css
(
'div.detail-box.has-background ul li'
)
title
=
pagination_detail_job
.
css
(
'div.job-desc h1.title'
)[
0
].
text
,
puts
pagination_detail_job
.
css
(
'div.job-desc h1.title'
)[
0
].
text
company
=
pagination_detail_job
.
css
(
'div.job-desc a.employer.job-company-name'
)[
0
].
text
,
strong_element_value
.
each
do
|
title_strong
|
class_value
=
pagination_detail_job
.
css
(
'div.detail-box.has-background ul li'
).
children
case
title_strong
.
css
(
'strong'
).
text
when
"Lương"
class_value
.
each
do
|
title
|
puts
title_strong
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
if
title
.
attributes
[
"class"
].
value
==
"fa fa-usd"
when
"Kinh nghiệm"
salary
=
pagination_detail_job
.
css
(
'div.detail-box.has-background ul li p'
).
text
.
gsub
(
/\s+/
,
" "
)
puts
title_strong
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
elsif
title
.
attributes
[
"class"
].
value
==
"fa fa-briefcase"
when
"Cấp bậc"
experience
=
pagination_detail_job
.
css
(
'div.detail-box.has-background ul li p'
).
text
.
gsub
(
"
\r\n
"
,
""
).
strip
puts
title_strong
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
elsif
title
.
attributes
[
"class"
].
value
==
"mdi mdi-account"
when
"Hết hạn nộp"
type
=
pagination_detail_job
.
css
(
'div.detail-box.has-background ul li p'
).
text
.
gsub
(
/\s+/
,
" "
)
puts
title_strong
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
elsif
title
.
attributes
[
"class"
].
value
==
"mdi mdi-calendar-check"
end
expired_at
=
pagination_detail_job
.
css
(
'div.detail-box.has-background ul li p'
).
text
.
gsub
(
/\s+/
,
" "
)
end
h3_element_value
=
pagination_detail_job
.
css
(
'div.detail-row'
)
h3_element_value
.
each
do
|
h3_element
|
case
h3_element
.
css
(
'h3'
).
text
when
"Mô tả Công việc"
puts
h3_element
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
when
"Yêu Cầu Công Việc"
puts
h3_element
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
when
"Thông tin khác"
puts
h3_element
.
css
(
'div.content_fck ul li'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
end
end
end
end
end
benefits
=
pagination_detail_job
.
css
(
'div.detail-row'
)[
0
].
text
.
gsub
(
"
\r\n
"
,
""
).
strip
,
pagination_job_listing
.
each
do
|
detail_company
|
overview
=
pagination_detail_job
.
css
(
'div.detail-row'
)[
1
].
text
.
gsub
(
"
\r\n
"
,
""
).
strip
,
company_url
=
detail_company
.
css
(
'a'
)[
0
].
attributes
[
"href"
].
value
requirement
=
pagination_detail_job
.
css
(
'div.detail-row'
)[
2
].
text
.
gsub
(
"
\r\n
"
,
""
).
strip
,
parse_company_url
=
Nokogiri
::
HTML
(
URI
.
open
(
pagination_url
))
other_requirement
=
pagination_detail_job
.
css
(
'div.detail-row'
)[
2
].
text
.
gsub
(
/\s+/
,
" "
)
company
=
parse_company_url
.
css
(
'company-content'
)
puts
pagination_detail_job
.
css
(
'div.job-desc a.employer.job-company-name'
)[
0
].
text
company
.
each
do
|
info_company
|
case
info_company
.
css
(
'h3'
).
text
when
"Giới thiệu về công ty"
puts
info_company
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
when
"Thông điệp từ CÔNG TY"
puts
info_company
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
end
end
page
+=
1
page
+=
1
end
end
byebug
end
end
scraper
scraper
\ No newline at end of file
lib/tasks/crawler.rake
0 → 100644
View file @
20dff5d8
namespace
:crawler
do
desc
"TODO"
task
jobs: :environment
do
base_url
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/'
))
list_url
=
base_url
.
css
(
'div.menu div.dropdown-menu ul li a'
)[
0
].
attributes
[
"href"
].
value
parse_list_url
=
Nokogiri
::
HTML
(
URI
.
open
(
list_url
))
job_listing
=
parse_list_url
.
css
(
'div.job-item'
)
page
=
1
per_page
=
job_listing
.
length
total
=
parse_list_url
.
css
(
'div.job-found p'
).
text
.
split
(
' '
)[
0
].
gsub
(
','
,
''
).
to_i
last_page
=
(
total
.
to_f
/
per_page
.
to_f
).
round
while
page
<=
last_page
pagination_list_url
=
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
page
}
-vi.html"
pagination_list_url_job
=
Nokogiri
::
HTML
(
URI
.
open
(
pagination_list_url
))
pagination_job_listing
=
pagination_list_url_job
.
css
(
'div.job-item'
)
pagination_job_listing
.
each
do
|
detail_jobs
|
pagination_url
=
detail_jobs
.
css
(
'a'
)[
1
].
attributes
[
"href"
].
value
parse_pagination_url
=
Nokogiri
::
HTML
(
URI
.
open
(
pagination_url
))
pagination_detail_job
=
parse_pagination_url
.
css
(
'div.container'
)
strong_element_value
=
pagination_detail_job
.
css
(
'div.detail-box.has-background ul li'
)
puts
pagination_detail_job
.
css
(
'div.job-desc h1.title'
)[
0
].
text
strong_element_value
.
each
do
|
title_strong
|
case
title_strong
.
css
(
'strong'
).
text
when
"Lương"
puts
title_strong
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
when
"Kinh nghiệm"
puts
title_strong
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
when
"Cấp bậc"
puts
title_strong
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
when
"Hết hạn nộp"
puts
title_strong
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
end
end
h3_element_value
=
pagination_detail_job
.
css
(
'div.detail-row'
)
h3_element_value
.
each
do
|
h3_element
|
case
h3_element
.
css
(
'h3'
).
text
when
"Mô tả Công việc"
puts
h3_element
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
when
"Yêu Cầu Công Việc"
puts
h3_element
.
css
(
'p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
when
"Thông tin khác"
puts
h3_element
.
css
(
'div.content_fck ul li'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
end
end
company_url
=
detail_jobs
.
css
(
'a'
)[
0
].
attributes
[
"href"
].
value
parse_company_url
=
Nokogiri
::
HTML
(
URI
.
open
(
company_url
))
company
=
parse_company_url
.
css
(
'div.container'
)
puts
company
.
css
(
'div.company-info div.info div.content p.name'
).
text
company_info
=
company
.
css
(
'div.company-info div.info div.content'
)
puts
company_info
.
css
(
'p'
)[
1
].
text
puts
company_info
.
css
(
'ul li'
).
text
puts
company
.
css
(
'div.row div.content p'
).
text
.
gsub
(
/\s+/
,
" "
).
strip
end
page
+=
1
end
end
desc
"TODO"
task
industries: :environment
do
industries_listing
=
parse_base_url
.
css
(
'div.container div.list-of-working-positions div.col-md-6.col-lg-4.cus-col'
)
puts
industries_listing
.
css
(
'ul.list-jobs li'
).
text
end
desc
"TODO"
task
cities: :environment
do
cities
=
parse_base_url
.
css
(
'div.container div.col-xl-3 div.main-jobs-by-location div.jobs-in-country li a'
)
cities
.
each
do
|
city
|
puts
city
.
text
.
gsub
(
'Việc làm tại'
,
''
)
end
end
desc
"TODO"
task
regions: :environment
do
puts
parse_base_url
.
css
(
'div.container div.col-xl-3 div.main-jobs-by-location h3'
).
text
end
def
parse_base_url
base_url
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/'
))
industries_url
=
base_url
.
css
(
'div.menu div.dropdown-menu ul li a'
)[
1
].
attributes
[
"href"
].
value
parse_industries_url
=
Nokogiri
::
HTML
(
URI
.
open
(
industries_url
))
end
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment