Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
ven-job
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Trịnh Hoàng Phúc
ven-job
Commits
475317f1
Commit
475317f1
authored
Apr 17, 2020
by
Hoang Phuc
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix get attributes crawler because careerbuild has changed UI
parent
11c5feb9
Pipeline
#580
failed with stages
in 0 seconds
Changes
1
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
37 additions
and
166 deletions
+37
-166
lib/tasks/crawler.rake
+37
-166
No files found.
lib/tasks/crawler.rake
View file @
475317f1
...
@@ -16,216 +16,87 @@ namespace :crawler do
...
@@ -16,216 +16,87 @@ namespace :crawler do
# Fetch and parse HTML document
# Fetch and parse HTML document
html_jobs
=
Nokogiri
::
HTML
.
parse
(
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
page
}
-vi.html"
))
html_jobs
=
Nokogiri
::
HTML
.
parse
(
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
page
}
-vi.html"
))
# Loop item
# Loop item
(
0
..
html_jobs
.
css
(
".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href"
).
length
-
1
).
each
do
|
i
|
html_jobs
.
css
(
".jobs-side-list .job-item"
).
each_with_index
do
|
item
,
index
|
# Get href of a tag and open job detail page
job_detail_url
=
html_jobs
.
css
(
".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href"
)[
i
].
text
html_job_detail
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
job_detail_url
)))
# Job attributes
# Job attributes
job_attributes
=
{
job_attributes
=
{
title:
nil
,
title:
item
.
css
(
".figure .figcaption .title a @title"
).
text
,
updated_date_job:
nil
,
updated_date_job:
item
.
css
(
".bottom-right-icon .time time"
).
text
,
level:
nil
,
level:
nil
,
years_of_experience:
nil
,
years_of_experience:
nil
,
salary:
nil
,
salary:
item
.
css
(
".figure .figcaption .caption .salary"
).
text
.
gsub
(
"$ "
,
""
)
,
expiration_date:
nil
,
expiration_date:
nil
,
job_description:
nil
,
job_description:
nil
,
company_id:
nil
,
company_id:
nil
,
}
}
# Defind cities array
cities
=
[]
item
.
css
(
".figure .figcaption .caption .location ul li"
).
each_with_index
do
|
city
|
city
=
check_exist_or_create_city
(
city
.
text
)
cities
<<
city
end
if
item
.
css
(
".figure .image a @href"
).
text
!=
"javascript:void(0);"
# Company attributes
# Company attributes
html_company_detail
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
item
.
css
(
".figure .image a @href"
).
text
)))
if
html_company_detail
.
at_css
(
".jobsby-company"
)
company_attributes
=
{
company_attributes
=
{
title:
nil
,
title:
html_company_detail
.
css
(
".jobsby-company .company-introduction .company-info .info .content .name"
).
text
,
address:
nil
,
address:
html_company_detail
.
css
(
".jobsby-company .company-introduction .company-info .info .content p"
)[
1
].
text
,
logo:
nil
,
logo:
html_company_detail
.
css
(
".jobsby-company .company-introduction .company-info .info .img @src"
).
text
,
description:
nil
description:
html_company_detail
.
css
(
".jobsby-company .company-introduction .company-info .info .content ul"
).
inner_html
.
strip
}
}
# Check exist or create company
# Defind city ids array
job_attributes
[
:company_id
]
=
check_exist_or_create_company
(
company_attributes
)
cities
=
[]
end
end
# Defind industry ids array
# Defind industry ids array
industries
=
[]
industries
=
[]
# Check what template job belongs to
html_job_detail
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
item
.
css
(
".figure .figcaption .title .job_link @href"
).
text
)))
if
html_job_detail
.
at_css
(
"#uni_container .MyJobDetail"
)
if
html_job_detail
.
at_css
(
".search-result-list-detail"
)
# CSS DOM
html_job_detail
.
css
(
".search-result-list-detail .tabs #tab-1 .job-detail-content .has-background ul li"
).
each
do
|
ele
|
css_dom
=
"#uni_container .MyJobDetail .MyJobLeft .LeftJobCB"
type
=
ele
.
css
(
"strong"
).
text
# Title
job_attributes
[
:title
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.top-job .top-job-info h1"
).
text
# Updated date job
job_attributes
[
:updated_date_job
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.datepost span"
).
text
# Hash company
company_attributes
[
:title
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.box1Detail .TitleDetailNew span"
).
text
company_attributes
[
:address
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.box1Detail .TitleDetailNew label label"
).
text
company_attributes
[
:logo
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.box1Detail .align_center.logocompany a img @src"
).
text
company_attributes
[
:description
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.desc_company.content_fck #emp_collapse"
).
text
.
split
(
"..."
)[
0
]
# Get value for job attributes
html_job_detail
.
css
(
"
#{
css_dom
}
.box2Detail .DetailJobNew li p"
).
each_with_index
do
|
ele
,
index
|
type
=
ele
.
css
(
"span"
).
text
case
type
case
type
when
"
Nơi làm việc:
"
when
"
Hết hạn nộp
"
# Check exist or create city
job_attributes
[
:expiration_date
]
=
ele
.
css
(
"p"
).
text
.
strip
ele
.
css
(
"b a"
).
each_with_index
do
|
ele
,
index
|
when
"Cấp bậc"
if
index
>
0
job_attributes
[
:level
]
=
ele
.
css
(
"p"
).
text
.
strip
city
=
check_exist_or_create_city
(
ele
.
text
.
gsub
(
","
,
""
))
when
"Kinh nghiệm"
cities
<<
city
job_attributes
[
:years_of_experience
]
=
ele
.
css
(
"p"
).
text
.
strip
end
end
end
end
when
"Cấp bậc: "
html_job_detail
.
css
(
".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a"
).
each
do
|
ele
|
job_attributes
[
:level
]
=
ele
.
css
(
"label"
).
text
industry
=
check_exist_or_create_industry
(
ele
.
text
.
strip
.
gsub
(
","
,
""
))
when
"Kinh nghiệm: "
job_attributes
[
:years_of_experience
]
=
ele
.
text
.
gsub
(
"Kinh nghiệm: "
,
""
)
when
"Lương: "
job_attributes
[
:salary
]
=
ele
.
text
.
gsub
(
"Lương: "
,
""
)
when
"Ngành nghề: "
# Check exist or create industry
ele
.
css
(
"b a"
).
each_with_index
do
|
ele
,
index
|
industry
=
check_exist_or_create_industry
(
ele
.
text
.
gsub
(
","
,
""
))
industries
<<
industry
industries
<<
industry
end
end
else
job_attributes
[
:expiration_date
]
=
ele
.
text
.
gsub
(
"Hết hạn nộp: "
,
""
)
end
end
# Get description for job attributes
# Get description for job attributes
description
=
""
description
=
""
html_job_detail
.
css
(
"
#{
css_dom
}
.MarBot20"
).
each_with_index
do
|
ele
,
index
|
html_job_detail
.
css
(
"
.search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row"
).
each
do
|
ele
|
description
<<
ele
.
inner_html
description
<<
ele
.
inner_html
end
end
# Set description for job attributes
job_attributes
[
:job_description
]
=
description
elsif
html_job_detail
.
at_css
(
"#uni_container .job-template-2"
)
# CSS DOM
css_dom
=
"#uni_container .job-template-2 .content-job-detail"
# Title
job_attributes
[
:title
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.top-job .top-job-info h1"
).
text
# Updated date job
job_attributes
[
:updated_date_job
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.top-job .top-job-info p"
)[
1
].
text
.
gsub
(
"Ngày cập nhật:"
,
""
)
# Hash company
company_attributes
[
:title
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info .top-job .top-job-info .tit_company"
).
text
company_attributes
[
:address
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info p"
)[
0
].
text
company_attributes
[
:logo
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info .top-job .logocompany a img @src"
).
text
company_attributes
[
:description
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info .desc"
).
text
.
split
(
"..."
)[
0
]
# Get value for job attributes
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .info-career .info li"
).
each_with_index
do
|
ele
,
index
|
type
=
ele
.
css
(
"b"
).
text
case
type
when
"Nơi làm việc"
# Check exist or create city
ele
.
css
(
"span a"
).
each_with_index
do
|
ele
,
index
|
if
index
>
0
city
=
check_exist_or_create_city
(
ele
.
text
.
gsub
(
","
,
""
))
cities
<<
city
end
end
when
"Cấp bậc"
job_attributes
[
:level
]
=
ele
.
css
(
"span"
).
text
when
"Kinh nghiệm"
job_attributes
[
:years_of_experience
]
=
ele
.
css
(
"span"
).
text
when
"Lương"
job_attributes
[
:salary
]
=
ele
.
text
.
gsub
(
"Lương: "
,
""
)
when
"Ngành nghề"
# Check exist or create industry
ele
.
css
(
"span a"
).
each_with_index
do
|
ele
,
index
|
industry
=
check_exist_or_create_industry
(
ele
.
text
.
gsub
(
","
,
""
))
industries
<<
industry
end
else
job_attributes
[
:expiration_date
]
=
ele
.
css
(
"span"
).
text
end
end
# Set description for job attributes
# Set description for job attributes
job_attributes
[
:job_description
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
#showScroll"
).
inner_html
job_attributes
[
:job_description
]
=
description
.
strip
elsif
html_job_detail
.
at_css
(
"#uni_container .job-template-201"
)
# CSS DOM
css_dom
=
"#uni_container .job-template-201"
# Title
job_attributes
[
:title
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.content-job-detail .top-job .top-job-info h1"
).
text
# Updated date job
job_attributes
[
:updated_date_job
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.content-job-detail .top-job .top-job-info p"
)[
1
].
text
.
gsub
(
"Ngày cập nhật: "
,
""
)
# Hash company
company_attributes
[
:title
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info .top-job .top-job-info .tit_company"
).
text
company_attributes
[
:logo
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info .top-job .logocompany a img @src"
).
text
company_attributes
[
:address
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info p"
)[
0
].
text
company_attributes
[
:description
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info .desc"
).
text
.
gsub
(
" Xem thêm"
,
""
)
# Get value for job attributes
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .info-career .info li"
).
each_with_index
do
|
ele
,
index
|
type
=
ele
.
css
(
"b"
).
text
case
type
when
"Nơi làm việc"
# Check exist or create city
ele
.
css
(
"span a"
).
each_with_index
do
|
ele
,
index
|
if
index
>
0
city
=
check_exist_or_create_city
(
ele
.
text
)
cities
<<
city
end
end
when
"Cấp bậc"
job_attributes
[
:level
]
=
ele
.
css
(
"span"
).
text
when
"Lương"
job_attributes
[
:salary
]
=
ele
.
css
(
"span"
).
text
when
"Ngành nghề"
# Check exist or create industry
ele
.
css
(
"span a"
).
each_with_index
do
|
ele
,
index
|
industry
=
check_exist_or_create_industry
(
ele
.
text
)
industries
<<
industry
end
when
"Hết hạn nộp"
job_attributes
[
:expiration_date
]
=
ele
.
css
(
"span"
).
text
else
else
job_attributes
[
:years_of_experience
]
=
ele
.
css
(
"span"
).
text
skip_url_logger
.
info
"another template
#{
item
.
css
(
".figure .figcaption .title .job_link @href"
).
text
}
"
end
end
end
# Set description for job attributes
job_attributes
[
:job_description
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.left-col #showScroll"
).
inner_html
else
skip_url_logger
.
info
"another template
#{
job_detail_url
}
"
end
# Check exist or create company
job_attributes
[
:company_id
]
=
check_exist_or_create_company
(
company_attributes
)
# Create job
# Create job
job
=
check_exist_or_create_job
(
job_attributes
)
job
=
check_exist_or_create_job
(
job_attributes
)
# Create city_job
# Create city_job
if
cities
.
length
>
0
if
cities
.
length
>
0
cities
.
each
do
|
city
|
cities
.
each
do
|
city
|
job
.
cities
<<
city
job
.
cities
<<
city
end
end
end
end
# Create industry_job
# Create industry_job
if
industries
.
length
>
0
if
industries
.
length
>
0
industries
.
each
do
|
industry
|
industries
.
each
do
|
industry
|
job
.
industries
<<
industry
job
.
industries
<<
industry
end
end
end
end
rescue
rescue
exception_logger
.
info
"Error url:
#{
job_detail_url
}
"
exception_logger
.
info
"Error url:
#{
job_detail_url
}
"
next
next
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment