Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
ven-job
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Trịnh Hoàng Phúc
ven-job
Commits
475317f1
Commit
475317f1
authored
Apr 17, 2020
by
Hoang Phuc
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix get attributes crawler because careerbuild has changed UI
parent
11c5feb9
Pipeline
#580
failed with stages
in 0 seconds
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
41 additions
and
170 deletions
+41
-170
lib/tasks/crawler.rake
+41
-170
No files found.
lib/tasks/crawler.rake
View file @
475317f1
...
...
@@ -15,217 +15,88 @@ namespace :crawler do
(
1
..
2
).
each
do
|
page
|
# Fetch and parse HTML document
html_jobs
=
Nokogiri
::
HTML
.
parse
(
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
page
}
-vi.html"
))
# Loop item
(
0
..
html_jobs
.
css
(
".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href"
).
length
-
1
).
each
do
|
i
|
# Get href of a tag and open job detail page
job_detail_url
=
html_jobs
.
css
(
".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href"
)[
i
].
text
html_job_detail
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
job_detail_url
)))
html_jobs
.
css
(
".jobs-side-list .job-item"
).
each_with_index
do
|
item
,
index
|
# Job attributes
job_attributes
=
{
title:
nil
,
updated_date_job:
nil
,
title:
item
.
css
(
".figure .figcaption .title a @title"
).
text
,
updated_date_job:
item
.
css
(
".bottom-right-icon .time time"
).
text
,
level:
nil
,
years_of_experience:
nil
,
salary:
nil
,
salary:
item
.
css
(
".figure .figcaption .caption .salary"
).
text
.
gsub
(
"$ "
,
""
)
,
expiration_date:
nil
,
job_description:
nil
,
company_id:
nil
,
}
# Company attributes
company_attributes
=
{
title:
nil
,
address:
nil
,
logo:
nil
,
description:
nil
}
# Defind city ids array
# Defind cities array
cities
=
[]
item
.
css
(
".figure .figcaption .caption .location ul li"
).
each_with_index
do
|
city
|
city
=
check_exist_or_create_city
(
city
.
text
)
cities
<<
city
end
if
item
.
css
(
".figure .image a @href"
).
text
!=
"javascript:void(0);"
# Company attributes
html_company_detail
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
item
.
css
(
".figure .image a @href"
).
text
)))
if
html_company_detail
.
at_css
(
".jobsby-company"
)
company_attributes
=
{
title:
html_company_detail
.
css
(
".jobsby-company .company-introduction .company-info .info .content .name"
).
text
,
address:
html_company_detail
.
css
(
".jobsby-company .company-introduction .company-info .info .content p"
)[
1
].
text
,
logo:
html_company_detail
.
css
(
".jobsby-company .company-introduction .company-info .info .img @src"
).
text
,
description:
html_company_detail
.
css
(
".jobsby-company .company-introduction .company-info .info .content ul"
).
inner_html
.
strip
}
# Check exist or create company
job_attributes
[
:company_id
]
=
check_exist_or_create_company
(
company_attributes
)
end
end
# Defind industry ids array
industries
=
[]
# Check what template job belongs to
if
html_job_detail
.
at_css
(
"#uni_container .MyJobDetail"
)
# CSS DOM
css_dom
=
"#uni_container .MyJobDetail .MyJobLeft .LeftJobCB"
# Title
job_attributes
[
:title
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.top-job .top-job-info h1"
).
text
# Updated date job
job_attributes
[
:updated_date_job
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.datepost span"
).
text
# Hash company
company_attributes
[
:title
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.box1Detail .TitleDetailNew span"
).
text
company_attributes
[
:address
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.box1Detail .TitleDetailNew label label"
).
text
company_attributes
[
:logo
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.box1Detail .align_center.logocompany a img @src"
).
text
company_attributes
[
:description
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.desc_company.content_fck #emp_collapse"
).
text
.
split
(
"..."
)[
0
]
# Get value for job attributes
html_job_detail
.
css
(
"
#{
css_dom
}
.box2Detail .DetailJobNew li p"
).
each_with_index
do
|
ele
,
index
|
type
=
ele
.
css
(
"span"
).
text
html_job_detail
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
item
.
css
(
".figure .figcaption .title .job_link @href"
).
text
)))
if
html_job_detail
.
at_css
(
".search-result-list-detail"
)
html_job_detail
.
css
(
".search-result-list-detail .tabs #tab-1 .job-detail-content .has-background ul li"
).
each
do
|
ele
|
type
=
ele
.
css
(
"strong"
).
text
case
type
when
"Nơi làm việc: "
# Check exist or create city
ele
.
css
(
"b a"
).
each_with_index
do
|
ele
,
index
|
if
index
>
0
city
=
check_exist_or_create_city
(
ele
.
text
.
gsub
(
","
,
""
))
cities
<<
city
end
end
when
"Cấp bậc: "
job_attributes
[
:level
]
=
ele
.
css
(
"label"
).
text
when
"Kinh nghiệm: "
job_attributes
[
:years_of_experience
]
=
ele
.
text
.
gsub
(
"Kinh nghiệm: "
,
""
)
when
"Lương: "
job_attributes
[
:salary
]
=
ele
.
text
.
gsub
(
"Lương: "
,
""
)
when
"Ngành nghề: "
# Check exist or create industry
ele
.
css
(
"b a"
).
each_with_index
do
|
ele
,
index
|
industry
=
check_exist_or_create_industry
(
ele
.
text
.
gsub
(
","
,
""
))
industries
<<
industry
end
else
job_attributes
[
:expiration_date
]
=
ele
.
text
.
gsub
(
"Hết hạn nộp: "
,
""
)
when
"Hết hạn nộp"
job_attributes
[
:expiration_date
]
=
ele
.
css
(
"p"
).
text
.
strip
when
"Cấp bậc"
job_attributes
[
:level
]
=
ele
.
css
(
"p"
).
text
.
strip
when
"Kinh nghiệm"
job_attributes
[
:years_of_experience
]
=
ele
.
css
(
"p"
).
text
.
strip
end
end
html_job_detail
.
css
(
".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a"
).
each
do
|
ele
|
industry
=
check_exist_or_create_industry
(
ele
.
text
.
strip
.
gsub
(
","
,
""
))
industries
<<
industry
end
# Get description for job attributes
description
=
""
html_job_detail
.
css
(
"
#{
css_dom
}
.MarBot20"
).
each_with_index
do
|
ele
,
index
|
html_job_detail
.
css
(
"
.search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row"
).
each
do
|
ele
|
description
<<
ele
.
inner_html
end
# Set description for job attributes
job_attributes
[
:job_description
]
=
description
elsif
html_job_detail
.
at_css
(
"#uni_container .job-template-2"
)
# CSS DOM
css_dom
=
"#uni_container .job-template-2 .content-job-detail"
# Title
job_attributes
[
:title
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.top-job .top-job-info h1"
).
text
# Updated date job
job_attributes
[
:updated_date_job
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.top-job .top-job-info p"
)[
1
].
text
.
gsub
(
"Ngày cập nhật:"
,
""
)
# Hash company
company_attributes
[
:title
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info .top-job .top-job-info .tit_company"
).
text
company_attributes
[
:address
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info p"
)[
0
].
text
company_attributes
[
:logo
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info .top-job .logocompany a img @src"
).
text
company_attributes
[
:description
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info .desc"
).
text
.
split
(
"..."
)[
0
]
# Get value for job attributes
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .info-career .info li"
).
each_with_index
do
|
ele
,
index
|
type
=
ele
.
css
(
"b"
).
text
case
type
when
"Nơi làm việc"
# Check exist or create city
ele
.
css
(
"span a"
).
each_with_index
do
|
ele
,
index
|
if
index
>
0
city
=
check_exist_or_create_city
(
ele
.
text
.
gsub
(
","
,
""
))
cities
<<
city
end
end
when
"Cấp bậc"
job_attributes
[
:level
]
=
ele
.
css
(
"span"
).
text
when
"Kinh nghiệm"
job_attributes
[
:years_of_experience
]
=
ele
.
css
(
"span"
).
text
when
"Lương"
job_attributes
[
:salary
]
=
ele
.
text
.
gsub
(
"Lương: "
,
""
)
when
"Ngành nghề"
# Check exist or create industry
ele
.
css
(
"span a"
).
each_with_index
do
|
ele
,
index
|
industry
=
check_exist_or_create_industry
(
ele
.
text
.
gsub
(
","
,
""
))
industries
<<
industry
end
else
job_attributes
[
:expiration_date
]
=
ele
.
css
(
"span"
).
text
end
end
# Set description for job attributes
job_attributes
[
:job_description
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
#showScroll"
).
inner_html
elsif
html_job_detail
.
at_css
(
"#uni_container .job-template-201"
)
# CSS DOM
css_dom
=
"#uni_container .job-template-201"
# Title
job_attributes
[
:title
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.content-job-detail .top-job .top-job-info h1"
).
text
# Updated date job
job_attributes
[
:updated_date_job
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.content-job-detail .top-job .top-job-info p"
)[
1
].
text
.
gsub
(
"Ngày cập nhật: "
,
""
)
# Hash company
company_attributes
[
:title
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info .top-job .top-job-info .tit_company"
).
text
company_attributes
[
:logo
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info .top-job .logocompany a img @src"
).
text
company_attributes
[
:address
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info p"
)[
0
].
text
company_attributes
[
:description
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .aboutustp .info .desc"
).
text
.
gsub
(
" Xem thêm"
,
""
)
# Get value for job attributes
html_job_detail
.
css
(
"
#{
css_dom
}
.right-col .info-career .info li"
).
each_with_index
do
|
ele
,
index
|
type
=
ele
.
css
(
"b"
).
text
case
type
when
"Nơi làm việc"
# Check exist or create city
ele
.
css
(
"span a"
).
each_with_index
do
|
ele
,
index
|
if
index
>
0
city
=
check_exist_or_create_city
(
ele
.
text
)
cities
<<
city
end
end
when
"Cấp bậc"
job_attributes
[
:level
]
=
ele
.
css
(
"span"
).
text
when
"Lương"
job_attributes
[
:salary
]
=
ele
.
css
(
"span"
).
text
when
"Ngành nghề"
# Check exist or create industry
ele
.
css
(
"span a"
).
each_with_index
do
|
ele
,
index
|
industry
=
check_exist_or_create_industry
(
ele
.
text
)
industries
<<
industry
end
when
"Hết hạn nộp"
job_attributes
[
:expiration_date
]
=
ele
.
css
(
"span"
).
text
else
job_attributes
[
:years_of_experience
]
=
ele
.
css
(
"span"
).
text
end
end
# Set description for job attributes
job_attributes
[
:job_description
]
=
html_job_detail
.
css
(
"
#{
css_dom
}
.left-col #showScroll"
).
inner_html
job_attributes
[
:job_description
]
=
description
.
strip
else
skip_url_logger
.
info
"another template
#{
job_detail_url
}
"
skip_url_logger
.
info
"another template
#{
item
.
css
(
".figure .figcaption .title .job_link @href"
).
text
}
"
end
# Check exist or create company
job_attributes
[
:company_id
]
=
check_exist_or_create_company
(
company_attributes
)
# Create job
job
=
check_exist_or_create_job
(
job_attributes
)
# Create city_job
if
cities
.
length
>
0
cities
.
each
do
|
city
|
job
.
cities
<<
city
end
end
# Create industry_job
if
industries
.
length
>
0
industries
.
each
do
|
industry
|
job
.
industries
<<
industry
end
end
rescue
exception_logger
.
info
"Error url:
#{
job_detail_url
}
"
next
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment