Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
VeNJOB
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Nguyen Ngoc Nghia
VeNJOB
Commits
cb600a23
Commit
cb600a23
authored
Feb 28, 2020
by
nnnghia98
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactoring job crawler code
parent
2d0503f4
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
96 additions
and
63 deletions
+96
-63
app/services/crawl_data.rb
+17
-63
app/services/job_html.rb
+77
-0
config/settings.yml
+2
-0
No files found.
app/services/crawl_data.rb
View file @
cb600a23
...
@@ -5,89 +5,43 @@ require "openssl"
...
@@ -5,89 +5,43 @@ require "openssl"
OpenSSL
::
SSL
::
VERIFY_PEER
=
OpenSSL
::
SSL
::
VERIFY_NONE
OpenSSL
::
SSL
::
VERIFY_PEER
=
OpenSSL
::
SSL
::
VERIFY_NONE
class
CrawlData
class
CrawlData
def
crawl_web
def
crawl_web
page
=
Nokogiri
::
HTML
.
parse
(
open
(
Settings
.
crawl
.
base_url
))
page
=
Nokogiri
::
HTML
.
parse
(
open
(
Settings
.
crawl
.
base_url
,
ssl_verify_mode:
nil
))
total_job
=
page
.
css
(
"div.ais-stats h1.col-sm-10 span"
).
text
.
gsub
(
","
,
""
).
to_f
total_job
=
page
.
css
(
"div.ais-stats h1.col-sm-10 span"
).
text
.
gsub
(
","
,
""
).
to_f
total_page
=
(
total_job
/
50
).
floor
total_page
=
(
total_job
/
Settings
.
crawl
.
jobs_per_page
).
floor
fixed_total_page
=
20
crawl_job_title_logger
=
ActiveSupport
::
Logger
.
new
(
"log/crawl_data.log"
)
crawl_job_title_logger
=
ActiveSupport
::
Logger
.
new
(
"log/crawl_data.log"
)
crawl_job_title_logger
.
info
"Crawl at
#{
Time
.
current
}
"
crawl_job_title_logger
.
info
"Crawl at
#{
Time
.
current
}
"
(
1
..
fixed_total_page
).
each
do
|
each_page
|
(
1
..
Settings
.
crawl
.
fixed_total_page
).
each
do
|
each_page
|
page
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
each_page
}
-vi.html"
)))
page
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
each_page
}
-vi.html"
)))
(
0
..
49
).
each
do
|
j
|
(
0
..
49
).
each
do
|
j
|
job_url
=
page
.
css
(
".jobtitle h3 a @href"
)[
j
].
text
job_url
=
page
.
css
(
".jobtitle h3 a @href"
)[
j
].
text
job_page
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
job_url
)))
job_page
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
job_url
)))
# Job code
# Job code
job_code
=
job_url
.
split
(
"/"
).
last
.
split
(
"."
)[
-
2
]
job_code
=
job_url
.
split
(
"/"
).
last
.
split
(
"."
)[
-
2
]
next
if
job_page
.
css
(
".LeftJobCB"
).
nil?
# Company code
company_code
=
job_url
.
split
(
"/"
).
last
.
split
(
"-"
).
last
.
split
(
"."
)[
-
2
].
strip
# Job title
job_title
=
job_page
.
css
(
".top-job-info h1"
).
text
.
strip
crawl_job_title_logger
.
info
"
#{
job_title
}
"
# Job post date
job_post_date
=
job_page
.
css
(
".datepost span"
).
text
job_salary
,
job_position
,
job_expiration_date
,
job_industries
,
job_level
=
""
job_workplace
=
[]
detail_job_new
=
job_page
.
css
(
".DetailJobNew li p"
)
(
0
..
detail_job_new
.
count
-
1
).
each
do
|
detail_part
|
detail
=
detail_job_new
[
detail_part
].
text
if
detail
.
include?
(
"Nơi làm việc"
)
job_workplace
=
detail
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
split
(
","
)
elsif
detail
.
include?
(
"Lương"
)
job_salary
=
detail
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
strip
elsif
detail
.
include?
(
"Cấp bậc"
)
job_level
=
detail
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
strip
elsif
detail
.
include?
(
"Hết hạn nộp"
)
job_expiration_date
=
detail
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
strip
elsif
detail
.
include?
(
"Ngành nghề"
)
job_industries
=
detail
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
split
(
","
)
end
end
job_description
,
job_requirement
=
""
next
if
job_page
.
css
(
".LeftJobCB"
).
nil?
job_container_detail
=
job_page
.
css
(
"div.MarBot20"
)
(
0
..
job_container_detail
.
count
-
1
).
each
do
|
detail_part
|
job
=
JobHtml
.
new
(
job_page
).
parse_job
detail
=
job_container_detail
[
detail_part
].
text
if
detail
.
include?
(
"Mô tả Công việc"
)
job_description
=
detail
.
partition
(
"Mô tả Công việc"
).
last
elsif
detail
.
include?
(
"Yêu Cầu Công Việc"
)
job_requirement
=
detail
.
partition
(
"Yêu Cầu Công Việc"
).
last
end
end
company_name
,
company_email
,
company_address
,
company_desc
,
company_code
=
""
crawl_job_title_logger
.
info
"
#{
job
[
:title
]
}
"
# Company full name
unless
job_page
.
css
(
".tit_company"
).
nil?
company_name
=
job_page
.
css
(
"div.tit_company"
).
text
.
strip
end
# Company code
company_code
=
job_url
.
split
(
"/"
).
last
.
split
(
"-"
).
last
.
split
(
"."
)[
-
2
].
strip
# Company address
next
if
job
[
:workplace
].
nil?
unless
job_page
.
css
(
".TitleDetailNew label"
)[
0
].
nil?
company_address
=
job_page
.
css
(
"p.TitleDetailNew label"
)[
0
].
text
.
strip
end
# Company description
company_desc
=
job_page
.
css
(
"#emp_more p"
).
text
.
strip
job
_workplace
.
each
do
|
city_name
|
job
[
:workplace
]
.
each
do
|
city_name
|
city_id
=
city_id
(
city_name
)
city_id
=
city_id
(
city_name
)
company_id
=
company_id
(
company_code
,
company_name
,
company_address
,
company_desc
)
company_id
=
company_id
(
company_code
,
job
[
:company_name
],
job
[
:company_address
],
job
[
:company_description
]
)
job_id
=
job_id
(
job_code
,
job
_title
,
job_salary
,
job_id
=
job_id
(
job_code
,
job
[
:title
],
job
[
:salary
]
,
job
_description
,
job_requirement
,
job
[
:description
],
job
[
:requirement
]
,
job
_level
,
job_post_date
,
job
[
:level
],
job
[
:post_date
]
,
job
_expiration_date
,
company_id
)
job
[
:expiration_date
]
,
company_id
)
CityJob
.
find_or_create_by!
(
job_id:
job_id
,
city_id:
city_id
)
CityJob
.
find_or_create_by!
(
job_id:
job_id
,
city_id:
city_id
)
job
_industries
.
each
do
|
job_industry
|
job
[
:industries
]
.
each
do
|
job_industry
|
industry_id
=
industry_id
(
job_industry
.
strip
)
industry_id
=
industry_id
(
job_industry
.
strip
)
IndustryJob
.
find_or_create_by!
(
industry_id:
industry_id
,
job_id:
job_id
)
IndustryJob
.
find_or_create_by!
(
industry_id:
industry_id
,
job_id:
job_id
)
end
end
...
...
app/services/job_html.rb
0 → 100644
View file @
cb600a23
class
JobHtml
def
initialize
(
html_data
=
{}
)
@html_data
=
html_data
end
def
parse_job
get_job_info
get_job_detail
job
=
{
title:
get_title
,
salary:
@job_salary
,
level:
@job_level
,
post_date:
get_post_date
,
description:
@job_description
,
requirement:
@job_requirement
,
expiration_date:
@job_expiration_date
,
workplace:
@job_workplace
,
level:
@job_level
,
industries:
@job_industries
,
company_name:
get_company_name
,
company_address:
get_company_address
,
company_description:
get_company_description
}
return
job
end
def
get_title
@job_title
=
@html_data
.
css
(
".top-job-info h1"
).
text
.
strip
end
def
get_post_date
@job_post_date
=
@html_data
.
css
(
".datepost span"
).
text
end
def
get_job_info
info_container
=
@html_data
.
css
(
".DetailJobNew li p"
)
job_info
=
(
0
..
info_container
.
count
-
1
).
map
do
|
info_part
|
info
=
info_container
[
info_part
].
text
case
when
info
.
include?
(
"Nơi làm việc"
)
@job_workplace
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
split
(
","
)
when
info
.
include?
(
"Lương"
)
@job_salary
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
strip
when
info
.
include?
(
"Cấp bậc"
)
@job_level
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
strip
when
info
.
include?
(
"Hết hạn nộp"
)
@job_expiration_date
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
strip
when
info
.
include?
(
"Ngành nghề"
)
@job_industries
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
split
(
","
)
end
end
end
def
get_job_detail
detail_container
=
@html_data
.
css
(
"div.MarBot20"
)
job_detail
=
(
0
..
detail_container
.
count
-
1
).
map
do
|
detail_part
|
detail
=
detail_container
[
detail_part
].
text
if
detail
.
include?
(
"Mô tả Công việc"
)
@job_description
=
detail
.
partition
(
"Mô tả Công việc"
).
last
elsif
detail
.
include?
(
"Yêu Cầu Công Việc"
)
@job_requirement
=
detail
.
partition
(
"Yêu Cầu Công Việc"
).
last
end
end
end
def
get_company_name
@html_data
.
css
(
".tit_company"
).
present?
?
@html_data
.
css
(
"div.tit_company"
).
text
.
strip
:
""
end
def
get_company_description
@html_data
.
css
(
"#emp_more p"
).
text
.
strip
end
def
get_company_address
@html_data
.
css
(
".TitleDetailNew label"
)[
0
].
present?
?
@html_data
.
css
(
"p.TitleDetailNew label"
)[
0
].
text
.
strip
:
""
end
end
config/settings.yml
View file @
cb600a23
...
@@ -20,3 +20,5 @@ solr:
...
@@ -20,3 +20,5 @@ solr:
crawl
:
crawl
:
base_url
:
"
https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"
base_url
:
"
https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"
jobs_per_page
:
50
fixed_total_page
:
20
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment