Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
VeNJOB
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Nguyen Ngoc Nghia
VeNJOB
Commits
37105061
Commit
37105061
authored
Mar 02, 2020
by
nnnghia98
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactoring code
parent
6e007fe3
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
51 additions
and
47 deletions
+51
-47
app/services/crawl_data.rb
+24
-26
app/services/job_html.rb
+27
-21
No files found.
app/services/crawl_data.rb
View file @
37105061
...
@@ -7,6 +7,7 @@ class CrawlData
...
@@ -7,6 +7,7 @@ class CrawlData
def
crawl_web
def
crawl_web
page
=
Nokogiri
::
HTML
.
parse
(
open
(
Settings
.
crawl
.
base_url
,
ssl_verify_mode:
nil
))
page
=
Nokogiri
::
HTML
.
parse
(
open
(
Settings
.
crawl
.
base_url
,
ssl_verify_mode:
nil
))
total_job
=
page
.
css
(
"div.ais-stats h1.col-sm-10 span"
).
text
.
gsub
(
","
,
""
).
to_f
total_job
=
page
.
css
(
"div.ais-stats h1.col-sm-10 span"
).
text
.
gsub
(
","
,
""
).
to_f
return
if
total_job
==
0
total_page
=
(
total_job
/
Settings
.
crawl
.
jobs_per_page
).
floor
total_page
=
(
total_job
/
Settings
.
crawl
.
jobs_per_page
).
floor
crawl_job_title_logger
=
ActiveSupport
::
Logger
.
new
(
"log/crawl_data.log"
)
crawl_job_title_logger
=
ActiveSupport
::
Logger
.
new
(
"log/crawl_data.log"
)
crawl_job_title_logger
.
info
"Crawl at
#{
Time
.
current
}
"
crawl_job_title_logger
.
info
"Crawl at
#{
Time
.
current
}
"
...
@@ -18,31 +19,31 @@ class CrawlData
...
@@ -18,31 +19,31 @@ class CrawlData
job_page
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
job_url
)))
job_page
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
job_url
)))
# Job code
job
=
JobHtml
.
new
(
job_page
).
parse_job
job_code
=
job_url
.
split
(
"/"
).
last
.
split
(
"."
)[
-
2
]
# Company code
next
if
job_page
.
css
(
".LeftJobCB"
).
nil?
||
job
[
:workplace
].
blank?
company_code
=
job_url
.
split
(
"/"
).
last
.
split
(
"-"
).
last
.
split
(
"."
)[
-
2
].
strip
next
if
job_page
.
css
(
".LeftJobCB"
).
nil?
# Job code
job_code
=
job_url
.
split
(
"/"
).
last
.
split
(
"."
)[
-
2
]
||
""
job
=
JobHtml
.
new
(
job_page
).
parse_job
# Company code
company_code
=
job_page
.
css
(
".viewmorejob a @href"
).
present?
?
job_page
.
css
(
".viewmorejob a @href"
).
text
.
split
(
"/"
).
last
.
split
(
"-"
)[
-
2
].
strip
:
""
crawl_job_title_logger
.
info
"
#{
job
[
:title
]
}
"
crawl_job_title_logger
.
info
"
#{
job
[
:title
]
}
"
next
if
job
[
:workplace
].
nil?
job
[
:workplace
].
each
do
|
city_name
|
job
[
:workplace
].
each
do
|
city_name
|
city_id
=
city_id
(
city_name
)
city_id
=
get_city
(
city_name
).
id
company_id
=
company_id
(
company_code
,
job
[
:company_name
],
job
[
:company_address
],
job
[
:company_description
])
company_id
=
get_company
(
company_code
,
job
[
:company_name
],
job
[
:company_address
],
job
[
:company_description
]).
id
job_id
=
job_id
(
job_code
,
job
[
:title
],
job
[
:salary
],
job_id
=
get_job
(
job_code
,
job
[
:title
],
job
[
:salary
],
job
[
:description
],
job
[
:requirement
],
job
[
:description
],
job
[
:requirement
],
job
[
:level
],
job
[
:post_date
],
job
[
:level
],
job
[
:post_date
],
job
[
:expiration_date
],
company_id
)
job
[
:expiration_date
],
company_id
)
.
id
CityJob
.
find_or_create_by!
(
job_id:
job_id
,
city_id:
city_id
)
CityJob
.
find_or_create_by!
(
job_id:
job_id
,
city_id:
city_id
)
job
[
:industries
].
each
do
|
job_industry
|
job
[
:industries
].
each
do
|
job_industry
|
industry_id
=
industry_id
(
job_industry
.
strip
)
job_industry
=
job_industry
.
strip
industry_id
=
get_industry
(
job_industry
).
id
IndustryJob
.
find_or_create_by!
(
industry_id:
industry_id
,
job_id:
job_id
)
IndustryJob
.
find_or_create_by!
(
industry_id:
industry_id
,
job_id:
job_id
)
end
end
end
end
...
@@ -50,28 +51,25 @@ class CrawlData
...
@@ -50,28 +51,25 @@ class CrawlData
end
end
end
end
def
company_id
(
code
,
name
,
address
,
description
)
def
get_company
(
code
,
name
,
address
,
description
)
company
=
Company
.
find_or_initialize_by
(
code:
code
)
company
=
Company
.
find_or_initialize_by
(
code:
code
)
company
.
update
(
name:
name
,
address:
address
,
description:
description
)
company
.
update
(
name:
name
,
address:
address
,
description:
description
)
company
.
id
company
end
end
def
industry_id
(
name
)
def
get_industry
(
name
)
industry
=
Industry
.
find_or_create_by!
(
name:
name
)
industry
=
Industry
.
find_or_create_by!
(
name:
name
)
industry
.
id
industry
end
end
def
city_id
(
name
)
def
get_city
(
name
)
name
=
name
.
strip
name
=
name
.
strip
City
.
find_or_create_by
(
name:
name
,
region:
"Việt Nam"
)
.
id
City
.
find_or_create_by
(
name:
name
,
region:
"Việt Nam"
)
end
end
def
job_id
(
code
=
nil
,
title
,
salary
,
description
,
requirement
,
level
,
post_date
,
expiration_date
,
company_id
)
def
get_job
(
code
=
nil
,
title
,
salary
,
description
,
requirement
,
level
,
post_date
,
expiration_date
,
company_id
)
if
expiration_date
.
nil?
attrs
=
expiration_date
.
nil?
?
{
title:
job_title
,
company_id:
company_id
}
:
{
code:
code
}
job
=
Job
.
find_or_initialize_by
(
title:
job_title
,
company_id:
company_id
)
job
=
Job
.
find_or_initialize_by
attrs
else
job
=
Job
.
find_or_initialize_by
(
code:
code
)
end
job
.
update
(
code:
code
,
job
.
update
(
code:
code
,
title:
title
,
title:
title
,
...
@@ -82,6 +80,6 @@ class CrawlData
...
@@ -82,6 +80,6 @@ class CrawlData
expiration_date:
expiration_date
,
expiration_date:
expiration_date
,
level:
level
,
level:
level
,
company_id:
company_id
)
company_id:
company_id
)
job
.
id
job
end
end
end
end
app/services/job_html.rb
View file @
37105061
...
@@ -6,61 +6,67 @@ class JobHtml
...
@@ -6,61 +6,67 @@ class JobHtml
def
parse_job
def
parse_job
get_job_info
get_job_info
get_job_detail
get_job_detail
job
=
{
title:
get_title
,
{
title:
get_title
,
salary:
@job_salary
,
salary:
get_job_info
[
:salary
]
,
level:
@job_level
,
level:
get_job_info
[
:level
]
,
post_date:
get_post_date
,
post_date:
get_post_date
,
description:
@job_description
,
description:
get_job_detail
[
:description
],
requirement:
@job_requirement
,
requirement:
get_job_detail
[
:requirement
],
expiration_date:
@job_expiration_date
,
expiration_date:
get_job_info
[
:expiration_date
],
workplace:
@job_workplace
,
workplace:
get_job_info
[
:workplace
],
level:
@job_level
,
industries:
get_job_info
[
:industries
],
industries:
@job_industries
,
company_name:
get_company_name
,
company_name:
get_company_name
,
company_address:
get_company_address
,
company_address:
get_company_address
,
company_description:
get_company_description
}
company_description:
get_company_description
}
return
job
end
end
private
def
get_title
def
get_title
@
job_title
=
@
html_data
.
css
(
".top-job-info h1"
).
text
.
strip
@html_data
.
css
(
".top-job-info h1"
).
text
.
strip
end
end
def
get_post_date
def
get_post_date
@
job_post_date
=
@
html_data
.
css
(
".datepost span"
).
text
@html_data
.
css
(
".datepost span"
).
text
end
end
def
get_job_info
def
get_job_info
info_container
=
@html_data
.
css
(
".DetailJobNew li p"
)
info_container
=
@html_data
.
css
(
".DetailJobNew li p"
)
job_info
=
{}
job_info
=
(
0
..
info_container
.
count
-
1
).
map
do
|
info_part
|
(
0
..
info_container
.
count
-
1
).
each
do
|
info_part
|
info
=
info_container
[
info_part
].
text
info
=
info_container
[
info_part
].
text
case
case
when
info
.
include?
(
"Nơi làm việc"
)
when
info
.
include?
(
"Nơi làm việc"
)
@job_workplace
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
split
(
","
)
job_info
[
:workplace
]
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
split
(
","
)
||
[]
when
info
.
include?
(
"Lương"
)
when
info
.
include?
(
"Lương"
)
@job_salary
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
strip
job_info
[
:salary
]
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
strip
when
info
.
include?
(
"Cấp bậc"
)
when
info
.
include?
(
"Cấp bậc"
)
@job_level
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
strip
job_info
[
:level
]
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
strip
when
info
.
include?
(
"Hết hạn nộp"
)
when
info
.
include?
(
"Hết hạn nộp"
)
@job_expiration_date
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
strip
job_info
[
:expiration_date
]
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
strip
when
info
.
include?
(
"Ngành nghề"
)
when
info
.
include?
(
"Ngành nghề"
)
@job_industries
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
split
(
","
)
job_info
[
:industries
]
=
info
.
gsub
(
"/[
\r\n
]+/"
,
""
).
partition
(
":"
).
last
.
split
(
","
)
end
end
end
end
return
job_info
end
end
def
get_job_detail
def
get_job_detail
detail_container
=
@html_data
.
css
(
"div.MarBot20"
)
detail_container
=
@html_data
.
css
(
"div.MarBot20"
)
job_detail
=
{}
job_detail
=
(
0
..
detail_container
.
count
-
1
).
map
do
|
detail_part
|
(
0
..
detail_container
.
count
-
1
).
map
do
|
detail_part
|
detail
=
detail_container
[
detail_part
].
text
detail
=
detail_container
[
detail_part
].
text
if
detail
.
include?
(
"Mô tả Công việc"
)
if
detail
.
include?
(
"Mô tả Công việc"
)
@job_description
=
detail
.
partition
(
"Mô tả Công việc"
).
last
job_detail
[
:description
]
=
detail
.
partition
(
"Mô tả Công việc"
).
last
elsif
detail
.
include?
(
"Yêu Cầu Công Việc"
)
elsif
detail
.
include?
(
"Yêu Cầu Công Việc"
)
@job_requirement
=
detail
.
partition
(
"Yêu Cầu Công Việc"
).
last
job_detail
[
:requirement
]
=
detail
.
partition
(
"Yêu Cầu Công Việc"
).
last
end
end
end
end
return
job_detail
end
end
def
get_company_name
def
get_company_name
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment