Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
Venjob_HungNT
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ngô Trung Hưng
Venjob_HungNT
Commits
ad336f02
Unverified
Commit
ad336f02
authored
Jul 22, 2020
by
Hung0326
Committed by
GitHub
Jul 22, 2020
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #8 from Hung0326/dev
fix crawler
parents
81cfe475
f5d71986
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
84 additions
and
149 deletions
+84
-149
app/assets/stylesheets/custom.scss
+3
-3
app/models/city.rb
+1
-1
app/views/shared/_block_cities_hot.html.erb
+3
-3
app/views/shared/_box_five_job.html.erb
+1
-3
app/views/shared/_box_nine_city.html.erb
+1
-1
lib/src/crawler.rb
+16
-12
lib/src/interface_web.rb
+56
-123
lib/tasks/crawler.rake
+3
-3
No files found.
app/assets/stylesheets/custom.scss
View file @
ad336f02
...
@@ -687,7 +687,6 @@ $main-color: #221f20;
...
@@ -687,7 +687,6 @@ $main-color: #221f20;
.box_info_salary
{
.box_info_salary
{
font-size
:
13px
;
font-size
:
13px
;
color
:
#008563
;
color
:
#008563
;
font-weight
:
600
;
transform
:
translateY
(
-2px
);
transform
:
translateY
(
-2px
);
}
}
.box_btn_favotite
{
.box_btn_favotite
{
...
@@ -818,7 +817,7 @@ $main-color: #221f20;
...
@@ -818,7 +817,7 @@ $main-color: #221f20;
}
}
.box_text_five_jobs.box_padding_city
{
.box_text_five_jobs.box_padding_city
{
background-
color
:
#da6d2e
;
background-
image
:
linear-gradient
(
to
right
,
#86cb49
,
#169b74
,
#86cb49
);
color
:
white
;
color
:
white
;
font-weight
:
600
;
font-weight
:
600
;
margin-bottom
:
0px
!
important
;
margin-bottom
:
0px
!
important
;
...
@@ -849,12 +848,13 @@ $main-color: #221f20;
...
@@ -849,12 +848,13 @@ $main-color: #221f20;
cursor
:
pointer
;
cursor
:
pointer
;
color
:
white
;
color
:
white
;
text-align
:
center
;
text-align
:
center
;
font-size
:
1
8
px
;
font-size
:
1
7
px
;
font-weight
:
bold
;
font-weight
:
bold
;
}
}
.ct_jobs_count
{
.ct_jobs_count
{
cursor
:
pointer
;
cursor
:
pointer
;
font-size
:
16px
;
text-align
:
center
;
text-align
:
center
;
color
:
#999
;
color
:
#999
;
}
}
...
...
app/models/city.rb
View file @
ad336f02
...
@@ -6,7 +6,7 @@ class City < ApplicationRecord
...
@@ -6,7 +6,7 @@ class City < ApplicationRecord
hash
=
{}
hash
=
{}
data_cities
=
City
.
all
data_cities
=
City
.
all
data_cities
.
each
do
|
val
|
data_cities
.
each
do
|
val
|
hash
[
val
.
id
]
=
val
.
jobs
.
count
hash
[
val
.
name
]
=
val
.
jobs
.
count
end
end
hash
.
sort_by
{
|
k
,
v
|
v
}.
reverse
hash
.
sort_by
{
|
k
,
v
|
v
}.
reverse
end
end
...
...
app/views/shared/_block_cities_hot.html.erb
View file @
ad336f02
<div
class=
"box_info_city"
>
<div
class=
"box_info_city"
>
<div
class=
"ct_name"
>
<div
class=
"ct_name"
>
<%
data
=
City
.
find
(
k
)
%>
<%=
link_to
name_city
,
'#'
,
class:
'link_ct'
%>
<%=
link_to
data
.
name
,
'#'
,
class:
'link_ct'
%>
</div>
</div>
<div
class=
"ct_jobs_count"
>
<div
class=
"ct_jobs_count"
>
<%=
link_to
"
#{
v
}
công việc"
,
'#'
,
class:
'link_ct'
%>
<%=
link_to
"
#{
jobs_count
}
công việc"
,
'#'
,
class:
'link_ct'
%>
</div>
</div>
</div>
</div>
\ No newline at end of file
app/views/shared/_box_five_job.html.erb
View file @
ad336f02
...
@@ -6,11 +6,9 @@
...
@@ -6,11 +6,9 @@
<%
@five_jobs
.
each
do
|
val
|
%>
<%
@five_jobs
.
each
do
|
val
|
%>
<div
class=
"box_jobs"
>
<div
class=
"box_jobs"
>
<div
class=
"col-sm-12 d-block d-sm-none"
>
<div
class=
"col-sm-12 d-block d-sm-none"
>
<button
type=
"submit"
class=
"btn_favorite_outline xs"
>
<button
type=
"submit"
class=
"btn_favorite_outline xs"
>
<i
class=
"far fa-heart"
></i>
<i
class=
"far fa-heart"
></i>
</button>
</button>
</div>
</div>
<div
class=
"row"
>
<div
class=
"row"
>
<div
class=
"col-sm-10 col-md-9 col-lg-10"
>
<div
class=
"col-sm-10 col-md-9 col-lg-10"
>
...
@@ -31,7 +29,7 @@
...
@@ -31,7 +29,7 @@
<%=
dt
.
join
(
''
).
chomp
(
'| '
)
%>
<%=
dt
.
join
(
''
).
chomp
(
'| '
)
%>
</h5>
</h5>
</div>
</div>
<h5
class=
"box_info_salary"
><i
class=
"fas fa-dollar-sign"
></i>
Lương:
<%=
val
.
salary
%>
</h5>
<h5
class=
"box_info_salary"
><i
class=
"fas fa-dollar-sign"
></i>
Lương:
<%=
val
.
salary
%>
</h5>
<div
class=
"coc"
>
<div
class=
"coc"
>
<h5
class=
"box_info_des"
>
<%=
strip_tags
(
val
.
description
)
%>
</h5>
<h5
class=
"box_info_des"
>
<%=
strip_tags
(
val
.
description
)
%>
</h5>
</div>
</div>
...
...
app/views/shared/_box_nine_city.html.erb
View file @
ad336f02
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
<div
class=
"row no-gutters"
>
<div
class=
"row no-gutters"
>
<%
@top_city
.
each
do
|
k
,
v
|
%>
<%
@top_city
.
each
do
|
k
,
v
|
%>
<div
class=
"col-lg-4 col-md-4 col-sm-6 col-xs-12"
>
<div
class=
"col-lg-4 col-md-4 col-sm-6 col-xs-12"
>
<%=
render
'shared/block_cities_hot'
,
k:
k
,
v
:
v
%>
<%=
render
'shared/block_cities_hot'
,
name_city:
k
,
jobs_count
:
v
%>
</div>
</div>
<%
end
%>
<%
end
%>
<div
class=
"col-lg-4 col-md-4 col-sm-6 col-xs-12"
>
<div
class=
"col-lg-4 col-md-4 col-sm-6 col-xs-12"
>
...
...
lib/src/crawler.rb
View file @
ad336f02
...
@@ -12,7 +12,7 @@ class Clawler
...
@@ -12,7 +12,7 @@ class Clawler
list_cities
.
each
do
|
x
|
list_cities
.
each
do
|
x
|
data_list_cities
<<
x
.
gsub
(
/(^<[\w\D]*>)/
,
''
).
gsub
(
/\n/
,
''
).
rstrip
data_list_cities
<<
x
.
gsub
(
/(^<[\w\D]*>)/
,
''
).
gsub
(
/\n/
,
''
).
rstrip
end
end
puts
"Save data to database...
\n
"
puts
"Save data to database...
\n
------------------------
"
data_list_cities
.
length
.
times
do
|
i
|
data_list_cities
.
length
.
times
do
|
i
|
area
=
i
>
69
?
0
:
1
area
=
i
>
69
?
0
:
1
name
=
(
data_list_cities
[
i
].
to_s
)
name
=
(
data_list_cities
[
i
].
to_s
)
...
@@ -29,7 +29,7 @@ class Clawler
...
@@ -29,7 +29,7 @@ class Clawler
list_industries
.
each
do
|
x
|
list_industries
.
each
do
|
x
|
data_list_industries
<<
x
.
gsub
(
/(^<[\w\D]*>)/
,
''
).
gsub
(
/\n/
,
''
).
strip
data_list_industries
<<
x
.
gsub
(
/(^<[\w\D]*>)/
,
''
).
gsub
(
/\n/
,
''
).
strip
end
end
puts
"Save data to database...
\n
"
puts
"Save data to database...
\n
------------------------
"
data_list_industries
.
length
.
times
do
|
i
|
data_list_industries
.
length
.
times
do
|
i
|
name
=
data_list_industries
[
i
].
to_s
name
=
data_list_industries
[
i
].
to_s
if
name
.
include?
(
'&'
)
if
name
.
include?
(
'&'
)
...
@@ -39,23 +39,27 @@ class Clawler
...
@@ -39,23 +39,27 @@ class Clawler
end
end
end
end
# FILL DATA COMPANIES
# FILL DATA COMPANIES
def
self
.
make_companies
def
self
.
make_companies
Company
.
create!
(
name:
"Bảo mật"
,
# Company.create!(name: "Bảo mật",
address:
"Vui lòng xem trong mô tả công việc"
,
# address: "Vui lòng xem trong mô tả công việc",
short_description:
"Vui lòng xem trong mô tả công việc"
)
# short_description: "Vui lòng xem trong mô tả công việc")
Company
.
find_or_create_by
(
name:
'Bảo mật'
,
address:
'Vui lòng xem trong mô tả công việc'
)
do
|
company
|
company
.
name
=
'Bảo mật'
company
.
address
=
'Vui lòng xem trong mô tả công việc'
company
.
short_description
=
'Vui lòng xem trong mô tả công việc'
end
@data
=
Interface_web
.
craw_data_companies
()
@data
=
Interface_web
.
craw_data_companies
()
puts
'Save info companies to database . . .'
puts
'Save info companies to database . . .'
i
=
0
@data
[
:name
].
each_with_index
do
|
name
,
index
|
@data
[
:name
].
each
do
|
n
|
if
Company
.
find_by
(
name:
name
).
blank?
if
Company
.
find_by
(
name:
n
).
blank?
address
=
@data
[
:address
][
index
]
address
=
@data
[
:address
][
i
]
short_description
=
@data
[
:description
][
index
]
short_description
=
@data
[
:description
][
i
]
Company
.
create!
(
name:
name
,
Company
.
create!
(
name:
n
,
address:
address
,
address:
address
,
short_description:
short_description
)
short_description:
short_description
)
end
end
i
+=
1
end
end
end
end
# FILL DATA JOBS
# FILL DATA JOBS
...
...
lib/src/interface_web.rb
View file @
ad336f02
class
Interface_web
class
Interface_web
# func get "n" link company & job
# func get "n" link company & job
debugger
def
self
.
crawl_link_for_companies_jobs
(
page
)
def
self
.
crawl_link_for_companies_jobs
(
page
)
puts
"Crawling link on page...
\n
PLease wait...
\n
"
puts
"Crawling link on page...
\n
PLease wait...
\n
"
data
=
[]
data
=
[]
...
@@ -20,14 +19,13 @@ class Interface_web
...
@@ -20,14 +19,13 @@ class Interface_web
website_jobs
=
website_jobs
.
join
(
","
)
website_jobs
=
website_jobs
.
join
(
","
)
website_jobs
=
website_jobs
.
split
(
","
)
website_jobs
=
website_jobs
.
split
(
","
)
website_jobs
=
website_jobs
.
select
{
|
val
|
val
!=
''
}
website_jobs
=
website_jobs
.
select
{
|
val
|
val
!=
''
}
puts
"Result:
\n
Company:
#{
website_companies
.
length
}
link
\n
Job :
#{
website_jobs
}
link
"
puts
"Result:
\n
Company:
#{
website_companies
.
length
}
link
\n
Job :
#{
website_jobs
.
length
}
link
\n
------------------------
"
data
<<
website_companies
<<
website_jobs
data
<<
website_companies
<<
website_jobs
end
end
@crawl_link_for_companies_jobs
=
crawl_link_for_companies_jobs
(
15
)
@crawl_link_for_companies_jobs
=
crawl_link_for_companies_jobs
(
1
)
def
self
.
get_link_job_and_companies
def
self
.
get_link_job_and_companies
@crawl_link_for_companies_jobs
||=
crawl_link_for_companies_jobs
(
1
)
@crawl_link_for_companies_jobs
||=
crawl_link_for_companies_jobs
(
1
5
)
end
end
def
self
.
base_link
(
url
)
def
self
.
base_link
(
url
)
...
@@ -36,13 +34,12 @@ class Interface_web
...
@@ -36,13 +34,12 @@ class Interface_web
def
self
.
craw_data_companies
def
self
.
craw_data_companies
link_crawl
=
get_link_job_and_companies
link_crawl
=
get_link_job_and_companies
@
data_companies
=
{}
data_companies
=
{}
@
data_companies_name
=
[]
data_companies_name
=
[]
@
data_companies_address
=
[]
data_companies_address
=
[]
@
data_companies_description
=
[]
data_companies_description
=
[]
puts
'Crawl data companies'
puts
'Crawl data companies'
@current_company
=
0
link_crawl
[
0
].
each_with_index
do
|
url
,
i
|
link_crawl
[
0
].
each
do
|
url
|
page
=
base_link
(
url
)
page
=
base_link
(
url
)
name
=
''
name
=
''
address
=
''
address
=
''
...
@@ -57,40 +54,37 @@ class Interface_web
...
@@ -57,40 +54,37 @@ class Interface_web
desc
=
page
.
search
(
".main-about-us .content"
).
text
desc
=
page
.
search
(
".main-about-us .content"
).
text
end
end
if
(
name
!=
""
&&
address
!=
""
&&
desc
!=
""
)
if
(
name
.
present?
&&
address
.
present?
&&
desc
.
present?
)
@data_companies_name
<<
name
.
to_s
.
strip
data_companies_name
<<
name
.
to_s
.
strip
@data_companies_address
<<
address
.
to_s
.
strip
data_companies_address
<<
address
.
to_s
.
strip
@data_companies_description
<<
desc
data_companies_description
<<
desc
@current_company
+=
1
end
end
puts
"
Crawling
#{
@current_company
}
"
puts
"
Process company
#{
i
+
1
}
. . .
\n
------------------------
"
end
end
@data_companies
[
:name
]
=
@
data_companies_name
data_companies
[
:name
]
=
data_companies_name
@data_companies
[
:address
]
=
@
data_companies_address
data_companies
[
:address
]
=
data_companies_address
@
data_companies_description
.
each
do
|
val
|
data_companies_description
.
each
do
|
val
|
val
.
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
val
.
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
val
.
strip!
val
.
strip!
end
end
@data_companies
[
:description
]
=
@
data_companies_description
data_companies
[
:description
]
=
data_companies_description
@
data_companies
data_companies
end
end
def
self
.
add_data
def
self
.
add_data
(
name
,
company_name
,
city_name
,
created_date
,
expiration_date
,
salary
,
industry_name
,
description
,
level
,
exprience
)
@data
[
:name
]
=
@
name
@data
[
:name
]
=
name
@data
[
:company_name
]
=
@
company_name
@data
[
:company_name
]
=
company_name
@data
[
:city_name
]
=
@
city_name
@data
[
:city_name
]
=
city_name
@data
[
:created_date
]
=
@
created_date
@data
[
:created_date
]
=
created_date
@data
[
:expiration_date
]
=
@
expiration_date
@data
[
:expiration_date
]
=
expiration_date
@data
[
:salary
]
=
@
salary
@data
[
:salary
]
=
salary
@data
[
:industry_name
]
=
@
industry_name
@data
[
:industry_name
]
=
industry_name
@data
[
:description
]
=
@
description
@data
[
:description
]
=
description
@data
[
:level
]
=
@
level
@data
[
:level
]
=
level
@data
[
:exprience
]
=
@
exprience
@data
[
:exprience
]
=
exprience
end
end
def
self
.
crawl_data_jobs_interface_1
(
url
)
def
self
.
crawl_data_jobs_interface_1
(
page
)
page
=
base_link
(
url
)
@name
<<
page
.
search
(
".apply-now-content .job-desc .title"
).
text
@name
<<
page
.
search
(
".apply-now-content .job-desc .title"
).
text
@company_name
<<
page
.
search
(
".apply-now-content .job-desc .job-company-name"
).
text
@company_name
<<
page
.
search
(
".apply-now-content .job-desc .job-company-name"
).
text
...
@@ -109,31 +103,30 @@ class Interface_web
...
@@ -109,31 +103,30 @@ class Interface_web
@salary
<<
page
.
search
(
".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p"
)[
1
].
text
@salary
<<
page
.
search
(
".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p"
)[
1
].
text
industr
y_name
=
page
.
search
(
".item-blue .detail-box:nth-child(1) ul li:nth-child(2) a"
).
text
industr
ies
=
page
.
search
(
".item-blue .detail-box:nth-child(1) ul li:nth-child(2) a"
).
text
industr
y_name
=
industry_name
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
select
{
|
v
|
v
!=
''
}
industr
ies
=
industries
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
select
{
|
v
|
v
!=
''
}
@industry_name
<<
industr
y_name
.
join
(
','
)
@industry_name
<<
industr
ies
.
join
(
','
)
@description
<<
page
.
search
(
".tabs .tab-content .detail-row:nth-child(n)"
).
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
@description
<<
page
.
search
(
".tabs .tab-content .detail-row:nth-child(n)"
).
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
get_level
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(3)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
)
get_level
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(3)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
)
get_level
=
get_level
[
1
].
to_s
.
strip
get_level
=
get_level
[
1
].
to_s
.
strip
if
get_level
==
""
if
get_level
==
""
level
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(2)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
)
g_
level
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(2)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
)
@level
<<
level
[
1
].
to_s
.
strip
@level
<<
g_
level
[
1
].
to_s
.
strip
else
else
level
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(3)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
)
g_
level
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(3)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
lstrip
.
split
(
'Cấp bậc'
)
@level
<<
level
[
1
].
to_s
.
strip
@level
<<
g_
level
[
1
].
to_s
.
strip
end
end
exp
rience
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(2)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
'Kinh nghiệm'
)
exp
=
page
.
search
(
".item-blue .detail-box:last ul li:nth-child(2)"
).
text
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
'Kinh nghiệm'
)
exp
rience
=
exprience
[
1
].
to_s
.
strip
exp
=
exp
[
1
].
to_s
.
strip
@exprience
<<
exp
rience
@exprience
<<
exp
add_data
()
add_data
(
@name
,
@company_name
,
@city_name
,
@created_date
,
@expiration_date
,
@salary
,
@industry_name
,
@description
,
@level
,
@exprience
)
end
end
def
self
.
crawl_data_jobs_interface_2
(
url
)
def
self
.
crawl_data_jobs_interface_2
(
page
)
page
=
base_link
(
url
)
@name
<<
page
.
search
(
".apply-now-content .job-desc .title"
).
text
@name
<<
page
.
search
(
".apply-now-content .job-desc .title"
).
text
...
@@ -169,11 +162,6 @@ class Interface_web
...
@@ -169,11 +162,6 @@ class Interface_web
@level
<<
lv
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
strip
.
split
(
'Cấp bậc'
).
last
.
strip
@level
<<
lv
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
strip
.
split
(
'Cấp bậc'
).
last
.
strip
end
end
# if exp == ""
# @exprience << ""
# else
# @exprience << exp.delete!("[\n,\t,\r]").split('Kinh nghiệm').last.strip
# end
exp
=
page
.
search
(
".info li:nth-child(6)"
).
text
exp
=
page
.
search
(
".info li:nth-child(6)"
).
text
if
exp
.
blank?
if
exp
.
blank?
@exprience
<<
""
@exprience
<<
""
...
@@ -181,68 +169,11 @@ class Interface_web
...
@@ -181,68 +169,11 @@ class Interface_web
@exprience
<<
exp
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
'Kinh nghiệm'
).
last
.
strip
@exprience
<<
exp
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
'Kinh nghiệm'
).
last
.
strip
end
end
add_data
()
add_data
(
@name
,
@company_name
,
@city_name
,
@created_date
,
@expiration_date
,
@salary
,
@industry_name
,
@description
,
@level
,
@exprience
)
end
def
self
.
crawl_data_jobs_interface_3
(
url
)
page
=
base_link
(
url
)
@name
<<
page
.
search
(
".intro_job h1"
).
text
@company_name
<<
page
.
search
(
".info-company .text-job h2"
).
text
@city_name
<<
page
.
search
(
".DetailJobNew ul li:nth-child(1) a"
).
text
@created_date
<<
""
@expiration_date
<<
page
.
search
(
".DetailJobNew .info ul li:nth-child(3) p"
).
text
.
strip
@salary
<<
page
.
search
(
".DetailJobNew .salary ul li:nth-child(3) p"
).
text
.
strip
@industry_name
<<
page
.
search
(
".DetailJobNew .salary ul li:nth-child(2) p"
).
text
.
strip
@description
<<
page
.
search
(
".content_job .detail-row"
).
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
@level
<<
page
.
search
(
".DetailJobNew .info ul li:nth-child(2) p"
).
text
.
strip
@exprience
<<
page
.
search
(
".DetailJobNew .info ul li:nth-child(1) p"
).
text
.
strip
add_data
()
end
def
self
.
crawl_data_jobs_interface_4
(
url
)
page
=
base_link
(
url
)
@name
<<
page
.
search
(
".info-company h1"
).
text
if
page
.
search
(
".zone-company .text-job h2"
).
text
==
""
@company_name
<<
page
.
search
(
".info-company .text-job h2"
).
text
industry_name
=
page
.
search
(
".DetailJobNew li:nth-child(3) span"
).
text
.
strip
@industry_name
<<
industry_name
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
select
{
|
v
|
v
!=
''
}
else
@company_name
<<
page
.
search
(
".zone-company .text-job h2"
).
text
.
strip
industry_name
=
page
.
search
(
".DetailJobNew li:nth-child(3) span a"
).
text
@industry_name
<<
industry_name
.
delete!
(
"[
\n
,
\t
,
\r
]"
).
split
(
' '
).
select
{
|
v
|
v
!=
''
}
end
@city_name
<<
page
.
search
(
".DetailJobNew ul li:nth-child(1) a"
).
text
@created_date
<<
""
@expiration_date
<<
page
.
search
(
".DetailJobNew li:nth-child(7) span"
).
text
@salary
<<
page
.
search
(
".DetailJobNew li:nth-child(6) span"
).
text
@description
<<
page
.
search
(
".left-col"
).
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
@level
<<
page
.
search
(
".DetailJobNew ul li:nth-child(2) span"
).
text
@exprience
<<
""
add_data
()
end
end
def
self
.
crawl_data_jobs_interface_5
(
url
)
def
self
.
crawl_data_jobs_interface_5
(
page
)
page
=
base_link
(
url
)
#
page = base_link(url)
@name
<<
page
.
search
(
".info-company h1"
).
text
@name
<<
page
.
search
(
".info-company h1"
).
text
@company_name
<<
page
.
search
(
".info-company .text-job h2"
).
text
@company_name
<<
page
.
search
(
".info-company .text-job h2"
).
text
...
@@ -263,7 +194,7 @@ class Interface_web
...
@@ -263,7 +194,7 @@ class Interface_web
@exprience
<<
page
.
search
(
".DetailJobNew li:nth-child(5) span"
).
text
.
strip
@exprience
<<
page
.
search
(
".DetailJobNew li:nth-child(5) span"
).
text
.
strip
add_data
()
add_data
(
@name
,
@company_name
,
@city_name
,
@created_date
,
@expiration_date
,
@salary
,
@industry_name
,
@description
,
@level
,
@exprience
)
end
end
def
self
.
make_data
def
self
.
make_data
...
@@ -281,24 +212,25 @@ class Interface_web
...
@@ -281,24 +212,25 @@ class Interface_web
@city_name
=
[]
@city_name
=
[]
link_crawl
=
get_link_job_and_companies
link_crawl
=
get_link_job_and_companies
total_jobs
=
link_crawl
[
1
].
length
link_crawl
[
1
].
each_with_index
do
|
path
,
i
|
current_job
=
1
link_crawl
[
1
].
each
do
|
path
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/
#{
path
}
"
))))
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/
#{
path
}
"
))))
if
page
.
search
(
".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p"
)[
0
]
!=
nil
if
page
.
search
(
".item-blue .detail-box:nth-child(1) ul li:nth-child(1) p"
)[
0
]
!=
nil
crawl_data_jobs_interface_1
(
pa
th
)
crawl_data_jobs_interface_1
(
pa
ge
)
elsif
page
.
search
(
"section .template-200"
).
text
!=
""
elsif
page
.
search
(
"section .template-200"
).
text
!=
""
crawl_data_jobs_interface_2
(
pa
th
)
crawl_data_jobs_interface_2
(
pa
ge
)
elsif
(
page
.
search
(
".DetailJobNew ul li"
).
size
==
10
&&
!
page
.
search
(
'.right-col ul li'
).
text
.
include?
(
'Độ tuổi'
))
elsif
(
page
.
search
(
".DetailJobNew ul li"
).
size
==
10
&&
!
page
.
search
(
'.right-col ul li'
).
text
.
include?
(
'Độ tuổi'
))
crawl_data_jobs_interface_5
(
pa
th
)
crawl_data_jobs_interface_5
(
pa
ge
)
end
end
puts
"Process:
#{
current_job
}
/
#{
total_jobs
}
"
puts
"Process:
#{
i
+
1
}
/
#{
link_crawl
[
1
].
length
}
"
current_job
+=
1
end
end
@data
@data
end
end
end
end
# else # insert "page.search(".DetailJobNew ul li").size == 8" (if want catch interface 4)
# else # insert "page.search(".DetailJobNew ul li").size == 8" (if want catch interface 4)
# crawl_data_jobs_interface_3(path)
# crawl_data_jobs_interface_3(path)
\ No newline at end of file
lib/tasks/crawler.rake
View file @
ad336f02
require
'src/crawler'
require
'src/crawler'
namespace
:db
do
namespace
:db
do
task
populate: :environment
do
task
populate: :environment
do
#
Clawler.make_industries
Clawler
.
make_industries
#
Clawler.make_cities
Clawler
.
make_cities
Clawler
.
make_companies
Clawler
.
make_companies
#
Clawler.make_jobs
Clawler
.
make_jobs
end
end
end
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment