Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
venjob
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Huỳnh Thiên Phước
venjob
Commits
6ab472c3
Commit
6ab472c3
authored
Jul 28, 2020
by
Huỳnh Thiên Phước
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix code
parent
5204717b
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
67 additions
and
71 deletions
+67
-71
app/views/top_pages/show.html.erb
+2
-2
lib/src/crawler.rb
+40
-39
lib/src/crontab.rb
+21
-26
lib/tasks/crawler_import.rake
+4
-4
No files found.
app/views/top_pages/show.html.erb
View file @
6ab472c3
...
...
@@ -2,8 +2,8 @@
<%
@job
.
each
do
|
job
|
%>
<%
if
!
job
.
cities
.
blank?
%>
<ul>
<div
class=
"title"
><strong>
<%=
(
@company
.
find_by
(
id:
job
.
company_id
)).
nam
e
%>
</strong></div>
<
%=
job
.
title
%
>
<div
class=
"title"
><strong>
<%=
job
.
titl
e
%>
</strong></div>
<
div>
<%=
(
@company
.
find_by
(
id:
job
.
company_id
)).
name
%>
</div
>
<div
class=
"salary"
><i
class=
"fas fa-dollar-sign"
></i>
Lương:
<%=
job
.
salary
%>
</div>
<div><i
class=
"fas fa-map-marker"
></i>
<%
job
.
cities
.
each
do
|
location
|
%>
...
...
lib/src/crawler.rb
View file @
6ab472c3
class
Crawler
class
Crawler
def
crawl_city
page
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"
))
get_name
=
page
.
search
(
'select#location'
)
...
...
@@ -14,15 +14,17 @@
end
end
end
def
crawl_industry
page
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"
))
get_name
=
page
.
search
(
'select#industry'
)
data_industry
=
get_name
.
search
(
'option'
).
map
{
|
p
|
p
.
text
.
strip
}
data_industry
=
get_name
.
search
(
'option'
).
map
{
|
p
|
p
.
text
.
strip
}
data_industry
.
each
do
|
name_industry
|
industry
=
Industry
.
create!
(
name:
name_industry
)
end
end
def
crawl_company
for
n
in
1
..
10
company_info
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
n
}
-vi.html"
))
...
...
@@ -53,29 +55,28 @@
end
end
end
def
crawl_job_relationships
for
n
in
1
..
10
page_access
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
n
}
-vi.html"
))
get_link
=
page_access
.
css
(
'a.job_link'
).
map
{
|
link
|
link
[
'href'
]
}
get_link
=
page_access
.
css
(
'a.job_link'
).
map
{
|
link
|
link
[
'href'
]
}
get_link
.
each
do
|
link
|
if
link
.
include?
(
'\u2013'
)
link
.
gsub!
(
'\u2013'
,
'–'
)
end
page_job
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
link
))))
get_row
=
page_job
.
search
(
'div.bg-blue div.row'
)
if
get_row
!=
""
get_name_company
=
page_job
.
search
(
'div.job-desc a.job-company-name'
).
text
.
strip
company_table
=
Company
.
find_by
(
name:
"
#{
get_name_company
}
"
)
company_table
=
Company
.
find_by
(
name:
get_name_company
)
title_job
=
page_job
.
search
(
'div.job-desc p'
).
text
description
=
page_job
.
search
(
'div.detail-row'
)
arr_column
=
get_row
.
css
(
'div.has-background'
).
map
{
|
data
|
data
.
text
.
split
(
' '
).
join
(
' '
)
}
arr_column
.
each_with_index
do
|
val
,
key
|
if
!
company_table
.
nil?
arr_column
=
get_row
.
css
(
'div.has-background'
).
map
{
|
data
|
data
.
text
.
split
(
' '
).
join
(
' '
)
}
arr_column
.
each_with_index
do
|
val
,
key
|
unless
company_table
.
nil?
job_check
=
Job
.
find_by
(
title:
title_job
,
company_id:
company_table
.
id
)
if
val
.
include?
(
'Ngày cập nhật'
)
arr_data
=
val
.
gsub
(
'Ngày cập nhật '
,
''
).
split
(
' '
)
arr_data
=
val
.
gsub
(
'Ngày cập nhật '
,
''
).
split
(
' '
)
date
=
arr_data
.
first
elsif
val
.
include?
(
'Lương'
)
&&
val
.
include?
(
'Kinh nghiệm'
)
==
true
&&
Job
.
find_by
(
title:
title_job
,
company_id:
company_table
.
id
)
==
nil
arr_sub
=
((((
val
.
gsub
(
'Lương '
,
''
)).
gsub
(
' Kinh nghiệm '
,
'*'
)).
gsub
(
' Cấp bậc '
,
'*'
)).
gsub
(
' Hết hạn nộp '
,
'*'
)
).
split
(
'*'
)
elsif
val
.
include?
(
'Lương'
)
&&
val
.
include?
(
'Kinh nghiệm'
)
==
true
&&
job_check
.
nil?
arr_sub
=
val
.
gsub
(
'Lương '
,
''
).
gsub
(
' Kinh nghiệm '
,
'*'
).
gsub
(
' Cấp bậc '
,
'*'
).
gsub
(
' Hết hạn nộp '
,
'*'
).
split
(
'*'
)
salary
=
arr_sub
[
0
]
experience
=
arr_sub
[
1
]
level
=
arr_sub
[
2
]
...
...
@@ -87,8 +88,8 @@
expiration_date:
expiration_date
,
description:
description
,
company_id:
company_table
.
id
)
elsif
val
.
include?
(
'Lương'
)
&&
val
.
include?
(
'Kinh nghiệm'
)
==
false
&&
Job
.
find_by
(
title:
title_job
,
company_id:
company_table
.
id
)
==
nil
arr_sub
=
(((
val
.
gsub
(
'Lương '
,
''
)).
gsub
(
' Cấp bậc '
,
'*'
)).
gsub
(
' Hết hạn nộp '
,
'*'
)
).
split
(
'*'
)
elsif
val
.
include?
(
'Lương'
)
&&
val
.
include?
(
'Kinh nghiệm'
)
==
false
&&
job_check
.
nil?
arr_sub
=
val
.
gsub
(
'Lương '
,
''
).
gsub
(
' Cấp bậc '
,
'*'
).
gsub
(
' Hết hạn nộp '
,
'*'
).
split
(
'*'
)
salary
=
arr_sub
[
0
]
level
=
arr_sub
[
1
]
expiration_date
=
arr_sub
[
2
]
...
...
@@ -101,21 +102,21 @@
company_id:
company_table
.
id
)
end
end
if
!
company_table
.
nil?
next
if
!
company_table
.
nil?
job_table
=
Job
.
find_by
(
title:
title_job
)
if
!
job_table
.
nil?
location_rel
=
get_row
.
css
(
'div.map p a'
).
children
.
map
{
|
location
|
location
.
text
.
strip
}
unless
job_table
.
nil?
location_rel
=
get_row
.
css
(
'div.map p a'
).
children
.
map
{
|
location
|
location
.
text
.
strip
}
location_rel
.
each
do
|
loc
|
city_table
=
City
.
find_by
(
name:
"
#{
loc
}
"
)
if
CityJob
.
find_by
(
job_id:
job_table
.
id
,
city_id:
city_table
.
id
)
==
nil
city_table
=
City
.
find_by
(
name:
loc
)
if
CityJob
.
find_by
(
job_id:
job_table
.
id
,
city_id:
city_table
.
id
).
nil?
puts
"Created City:
#{
job_table
.
id
}
-
#{
city_table
.
id
}
.
#{
loc
}
"
city_jobs
=
CityJob
.
create!
(
job_id:
job_table
.
id
,
city_id:
city_table
.
id
)
end
end
industry_rel
=
get_row
.
css
(
'li a'
).
children
.
map
{
|
industry
|
industry
.
text
.
strip
}
industry_rel
=
get_row
.
css
(
'li a'
).
children
.
map
{
|
industry
|
industry
.
text
.
strip
}
industry_rel
.
each
do
|
ind
|
industry_table
=
Industry
.
find_by
(
name:
"
#{
ind
}
"
)
if
IndustryJob
.
find_by
(
job_id:
job_table
.
id
,
industry_id:
industry_table
.
id
)
==
nil
industry_table
=
Industry
.
find_by
(
name:
ind
)
if
IndustryJob
.
find_by
(
job_id:
job_table
.
id
,
industry_id:
industry_table
.
id
).
nil?
puts
"Created Industry:
#{
job_table
.
id
}
-
#{
industry_table
.
id
}
.
#{
ind
}
"
industry_jobs
=
IndustryJob
.
create!
(
job_id:
job_table
.
id
,
industry_id:
industry_table
.
id
)
end
...
...
@@ -126,7 +127,6 @@
end
end
end
end
def
get_file_csv
Net
::
FTP
.
open
(
'192.168.1.156'
,
'training'
,
'training'
)
do
|
ftp
|
...
...
@@ -151,40 +151,40 @@
file
=
"jobs.csv"
CSV
.
foreach
(
file
,
headers:
true
)
do
|
row
|
begin
company_name
=
row
[
"company name"
].
strip
company_name
=
row
[
"company name"
]
company_address
=
row
[
"company address"
]
company_introduction
=
row
[
"benefit"
]
company_table
=
Company
.
find_by
(
name:
"
#{
company_name
}
"
)
if
company_table
==
nil
company_introduction
=
row
[
:benefit
]
company_table
=
Company
.
find_by
(
name:
company_name
)
if
company_table
.
nil?
company_table
=
Company
.
create!
(
name:
company_name
,
address:
company_address
,
introduction:
company_introduction
)
end
title_job
=
row
[
"name"
].
strip
description_job
=
row
[
"description"
]
level
=
row
[
"level"
]
salary
=
row
[
"salary"
]
if
company_table
!=
nil
&&
Job
.
find_by
(
title:
title_job
,
level:
level
,
salary:
salary
,
company_id:
company_table
.
id
)
==
nil
title_job
=
row
[
:name
]
description_job
=
row
[
:description
]
level
=
row
[
:level
]
salary
=
row
[
:salary
]
unless
company_table
.
nil?
job_table
=
Job
.
create!
(
title:
title_job
,
description:
description_job
,
level:
level
,
salary:
salary
,
company_id:
company_table
.
id
)
puts
job_table
.
id
end
industry
=
row
[
"category"
].
strip
industry
=
row
[
:category
]
industry_find
=
Industry
.
find_by
(
name:
industry
)
if
industry_find
==
nil
if
industry_find
.
nil?
industry_table
=
Industry
.
create!
(
name:
industry
)
industry_job_table
=
IndustryJob
.
create!
(
job_id:
job_table
.
id
,
industry_id:
industry_find
.
id
)
else
industry_job_table
=
IndustryJob
.
create!
(
job_id:
job_table
.
id
,
industry_id:
industry_find
.
id
)
end
puts
"========================================="
puts
job_table
.
id
,
title_job
,
industry
,
salary
location_data
=
row
[
"work place"
].
strip
location
=
(
location_data
.
gsub
(
'["'
,
''
)).
gsub
(
'"]'
,
''
).
strip
location_data
=
row
[
"work place"
]
location
=
location_data
.
gsub
(
'["'
,
''
).
gsub
(
'"]'
,
''
)
location_find
=
City
.
find_by
(
name:
location
)
if
location_find
==
nil
if
location_find
.
nil?
city_table
=
City
.
create!
(
name:
location
)
city_job_table
=
CityJob
.
create!
(
job_id:
job_table
.
id
,
city_id:
location_find
.
id
)
else
...
...
@@ -196,6 +196,7 @@
end
end
end
def
logger
# config.log_level = :info
Rails
.
logger
=
Logger
.
new
(
STDOUT
)
...
...
lib/src/crontab.rb
View file @
6ab472c3
class
Crontab
def
find_company
company_info
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-1-vi.html"
))
company_link
=
company_info
.
css
(
'div.caption a.company-name'
).
map
{
|
link
|
link
[
'href'
]
}
company_link
=
company_info
.
css
(
'div.caption a.company-name'
).
map
{
|
link
|
link
[
'href'
]
}
company_link
.
each
do
|
link
|
if
link
.
include?
(
'\u2019'
)
link
.
gsub!
(
'\u2019'
,
"'"
)
end
next
if
link
==
'javascript:void(0);'
if
link
!=
'https://careerbuilder.vn/vi/nha-tuyen-dung/hr-vietnam\xE2\x80\x99s-ess-client.35A4EFBA.html'
company_page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
link
))))
if
!
(
company_page
.
search
(
'p.name'
).
text
).
nil?
unless
(
company_page
.
search
(
'p.name'
).
text
).
nil?
begin
name_company
=
company_page
.
search
(
'p.name'
).
text
address_company
=
company_page
.
css
(
'div.content p'
).
children
[
1
].
text
...
...
@@ -29,11 +26,8 @@ class Crontab
end
def
find_job
page_access
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"
))
get_link
=
page_access
.
css
(
'a.job_link'
).
map
{
|
link
|
link
[
'href'
]
}
get_link
=
page_access
.
css
(
'a.job_link'
).
map
{
|
link
|
link
[
'href'
]
}
get_link
.
each
do
|
link
|
if
link
.
include?
(
'\u2013'
)
link
.
gsub!
(
'\u2013'
,
'–'
)
end
page_job
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
link
))))
get_row
=
page_job
.
search
(
'div.bg-blue div.row'
)
if
get_row
!=
""
...
...
@@ -41,15 +35,16 @@ class Crontab
company_table
=
Company
.
find_by
(
name:
get_name_company
)
title_job
=
page_job
.
search
(
'div.job-desc p'
).
text
description
=
page_job
.
search
(
'div.detail-row'
)
arr_column
=
get_row
.
css
(
'div.has-background'
).
map
{
|
data
|
data
.
text
.
split
(
' '
).
join
(
' '
)
}
arr_column
=
get_row
.
css
(
'div.has-background'
).
map
{
|
data
|
data
.
text
.
split
(
' '
).
join
(
' '
)
}
job_table
=
Job
.
find_by
(
title:
title_job
)
arr_column
.
each_with_index
do
|
val
,
key
|
if
!
company_table
.
nil?
arr_column
.
each
do
|
val
|
unless
company_table
.
nil?
job_check
=
Job
.
find_by
(
title:
title_job
,
company_id:
company_table
.
id
)
if
val
.
include?
(
'Ngày cập nhật'
)
arr_data
=
val
.
gsub
(
'Ngày cập nhật '
,
''
).
split
(
' '
)
date
=
arr_data
.
first
elsif
val
.
include?
(
'Lương'
)
&&
val
.
include?
(
'Kinh nghiệm'
)
==
true
&&
Job
.
find_by
(
title:
title_job
,
company_id:
company_table
.
id
)
==
nil
arr_sub
=
((((
val
.
gsub
(
'Lương '
,
''
)).
gsub
(
' Kinh nghiệm '
,
'*'
)).
gsub
(
' Cấp bậc '
,
'*'
)).
gsub
(
' Hết hạn nộp '
,
'*'
)
).
split
(
'*'
)
arr_data
=
val
.
gsub
(
'Ngày cập nhật '
,
''
).
split
(
' '
)
date
_update
=
arr_data
.
first
elsif
val
.
include?
(
'Lương'
)
&&
val
.
include?
(
'Kinh nghiệm'
)
==
true
&&
job_check
.
nil?
arr_sub
=
val
.
gsub
(
'Lương '
,
''
).
gsub
(
' Kinh nghiệm '
,
'*'
).
gsub
(
' Cấp bậc '
,
'*'
).
gsub
(
' Hết hạn nộp '
,
'*'
).
split
(
'*'
)
salary
=
arr_sub
[
0
]
experience
=
arr_sub
[
1
]
level
=
arr_sub
[
2
]
...
...
@@ -61,8 +56,8 @@ class Crontab
expiration_date:
expiration_date
,
description:
description
,
company_id:
company_table
.
id
)
elsif
val
.
include?
(
'Lương'
)
&&
val
.
include?
(
'Kinh nghiệm'
)
==
false
&&
Job
.
find_by
(
title:
title_job
,
company_id:
company_table
.
id
)
==
nil
arr_sub
=
(((
val
.
gsub
(
'Lương '
,
''
)).
gsub
(
' Cấp bậc '
,
'*'
)).
gsub
(
' Hết hạn nộp '
,
'*'
)
).
split
(
'*'
)
elsif
val
.
include?
(
'Lương'
)
&&
val
.
include?
(
'Kinh nghiệm'
)
==
false
&&
job_check
.
nil?
arr_sub
=
val
.
gsub
(
'Lương '
,
''
).
gsub
(
' Cấp bậc '
,
'*'
).
gsub
(
' Hết hạn nộp '
,
'*'
).
split
(
'*'
)
salary
=
arr_sub
[
0
]
level
=
arr_sub
[
1
]
expiration_date
=
arr_sub
[
2
]
...
...
@@ -77,19 +72,19 @@ class Crontab
end
end
if
!
job_table
.
nil?
&&
!
company_table
.
nil?
location_rel
=
get_row
.
css
(
'div.map p a'
).
children
.
map
{
|
location
|
location
.
text
.
strip
}
location_rel
=
get_row
.
css
(
'div.map p a'
).
children
.
map
{
|
location
|
location
.
text
.
strip
}
location_rel
.
each
do
|
loc
|
city_table
=
City
.
find_by
(
name:
"
#{
loc
}
"
)
if
CityJob
.
find_by
(
job_id:
job_table
.
id
,
city_id:
city_table
.
id
)
==
nil
puts
"Created
#{
job_table
.
id
}
-
#{
city_table
.
id
}
.
#{
loc
}
"
city_table
=
City
.
find_by
(
name:
loc
)
if
CityJob
.
find_by
(
job_id:
job_table
.
id
,
city_id:
city_table
.
id
)
.
nil?
puts
"Created City
#{
city_table
.
id
}
=>
#{
loc
}
"
city_jobs
=
CityJob
.
create!
(
job_id:
job_table
.
id
,
city_id:
city_table
.
id
)
end
end
industry_rel
=
get_row
.
css
(
'li a'
).
children
.
map
{
|
industry
|
industry
.
text
.
strip
}
industry_rel
=
get_row
.
css
(
'li a'
).
children
.
map
{
|
industry
|
industry
.
text
.
strip
}
industry_rel
.
each
do
|
ind
|
industry_table
=
Industry
.
find_by
(
name:
"
#{
ind
}
"
)
if
IndustryJob
.
find_by
(
job_id:
job_table
.
id
,
industry_id:
industry_table
.
id
)
==
nil
puts
"
#{
job_table
.
id
}
-
#{
industry_table
.
id
}
.
#{
ind
}
"
industry_table
=
Industry
.
find_by
(
name:
ind
)
if
IndustryJob
.
find_by
(
job_id:
job_table
.
id
,
industry_id:
industry_table
.
id
)
.
nil?
puts
"Created Industry
#{
job_table
.
id
}
-
#{
industry_table
.
id
}
=>
#{
ind
}
"
industry_jobs
=
IndustryJob
.
create!
(
job_id:
job_table
.
id
,
industry_id:
industry_table
.
id
)
end
end
...
...
lib/tasks/crawler_import.rake
View file @
6ab472c3
...
...
@@ -6,23 +6,23 @@ require 'zip'
action
=
Crawler
.
new
crontab
=
Crontab
.
new
namespace
:import
do
desc
"crawler data"
desc
'crawler data'
task
crawler: :environment
do
action
.
crawl_city
action
.
crawl_industry
action
.
crawl_company
action
.
crawl_job_relationships
end
desc
"get file CSV from server"
desc
'get file CSV from Server'
task
csv_get: :environment
do
action
.
get_file_csv
action
.
extract_zip
(
'./jobs.zip'
,
'.'
)
end
desc
"Import data from CSV"
desc
'Import data from CSV'
task
data_csv: :environment
do
action
.
import_file_csv
end
desc
"Crontab"
desc
'Crontab'
task
auto: :environment
do
crontab
.
find_company
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment