Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
VeNJob
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Tô Ngọc Ánh
VeNJob
Commits
c99884c7
Commit
c99884c7
authored
Jul 28, 2020
by
Tô Ngọc Ánh
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
separating def into class
parent
ee6a23cd
Pipeline
#721
canceled with stages
in 0 seconds
Changes
4
Pipelines
2
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
72 additions
and
46 deletions
+72
-46
lib/common/crawler.rb
+18
-18
lib/common/csv.rb
+12
-28
lib/common/extract_zip.rb
+14
-0
lib/tasks/import_data.rake
+28
-0
No files found.
lib/
tasks/crawler.rake
→
lib/
common/crawler.rb
View file @
c99884c7
require
'open-uri'
require
'open-uri'
@logger
||=
Logger
.
new
(
"./log/import_data.log"
)
class
Crawler
def
initialize
(
logger
)
@logger
=
logger
end
namespace
:crawl
do
def
crawl_data
(
page
,
base_link
)
desc
'crawl industries locations jobs'
task
:crawl_industries_locations_jobs
,
%i[page link]
=>
[
:environment
]
do
|
_
,
args
|
args
.
with_defaults
(
link:
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
)
crawl_industries_locations
crawl_industries_locations
job_links
=
get_job_links
(
args
[
:page
].
to_i
,
args
[
:link
]
)
job_links
=
get_job_links
(
page
,
base_link
)
job_links
.
each
do
|
link
|
job_links
.
each
do
|
link
|
next
if
link
.
empty?
next
if
link
.
empty?
crawl_job
(
link
)
crawl_job
(
link
)
end
end
end
end
end
def
get_job_links
(
page
,
link
)
def
get_job_links
(
page
,
link
)
job_links
=
[]
job_links
=
[]
page
.
times
do
page
.
times
do
document
=
Nokogiri
::
HTML
(
URI
.
open
(
link
))
document
=
Nokogiri
::
HTML
(
URI
.
open
(
link
))
...
@@ -28,9 +27,9 @@ def get_job_links(page, link)
...
@@ -28,9 +27,9 @@ def get_job_links(page, link)
link
=
next_page
[
:href
]
link
=
next_page
[
:href
]
end
end
job_links
job_links
end
end
def
crawl_company
(
company_link
)
def
crawl_company
(
company_link
)
uri
=
URI
.
parse
(
URI
.
escape
(
company_link
))
# fix error: uri must be ascii only
uri
=
URI
.
parse
(
URI
.
escape
(
company_link
))
# fix error: uri must be ascii only
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
company_name
=
document
.
css
(
'.content .name'
).
text
company_name
=
document
.
css
(
'.content .name'
).
text
...
@@ -43,11 +42,11 @@ def crawl_company(company_link)
...
@@ -43,11 +42,11 @@ def crawl_company(company_link)
company
.
address
=
company_address
company
.
address
=
company_address
company
.
description
=
company_description
company
.
description
=
company_description
end
end
rescue
StandardError
=>
e
rescue
StandardError
=>
e
@logger
.
error
"
#{
e
.
message
}
- Company link:
#{
uri
}
"
@logger
.
error
"
#{
e
.
message
}
- Company link:
#{
uri
}
"
end
end
def
crawl_job
(
job_link
)
def
crawl_job
(
job_link
)
uri
=
URI
.
parse
(
URI
.
escape
(
job_link
))
# fix error: uri must be ascii only
uri
=
URI
.
parse
(
URI
.
escape
(
job_link
))
# fix error: uri must be ascii only
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
job_title
=
document
.
at_css
(
'.job-desc p.title'
).
text
job_title
=
document
.
at_css
(
'.job-desc p.title'
).
text
...
@@ -66,7 +65,7 @@ def crawl_job(job_link)
...
@@ -66,7 +65,7 @@ def crawl_job(job_link)
job_salary
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-usd")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_salary
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-usd")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_level
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-account")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_level
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-account")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_experience
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_experience
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_expiration
_date
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_exp
_date
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_description
=
document
.
css
(
'.job-detail-content .detail-row'
).
to_s
job_description
=
document
.
css
(
'.job-detail-content .detail-row'
).
to_s
...
@@ -75,16 +74,16 @@ def crawl_job(job_link)
...
@@ -75,16 +74,16 @@ def crawl_job(job_link)
level:
job_level
,
level:
job_level
,
experience:
job_experience
,
experience:
job_experience
,
salary:
job_salary
,
salary:
job_salary
,
expiration_date:
job_expiration
_date
)
do
|
job
|
expiration_date:
job_exp
_date
)
do
|
job
|
job
.
description
=
job_description
job
.
description
=
job_description
job
.
industries
<<
job_industries
job
.
industries
<<
job_industries
job
.
locations
<<
job_locations
job
.
locations
<<
job_locations
end
end
rescue
StandardError
=>
e
rescue
StandardError
=>
e
@logger
.
error
"
#{
e
.
message
}
- Job link:
#{
uri
}
"
@logger
.
error
"
#{
e
.
message
}
- Job link:
#{
uri
}
"
end
end
def
crawl_industries_locations
def
crawl_industries_locations
document
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
))
document
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
))
industries
=
document
.
css
(
'#industry option'
).
map
(
&
:text
)
industries
=
document
.
css
(
'#industry option'
).
map
(
&
:text
)
locations
=
document
.
css
(
'#location option'
).
map
(
&
:text
)
locations
=
document
.
css
(
'#location option'
).
map
(
&
:text
)
...
@@ -104,4 +103,5 @@ def crawl_industries_locations
...
@@ -104,4 +103,5 @@ def crawl_industries_locations
location
.
oversea
=
true
location
.
oversea
=
true
end
end
end
end
end
end
end
lib/
tasks/csv_import.rake
→
lib/
common/csv.rb
View file @
c99884c7
require
'csv'
require
'csv'
require
'zip'
require
'./lib/common/extract_zip'
require_relative
'../common/ftp'
@logger
||=
Logger
.
new
(
"./log/import_data.log"
)
class
CsvImport
include
ExtractZip
namespace
:csv_import
do
def
initialize
(
logger
)
desc
'Download csv file from FTP and import'
@logger
=
logger
task
csv: :environment
do
destination_dir
=
'./lib/data'
ftp
=
Ftp
.
new
(
'192.168.1.156'
,
'training'
,
'training'
)
ftp
.
download_file
(
'jobs.zip'
,
destination_dir
)
ftp
.
close
extract_zip
(
"
#{
destination_dir
}
/jobs.zip"
,
destination_dir
)
import_job
(
destination_dir
)
end
end
end
def
extract_zip
(
file
,
destination
)
FileUtils
.
mkdir_p
(
destination
)
Zip
::
File
.
open
(
file
)
do
|
zip_file
|
zip_file
.
each
do
|
f
|
fpath
=
File
.
join
(
destination
,
f
.
name
)
zip_file
.
extract
(
f
,
fpath
)
unless
File
.
exist?
(
fpath
)
end
end
end
def
import_job
(
direction
)
def
import_job
(
direction
)
index
=
0
index
=
0
CSV
.
foreach
(
"
#{
direction
}
/jobs.csv"
,
headers:
true
)
do
|
row
|
CSV
.
foreach
(
"
#{
direction
}
/jobs.csv"
,
headers:
true
)
do
|
row
|
index
+=
1
index
+=
1
...
@@ -57,11 +38,14 @@ def import_job(direction)
...
@@ -57,11 +38,14 @@ def import_job(direction)
end
end
puts
title
puts
title
end
end
rescue
StandardError
=>
e
rescue
StandardError
=>
e
puts
e
puts
e
@logger
.
error
"Job
#{
index
}
:
#{
e
.
message
}
"
@logger
.
error
"Job
#{
index
}
:
#{
e
.
message
}
"
end
end
def
integer?
(
str
)
private
def
integer?
(
str
)
str
.
to_i
.
to_s
==
str
str
.
to_i
.
to_s
==
str
end
end
end
lib/common/extract_zip.rb
0 → 100644
View file @
c99884c7
require
'zip'
module
ExtractZip
def
extract_zip
(
file
,
destination
)
FileUtils
.
mkdir_p
(
destination
)
Zip
::
File
.
open
(
file
)
do
|
zip_file
|
zip_file
.
each
do
|
f
|
fpath
=
File
.
join
(
destination
,
f
.
name
)
zip_file
.
extract
(
f
,
fpath
)
unless
File
.
exist?
(
fpath
)
end
end
end
end
lib/tasks/import_data.rake
0 → 100644
View file @
c99884c7
require
'./lib/common/ftp'
require
'./lib/common/csv'
require
'./lib/common/crawler'
namespace
:import_data
do
logger
||=
Logger
.
new
(
'./log/import_data.log'
)
desc
'crawl industries locations jobs'
task
:crawler
,
%i[page link]
=>
[
:environment
]
do
|
_
,
args
|
args
.
with_defaults
(
page:
1
,
link:
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
)
crawler
=
Crawler
.
new
(
logger
)
crawler
.
crawl_data
(
args
[
:page
].
to_i
,
args
[
:link
])
end
desc
'Download csv file from FTP and import'
task
csv: :environment
do
destination_dir
=
'./lib/data'
ftp
=
Ftp
.
new
(
'192.168.1.156'
,
'training'
,
'training'
)
ftp
.
download_file
(
'jobs.zip'
,
destination_dir
)
ftp
.
close
csv
=
CsvImport
.
new
(
logger
)
csv
.
extract_zip
(
"
#{
destination_dir
}
/jobs.zip"
,
destination_dir
)
csv
.
import_job
(
destination_dir
)
end
desc
'Import data from crawler and csv file'
task
all:
%i[crawler csv]
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment