Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
VeNJob
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Tô Ngọc Ánh
VeNJob
Commits
c99884c7
Commit
c99884c7
authored
Jul 28, 2020
by
Tô Ngọc Ánh
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
separating def into class
parent
ee6a23cd
Pipeline
#721
canceled with stages
in 0 seconds
Changes
4
Pipelines
2
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
72 additions
and
46 deletions
+72
-46
lib/common/crawler.rb
+18
-18
lib/common/csv.rb
+12
-28
lib/common/extract_zip.rb
+14
-0
lib/tasks/import_data.rake
+28
-0
No files found.
lib/
tasks/crawler.rake
→
lib/
common/crawler.rb
View file @
c99884c7
require
'open-uri'
@logger
||=
Logger
.
new
(
"./log/import_data.log"
)
class
Crawler
def
initialize
(
logger
)
@logger
=
logger
end
namespace
:crawl
do
desc
'crawl industries locations jobs'
task
:crawl_industries_locations_jobs
,
%i[page link]
=>
[
:environment
]
do
|
_
,
args
|
args
.
with_defaults
(
link:
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
)
def
crawl_data
(
page
,
base_link
)
crawl_industries_locations
job_links
=
get_job_links
(
args
[
:page
].
to_i
,
args
[
:link
]
)
job_links
=
get_job_links
(
page
,
base_link
)
job_links
.
each
do
|
link
|
next
if
link
.
empty?
crawl_job
(
link
)
end
end
end
def
get_job_links
(
page
,
link
)
def
get_job_links
(
page
,
link
)
job_links
=
[]
page
.
times
do
document
=
Nokogiri
::
HTML
(
URI
.
open
(
link
))
...
...
@@ -28,9 +27,9 @@ def get_job_links(page, link)
link
=
next_page
[
:href
]
end
job_links
end
end
def
crawl_company
(
company_link
)
def
crawl_company
(
company_link
)
uri
=
URI
.
parse
(
URI
.
escape
(
company_link
))
# fix error: uri must be ascii only
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
company_name
=
document
.
css
(
'.content .name'
).
text
...
...
@@ -43,11 +42,11 @@ def crawl_company(company_link)
company
.
address
=
company_address
company
.
description
=
company_description
end
rescue
StandardError
=>
e
rescue
StandardError
=>
e
@logger
.
error
"
#{
e
.
message
}
- Company link:
#{
uri
}
"
end
end
def
crawl_job
(
job_link
)
def
crawl_job
(
job_link
)
uri
=
URI
.
parse
(
URI
.
escape
(
job_link
))
# fix error: uri must be ascii only
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
job_title
=
document
.
at_css
(
'.job-desc p.title'
).
text
...
...
@@ -66,7 +65,7 @@ def crawl_job(job_link)
job_salary
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-usd")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_level
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-account")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_experience
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_expiration
_date
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_exp
_date
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_description
=
document
.
css
(
'.job-detail-content .detail-row'
).
to_s
...
...
@@ -75,16 +74,16 @@ def crawl_job(job_link)
level:
job_level
,
experience:
job_experience
,
salary:
job_salary
,
expiration_date:
job_expiration
_date
)
do
|
job
|
expiration_date:
job_exp
_date
)
do
|
job
|
job
.
description
=
job_description
job
.
industries
<<
job_industries
job
.
locations
<<
job_locations
end
rescue
StandardError
=>
e
rescue
StandardError
=>
e
@logger
.
error
"
#{
e
.
message
}
- Job link:
#{
uri
}
"
end
end
def
crawl_industries_locations
def
crawl_industries_locations
document
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
))
industries
=
document
.
css
(
'#industry option'
).
map
(
&
:text
)
locations
=
document
.
css
(
'#location option'
).
map
(
&
:text
)
...
...
@@ -104,4 +103,5 @@ def crawl_industries_locations
location
.
oversea
=
true
end
end
end
end
lib/
tasks/csv_import.rake
→
lib/
common/csv.rb
View file @
c99884c7
require
'csv'
require
'zip'
require_relative
'../common/ftp'
require
'./lib/common/extract_zip'
@logger
||=
Logger
.
new
(
"./log/import_data.log"
)
class
CsvImport
include
ExtractZip
namespace
:csv_import
do
desc
'Download csv file from FTP and import'
task
csv: :environment
do
destination_dir
=
'./lib/data'
ftp
=
Ftp
.
new
(
'192.168.1.156'
,
'training'
,
'training'
)
ftp
.
download_file
(
'jobs.zip'
,
destination_dir
)
ftp
.
close
extract_zip
(
"
#{
destination_dir
}
/jobs.zip"
,
destination_dir
)
import_job
(
destination_dir
)
def
initialize
(
logger
)
@logger
=
logger
end
end
def
extract_zip
(
file
,
destination
)
FileUtils
.
mkdir_p
(
destination
)
Zip
::
File
.
open
(
file
)
do
|
zip_file
|
zip_file
.
each
do
|
f
|
fpath
=
File
.
join
(
destination
,
f
.
name
)
zip_file
.
extract
(
f
,
fpath
)
unless
File
.
exist?
(
fpath
)
end
end
end
def
import_job
(
direction
)
def
import_job
(
direction
)
index
=
0
CSV
.
foreach
(
"
#{
direction
}
/jobs.csv"
,
headers:
true
)
do
|
row
|
index
+=
1
...
...
@@ -57,11 +38,14 @@ def import_job(direction)
end
puts
title
end
rescue
StandardError
=>
e
rescue
StandardError
=>
e
puts
e
@logger
.
error
"Job
#{
index
}
:
#{
e
.
message
}
"
end
end
def
integer?
(
str
)
private
def
integer?
(
str
)
str
.
to_i
.
to_s
==
str
end
end
lib/common/extract_zip.rb
0 → 100644
View file @
c99884c7
require
'zip'
module
ExtractZip
def
extract_zip
(
file
,
destination
)
FileUtils
.
mkdir_p
(
destination
)
Zip
::
File
.
open
(
file
)
do
|
zip_file
|
zip_file
.
each
do
|
f
|
fpath
=
File
.
join
(
destination
,
f
.
name
)
zip_file
.
extract
(
f
,
fpath
)
unless
File
.
exist?
(
fpath
)
end
end
end
end
lib/tasks/import_data.rake
0 → 100644
View file @
c99884c7
require
'./lib/common/ftp'
require
'./lib/common/csv'
require
'./lib/common/crawler'
namespace
:import_data
do
logger
||=
Logger
.
new
(
'./log/import_data.log'
)
desc
'crawl industries locations jobs'
task
:crawler
,
%i[page link]
=>
[
:environment
]
do
|
_
,
args
|
args
.
with_defaults
(
page:
1
,
link:
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
)
crawler
=
Crawler
.
new
(
logger
)
crawler
.
crawl_data
(
args
[
:page
].
to_i
,
args
[
:link
])
end
desc
'Download csv file from FTP and import'
task
csv: :environment
do
destination_dir
=
'./lib/data'
ftp
=
Ftp
.
new
(
'192.168.1.156'
,
'training'
,
'training'
)
ftp
.
download_file
(
'jobs.zip'
,
destination_dir
)
ftp
.
close
csv
=
CsvImport
.
new
(
logger
)
csv
.
extract_zip
(
"
#{
destination_dir
}
/jobs.zip"
,
destination_dir
)
csv
.
import_job
(
destination_dir
)
end
desc
'Import data from crawler and csv file'
task
all:
%i[crawler csv]
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment