Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
VeNJob
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Tô Ngọc Ánh
VeNJob
Commits
c99884c7
Commit
c99884c7
authored
Jul 28, 2020
by
Tô Ngọc Ánh
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
separating def into class
parent
ee6a23cd
Pipeline
#721
canceled with stages
in 0 seconds
Changes
4
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
162 additions
and
136 deletions
+162
-136
lib/common/crawler.rb
+80
-80
lib/common/csv.rb
+40
-56
lib/common/extract_zip.rb
+14
-0
lib/tasks/import_data.rake
+28
-0
No files found.
lib/
tasks/crawler.rake
→
lib/
common/crawler.rb
View file @
c99884c7
require
'open-uri'
require
'open-uri'
@logger
||=
Logger
.
new
(
"./log/import_data.log"
)
class
Crawler
def
initialize
(
logger
)
@logger
=
logger
end
namespace
:crawl
do
def
crawl_data
(
page
,
base_link
)
desc
'crawl industries locations jobs'
task
:crawl_industries_locations_jobs
,
%i[page link]
=>
[
:environment
]
do
|
_
,
args
|
args
.
with_defaults
(
link:
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
)
crawl_industries_locations
crawl_industries_locations
job_links
=
get_job_links
(
args
[
:page
].
to_i
,
args
[
:link
]
)
job_links
=
get_job_links
(
page
,
base_link
)
job_links
.
each
do
|
link
|
job_links
.
each
do
|
link
|
next
if
link
.
empty?
next
if
link
.
empty?
crawl_job
(
link
)
crawl_job
(
link
)
end
end
end
end
end
def
get_job_links
(
page
,
link
)
def
get_job_links
(
page
,
link
)
job_links
=
[]
job_links
=
[]
page
.
times
do
page
.
times
do
document
=
Nokogiri
::
HTML
(
URI
.
open
(
link
))
document
=
Nokogiri
::
HTML
(
URI
.
open
(
link
))
jobs_xml
=
document
.
xpath
(
'//div/a[@class="job_link"]/@href'
)
jobs_xml
=
document
.
xpath
(
'//div/a[@class="job_link"]/@href'
)
jobs_xml
.
each
{
|
item
|
job_links
<<
item
.
value
}
jobs_xml
.
each
{
|
item
|
job_links
<<
item
.
value
}
next_page
=
document
.
at_css
(
'.next-page a'
)
next_page
=
document
.
at_css
(
'.next-page a'
)
break
if
next_page
.
nil?
break
if
next_page
.
nil?
link
=
next_page
[
:href
]
link
=
next_page
[
:href
]
end
job_links
end
end
job_links
end
def
crawl_company
(
company_link
)
def
crawl_company
(
company_link
)
uri
=
URI
.
parse
(
URI
.
escape
(
company_link
))
# fix error: uri must be ascii only
uri
=
URI
.
parse
(
URI
.
escape
(
company_link
))
# fix error: uri must be ascii only
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
company_name
=
document
.
css
(
'.content .name'
).
text
company_name
=
document
.
css
(
'.content .name'
).
text
return
if
company_name
.
empty?
return
if
company_name
.
empty?
company_address
=
document
.
css
(
'.content p'
)[
1
].
text
company_address
=
document
.
css
(
'.content p'
)[
1
].
text
company_description
=
document
.
css
(
'.main-about-us'
).
css
(
'.content'
).
text
company_description
=
document
.
css
(
'.main-about-us'
).
css
(
'.content'
).
text
Company
.
find_or_create_by
(
name:
company_name
)
do
|
company
|
Company
.
find_or_create_by
(
name:
company_name
)
do
|
company
|
company
.
address
=
company_address
company
.
address
=
company_address
company
.
description
=
company_description
company
.
description
=
company_description
end
rescue
StandardError
=>
e
@logger
.
error
"
#{
e
.
message
}
- Company link:
#{
uri
}
"
end
end
rescue
StandardError
=>
e
@logger
.
error
"
#{
e
.
message
}
- Company link:
#{
uri
}
"
end
def
crawl_job
(
job_link
)
def
crawl_job
(
job_link
)
uri
=
URI
.
parse
(
URI
.
escape
(
job_link
))
# fix error: uri must be ascii only
uri
=
URI
.
parse
(
URI
.
escape
(
job_link
))
# fix error: uri must be ascii only
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
job_title
=
document
.
at_css
(
'.job-desc p.title'
).
text
job_title
=
document
.
at_css
(
'.job-desc p.title'
).
text
return
if
job_title
.
empty?
return
if
job_title
.
empty?
job_company_link
=
document
.
at_css
(
'.job-desc a.job-company-name'
)[
:href
]
job_company_link
=
document
.
at_css
(
'.job-desc a.job-company-name'
)[
:href
]
job_company
=
crawl_company
(
job_company_link
)
job_company
=
crawl_company
(
job_company_link
)
return
if
job_company
.
nil?
return
if
job_company
.
nil?
job_location_name
=
document
.
css
(
'.map p a'
).
map
{
|
val
|
val
.
text
.
strip
}
job_location_name
=
document
.
css
(
'.map p a'
).
map
{
|
val
|
val
.
text
.
strip
}
job_locations
=
Location
.
where
(
city:
job_location_name
)
job_locations
=
Location
.
where
(
city:
job_location_name
)
job_industry_names
=
document
.
at_xpath
(
'//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]'
).
css
(
'p a'
).
map
{
|
val
|
val
.
text
.
strip
}
job_industry_names
=
document
.
at_xpath
(
'//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]'
).
css
(
'p a'
).
map
{
|
val
|
val
.
text
.
strip
}
job_industries
=
Industry
.
where
(
name:
job_industry_names
)
job_industries
=
Industry
.
where
(
name:
job_industry_names
)
job_salary
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-usd")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_salary
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-usd")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_level
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-account")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_level
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-account")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_experience
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_experience
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_expiration_date
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_exp_date
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_description
=
document
.
css
(
'.job-detail-content .detail-row'
).
to_s
job_description
=
document
.
css
(
'.job-detail-content .detail-row'
).
to_s
Job
.
find_or_create_by
(
title:
job_title
,
Job
.
find_or_create_by
(
title:
job_title
,
company_id:
job_company
.
id
,
company_id:
job_company
.
id
,
level:
job_level
,
level:
job_level
,
experience:
job_experience
,
experience:
job_experience
,
salary:
job_salary
,
salary:
job_salary
,
expiration_date:
job_expiration_date
)
do
|
job
|
expiration_date:
job_exp_date
)
do
|
job
|
job
.
description
=
job_description
job
.
description
=
job_description
job
.
industries
<<
job_industries
job
.
industries
<<
job_industries
job
.
locations
<<
job_locations
job
.
locations
<<
job_locations
end
rescue
StandardError
=>
e
@logger
.
error
"
#{
e
.
message
}
- Job link:
#{
uri
}
"
end
end
rescue
StandardError
=>
e
@logger
.
error
"
#{
e
.
message
}
- Job link:
#{
uri
}
"
end
def
crawl_industries_locations
def
crawl_industries_locations
document
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
))
document
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
))
industries
=
document
.
css
(
'#industry option'
).
map
(
&
:text
)
industries
=
document
.
css
(
'#industry option'
).
map
(
&
:text
)
locations
=
document
.
css
(
'#location option'
).
map
(
&
:text
)
locations
=
document
.
css
(
'#location option'
).
map
(
&
:text
)
industries
.
each
do
|
val
|
industries
.
each
do
|
val
|
Industry
.
find_or_create_by
(
name:
val
)
Industry
.
find_or_create_by
(
name:
val
)
end
end
locations
.
take
(
Location
::
CITY_VIETNAM_NUMBER
).
each
do
|
val
|
locations
.
take
(
Location
::
CITY_VIETNAM_NUMBER
).
each
do
|
val
|
Location
.
find_or_create_by
(
city:
val
)
do
|
location
|
Location
.
find_or_create_by
(
city:
val
)
do
|
location
|
location
.
oversea
=
false
location
.
oversea
=
false
end
end
end
end
locations
.
last
(
locations
.
count
-
Location
::
CITY_VIETNAM_NUMBER
).
each
do
|
val
|
locations
.
last
(
locations
.
count
-
Location
::
CITY_VIETNAM_NUMBER
).
each
do
|
val
|
Location
.
find_or_create_by
(
city:
val
)
do
|
location
|
Location
.
find_or_create_by
(
city:
val
)
do
|
location
|
location
.
oversea
=
true
location
.
oversea
=
true
end
end
end
end
end
end
end
lib/
tasks/csv_import.rake
→
lib/
common/csv.rb
View file @
c99884c7
require
'csv'
require
'csv'
require
'zip'
require
'./lib/common/extract_zip'
require_relative
'../common/ftp'
@logger
||=
Logger
.
new
(
"./log/import_data.log"
)
class
CsvImport
include
ExtractZip
namespace
:csv_import
do
def
initialize
(
logger
)
desc
'Download csv file from FTP and import'
@logger
=
logger
task
csv: :environment
do
destination_dir
=
'./lib/data'
ftp
=
Ftp
.
new
(
'192.168.1.156'
,
'training'
,
'training'
)
ftp
.
download_file
(
'jobs.zip'
,
destination_dir
)
ftp
.
close
extract_zip
(
"
#{
destination_dir
}
/jobs.zip"
,
destination_dir
)
import_job
(
destination_dir
)
end
end
end
def
extract_zip
(
file
,
destination
)
FileUtils
.
mkdir_p
(
destination
)
Zip
::
File
.
open
(
file
)
do
|
zip_file
|
def
import_job
(
direction
)
zip_file
.
each
do
|
f
|
index
=
0
fpath
=
File
.
join
(
destination
,
f
.
name
)
CSV
.
foreach
(
"
#{
direction
}
/jobs.csv"
,
headers:
true
)
do
|
row
|
zip_file
.
extract
(
f
,
fpath
)
unless
File
.
exist?
(
fpath
)
index
+=
1
next
if
integer?
(
row
[
'category'
])
title
=
row
[
'name'
].
strip
company
=
Company
.
find_or_create_by
(
name:
row
[
'company name'
])
do
|
c
|
c
.
description
=
"Contact email:
#{
row
[
'contact email'
]
}
\n
"
\
"Contact name:
#{
row
[
'contact name'
]
}
\n
"
\
"Contact phone:
#{
row
[
'contact phone'
]
}
"
c
.
address
=
"
#{
row
[
'company address'
]
}
,
#{
row
[
'company province'
]
}
"
end
industry
=
Industry
.
find_or_create_by
(
name:
row
[
'category'
].
strip
)
level
=
row
[
'level'
].
try
(
:strip
)
salary
=
row
[
'salary'
].
try
(
:strip
)
locations_name
=
row
[
'work place'
].
tr
(
'"[]'
,
''
).
split
(
','
)
locations
=
Location
.
where
(
city:
locations_name
)
locations
=
locations_name
.
map
{
|
city
|
Location
.
create
(
oversea:
false
,
city:
city
)
}
if
locations
.
empty?
description
=
"Benefits:
\n
#{
row
[
'benefit'
]
}
\n
"
\
"Descriptions:
\n
#{
row
[
'description'
]
}
\n
"
\
"Requirements:
\n
#{
row
[
'requirement'
]
}
"
Job
.
find_or_create_by
(
title:
title
,
company_id:
company
.
id
,
level:
level
,
salary:
salary
)
do
|
job
|
job
.
industries
<<
industry
job
.
locations
<<
locations
job
.
description
=
description
end
puts
title
end
end
rescue
StandardError
=>
e
puts
e
@logger
.
error
"Job
#{
index
}
:
#{
e
.
message
}
"
end
end
end
def
import_job
(
direction
)
index
=
0
CSV
.
foreach
(
"
#{
direction
}
/jobs.csv"
,
headers:
true
)
do
|
row
|
index
+=
1
next
if
integer?
(
row
[
'category'
])
title
=
row
[
'name'
].
strip
private
company
=
Company
.
find_or_create_by
(
name:
row
[
'company name'
])
do
|
c
|
c
.
description
=
"Contact email:
#{
row
[
'contact email'
]
}
\n
"
\
"Contact name:
#{
row
[
'contact name'
]
}
\n
"
\
"Contact phone:
#{
row
[
'contact phone'
]
}
"
c
.
address
=
"
#{
row
[
'company address'
]
}
,
#{
row
[
'company province'
]
}
"
end
industry
=
Industry
.
find_or_create_by
(
name:
row
[
'category'
].
strip
)
level
=
row
[
'level'
].
try
(
:strip
)
salary
=
row
[
'salary'
].
try
(
:strip
)
locations_name
=
row
[
'work place'
].
tr
(
'"[]'
,
''
).
split
(
','
)
locations
=
Location
.
where
(
city:
locations_name
)
locations
=
locations_name
.
map
{
|
city
|
Location
.
create
(
oversea:
false
,
city:
city
)
}
if
locations
.
empty?
description
=
"Benefits:
\n
#{
row
[
'benefit'
]
}
\n
"
\
"Descriptions:
\n
#{
row
[
'description'
]
}
\n
"
\
"Requirements:
\n
#{
row
[
'requirement'
]
}
"
Job
.
find_or_create_by
(
title:
title
,
company_id:
company
.
id
,
level:
level
,
salary:
salary
)
do
|
job
|
def
integer?
(
str
)
job
.
industries
<<
industry
str
.
to_i
.
to_s
==
str
job
.
locations
<<
locations
job
.
description
=
description
end
puts
title
end
end
rescue
StandardError
=>
e
puts
e
@logger
.
error
"Job
#{
index
}
:
#{
e
.
message
}
"
end
def
integer?
(
str
)
str
.
to_i
.
to_s
==
str
end
end
lib/common/extract_zip.rb
0 → 100644
View file @
c99884c7
require
'zip'
module
ExtractZip
def
extract_zip
(
file
,
destination
)
FileUtils
.
mkdir_p
(
destination
)
Zip
::
File
.
open
(
file
)
do
|
zip_file
|
zip_file
.
each
do
|
f
|
fpath
=
File
.
join
(
destination
,
f
.
name
)
zip_file
.
extract
(
f
,
fpath
)
unless
File
.
exist?
(
fpath
)
end
end
end
end
lib/tasks/import_data.rake
0 → 100644
View file @
c99884c7
require
'./lib/common/ftp'
require
'./lib/common/csv'
require
'./lib/common/crawler'
namespace
:import_data
do
logger
||=
Logger
.
new
(
'./log/import_data.log'
)
desc
'crawl industries locations jobs'
task
:crawler
,
%i[page link]
=>
[
:environment
]
do
|
_
,
args
|
args
.
with_defaults
(
page:
1
,
link:
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
)
crawler
=
Crawler
.
new
(
logger
)
crawler
.
crawl_data
(
args
[
:page
].
to_i
,
args
[
:link
])
end
desc
'Download csv file from FTP and import'
task
csv: :environment
do
destination_dir
=
'./lib/data'
ftp
=
Ftp
.
new
(
'192.168.1.156'
,
'training'
,
'training'
)
ftp
.
download_file
(
'jobs.zip'
,
destination_dir
)
ftp
.
close
csv
=
CsvImport
.
new
(
logger
)
csv
.
extract_zip
(
"
#{
destination_dir
}
/jobs.zip"
,
destination_dir
)
csv
.
import_job
(
destination_dir
)
end
desc
'Import data from crawler and csv file'
task
all:
%i[crawler csv]
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment