Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
VeNJob
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Tô Ngọc Ánh
VeNJob
Commits
c99884c7
Commit
c99884c7
authored
Jul 28, 2020
by
Tô Ngọc Ánh
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
separating def into class
parent
ee6a23cd
Pipeline
#721
canceled with stages
in 0 seconds
Changes
4
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
162 additions
and
136 deletions
+162
-136
lib/common/crawler.rb
+80
-80
lib/common/csv.rb
+40
-56
lib/common/extract_zip.rb
+14
-0
lib/tasks/import_data.rake
+28
-0
No files found.
lib/
tasks/crawler.rake
→
lib/
common/crawler.rb
View file @
c99884c7
require
'open-uri'
@logger
||=
Logger
.
new
(
"./log/import_data.log"
)
class
Crawler
def
initialize
(
logger
)
@logger
=
logger
end
namespace
:crawl
do
desc
'crawl industries locations jobs'
task
:crawl_industries_locations_jobs
,
%i[page link]
=>
[
:environment
]
do
|
_
,
args
|
args
.
with_defaults
(
link:
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
)
def
crawl_data
(
page
,
base_link
)
crawl_industries_locations
job_links
=
get_job_links
(
args
[
:page
].
to_i
,
args
[
:link
]
)
job_links
=
get_job_links
(
page
,
base_link
)
job_links
.
each
do
|
link
|
next
if
link
.
empty?
crawl_job
(
link
)
end
end
end
def
get_job_links
(
page
,
link
)
job_links
=
[]
page
.
times
do
document
=
Nokogiri
::
HTML
(
URI
.
open
(
link
))
jobs_xml
=
document
.
xpath
(
'//div/a[@class="job_link"]/@href'
)
jobs_xml
.
each
{
|
item
|
job_links
<<
item
.
value
}
next_page
=
document
.
at_css
(
'.next-page a'
)
break
if
next_page
.
nil?
def
get_job_links
(
page
,
link
)
job_links
=
[]
page
.
times
do
document
=
Nokogiri
::
HTML
(
URI
.
open
(
link
))
jobs_xml
=
document
.
xpath
(
'//div/a[@class="job_link"]/@href'
)
jobs_xml
.
each
{
|
item
|
job_links
<<
item
.
value
}
next_page
=
document
.
at_css
(
'.next-page a'
)
break
if
next_page
.
nil?
link
=
next_page
[
:href
]
link
=
next_page
[
:href
]
end
job_links
end
job_links
end
def
crawl_company
(
company_link
)
uri
=
URI
.
parse
(
URI
.
escape
(
company_link
))
# fix error: uri must be ascii only
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
company_name
=
document
.
css
(
'.content .name'
).
text
return
if
company_name
.
empty?
def
crawl_company
(
company_link
)
uri
=
URI
.
parse
(
URI
.
escape
(
company_link
))
# fix error: uri must be ascii only
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
company_name
=
document
.
css
(
'.content .name'
).
text
return
if
company_name
.
empty?
company_address
=
document
.
css
(
'.content p'
)[
1
].
text
company_description
=
document
.
css
(
'.main-about-us'
).
css
(
'.content'
).
text
company_address
=
document
.
css
(
'.content p'
)[
1
].
text
company_description
=
document
.
css
(
'.main-about-us'
).
css
(
'.content'
).
text
Company
.
find_or_create_by
(
name:
company_name
)
do
|
company
|
company
.
address
=
company_address
company
.
description
=
company_description
Company
.
find_or_create_by
(
name:
company_name
)
do
|
company
|
company
.
address
=
company_address
company
.
description
=
company_description
end
rescue
StandardError
=>
e
@logger
.
error
"
#{
e
.
message
}
- Company link:
#{
uri
}
"
end
rescue
StandardError
=>
e
@logger
.
error
"
#{
e
.
message
}
- Company link:
#{
uri
}
"
end
def
crawl_job
(
job_link
)
uri
=
URI
.
parse
(
URI
.
escape
(
job_link
))
# fix error: uri must be ascii only
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
job_title
=
document
.
at_css
(
'.job-desc p.title'
).
text
return
if
job_title
.
empty?
job_company_link
=
document
.
at_css
(
'.job-desc a.job-company-name'
)[
:href
]
job_company
=
crawl_company
(
job_company_link
)
return
if
job_company
.
nil?
job_location_name
=
document
.
css
(
'.map p a'
).
map
{
|
val
|
val
.
text
.
strip
}
job_locations
=
Location
.
where
(
city:
job_location_name
)
job_industry_names
=
document
.
at_xpath
(
'//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]'
).
css
(
'p a'
).
map
{
|
val
|
val
.
text
.
strip
}
job_industries
=
Industry
.
where
(
name:
job_industry_names
)
job_salary
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-usd")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_level
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-account")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_experience
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_expiration_date
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_description
=
document
.
css
(
'.job-detail-content .detail-row'
).
to_s
Job
.
find_or_create_by
(
title:
job_title
,
company_id:
job_company
.
id
,
level:
job_level
,
experience:
job_experience
,
salary:
job_salary
,
expiration_date:
job_expiration_date
)
do
|
job
|
job
.
description
=
job_description
job
.
industries
<<
job_industries
job
.
locations
<<
job_locations
def
crawl_job
(
job_link
)
uri
=
URI
.
parse
(
URI
.
escape
(
job_link
))
# fix error: uri must be ascii only
document
=
Nokogiri
::
HTML
(
URI
.
open
(
uri
))
job_title
=
document
.
at_css
(
'.job-desc p.title'
).
text
return
if
job_title
.
empty?
job_company_link
=
document
.
at_css
(
'.job-desc a.job-company-name'
)[
:href
]
job_company
=
crawl_company
(
job_company_link
)
return
if
job_company
.
nil?
job_location_name
=
document
.
css
(
'.map p a'
).
map
{
|
val
|
val
.
text
.
strip
}
job_locations
=
Location
.
where
(
city:
job_location_name
)
job_industry_names
=
document
.
at_xpath
(
'//li[./strong/em[contains(@class, "mdi mdi-briefcase")]]'
).
css
(
'p a'
).
map
{
|
val
|
val
.
text
.
strip
}
job_industries
=
Industry
.
where
(
name:
job_industry_names
)
job_salary
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-usd")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_level
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-account")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_experience
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_exp_date
=
document
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p'
).
try
(
:text
).
try
(
:strip
)
job_description
=
document
.
css
(
'.job-detail-content .detail-row'
).
to_s
Job
.
find_or_create_by
(
title:
job_title
,
company_id:
job_company
.
id
,
level:
job_level
,
experience:
job_experience
,
salary:
job_salary
,
expiration_date:
job_exp_date
)
do
|
job
|
job
.
description
=
job_description
job
.
industries
<<
job_industries
job
.
locations
<<
job_locations
end
rescue
StandardError
=>
e
@logger
.
error
"
#{
e
.
message
}
- Job link:
#{
uri
}
"
end
rescue
StandardError
=>
e
@logger
.
error
"
#{
e
.
message
}
- Job link:
#{
uri
}
"
end
def
crawl_industries_locations
document
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
))
industries
=
document
.
css
(
'#industry option'
).
map
(
&
:text
)
locations
=
document
.
css
(
'#location option'
).
map
(
&
:text
)
def
crawl_industries_locations
document
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
))
industries
=
document
.
css
(
'#industry option'
).
map
(
&
:text
)
locations
=
document
.
css
(
'#location option'
).
map
(
&
:text
)
industries
.
each
do
|
val
|
Industry
.
find_or_create_by
(
name:
val
)
end
industries
.
each
do
|
val
|
Industry
.
find_or_create_by
(
name:
val
)
end
locations
.
take
(
Location
::
CITY_VIETNAM_NUMBER
).
each
do
|
val
|
Location
.
find_or_create_by
(
city:
val
)
do
|
location
|
location
.
oversea
=
false
locations
.
take
(
Location
::
CITY_VIETNAM_NUMBER
).
each
do
|
val
|
Location
.
find_or_create_by
(
city:
val
)
do
|
location
|
location
.
oversea
=
false
end
end
end
locations
.
last
(
locations
.
count
-
Location
::
CITY_VIETNAM_NUMBER
).
each
do
|
val
|
Location
.
find_or_create_by
(
city:
val
)
do
|
location
|
location
.
oversea
=
true
locations
.
last
(
locations
.
count
-
Location
::
CITY_VIETNAM_NUMBER
).
each
do
|
val
|
Location
.
find_or_create_by
(
city:
val
)
do
|
location
|
location
.
oversea
=
true
end
end
end
end
lib/
tasks/csv_import.rake
→
lib/
common/csv.rb
View file @
c99884c7
require
'csv'
require
'zip'
require_relative
'../common/ftp'
require
'./lib/common/extract_zip'
@logger
||=
Logger
.
new
(
"./log/import_data.log"
)
class
CsvImport
include
ExtractZip
namespace
:csv_import
do
desc
'Download csv file from FTP and import'
task
csv: :environment
do
destination_dir
=
'./lib/data'
ftp
=
Ftp
.
new
(
'192.168.1.156'
,
'training'
,
'training'
)
ftp
.
download_file
(
'jobs.zip'
,
destination_dir
)
ftp
.
close
extract_zip
(
"
#{
destination_dir
}
/jobs.zip"
,
destination_dir
)
import_job
(
destination_dir
)
def
initialize
(
logger
)
@logger
=
logger
end
end
def
extract_zip
(
file
,
destination
)
FileUtils
.
mkdir_p
(
destination
)
Zip
::
File
.
open
(
file
)
do
|
zip_file
|
zip_file
.
each
do
|
f
|
fpath
=
File
.
join
(
destination
,
f
.
name
)
zip_file
.
extract
(
f
,
fpath
)
unless
File
.
exist?
(
fpath
)
def
import_job
(
direction
)
index
=
0
CSV
.
foreach
(
"
#{
direction
}
/jobs.csv"
,
headers:
true
)
do
|
row
|
index
+=
1
next
if
integer?
(
row
[
'category'
])
title
=
row
[
'name'
].
strip
company
=
Company
.
find_or_create_by
(
name:
row
[
'company name'
])
do
|
c
|
c
.
description
=
"Contact email:
#{
row
[
'contact email'
]
}
\n
"
\
"Contact name:
#{
row
[
'contact name'
]
}
\n
"
\
"Contact phone:
#{
row
[
'contact phone'
]
}
"
c
.
address
=
"
#{
row
[
'company address'
]
}
,
#{
row
[
'company province'
]
}
"
end
industry
=
Industry
.
find_or_create_by
(
name:
row
[
'category'
].
strip
)
level
=
row
[
'level'
].
try
(
:strip
)
salary
=
row
[
'salary'
].
try
(
:strip
)
locations_name
=
row
[
'work place'
].
tr
(
'"[]'
,
''
).
split
(
','
)
locations
=
Location
.
where
(
city:
locations_name
)
locations
=
locations_name
.
map
{
|
city
|
Location
.
create
(
oversea:
false
,
city:
city
)
}
if
locations
.
empty?
description
=
"Benefits:
\n
#{
row
[
'benefit'
]
}
\n
"
\
"Descriptions:
\n
#{
row
[
'description'
]
}
\n
"
\
"Requirements:
\n
#{
row
[
'requirement'
]
}
"
Job
.
find_or_create_by
(
title:
title
,
company_id:
company
.
id
,
level:
level
,
salary:
salary
)
do
|
job
|
job
.
industries
<<
industry
job
.
locations
<<
locations
job
.
description
=
description
end
puts
title
end
rescue
StandardError
=>
e
puts
e
@logger
.
error
"Job
#{
index
}
:
#{
e
.
message
}
"
end
end
def
import_job
(
direction
)
index
=
0
CSV
.
foreach
(
"
#{
direction
}
/jobs.csv"
,
headers:
true
)
do
|
row
|
index
+=
1
next
if
integer?
(
row
[
'category'
])
title
=
row
[
'name'
].
strip
company
=
Company
.
find_or_create_by
(
name:
row
[
'company name'
])
do
|
c
|
c
.
description
=
"Contact email:
#{
row
[
'contact email'
]
}
\n
"
\
"Contact name:
#{
row
[
'contact name'
]
}
\n
"
\
"Contact phone:
#{
row
[
'contact phone'
]
}
"
c
.
address
=
"
#{
row
[
'company address'
]
}
,
#{
row
[
'company province'
]
}
"
end
industry
=
Industry
.
find_or_create_by
(
name:
row
[
'category'
].
strip
)
level
=
row
[
'level'
].
try
(
:strip
)
salary
=
row
[
'salary'
].
try
(
:strip
)
locations_name
=
row
[
'work place'
].
tr
(
'"[]'
,
''
).
split
(
','
)
locations
=
Location
.
where
(
city:
locations_name
)
locations
=
locations_name
.
map
{
|
city
|
Location
.
create
(
oversea:
false
,
city:
city
)
}
if
locations
.
empty?
description
=
"Benefits:
\n
#{
row
[
'benefit'
]
}
\n
"
\
"Descriptions:
\n
#{
row
[
'description'
]
}
\n
"
\
"Requirements:
\n
#{
row
[
'requirement'
]
}
"
private
Job
.
find_or_create_by
(
title:
title
,
company_id:
company
.
id
,
level:
level
,
salary:
salary
)
do
|
job
|
job
.
industries
<<
industry
job
.
locations
<<
locations
job
.
description
=
description
end
puts
title
def
integer?
(
str
)
str
.
to_i
.
to_s
==
str
end
rescue
StandardError
=>
e
puts
e
@logger
.
error
"Job
#{
index
}
:
#{
e
.
message
}
"
end
def
integer?
(
str
)
str
.
to_i
.
to_s
==
str
end
lib/common/extract_zip.rb
0 → 100644
View file @
c99884c7
require
'zip'
module
ExtractZip
def
extract_zip
(
file
,
destination
)
FileUtils
.
mkdir_p
(
destination
)
Zip
::
File
.
open
(
file
)
do
|
zip_file
|
zip_file
.
each
do
|
f
|
fpath
=
File
.
join
(
destination
,
f
.
name
)
zip_file
.
extract
(
f
,
fpath
)
unless
File
.
exist?
(
fpath
)
end
end
end
end
lib/tasks/import_data.rake
0 → 100644
View file @
c99884c7
require
'./lib/common/ftp'
require
'./lib/common/csv'
require
'./lib/common/crawler'
namespace
:import_data
do
logger
||=
Logger
.
new
(
'./log/import_data.log'
)
desc
'crawl industries locations jobs'
task
:crawler
,
%i[page link]
=>
[
:environment
]
do
|
_
,
args
|
args
.
with_defaults
(
page:
1
,
link:
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
)
crawler
=
Crawler
.
new
(
logger
)
crawler
.
crawl_data
(
args
[
:page
].
to_i
,
args
[
:link
])
end
desc
'Download csv file from FTP and import'
task
csv: :environment
do
destination_dir
=
'./lib/data'
ftp
=
Ftp
.
new
(
'192.168.1.156'
,
'training'
,
'training'
)
ftp
.
download_file
(
'jobs.zip'
,
destination_dir
)
ftp
.
close
csv
=
CsvImport
.
new
(
logger
)
csv
.
extract_zip
(
"
#{
destination_dir
}
/jobs.zip"
,
destination_dir
)
csv
.
import_job
(
destination_dir
)
end
desc
'Import data from crawler and csv file'
task
all:
%i[crawler csv]
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment