Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
venjob
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Huỳnh Thiên Phước
venjob
Commits
7b024ad9
Commit
7b024ad9
authored
Aug 03, 2020
by
Huỳnh Thiên Phước
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Set name variable
parent
53de3765
Pipeline
#759
failed with stages
in 0 seconds
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
94 additions
and
85 deletions
+94
-85
lib/src/crawler.rb
+9
-0
lib/src/crontab.rb
+75
-69
lib/tasks/crawler_import.rake
+10
-16
No files found.
lib/src/crawler.rb
View file @
7b024ad9
require
'net/ftp'
require
'csv'
require
'zip'
class
Crawler
class
Crawler
def
initialize
(
logger
)
def
initialize
(
logger
)
...
@@ -7,6 +11,11 @@ class Crawler
...
@@ -7,6 +11,11 @@ class Crawler
@PASSWORD_FTP
=
'training'
@PASSWORD_FTP
=
'training'
end
end
def
crawl_city_industry
crawl_city
crawl_industry
end
def
crawl_city
def
crawl_city
page
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"
))
page
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"
))
get_name
=
page
.
search
(
'select#location'
)
get_name
=
page
.
search
(
'select#location'
)
...
...
lib/src/crontab.rb
View file @
7b024ad9
class
Crontab
require
'net/ftp'
def
initialize
(
logger
)
require
'csv'
require
'zip'
class
InforJob
def
initialize
(
logger
,
url
)
@mylogger
=
logger
@mylogger
=
logger
@url
=
url
end
def
crawl_all
find_company
find_job
end
end
def
find_company
(
url
)
def
find_company
company_info
=
Nokogiri
::
HTML
(
URI
.
open
(
url
))
info
=
Nokogiri
::
HTML
(
URI
.
open
(
@
url
))
company_links
=
company_
info
.
css
(
'div.caption a.company-name'
).
map
{
|
link
|
link
[
'href'
]
}
links
=
info
.
css
(
'div.caption a.company-name'
).
map
{
|
link
|
link
[
'href'
]
}
company_
links
.
each
do
|
link
|
links
.
each
do
|
link
|
next
if
link
==
'javascript:void(0);'
next
if
link
==
'javascript:void(0);'
company_page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
link
))))
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
escape
(
link
)))
name_company
=
company_page
.
search
(
'p.name'
)
&
.
text
name
=
page
.
search
(
'p.name'
)
&
.
text
address_company
=
company_page
.
css
(
'div.content p'
).
children
[
1
]
&
.
text
return
if
name
.
blank?
introduction_company
=
company_page
.
css
(
'div.main-about-us'
).
text
next
if
name_company
.
blank?
address
=
page
.
css
(
'div.content p'
).
children
[
1
]
&
.
text
introduction
=
page
.
css
(
'div.main-about-us'
).
text
begin
begin
get_name_company
=
Company
.
find_by
(
name:
name_company
)
puts
name
if
get_name_company
.
nil?
Company
.
find_or_create_by!
(
name:
name
,
company
=
Company
.
create!
(
name:
name_company
,
address:
address
,
address:
address_company
,
introduction:
introduction
)
introduction:
introduction_company
)
end
rescue
StandardError
=>
e
rescue
StandardError
=>
e
@mylogger
.
error
"
#{
e
.
message
}
"
@mylogger
.
error
e
.
message
end
end
end
end
end
end
def
create_job
(
title_job
,
level
,
salary
,
experience
,
expiration_date
,
description
,
company_id
)
Job
.
create!
(
title:
title_job
,
level:
level
,
salary:
salary
,
experience:
experience
,
expiration_date:
expiration_date
,
description:
description
,
company_id:
company_id
)
end
def
create_city_rel
(
get_row
,
job_find
)
def
create_city_rel
(
row
,
info_job
)
location_rel
=
get_row
.
css
(
'div.map p a'
).
children
.
map
{
|
location
|
location
.
text
.
strip
}
location_rel
=
row
.
css
(
'div.map p a'
).
children
.
map
(
&
:text
).
map
(
&
:strip
)
location_rel
.
each
do
|
loc
|
location_rel
.
each
do
|
loc
|
city_table
=
City
.
find_by
(
name:
loc
)
city_table
=
City
.
find_by
(
name:
loc
)
next
if
city_table
.
nil?
puts
"Created City:
#{
info_job
.
id
}
-
#{
city_table
.
id
}
.
#{
loc
}
"
unless
CityJob
.
exists?
(
job_id:
job_find
.
id
,
city_id:
city_table
.
id
)
CityJob
.
find_or_create_by!
(
job_id:
info_job
.
id
,
city_id:
city_table
.
id
)
puts
"Created City:
#{
job_find
.
id
}
-
#{
city_table
.
id
}
.
#{
loc
}
"
city_jobs
=
CityJob
.
create!
(
job_id:
job_find
.
id
,
city_id:
city_table
.
id
)
end
end
end
end
end
def
create_industry_rel
(
get_row
,
job_find
)
def
create_industry_rel
(
row
,
info_job
)
industry_rel
=
get_row
.
css
(
'li a'
).
children
.
map
{
|
industry
|
industry
.
text
.
strip
}
industry_rel
=
row
.
css
(
'li a'
).
children
.
map
(
&
:text
).
map
(
&
:strip
)
industry_rel
.
each
do
|
ind
|
industry_rel
.
each
do
|
ind
|
industry_table
=
Industry
.
find_by
(
name:
ind
)
industry_table
=
Industry
.
find_by
(
name:
ind
)
next
if
industry_table
.
nil?
puts
"Created Industry:
#{
info_job
.
id
}
-
#{
industry_table
.
id
}
.
#{
ind
}
"
unless
IndustryJob
.
exists?
(
job_id:
job_find
.
id
,
industry_id:
industry_table
.
id
)
IndustryJob
.
find_or_create_by!
(
job_id:
info_job
.
id
,
industry_id:
industry_table
.
id
)
puts
"Created Industry:
#{
job_find
.
id
}
-
#{
industry_table
.
id
}
.
#{
ind
}
"
industry_jobs
=
IndustryJob
.
create!
(
job_id:
job_find
.
id
,
industry_id:
industry_table
.
id
)
end
end
end
end
end
def
find_job
(
url
)
def
create_job
(
title
,
link_page
,
row
,
company_table
)
page_access
=
Nokogiri
::
HTML
(
URI
.
open
(
url
))
description
=
link_page
.
search
(
'div.detail-row'
).
to_s
get_link
=
page_access
.
css
(
'a.job_link'
).
map
{
|
link
|
link
[
'href'
]
}
salary
=
row
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-usd")]]/p'
).
text
.
strip
get_link
.
each
do
|
link
|
experience
=
row
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p'
)
&
.
text
&
.
strip
link_page_job
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
link
))))
level
=
row
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-account")]]/p'
).
text
.
strip
get_row
=
link_page_job
.
search
(
'div.bg-blue div.row'
)
expiration_date
=
row
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p'
).
text
.
strip
if
get_row
.
present?
begin
info_job
=
Job
.
find_or_create_by!
(
title:
title
,
get_name_company
=
link_page_job
.
search
(
'div.job-desc a.job-company-name'
).
text
.
strip
level:
level
,
title_job
=
link_page_job
.
search
(
'div.job-desc p'
).
text
.
strip
salary:
salary
,
description
=
link_page_job
.
search
(
'div.detail-row'
)
experience:
experience
,
salary
=
get_row
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-usd")]]/p'
).
text
.
strip
expiration_date:
expiration_date
,
experience
=
get_row
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p'
)
&
.
text
&
.
strip
description:
description
,
level
=
get_row
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-account")]]/p'
).
text
.
strip
company_id:
company_table
.
id
)
expiration_date
=
get_row
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p'
).
text
.
strip
company_table
=
Company
.
find_by
(
name:
get_name_company
)
create_city_rel
(
row
,
info_job
)
next
if
company_table
.
nil?
create_industry_rel
(
row
,
info_job
)
job_check
=
Job
.
exists?
(
title:
title_job
,
company_id:
company_table
.
id
)
end
if
job_check
==
false
create_job
(
title_job
,
level
,
salary
,
experience
,
expiration_date
,
description
,
company_table
.
id
)
def
find_job
end
info
=
Nokogiri
::
HTML
(
URI
.
open
(
@url
))
job_find
=
Job
.
find_by
(
title:
title_job
,
company_id:
company_table
.
id
)
link
=
info
.
css
(
'a.job_link'
).
map
{
|
link
|
link
[
'href'
]
}
create_city_rel
(
get_row
,
job_find
)
link
.
each
do
|
link
|
create_industry_rel
(
get_row
,
job_find
)
link_page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
escape
(
link
)))
rescue
StandardError
=>
e
row
=
link_page
.
search
(
'div.bg-blue div.row'
)
@mylogger
.
error
"
#{
e
.
message
}
"
next
if
row
.
blank?
end
begin
name_company
=
link_page
.
search
(
'div.job-desc a.job-company-name'
).
text
.
strip
company_table
=
Company
.
find_by
(
name:
name_company
)
next
if
company_table
.
blank?
title
=
link_page
.
search
(
'div.job-desc p'
).
text
.
strip
next
if
title
.
blank?
create_job
(
title
,
link_page
,
row
,
company_table
)
rescue
StandardError
=>
e
puts
e
# @mylogger.error e.message
end
end
end
end
end
end
end
end
lib/tasks/crawler_import.rake
View file @
7b024ad9
require
'src/crawler.rb'
require
'src/crawler.rb'
require
'src/crontab.rb'
require
'src/crontab.rb'
require
'net/ftp'
require
'csv'
require
'zip'
namespace
:import
do
namespace
:import
do
logger
||=
Logger
.
new
(
Rails
.
root
.
join
(
'log'
,
'my.log'
))
url
=
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-1-vi.html'
desc
'crawler data'
desc
'crawler data'
task
crawler: :environment
do
task
crawler: :environment
do
action
=
Crawler
.
new
(
logger
)
action
=
Crawler
.
new
(
logger
)
action
.
crawl_city
action
.
crawl_city_industry
action
.
crawl_industry
action
.
crawl_company
action
.
crawl_job_relationships
end
end
desc
'Crontab'
desc
'Crontab'
task
auto: :environment
do
task
auto: :environment
do
action
=
Crawler
.
new
(
logger
)
action
=
Crawler
.
new
(
logger
)
crontab
=
Crontab
.
new
(
logger
)
crontab
=
InforJob
.
new
(
logger
,
url
)
crontab
.
find_company
(
url
)
crontab
.
crawl_all
crontab
.
find_job
(
url
)
action
.
get_file_csv
action
.
get_file_csv
action
.
extract_zip
(
'./jobs.zip'
,
'lib/csv'
)
action
.
extract_zip
(
'./jobs.zip'
,
'lib/csv'
)
action
.
import_file_csv
(
Rails
.
root
.
join
(
'lib'
,
'csv'
,
'jobs.csv'
))
action
.
import_file_csv
(
Rails
.
root
.
join
(
'lib'
,
'csv'
,
'jobs.csv'
))
end
end
task
find_job: :environment
do
crontab
=
Crontab
.
new
(
logger
)
def
logger
crontab
.
find_company
(
url
)
Logger
.
new
(
Rails
.
root
.
join
(
'log'
,
'my.log'
))
crontab
.
find_job
(
url
)
end
def
url
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-1-vi.html'
end
end
end
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment