Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
venjob
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Huỳnh Thiên Phước
venjob
Commits
7b024ad9
Commit
7b024ad9
authored
Aug 03, 2020
by
Huỳnh Thiên Phước
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Set name variable
parent
53de3765
Pipeline
#759
failed with stages
in 0 seconds
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
94 additions
and
85 deletions
+94
-85
lib/src/crawler.rb
+9
-0
lib/src/crontab.rb
+75
-69
lib/tasks/crawler_import.rake
+10
-16
No files found.
lib/src/crawler.rb
View file @
7b024ad9
require
'net/ftp'
require
'csv'
require
'zip'
class
Crawler
def
initialize
(
logger
)
...
...
@@ -7,6 +11,11 @@ class Crawler
@PASSWORD_FTP
=
'training'
end
def
crawl_city_industry
crawl_city
crawl_industry
end
def
crawl_city
page
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"
))
get_name
=
page
.
search
(
'select#location'
)
...
...
lib/src/crontab.rb
View file @
7b024ad9
class
Crontab
def
initialize
(
logger
)
require
'net/ftp'
require
'csv'
require
'zip'
class
InforJob
def
initialize
(
logger
,
url
)
@mylogger
=
logger
@url
=
url
end
def
crawl_all
find_company
find_job
end
def
find_company
(
url
)
company_info
=
Nokogiri
::
HTML
(
URI
.
open
(
url
))
company_links
=
company_
info
.
css
(
'div.caption a.company-name'
).
map
{
|
link
|
link
[
'href'
]
}
company_
links
.
each
do
|
link
|
def
find_company
info
=
Nokogiri
::
HTML
(
URI
.
open
(
@
url
))
links
=
info
.
css
(
'div.caption a.company-name'
).
map
{
|
link
|
link
[
'href'
]
}
links
.
each
do
|
link
|
next
if
link
==
'javascript:void(0);'
company_page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
link
))))
name_company
=
company_page
.
search
(
'p.name'
)
&
.
text
address_company
=
company_page
.
css
(
'div.content p'
).
children
[
1
]
&
.
text
introduction_company
=
company_page
.
css
(
'div.main-about-us'
).
text
next
if
name_company
.
blank?
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
escape
(
link
)))
name
=
page
.
search
(
'p.name'
)
&
.
text
return
if
name
.
blank?
address
=
page
.
css
(
'div.content p'
).
children
[
1
]
&
.
text
introduction
=
page
.
css
(
'div.main-about-us'
).
text
begin
get_name_company
=
Company
.
find_by
(
name:
name_company
)
if
get_name_company
.
nil?
company
=
Company
.
create!
(
name:
name_company
,
address:
address_company
,
introduction:
introduction_company
)
end
puts
name
Company
.
find_or_create_by!
(
name:
name
,
address:
address
,
introduction:
introduction
)
rescue
StandardError
=>
e
@mylogger
.
error
"
#{
e
.
message
}
"
@mylogger
.
error
e
.
message
end
end
end
def
create_job
(
title_job
,
level
,
salary
,
experience
,
expiration_date
,
description
,
company_id
)
Job
.
create!
(
title:
title_job
,
level:
level
,
salary:
salary
,
experience:
experience
,
expiration_date:
expiration_date
,
description:
description
,
company_id:
company_id
)
end
def
create_city_rel
(
get_row
,
job_find
)
location_rel
=
get_row
.
css
(
'div.map p a'
).
children
.
map
{
|
location
|
location
.
text
.
strip
}
def
create_city_rel
(
row
,
info_job
)
location_rel
=
row
.
css
(
'div.map p a'
).
children
.
map
(
&
:text
).
map
(
&
:strip
)
location_rel
.
each
do
|
loc
|
city_table
=
City
.
find_by
(
name:
loc
)
next
if
city_table
.
nil?
unless
CityJob
.
exists?
(
job_id:
job_find
.
id
,
city_id:
city_table
.
id
)
puts
"Created City:
#{
job_find
.
id
}
-
#{
city_table
.
id
}
.
#{
loc
}
"
city_jobs
=
CityJob
.
create!
(
job_id:
job_find
.
id
,
city_id:
city_table
.
id
)
end
puts
"Created City:
#{
info_job
.
id
}
-
#{
city_table
.
id
}
.
#{
loc
}
"
CityJob
.
find_or_create_by!
(
job_id:
info_job
.
id
,
city_id:
city_table
.
id
)
end
end
def
create_industry_rel
(
get_row
,
job_find
)
industry_rel
=
get_row
.
css
(
'li a'
).
children
.
map
{
|
industry
|
industry
.
text
.
strip
}
def
create_industry_rel
(
row
,
info_job
)
industry_rel
=
row
.
css
(
'li a'
).
children
.
map
(
&
:text
).
map
(
&
:strip
)
industry_rel
.
each
do
|
ind
|
industry_table
=
Industry
.
find_by
(
name:
ind
)
next
if
industry_table
.
nil?
unless
IndustryJob
.
exists?
(
job_id:
job_find
.
id
,
industry_id:
industry_table
.
id
)
puts
"Created Industry:
#{
job_find
.
id
}
-
#{
industry_table
.
id
}
.
#{
ind
}
"
industry_jobs
=
IndustryJob
.
create!
(
job_id:
job_find
.
id
,
industry_id:
industry_table
.
id
)
end
puts
"Created Industry:
#{
info_job
.
id
}
-
#{
industry_table
.
id
}
.
#{
ind
}
"
IndustryJob
.
find_or_create_by!
(
job_id:
info_job
.
id
,
industry_id:
industry_table
.
id
)
end
end
def
find_job
(
url
)
page_access
=
Nokogiri
::
HTML
(
URI
.
open
(
url
))
get_link
=
page_access
.
css
(
'a.job_link'
).
map
{
|
link
|
link
[
'href'
]
}
get_link
.
each
do
|
link
|
link_page_job
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
link
))))
get_row
=
link_page_job
.
search
(
'div.bg-blue div.row'
)
if
get_row
.
present?
begin
get_name_company
=
link_page_job
.
search
(
'div.job-desc a.job-company-name'
).
text
.
strip
title_job
=
link_page_job
.
search
(
'div.job-desc p'
).
text
.
strip
description
=
link_page_job
.
search
(
'div.detail-row'
)
salary
=
get_row
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-usd")]]/p'
).
text
.
strip
experience
=
get_row
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p'
)
&
.
text
&
.
strip
level
=
get_row
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-account")]]/p'
).
text
.
strip
expiration_date
=
get_row
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p'
).
text
.
strip
company_table
=
Company
.
find_by
(
name:
get_name_company
)
next
if
company_table
.
nil?
job_check
=
Job
.
exists?
(
title:
title_job
,
company_id:
company_table
.
id
)
if
job_check
==
false
create_job
(
title_job
,
level
,
salary
,
experience
,
expiration_date
,
description
,
company_table
.
id
)
end
job_find
=
Job
.
find_by
(
title:
title_job
,
company_id:
company_table
.
id
)
create_city_rel
(
get_row
,
job_find
)
create_industry_rel
(
get_row
,
job_find
)
rescue
StandardError
=>
e
@mylogger
.
error
"
#{
e
.
message
}
"
end
def
create_job
(
title
,
link_page
,
row
,
company_table
)
description
=
link_page
.
search
(
'div.detail-row'
).
to_s
salary
=
row
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-usd")]]/p'
).
text
.
strip
experience
=
row
.
at_xpath
(
'//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p'
)
&
.
text
&
.
strip
level
=
row
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-account")]]/p'
).
text
.
strip
expiration_date
=
row
.
at_xpath
(
'//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p'
).
text
.
strip
info_job
=
Job
.
find_or_create_by!
(
title:
title
,
level:
level
,
salary:
salary
,
experience:
experience
,
expiration_date:
expiration_date
,
description:
description
,
company_id:
company_table
.
id
)
create_city_rel
(
row
,
info_job
)
create_industry_rel
(
row
,
info_job
)
end
def
find_job
info
=
Nokogiri
::
HTML
(
URI
.
open
(
@url
))
link
=
info
.
css
(
'a.job_link'
).
map
{
|
link
|
link
[
'href'
]
}
link
.
each
do
|
link
|
link_page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
escape
(
link
)))
row
=
link_page
.
search
(
'div.bg-blue div.row'
)
next
if
row
.
blank?
begin
name_company
=
link_page
.
search
(
'div.job-desc a.job-company-name'
).
text
.
strip
company_table
=
Company
.
find_by
(
name:
name_company
)
next
if
company_table
.
blank?
title
=
link_page
.
search
(
'div.job-desc p'
).
text
.
strip
next
if
title
.
blank?
create_job
(
title
,
link_page
,
row
,
company_table
)
rescue
StandardError
=>
e
puts
e
# @mylogger.error e.message
end
end
end
end
lib/tasks/crawler_import.rake
View file @
7b024ad9
require
'src/crawler.rb'
require
'src/crontab.rb'
require
'net/ftp'
require
'csv'
require
'zip'
namespace
:import
do
logger
||=
Logger
.
new
(
Rails
.
root
.
join
(
'log'
,
'my.log'
))
url
=
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-1-vi.html'
desc
'crawler data'
task
crawler: :environment
do
action
=
Crawler
.
new
(
logger
)
action
.
crawl_city
action
.
crawl_industry
action
.
crawl_company
action
.
crawl_job_relationships
action
.
crawl_city_industry
end
desc
'Crontab'
task
auto: :environment
do
action
=
Crawler
.
new
(
logger
)
crontab
=
Crontab
.
new
(
logger
)
crontab
.
find_company
(
url
)
crontab
.
find_job
(
url
)
crontab
=
InforJob
.
new
(
logger
,
url
)
crontab
.
crawl_all
action
.
get_file_csv
action
.
extract_zip
(
'./jobs.zip'
,
'lib/csv'
)
action
.
import_file_csv
(
Rails
.
root
.
join
(
'lib'
,
'csv'
,
'jobs.csv'
))
end
task
find_job: :environment
do
crontab
=
Crontab
.
new
(
logger
)
crontab
.
find_company
(
url
)
crontab
.
find_job
(
url
)
def
logger
Logger
.
new
(
Rails
.
root
.
join
(
'log'
,
'my.log'
))
end
def
url
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-1-vi.html'
end
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment