Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
veNJOB
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
2
Merge Requests
2
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Thanh Hung Pham
veNJOB
Commits
9707d2d0
Commit
9707d2d0
authored
Jun 29, 2017
by
Thanh Hung Pham
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Resolve thread
parent
d6fa267c
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
56 additions
and
4 deletions
+56
-4
lib/tasks/careerbuilder.rb
+48
-4
lib/tasks/crawler.rake
+8
-0
No files found.
lib/tasks/c
rawler_data.rake
→
lib/tasks/c
areerbuilder.rb
View file @
9707d2d0
require
'
nokogiri
'
require
'
thread
'
require
'open-uri'
require
'rake'
require
'nokogiri'
require
'logger'
namespace
:crawler_data
do
task
load_page: :environment
do
class
Careerbuilder
attr_reader
:domain
,
:thread_count
,
:logger
def
initialize
(
domain
,
thread_count
=
1
)
@domain
=
domain
@thread_count
=
thread_count
@mutex
=
Mutex
.
new
@logger
=
Logger
.
new
(
"
#{
Rails
.
root
}
/log/careerbuilder_crawler.log"
)
end
def
crawl
@logger
.
info
(
'Start crawl'
)
workers
=
(
0
...
thread_count
).
map
do
Thread
.
new
do
begin
doc
=
Nokogiri
::
HTML
(
open
(
'http://careerbuilder.vn'
))
import_area
import_category
(
doc
)
...
...
@@ -11,22 +25,41 @@ namespace :crawler_data do
new_jobs_url
=
doc
.
xpath
(
"//div[@class='logo_nav']/ul/li[@class=' hasmenu']/ul/li/a[text()='Việc làm mới nhất']/@href"
)
inport_job
(
new_jobs_url
.
to_s
)
rescue
ThreadError
end
end
end
workers
.
map
(
&
:join
)
@logger
.
info
(
'Crawl finished'
)
end
def
import_area
@mutex
.
synchronize
do
Area
.
new
(
name:
'Viet Nam'
).
save
if
Area
.
where
(
name:
'Viet Nam'
).
blank?
Area
.
new
(
name:
'International'
).
save
if
Area
.
where
(
name:
'International'
).
blank?
end
rescue
StandardError
=>
e
logger
.
error
(
e
.
message
)
logger
.
error
(
e
.
backtrace
)
end
def
import_category
(
doc
)
@mutex
.
synchronize
do
categories
=
doc
.
xpath
(
"//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option"
)
categories
=
categories
.
slice
(
1
..
categories
.
size
-
2
)
categories
.
each
do
|
category
|
Category
.
new
(
name:
category
.
text
.
strip
).
save
if
Category
.
where
(
name:
category
.
text
.
strip
).
blank?
end
end
rescue
StandardError
=>
e
logger
.
error
(
"[method: ]
#{
import_category
}
"
)
logger
.
error
(
e
.
message
)
logger
.
error
(
e
.
backtrace
)
end
def
import_city
(
doc
)
@mutex
.
synchronize
do
cities
=
doc
.
xpath
(
"//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option"
).
drop
(
1
)
area_id
=
1
cities
.
each
do
|
city
|
...
...
@@ -34,9 +67,15 @@ namespace :crawler_data do
City
.
new
(
name:
city
.
text
.
strip
,
area:
Area
.
find
(
area_id
)).
save
if
City
.
where
(
name:
city
.
text
.
strip
).
blank?
end
end
rescue
StandardError
=>
e
logger
.
error
(
"[method: ]
#{
import_city
}
"
)
logger
.
error
(
e
.
message
)
logger
.
error
(
e
.
backtrace
)
end
def
inport_job
(
url
)
10
.
times
do
@mutex
.
synchronize
do
doc_new_jobs
=
Nokogiri
::
HTML
(
open
(
url
))
doc_new_jobs
.
encoding
=
'utf-8'
...
...
@@ -80,4 +119,9 @@ namespace :crawler_data do
url
=
doc_new_jobs
.
xpath
(
"//div[@class='paginationTwoStatus']/a[@class='right']/@href"
).
to_s
end
end
rescue
StandardError
=>
e
logger
.
error
(
"[URL]
#{
url
}
"
)
logger
.
error
(
e
.
message
)
logger
.
error
(
e
.
backtrace
)
end
end
lib/tasks/crawler.rake
0 → 100644
View file @
9707d2d0
namespace
:crawler
do
desc
'client crawler'
task
load: :environment
do
require
"
#{
Rails
.
root
}
/lib/tasks/careerbuilder"
thread_count
=
ENV
[
'THREAD_COUNT'
]
||
1
Careerbuilder
.
new
(
'http://careerbuilder.vn'
,
thread_count
.
to_i
).
crawl
end
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment