Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
venjob_nth
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
3
Merge Requests
3
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ngô Trung Hưng
venjob_nth
Commits
d2e14dc8
Commit
d2e14dc8
authored
Aug 03, 2020
by
Ngô Trung Hưng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
autoload_paths
parent
043ca43e
Pipeline
#757
canceled with stages
in 0 seconds
Changes
7
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
37 additions
and
37 deletions
+37
-37
config/application.rb
+4
-2
lib/src/crawler.rb
+10
-6
lib/src/crawler_job.rb
+20
-19
lib/src/interface/blue_interface.rb
+0
-2
lib/src/interface/green_interface.rb
+0
-2
lib/src/interface/red_interface.rb
+0
-2
lib/tasks/crawler.rake
+3
-4
No files found.
config/application.rb
View file @
d2e14dc8
...
...
@@ -9,9 +9,11 @@ Bundler.require(*Rails.groups)
module
Venjob
class
Application
<
Rails
::
Application
# Initialize configuration defaults for originally generated Rails version.
config
.
autoload_paths
<<
Rails
.
root
.
join
(
'lib'
)
config
.
eager_load_paths
<<
Rails
.
root
.
join
(
'lib'
)
config
.
load_defaults
5.2
config
.
autoload_paths
+=
[
Rails
.
root
.
join
(
'lib/src'
),
Rails
.
root
.
join
(
'lib/src/base'
),
Rails
.
root
.
join
(
'lib/src/interface'
)]
# Settings in config/environments/* take precedence over those specified here.
# Application configuration can go into files in config/initializers
# -- all .rb files in that directory are automatically loaded after loading
...
...
lib/src/crawler.rb
View file @
d2e14dc8
...
...
@@ -5,10 +5,14 @@ require 'open-uri'
# Crawler data
class
Crawler
COMPANY_SECURITY
=
1
NUMBER_LINK
=
5
SIZE_LI
=
8
RANGE
=
69
attr_accessor
:number_link
def
initialize
(
number_link
)
@number_link
=
number_link
end
def
path_to_first_link
Rails
.
root
.
join
(
'tmp'
,
'link.txt'
)
end
...
...
@@ -23,12 +27,12 @@ class Crawler
end
def
safe_link
(
url
)
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
url
)
)))
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
escape
(
url
)))
end
def
crawl_link
(
page
)
def
crawl_link
website_companies
=
[]
page
.
times
do
|
i
|
number_link
.
times
do
|
i
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
i
+
1
}
-vi.html"
))
link_companies
=
page
.
search
(
'.figcaption .caption @href'
)
website_companies
+=
link_companies
.
map
(
&
:value
).
uniq
...
...
@@ -50,7 +54,7 @@ class Crawler
end
def
craw_data_companies
crawl_link
(
NUMBER_LINK
)
.
each
do
|
url
|
crawl_link
.
each
do
|
url
|
page
=
safe_link
(
url
)
company_name
=
page
.
search
(
'.company-info .content .name'
).
text
Company
.
find_or_create_by
(
name:
company_name
)
do
|
company
|
...
...
lib/src/crawler_job.rb
View file @
d2e14dc8
# frozen_string_literal: true
require
'src/crawler.rb'
require_relative
'../src/interface/red_interface.rb'
require_relative
'../src/interface/blue_interface.rb'
require_relative
'../src/interface/green_interface.rb'
# Crawler data job
class
CrawlerJob
<
Crawler
def
crawl_link
(
page
)
SIZE_LI
=
8
def
crawl_link
website_jobs
=
[]
page
.
times
do
|
i
|
number_link
.
times
do
|
i
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
i
+
1
}
-vi.html"
))
link_jobs
=
page
.
search
(
'.figcaption .title .job_link @href'
)
website_jobs
+=
link_jobs
.
map
(
&
:value
)
break
if
website_jobs
.
include?
(
link_make_stop_crawler
)
link_jobs
.
each
do
|
val
|
link
=
val
.
value
return
website_jobs
if
link
.
include?
(
link_make_stop_crawler
)
website_jobs
<<
link
end
end
File
.
write
(
path_to_first_link
,
website_jobs
[
0
])
website_jobs
.
select
(
&
:present?
)
website_jobs
rescue
StandardError
=>
e
logger
.
error
"Crawler link on page have error
#{
e
}
"
logger
.
error
"Crawler link jobs on page have error
#{
e
}
"
end
def
parse_data
@box_links
||=
crawl_link
.
reverse!
end
def
reverse_arr
arr_link
=
[]
crawl_link
(
NUMBER_LINK
).
each
{
|
val
|
arr_link
<<
val
}
arr_link
.
reverse!
def
refresh_first_link
File
.
write
(
path_to_first_link
,
parse_data
.
last
)
end
def
craw_data_jobs
reverse_arr
.
each
do
|
path
|
parse_data
.
each
do
|
path
|
page
=
safe_link
(
path
)
if
page
.
search
(
'.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p'
)[
0
].
present?
@data
=
RedInterface
.
new
(
page
).
create_data
...
...
@@ -38,12 +39,12 @@ class CrawlerJob < Crawler
@data
=
GreenInterface
.
new
(
page
).
create_data
end
add_data
(
@data
)
refresh_first_link
end
end
def
add_data
(
data
)
id_company
=
Company
.
find_by
name:
data
[
:company_name
]
id_company
=
id_company
.
present?
?
id_company
.
id
:
COMPANY_SECURITY
id_company
=
(
Company
.
find_by
name:
data
[
:company_name
]).
try
(
:id
)
||
COMPANY_SECURITY
job
=
Job
.
create
(
name:
data
[
:name
],
company_id:
id_company
,
level:
data
[
:level
],
...
...
lib/src/interface/blue_interface.rb
View file @
d2e14dc8
# frozen_string_literal: true
require_relative
'../base/base.rb'
# Inherience from base
class
BlueInterface
<
Base
def
fill_company_name
...
...
lib/src/interface/green_interface.rb
View file @
d2e14dc8
# frozen_string_literal: true
require_relative
'../base/base.rb'
# ahihi
class
GreenInterface
<
Base
def
fill_name
...
...
lib/src/interface/red_interface.rb
View file @
d2e14dc8
# frozen_string_literal: true
require_relative
'../base/base.rb'
# Inherience from base
class
RedInterface
<
Base
end
lib/tasks/crawler.rake
View file @
d2e14dc8
# frozen_string_literal: true
require
'open-uri'
require
'src/crawler'
require
'src/crawler_job'
# rake task
namespace
:crawler
do
task
populate: :environment
do
NUMBER_LINK_WILL_BE_CRAWLER
=
100
Company
.
find_or_create_by
(
name:
'Bảo mật'
)
do
|
company
|
company
.
address
=
'Vui lòng xem trong mô tả công việc'
company
.
short_description
=
'Vui lòng xem trong mô tả công việc'
end
cw
=
Crawler
.
new
cw
=
Crawler
.
new
(
NUMBER_LINK_WILL_BE_CRAWLER
)
cw
.
craw_data_cities
cw
.
craw_data_companies
CrawlerJob
.
new
.
craw_data_jobs
CrawlerJob
.
new
(
NUMBER_LINK_WILL_BE_CRAWLER
)
.
craw_data_jobs
end
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment