Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
VeNJOB
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Nguyen Hoang Mai Phuong
VeNJOB
Commits
6ba9bc64
Commit
6ba9bc64
authored
Jul 23, 2021
by
Nguyen Hoang Mai Phuong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix crawler
parent
fe91b8cf
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
23 additions
and
17 deletions
+23
-17
config/schedule.rb
+1
-1
db/migrate/20210723035105_change_companies.rb
+5
-0
db/schema.rb
+2
-2
lib/tasks/crawler.rake
+15
-14
No files found.
config/schedule.rb
View file @
6ba9bc64
every
1
.
day
,
at:
'08:
18
am'
do
every
1
.
day
,
at:
'08:
00
am'
do
rake
'crawler:all'
rake
'crawler:all'
end
end
db/migrate/20210723035105_change_companies.rb
0 → 100644
View file @
6ba9bc64
class
ChangeCompanies
<
ActiveRecord
::
Migration
[
6.1
]
def
change
change_column
:companies
,
:address
,
:text
end
end
db/schema.rb
View file @
6ba9bc64
...
@@ -10,7 +10,7 @@
...
@@ -10,7 +10,7 @@
#
#
# It's strongly recommended that you check this file into your version control system.
# It's strongly recommended that you check this file into your version control system.
ActiveRecord
::
Schema
.
define
(
version:
2021_07_2
0_145646
)
do
ActiveRecord
::
Schema
.
define
(
version:
2021_07_2
3_035105
)
do
create_table
"apply_jobs"
,
charset:
"utf8mb4"
,
collation:
"utf8mb4_0900_ai_ci"
,
force: :cascade
do
|
t
|
create_table
"apply_jobs"
,
charset:
"utf8mb4"
,
collation:
"utf8mb4_0900_ai_ci"
,
force: :cascade
do
|
t
|
t
.
bigint
"user_id"
,
null:
false
t
.
bigint
"user_id"
,
null:
false
...
@@ -44,7 +44,7 @@ ActiveRecord::Schema.define(version: 2021_07_20_145646) do
...
@@ -44,7 +44,7 @@ ActiveRecord::Schema.define(version: 2021_07_20_145646) do
t
.
text
"description"
t
.
text
"description"
t
.
datetime
"created_at"
,
precision:
6
,
null:
false
t
.
datetime
"created_at"
,
precision:
6
,
null:
false
t
.
datetime
"updated_at"
,
precision:
6
,
null:
false
t
.
datetime
"updated_at"
,
precision:
6
,
null:
false
t
.
string
"address"
t
.
text
"address"
t
.
text
"overview"
t
.
text
"overview"
end
end
...
...
lib/tasks/crawler.rake
View file @
6ba9bc64
...
@@ -4,6 +4,8 @@ require 'logger'
...
@@ -4,6 +4,8 @@ require 'logger'
namespace
:crawler
do
namespace
:crawler
do
desc
'Crawl Jobs and Companies'
desc
'Crawl Jobs and Companies'
task
jobs: :environment
do
task
jobs: :environment
do
logger
=
Logger
.
new
(
"
#{
Rails
.
root
}
/log/crawler_jobs.log"
)
logger
.
info
"Start crawler job at:
#{
Time
.
current
}
"
base_url
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/'
))
base_url
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/'
))
job_page
=
base_url
.
css
(
'div.menu div.dropdown-menu ul li a'
)[
0
].
attributes
[
'href'
].
value
job_page
=
base_url
.
css
(
'div.menu div.dropdown-menu ul li a'
)[
0
].
attributes
[
'href'
].
value
parse_job_page
=
Nokogiri
::
HTML
(
URI
.
open
(
job_page
))
parse_job_page
=
Nokogiri
::
HTML
(
URI
.
open
(
job_page
))
...
@@ -28,8 +30,8 @@ namespace :crawler do
...
@@ -28,8 +30,8 @@ namespace :crawler do
company_name
=
company
.
css
(
'div.company-info div.content p.name'
)
company_name
=
company
.
css
(
'div.company-info div.content p.name'
)
next
if
company_name
.
nil?
next
if
company_name
.
nil?
logger
=
Logger
.
new
(
"
#{
Rails
.
root
}
/log/crawler_jobs.log
"
)
logger
.
info
(
"Link company:
#{
company_page
}
"
)
logger
.
info
(
"Link company:
#{
company_page
}
.to_s"
)
company_info
=
company
.
css
(
'div.company-info div.content'
)
company_info
=
company
.
css
(
'div.company-info div.content'
)
address
=
company_info
.
css
(
'p'
)[
1
].
try
(
:text
)
address
=
company_info
.
css
(
'p'
)[
1
].
try
(
:text
)
description
=
company_info
.
css
(
'ul li'
).
text
description
=
company_info
.
css
(
'ul li'
).
text
...
@@ -51,22 +53,22 @@ namespace :crawler do
...
@@ -51,22 +53,22 @@ namespace :crawler do
next
if
title
.
nil?
next
if
title
.
nil?
logger
.
info
(
"Link job:
#{
job_detail_page
}
"
)
logger
.
info
(
"Link job:
#{
job_detail_page
}
"
)
salary
,
experience
,
type
,
level
,
expired_at
=
''
salary
,
experience
,
type
,
level
,
expired_at
=
''
detail_content
=
detail_job
.
css
(
'div.
col-lg-4 col-sm-6 item-blue
ul li'
)
detail_content
=
detail_job
.
css
(
'div.
row div.detail-box.has-background
ul li'
)
detail_content
.
each
do
|
content
|
detail_content
.
each
do
|
content
|
case
content
.
css
(
'strong'
).
text
case
content
.
css
(
'strong'
).
text
when
'Lương'
when
'Lương'
salary
=
content
.
css
(
'p'
).
text
salary
=
content
.
css
(
'p'
).
text
when
'Kinh nghiệm'
when
'Kinh nghiệm'
puts
content
.
css
(
'p'
).
text
experience
=
content
.
css
(
'p'
).
text
when
'Hình thức'
when
'Hình thức'
puts
content
.
css
(
'p'
).
text
type
=
content
.
css
(
'p'
).
text
when
'Cấp bậc'
when
'Cấp bậc'
level
=
content
.
css
(
'p'
).
text
level
=
content
.
css
(
'p'
).
text
when
'Hết hạn nộp'
when
'Hết hạn nộp'
expired_at
=
content
.
css
(
'p'
).
text
expired_at
=
content
.
css
(
'p'
).
text
end
end
end
end
benefits
,
overview
,
requirement
,
other_requirement
=
''
benefits
,
overview
,
requirement
,
other_requirement
=
''
detail_require
=
detail_job
.
css
(
'div.detail-row'
)
detail_require
=
detail_job
.
css
(
'div.detail-row'
)
...
@@ -97,26 +99,25 @@ namespace :crawler do
...
@@ -97,26 +99,25 @@ namespace :crawler do
company_id:
Company
.
find_by
(
name:
company_name
.
text
).
id
company_id:
Company
.
find_by
(
name:
company_name
.
text
).
id
)
)
job_industries
=
[]
industries
=
detail_job
.
css
(
'div.detail-box.has-background ul li p a'
)
industries
=
detail_job
.
css
(
'div.detail-box.has-background ul li p a'
)
industries
.
each
do
|
industry
|
industries
.
each
do
|
industry
|
name
=
industry
.
text
.
squish
name
=
industry
.
text
.
squish
industry_name
=
Industry
.
find_or_create_by
(
job_industries
<<
Industry
.
find__by
(
name:
name
)
name:
name
)
job
.
industries
<<
industry_name
end
end
job
.
industries
<<
job_industries
job_cities
=
[]
location
=
detail_job
.
css
(
'div.map p a'
)
location
=
detail_job
.
css
(
'div.map p a'
)
location
.
each
do
|
city
|
location
.
each
do
|
city
|
name
=
city
.
text
name
=
city
.
text
city_name
=
City
.
find_or_create_by
(
job_cities
<<
City
.
find_by
(
name:
name
)
name:
name
)
job
.
cities
<<
city_name
end
end
job
.
cities
<<
job_cities
unless
job_cities
.
nil?
end
end
page
+=
1
page
+=
1
end
end
logger
.
info
"End crawler job at:
#{
Time
.
current
}
"
end
end
desc
'Crawl Industries'
desc
'Crawl Industries'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment