Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
ven-job
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Trịnh Hoàng Phúc
ven-job
Commits
9b293818
Commit
9b293818
authored
May 11, 2020
by
Trịnh Hoàng Phúc
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix review 11/05/2020
parent
ff55254b
Pipeline
#609
failed with stages
in 0 seconds
Changes
7
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
148 additions
and
114 deletions
+148
-114
app/models/city.rb
+2
-0
app/models/company.rb
+2
-0
app/models/industry.rb
+2
-0
app/models/job.rb
+2
-0
db/migrate/20200511055632_add_columns_to_jobs.rb
+9
-0
db/schema.rb
+16
-11
lib/tasks/crawler.rake
+115
-103
No files found.
app/models/city.rb
View file @
9b293818
class
City
<
ApplicationRecord
validates
:title
,
presence:
true
has_and_belongs_to_many
:jobs
end
app/models/company.rb
View file @
9b293818
class
Company
<
ApplicationRecord
validates
:title
,
presence:
true
has_many
:jobs
end
app/models/industry.rb
View file @
9b293818
class
Industry
<
ApplicationRecord
validates
:title
,
presence:
true
has_and_belongs_to_many
:jobs
end
app/models/job.rb
View file @
9b293818
class
Job
<
ApplicationRecord
validates
:title
,
presence:
true
belongs_to
:company
has_many
:applies
...
...
db/migrate/20200511055632_add_columns_to_jobs.rb
0 → 100644
View file @
9b293818
class
AddColumnsToJobs
<
ActiveRecord
::
Migration
[
6.0
]
def
change
add_column
:jobs
,
:min_salary
,
:bigint
,
:default
=>
0
add_column
:jobs
,
:max_salary
,
:bigint
,
:default
=>
0
add_column
:jobs
,
:benefit
,
:text
add_column
:jobs
,
:job_requirements
,
:text
add_column
:jobs
,
:other_information
,
:text
end
end
db/schema.rb
View file @
9b293818
...
...
@@ -10,9 +10,9 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord
::
Schema
.
define
(
version:
2020_0
4_23_044651
)
do
ActiveRecord
::
Schema
.
define
(
version:
2020_0
5_11_055632
)
do
create_table
"admins"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
COLLATE=utf8mb4_0900_ai_ci
"
,
force: :cascade
do
|
t
|
create_table
"admins"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
,
force: :cascade
do
|
t
|
t
.
string
"email"
,
default:
""
,
null:
false
t
.
string
"encrypted_password"
,
default:
""
,
null:
false
t
.
string
"reset_password_token"
...
...
@@ -24,7 +24,7 @@ ActiveRecord::Schema.define(version: 2020_04_23_044651) do
t
.
index
[
"reset_password_token"
],
name:
"index_admins_on_reset_password_token"
,
unique:
true
end
create_table
"applies"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
COLLATE=utf8mb4_0900_ai_ci
"
,
force: :cascade
do
|
t
|
create_table
"applies"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
,
force: :cascade
do
|
t
|
t
.
bigint
"user_id"
,
null:
false
t
.
bigint
"job_id"
,
null:
false
t
.
datetime
"created_at"
,
precision:
6
,
null:
false
...
...
@@ -33,21 +33,21 @@ ActiveRecord::Schema.define(version: 2020_04_23_044651) do
t
.
index
[
"user_id"
],
name:
"index_applies_on_user_id"
end
create_table
"cities"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
COLLATE=utf8mb4_0900_ai_ci
"
,
force: :cascade
do
|
t
|
create_table
"cities"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
,
force: :cascade
do
|
t
|
t
.
string
"title"
t
.
datetime
"created_at"
,
precision:
6
,
null:
false
t
.
datetime
"updated_at"
,
precision:
6
,
null:
false
t
.
boolean
"foreign"
,
default:
false
end
create_table
"cities_jobs"
,
id:
false
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
COLLATE=utf8mb4_0900_ai_ci
"
,
force: :cascade
do
|
t
|
create_table
"cities_jobs"
,
id:
false
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
,
force: :cascade
do
|
t
|
t
.
bigint
"city_id"
,
null:
false
t
.
bigint
"job_id"
,
null:
false
t
.
index
[
"city_id"
,
"job_id"
],
name:
"index_cities_jobs_on_city_id_and_job_id"
t
.
index
[
"job_id"
,
"city_id"
],
name:
"index_cities_jobs_on_job_id_and_city_id"
end
create_table
"companies"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
COLLATE=utf8mb4_0900_ai_ci
"
,
force: :cascade
do
|
t
|
create_table
"companies"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
,
force: :cascade
do
|
t
|
t
.
string
"title"
t
.
string
"address"
t
.
string
"logo"
...
...
@@ -56,7 +56,7 @@ ActiveRecord::Schema.define(version: 2020_04_23_044651) do
t
.
datetime
"updated_at"
,
precision:
6
,
null:
false
end
create_table
"favorites"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
COLLATE=utf8mb4_0900_ai_ci
"
,
force: :cascade
do
|
t
|
create_table
"favorites"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
,
force: :cascade
do
|
t
|
t
.
bigint
"user_id"
,
null:
false
t
.
bigint
"job_id"
,
null:
false
t
.
datetime
"created_at"
,
precision:
6
,
null:
false
...
...
@@ -65,20 +65,20 @@ ActiveRecord::Schema.define(version: 2020_04_23_044651) do
t
.
index
[
"user_id"
],
name:
"index_favorites_on_user_id"
end
create_table
"industries"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
COLLATE=utf8mb4_0900_ai_ci
"
,
force: :cascade
do
|
t
|
create_table
"industries"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
,
force: :cascade
do
|
t
|
t
.
string
"title"
t
.
datetime
"created_at"
,
precision:
6
,
null:
false
t
.
datetime
"updated_at"
,
precision:
6
,
null:
false
end
create_table
"industries_jobs"
,
id:
false
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
COLLATE=utf8mb4_0900_ai_ci
"
,
force: :cascade
do
|
t
|
create_table
"industries_jobs"
,
id:
false
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
,
force: :cascade
do
|
t
|
t
.
bigint
"industry_id"
,
null:
false
t
.
bigint
"job_id"
,
null:
false
t
.
index
[
"industry_id"
,
"job_id"
],
name:
"index_industries_jobs_on_industry_id_and_job_id"
t
.
index
[
"job_id"
,
"industry_id"
],
name:
"index_industries_jobs_on_job_id_and_industry_id"
end
create_table
"jobs"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
COLLATE=utf8mb4_0900_ai_ci
"
,
force: :cascade
do
|
t
|
create_table
"jobs"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
,
force: :cascade
do
|
t
|
t
.
string
"title"
t
.
string
"updated_date_job"
t
.
string
"level"
...
...
@@ -89,10 +89,15 @@ ActiveRecord::Schema.define(version: 2020_04_23_044651) do
t
.
bigint
"company_id"
t
.
datetime
"created_at"
,
precision:
6
,
null:
false
t
.
datetime
"updated_at"
,
precision:
6
,
null:
false
t
.
bigint
"min_salary"
,
default:
0
t
.
bigint
"max_salary"
,
default:
0
t
.
text
"benefit"
t
.
text
"job_requirements"
t
.
text
"other_information"
t
.
index
[
"company_id"
],
name:
"index_jobs_on_company_id"
end
create_table
"users"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
COLLATE=utf8mb4_0900_ai_ci
"
,
force: :cascade
do
|
t
|
create_table
"users"
,
options:
"ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
,
force: :cascade
do
|
t
|
t
.
string
"email"
,
default:
""
,
null:
false
t
.
string
"encrypted_password"
,
default:
""
,
null:
false
t
.
string
"reset_password_token"
...
...
lib/tasks/crawler.rake
View file @
9b293818
...
...
@@ -6,148 +6,160 @@ namespace :crawler do
task
job: :environment
do
# Define exception logger
exception_logger
=
ActiveSupport
::
Logger
.
new
(
"log/exception_logger.log"
)
exception_logger
=
Logger
.
new
(
"log/exception_logger.log"
)
# Define skip logger
skip_url_logger
=
ActiveSupport
::
Logger
.
new
(
"log/skip_url_logger.log"
)
skip_url_logger
=
Logger
.
new
(
"log/skip_url_logger.log"
)
# Loop page
(
1
0
..
1
2
).
each
do
|
page
|
(
1
..
2
).
each
do
|
page
|
# Fetch and parse HTML document
html_jobs
=
Nokogiri
::
HTML
.
parse
(
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
page
}
-vi.html"
))
html_jobs
=
Nokogiri
::
HTML
.
parse
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
page
}
-vi.html"
))
# Loop item
html_jobs
.
css
(
".jobs-side-list .job-item"
).
each
do
|
item
|
# Set salary, min-salary, max-salary
salary
=
item
.
at_css
(
".figure .figcaption .caption .salary"
).
text
.
gsub
(
"$ "
,
""
)
if
salary
==
"Cạnh tranh"
min_salary
=
0
max_salary
=
999999999
elsif
salary
.
include?
"Dưới"
min_salary
=
0
max_salary
=
(
salary
.
gsub
(
"Dưới "
,
""
).
gsub
(
" Tr VND"
,
""
).
gsub
(
","
,
"."
).
to_f
*
1000000
).
to_i
elsif
salary
.
include?
"Trên"
min_salary
=
(
salary
.
gsub
(
"Trên "
,
""
).
gsub
(
" Tr VND"
,
""
).
gsub
(
","
,
"."
).
to_f
*
1000000
).
to_i
max_salary
=
999999999
else
range_salary
=
salary
.
split
(
"-"
)
min_salary
=
(
range_salary
[
0
].
gsub
(
"$ "
,
""
).
gsub
(
" Tr "
,
""
).
to_f
*
1000000
).
to_i
max_salary
=
(
range_salary
[
1
].
gsub
(
" Tr VND"
,
""
).
gsub
(
" "
,
""
).
to_f
*
1000000
).
to_i
end
# Job attributes
job_attributes
=
{
title:
item
.
css
(
".figure .figcaption .title a @title"
).
text
,
updated_date_job:
item
.
css
(
".bottom-right-icon .time time"
).
text
,
level:
nil
,
years_of_experience:
nil
,
salary:
item
.
css
(
".figure .figcaption .caption .salary"
).
text
.
gsub
(
"$ "
,
""
),
expiration_date:
nil
,
job_description:
nil
,
company_id:
nil
,
title:
item
.
at_css
(
".figure .figcaption .title a @title"
).
text
,
updated_date_job:
item
.
at_css
(
".bottom-right-icon .time time"
).
text
,
salary:
salary
,
min_salary:
min_salary
,
max_salary:
max_salary
}
# Defind cities array
cities
=
[]
item
.
css
(
".figure .figcaption .caption .location ul li"
).
each
do
|
city
|
city
=
check_exist_or_create_city
(
city
.
text
.
strip
)
cities
<<
city
end
if
item
.
css
(
".figure .image a @href"
).
text
!=
"javascript:void(0);"
# Company attributes
html_company_detail
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
item
.
css
(
".figure .image a @href"
).
text
)))
unless
html_company_detail
.
at_css
(
".jobsby-company"
).
nil?
company_attributes
=
{
title:
html_company_detail
.
css
(
".jobsby-company .company-introduction .company-info .info .content .name"
).
text
,
address:
html_company_detail
.
css
(
".jobsby-company .company-introduction .company-info .info .content p"
)[
1
].
text
,
logo:
html_company_detail
.
css
(
".jobsby-company .company-introduction .company-info .info .img @src"
).
text
,
description:
html_company_detail
.
css
(
".jobsby-company .company-introduction .company-info .info .content ul"
).
inner_html
.
strip
}
# Check exist or create company
job_attributes
[
:company_id
]
=
check_exist_or_create_company
(
company_attributes
)
end
end
# Defind industry ids array
industries
=
[]
html_job_detail
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
item
.
css
(
".figure .figcaption .title .job_link @href"
).
text
)))
unless
html_job_detail
.
at_css
(
".search-result-list-detail"
).
nil
?
html_job_detail
=
Nokogiri
::
HTML
.
parse
(
URI
.
open
(
URI
.
encode
(
item
.
css
(
".figure .figcaption .title .job_link @href"
).
text
)))
if
html_job_detail
.
at_css
(
".search-result-list-detail .container .no-gutters"
).
present
?
html_job_detail
.
css
(
".search-result-list-detail .tabs #tab-1 .job-detail-content .has-background ul li"
).
each
do
|
ele
|
type
=
ele
.
css
(
"strong"
).
text
type
=
ele
.
at_css
(
"strong"
).
text
case
type
when
"Hết hạn nộp"
job_attributes
[
:expiration_date
]
=
ele
.
css
(
"p"
).
text
.
strip
job_attributes
[
:expiration_date
]
=
ele
.
at_css
(
"p"
).
text
.
squish
when
"Cấp bậc"
job_attributes
[
:level
]
=
ele
.
css
(
"p"
).
text
.
strip
job_attributes
[
:level
]
=
ele
.
at_css
(
"p"
).
text
.
squish
when
"Kinh nghiệm"
job_attributes
[
:years_of_experience
]
=
ele
.
css
(
"p"
).
text
.
strip
job_attributes
[
:years_of_experience
]
=
ele
.
at_css
(
"p"
).
text
.
squish
end
end
html_job_detail
.
css
(
".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row"
).
each
do
|
ele
|
if
ele
.
at_css
(
"h3"
).
present?
type
=
ele
.
at_css
(
"h3"
).
text
case
type
when
"Phúc lợi "
job_attributes
[
:benefit
]
=
ele
.
at_css
(
"ul"
).
inner_html
.
squish
when
"Mô tả Công việc"
job_attributes
[
:job_description
]
=
ele
.
inner_html
.
squish
.
gsub
(
"<h3 class=
\"
detail-title
\"
>Mô tả Công việc</h3>"
,
""
)
when
"Yêu Cầu Công Việc"
job_attributes
[
:job_requirements
]
=
ele
.
inner_html
.
squish
.
gsub
(
"<h3 class=
\"
detail-title
\"
>Yêu Cầu Công Việc</h3>"
,
""
)
when
"Thông tin khác"
job_attributes
[
:other_information
]
=
ele
.
inner_html
.
squish
.
gsub
(
"<h3 class=
\"
detail-title
\"
>Thông tin khác</h3>"
,
""
)
end
end
end
if
item
.
at_css
(
".figure .image a @href"
).
text
!=
"javascript:void(0);"
# Company attributes
html_company_detail
=
Nokogiri
::
HTML
.
parse
(
open
(
URI
.
encode
(
item
.
css
(
".figure .image a @href"
).
text
)))
if
html_company_detail
.
at_css
(
".jobsby-company"
).
present?
company_attributes
=
{
title:
html_company_detail
.
at_css
(
".jobsby-company .company-introduction .company-info .info .content .name"
).
text
,
address:
html_company_detail
.
css
(
".jobsby-company .company-introduction .company-info .info .content p"
)[
1
].
text
,
logo:
html_company_detail
.
at_css
(
".jobsby-company .company-introduction .company-info .info .img @src"
).
text
,
description:
html_company_detail
.
at_css
(
".jobsby-company .company-introduction .company-info .info .content ul"
).
inner_html
.
squish
}
# Check exist or create company
job_attributes
[
:company_id
]
=
check_exist_or_create_company
(
company_attributes
)
end
end
# Create job
job
=
check_exist_or_create_job
(
job_attributes
)
# Defind cities array
cities
=
[]
item
.
css
(
".figure .figcaption .caption .location ul li"
).
each
do
|
city
|
city
=
check_exist_or_create_city
(
city
.
text
.
squish
)
cities
<<
city
end
# Create city_job
if
cities
.
length
>
0
cities
.
each
do
|
city
|
job
.
cities
<<
city
end
end
# Create industry_job
html_job_detail
.
css
(
".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a"
).
each
do
|
ele
|
industry
=
check_exist_or_create_industry
(
ele
.
text
.
gsub
(
","
,
""
).
s
trip
)
industry
=
check_exist_or_create_industry
(
ele
.
text
.
gsub
(
","
,
""
).
s
quish
)
industries
<<
industry
end
# Get description for job attributes
description
=
""
html_job_detail
.
css
(
".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row"
).
each
do
|
ele
|
description
<<
ele
.
inner_html
if
industries
.
length
>
0
industries
.
each
do
|
industry
|
job
.
industries
<<
industry
end
end
# Set description for job attributes
job_attributes
[
:job_description
]
=
description
.
strip
else
skip_url_logger
.
info
"another template
#{
item
.
css
(
".figure .figcaption .title .job_link @href"
).
text
}
"
skip_url_logger
.
info
"another template
#{
item
.
at_
css
(
".figure .figcaption .title .job_link @href"
).
text
}
"
end
# Create job
job
=
check_exist_or_create_job
(
job_attributes
)
# Create city_job
if
cities
.
count
>
0
cities
.
each
do
|
city
|
job
.
cities
<<
city
end
end
# Create industry_job
if
industries
.
count
>
0
industries
.
each
do
|
industry
|
job
.
industries
<<
industry
end
end
rescue
exception_logger
.
info
"Error url:
#{
item
.
css
(
".figure .figcaption .title .job_link @href"
).
text
}
"
rescue
Exception
=>
e
exception_logger
.
info
e
skip_url_logger
.
info
"another template
#{
item
.
at_css
(
".figure .figcaption .title .job_link @href"
).
text
}
"
next
end
end
end
task
city: :environment
do
# Fetch and parse HTML document
html_cities
=
Nokogiri
::
HTML
.
parse
(
open
(
"https://careerbuilder.vn/tim-viec-lam.html"
))
unless
html_cities
.
at_css
(
".find-jobsby-categories .main-jobs-by-location"
).
nil?
# Defind cities array
cities
=
[]
# Get city in country
html_cities
.
css
(
".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a"
).
each
do
|
title
|
city
=
{
title:
title
.
text
.
gsub
(
"Việc làm tại "
,
""
).
strip
,
foreign:
false
}
cities
<<
city
end
# Get city foreign
html_cities
.
css
(
".find-jobsby-categories .main-jobs-by-location .overseas-jobs .list-overseas-jobs li a"
).
each
do
|
title
|
city
=
{
title:
title
.
text
.
strip
,
foreign:
true
}
cities
<<
city
end
if
cities
.
count
>
0
City
.
import
cities
end
# Get city in country
cities_in_country
=
html_cities
.
css
(
".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a"
).
map
do
|
title
|
{
title:
title
.
text
.
gsub
(
"Việc làm tại "
,
""
).
squish
,
foreign:
false
}
end
# Get city foreign
cities_foreign
=
html_cities
.
css
(
".find-jobsby-categories .main-jobs-by-location .overseas-jobs .list-overseas-jobs li a"
).
map
do
|
title
|
{
title:
title
.
text
.
squish
,
foreign:
true
}
end
cities
=
cities_in_country
+
cities_foreign
if
cities
.
length
>
0
City
.
import
cities
end
end
task
industry: :environment
do
# Fetch and parse HTML document
html_industries
=
Nokogiri
::
HTML
.
parse
(
open
(
"https://careerbuilder.vn/tim-viec-lam.html"
))
unless
html_industries
.
at_css
(
".find-jobsby-categories .list-of-working-positions"
).
nil?
# Defind industries array
industries
=
[]
# Get industry
html_industries
.
css
(
".find-jobsby-categories .list-of-working-positions .list-jobs li a"
).
each
do
|
title
|
industry
=
{
title:
title
.
text
.
strip
}
industries
<<
industry
end
if
industries
.
count
>
0
Industry
.
import
industries
end
# Get industry
industries
=
html_industries
.
css
(
".find-jobsby-categories .list-of-working-positions .list-jobs li a"
).
map
do
|
title
|
{
title:
title
.
text
.
squish
}
end
if
industries
.
length
>
0
Industry
.
import
industries
end
end
def
check_exist_or_create_company
(
company_attributes
)
find_company
=
Company
.
find_or_create_by
(
company_attributes
)
return
find_company
.
id
...
...
@@ -155,7 +167,7 @@ namespace :crawler do
def
check_exist_or_create_industry
(
industry_title
)
industries
=
Industry
.
where
(
"title LIKE ?"
,
industry_title
)
if
industries
.
count
==
0
if
industries
.
length
==
0
industry
=
Industry
.
create
(
title:
industry_title
)
else
industry
=
industries
[
0
]
...
...
@@ -165,7 +177,7 @@ namespace :crawler do
def
check_exist_or_create_city
(
city_title
)
cities
=
City
.
where
(
"title LIKE ?"
,
city_title
)
if
cities
.
count
==
0
if
cities
.
length
==
0
city
=
City
.
create
(
title:
city_title
)
else
city
=
cities
[
0
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment