Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
V
Venjob_HungNT
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ngô Trung Hưng
Venjob_HungNT
Commits
a1f70d89
Commit
a1f70d89
authored
Jul 16, 2020
by
Ngo Trung Hung
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
finished crawler 50% jobs
parent
c797139b
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
126 additions
and
123 deletions
+126
-123
app/controllers/application_controller.rb
+0
-0
app/controllers/home_controller.rb
+4
-1
app/helpers/crawler_helper.rb
+0
-0
config/database.yml
+1
-1
lib/helper/crawler.rb
+117
-0
lib/tasks/crawler.rake
+4
-121
No files found.
app/controllers/application_controller.rb
View file @
a1f70d89
app/controllers/home_controller.rb
View file @
a1f70d89
...
...
@@ -3,6 +3,9 @@ class HomeController < ApplicationController
def
index
# crawl_data_jobs_interface_1()
# crawl_data_jobs_interface_2()
crawl_data_jobs_interface_3
()
# crawl_data_jobs_interface_3()
# crawl_data_jobs_interface_4()
# crawl_data_jobs_interface_5()
make_data
end
end
app/helpers/crawler_helper.rb
View file @
a1f70d89
This diff is collapsed.
Click to expand it.
config/database.yml
View file @
a1f70d89
...
...
@@ -14,7 +14,7 @@ default: &default
encoding
:
utf8
pool
:
<%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username
:
root
password
:
'
1'
password
:
'
1
2345678
'
socket
:
/var/run/mysqld/mysqld.sock
...
...
lib/helper/crawler.rb
0 → 100644
View file @
a1f70d89
require
'open-uri'
class
Clawler
@page
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
))
# PILL DATA CITIES
def
self
.
make_cities
@data_list_cities
=
[]
data
=
@page
.
search
(
"#location option"
)
list_cities
=
data
.
to_s
.
split
(
"</option>"
)
list_cities
.
each
do
|
x
|
@data_list_cities
<<
x
.
gsub
(
/(^<[\w\D]*>)/
,
''
).
gsub
(
/\n/
,
''
).
rstrip
end
@data_list_cities
.
length
.
times
do
|
i
|
if
i
<=
69
name
=
(
@data_list_cities
[
i
].
to_s
)
City
.
create!
(
name:
name
,
area:
1
)
elsif
i
>
69
name
=
(
@data_list_cities
[
i
].
to_s
)
City
.
create!
(
name:
name
,
area:
0
)
end
end
end
#PIL DATA INDUSTRIES
def
self
.
make_industries
@data_list_industries
=
[]
data
=
@page
.
search
(
"#industry option"
)
list_industries
=
data
.
to_s
.
split
(
"</option>"
)
list_industries
.
each
do
|
x
|
@data_list_industries
<<
x
.
gsub
(
/(^<[\w\D]*>)/
,
''
).
gsub
(
/\n/
,
''
).
rstrip
end
@data_list_industries
.
length
.
times
do
|
i
|
name
=
(
@data_list_industries
[
i
].
to_s
)
Industry
.
create!
(
name:
name
)
end
end
# CRAWLER LINK JOB & COMPANIES
def
self
.
crawl_link_for_companies_jobs
data
=
[]
website_companies
=
[]
website_jobs
=
[]
num_page_will_crawl
=
1
num_page_will_crawl
.
times
do
|
i
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
i
+
1
}
-vi.html"
))
website_companies
<<
page
.
search
(
".figcaption .caption a/@href"
).
text
.
to_s
.
split
(
'https://careerbuilder.vn/'
)
website_jobs
<<
page
.
search
(
".figcaption .title .job_link @href"
).
text
.
to_s
.
split
(
'https://careerbuilder.vn/'
)
end
website_companies
=
website_companies
.
join
(
","
)
website_companies
=
website_companies
.
split
(
","
).
uniq!
website_companies
=
website_companies
.
select
{
|
val
|
val
!=
''
}
website_jobs
=
website_jobs
.
join
(
","
)
website_jobs
=
website_jobs
.
split
(
","
)
website_jobs
=
website_jobs
.
select
{
|
val
|
val
!=
''
}
data
<<
website_companies
<<
website_jobs
end
# CRAWLER DATA COMPANIES
def
self
.
craw_data_companies
link_crawl
=
crawl_link_for_companies_jobs
()
@data_companies
=
{}
@data_companies_name
=
[]
@data_companies_address
=
[]
@data_companies_description
=
[]
link_crawl
[
0
].
each
do
|
url
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/
#{
url
}
"
))))
name
=
''
address
=
''
desc
=
''
if
page
.
search
(
".company-info .info .content .name"
).
text
==
""
name
=
page
.
search
(
".section-page #cp_company_name"
).
text
address
=
page
.
search
(
".section-page .cp_basic_info_details ul li:nth-child(1)"
).
text
desc
=
page
.
search
(
".cp_aboutus_item .content_fck"
).
text
else
name
=
page
.
search
(
".company-info .info .content .name"
).
text
address
=
page
.
search
(
".company-info .info .content p:nth-child(3)"
).
text
desc
=
page
.
search
(
".main-about-us .content"
).
text
end
if
(
name
!=
""
&&
address
!=
""
&&
desc
!=
""
)
@data_companies_name
<<
name
.
to_s
.
rstrip
@data_companies_address
<<
address
.
to_s
.
rstrip
@data_companies_description
<<
desc
end
end
@data_companies
[
:name
]
=
@data_companies_name
@data_companies
[
:address
]
=
@data_companies_address
@data_companies_description
.
each
do
|
val
|
val
.
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
val
.
strip!
end
@data_companies
[
:description
]
=
@data_companies_description
@data_companies
end
# FILL DATA COMPANIES
def
self
.
make_companies
@data
=
self
.
craw_data_companies
()
i
=
@data
[
:name
].
length
i
.
times
do
|
n
|
name
=
@data
[
:name
][
n
]
address
=
@data
[
:address
][
n
]
short_description
=
@data
[
:description
][
n
]
password
=
"password"
Company
.
create!
(
name:
name
,
address:
address
,
short_description:
short_description
)
end
end
end
\ No newline at end of file
lib/tasks/crawler.rake
View file @
a1f70d89
require
'
open-uri
'
require
'
helper/crawler
'
namespace
:db
do
task
populate: :environment
do
make_cities
make_industries
make_companies
end
# CRAWLER ALL CITIES
$page
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'
))
p1
=
Nokogiri
::
HTML
(
URI
.
open
(
'https://careerbuilder.vn/vi/tim-viec-lam/vinhomes-chuyen-vien-thu-tuc-bat-dong-san.35B449B5.html'
))
def
make_cities
@data_list_cities
=
[]
data
=
$page
.
search
(
"#location option"
)
list_cities
=
data
.
to_s
.
split
(
"</option>"
)
list_cities
.
each
do
|
x
|
@data_list_cities
<<
x
.
gsub
(
/(^<[\w\D]*>)/
,
''
).
gsub
(
/\n/
,
''
).
rstrip
end
@data_list_cities
.
length
.
times
do
|
i
|
if
i
<=
69
name
=
(
@data_list_cities
[
i
].
to_s
)
City
.
create!
(
name:
name
,
area:
1
)
elsif
i
>
69
name
=
(
@data_list_cities
[
i
].
to_s
)
City
.
create!
(
name:
name
,
area:
0
)
end
end
end
# CRAWLER ALL INDUSTRIES
def
make_industries
@data_list_industries
=
[]
data
=
$page
.
search
(
"#industry option"
)
list_industries
=
data
.
to_s
.
split
(
"</option>"
)
list_industries
.
each
do
|
x
|
@data_list_industries
<<
x
.
gsub
(
/(^<[\w\D]*>)/
,
''
).
gsub
(
/\n/
,
''
).
rstrip
end
@data_list_industries
.
length
.
times
do
|
i
|
name
=
(
@data_list_industries
[
i
].
to_s
)
Industry
.
create!
(
name:
name
)
end
end
# CRAWLER LINK JOB & COMPANIES
def
crawl_link_for_companies_jobs
data
=
[]
website_companies
=
[]
website_jobs
=
[]
num_page_will_crawl
=
3
num_page_will_crawl
.
times
do
|
i
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
"https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-
#{
i
+
1
}
-vi.html"
))
website_companies
<<
page
.
search
(
".figcaption .caption a/@href"
).
text
.
to_s
.
split
(
'https://careerbuilder.vn/'
)
website_jobs
<<
page
.
search
(
".figcaption .title .job_link @href"
).
text
.
to_s
.
split
(
'https://careerbuilder.vn/'
)
end
website_companies
=
website_companies
.
join
(
","
)
website_companies
=
website_companies
.
split
(
","
).
uniq!
website_companies
=
website_companies
.
select
{
|
val
|
val
!=
''
}
website_jobs
=
website_jobs
.
join
(
","
)
website_jobs
=
website_jobs
.
split
(
","
)
website_jobs
=
website_jobs
.
select
{
|
val
|
val
!=
''
}
data
<<
website_companies
<<
website_jobs
end
# CRAWLER COMPANIES
def
make_companies
def
craw_data_companies
link_crawl
=
crawl_link_for_companies_jobs
()
@data_companies
=
{}
@data_companies_name
=
[]
@data_companies_address
=
[]
@data_companies_description
=
[]
link_crawl
[
0
].
each
do
|
url
|
page
=
Nokogiri
::
HTML
(
URI
.
open
(
URI
.
parse
(
URI
.
escape
(
"https://careerbuilder.vn/
#{
url
}
"
))))
name
=
''
address
=
''
desc
=
''
if
page
.
search
(
".company-info .info .content .name"
).
text
==
""
name
=
page
.
search
(
".section-page #cp_company_name"
).
text
address
=
page
.
search
(
".section-page .cp_basic_info_details ul li:nth-child(1)"
).
text
desc
=
page
.
search
(
".cp_aboutus_item .content_fck"
).
text
else
name
=
page
.
search
(
".company-info .info .content .name"
).
text
address
=
page
.
search
(
".company-info .info .content p:nth-child(3)"
).
text
desc
=
page
.
search
(
".main-about-us .content"
).
text
end
if
(
name
!=
""
&&
address
!=
""
&&
desc
!=
""
)
@data_companies_name
<<
name
.
to_s
.
rstrip
@data_companies_address
<<
address
.
to_s
.
rstrip
@data_companies_description
<<
desc
end
end
@data_companies
[
:name
]
=
@data_companies_name
@data_companies
[
:address
]
=
@data_companies_address
@data_companies_description
.
each
do
|
val
|
val
.
to_s
.
delete!
(
"[
\n
,
\t
,
\r
]"
)
val
.
strip!
end
@data_companies
[
:description
]
=
@data_companies_description
@data_companies
end
@data
=
craw_data_companies
()
i
=
@data
[
:name
].
length
i
.
times
do
|
n
|
name
=
@data
[
:name
][
n
]
address
=
@data
[
:address
][
n
]
short_description
=
@data
[
:description
][
n
]
password
=
"password"
Company
.
create!
(
name:
name
,
address:
address
,
short_description:
short_description
)
end
end
def
make_jobs
Clawler
.
make_cities
Clawler
.
make_industries
Clawler
.
make_companies
end
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment