Commit 36ba17f0 by Hoang Phuc

Fix review, customize rake task crawler

parent a63a3e75
Pipeline #542 failed with stages
in 0 seconds
...@@ -61,6 +61,8 @@ GEM ...@@ -61,6 +61,8 @@ GEM
bindex (0.8.1) bindex (0.8.1)
bootsnap (1.4.6) bootsnap (1.4.6)
msgpack (~> 1.0) msgpack (~> 1.0)
bootsnap (1.4.6-java)
msgpack (~> 1.0)
builder (3.2.4) builder (3.2.4)
byebug (11.1.1) byebug (11.1.1)
capybara (3.32.0) capybara (3.32.0)
...@@ -76,6 +78,9 @@ GEM ...@@ -76,6 +78,9 @@ GEM
crass (1.0.6) crass (1.0.6)
erubi (1.9.0) erubi (1.9.0)
ffi (1.12.2) ffi (1.12.2)
ffi (1.12.2-java)
ffi (1.12.2-x64-mingw32)
ffi (1.12.2-x86-mingw32)
globalid (0.4.2) globalid (0.4.2)
activesupport (>= 4.2.0) activesupport (>= 4.2.0)
i18n (1.8.2) i18n (1.8.2)
...@@ -99,13 +104,27 @@ GEM ...@@ -99,13 +104,27 @@ GEM
mini_portile2 (2.4.0) mini_portile2 (2.4.0)
minitest (5.14.0) minitest (5.14.0)
msgpack (1.3.3) msgpack (1.3.3)
msgpack (1.3.3-java)
msgpack (1.3.3-x64-mingw32)
msgpack (1.3.3-x86-mingw32)
mysql2 (0.5.3) mysql2 (0.5.3)
mysql2 (0.5.3-x64-mingw32)
mysql2 (0.5.3-x86-mingw32)
mysql2 (0.5.3-x86-mswin32-60)
nio4r (2.5.2) nio4r (2.5.2)
nio4r (2.5.2-java)
nokogiri (1.10.9) nokogiri (1.10.9)
mini_portile2 (~> 2.4.0) mini_portile2 (~> 2.4.0)
nokogiri (1.10.9-java)
nokogiri (1.10.9-x64-mingw32)
mini_portile2 (~> 2.4.0)
nokogiri (1.10.9-x86-mingw32)
mini_portile2 (~> 2.4.0)
public_suffix (4.0.3) public_suffix (4.0.3)
puma (4.3.3) puma (4.3.3)
nio4r (~> 2.0) nio4r (~> 2.0)
puma (4.3.3-java)
nio4r (~> 2.0)
rack (2.2.2) rack (2.2.2)
rack-proxy (0.6.5) rack-proxy (0.6.5)
rack rack
...@@ -148,6 +167,10 @@ GEM ...@@ -148,6 +167,10 @@ GEM
sassc-rails (~> 2.1, >= 2.1.1) sassc-rails (~> 2.1, >= 2.1.1)
sassc (2.2.1) sassc (2.2.1)
ffi (~> 1.9) ffi (~> 1.9)
sassc (2.2.1-x64-mingw32)
ffi (~> 1.9)
sassc (2.2.1-x86-mingw32)
ffi (~> 1.9)
sassc-rails (2.1.2) sassc-rails (2.1.2)
railties (>= 4.0.0) railties (>= 4.0.0)
sassc (>= 2.0) sassc (>= 2.0)
...@@ -170,12 +193,15 @@ GEM ...@@ -170,12 +193,15 @@ GEM
sprockets (>= 3.0.0) sprockets (>= 3.0.0)
thor (1.0.1) thor (1.0.1)
thread_safe (0.3.6) thread_safe (0.3.6)
thread_safe (0.3.6-java)
tilt (2.0.10) tilt (2.0.10)
turbolinks (5.2.1) turbolinks (5.2.1)
turbolinks-source (~> 5.2) turbolinks-source (~> 5.2)
turbolinks-source (5.2.0) turbolinks-source (5.2.0)
tzinfo (1.2.6) tzinfo (1.2.6)
thread_safe (~> 0.1) thread_safe (~> 0.1)
tzinfo-data (1.2019.3)
tzinfo (>= 1.0.0)
web-console (4.0.1) web-console (4.0.1)
actionview (>= 6.0.0) actionview (>= 6.0.0)
activemodel (>= 6.0.0) activemodel (>= 6.0.0)
...@@ -191,13 +217,19 @@ GEM ...@@ -191,13 +217,19 @@ GEM
railties (>= 4.2) railties (>= 4.2)
websocket-driver (0.7.1) websocket-driver (0.7.1)
websocket-extensions (>= 0.1.0) websocket-extensions (>= 0.1.0)
websocket-driver (0.7.1-java)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.4) websocket-extensions (0.1.4)
xpath (3.2.0) xpath (3.2.0)
nokogiri (~> 1.8) nokogiri (~> 1.8)
zeitwerk (2.3.0) zeitwerk (2.3.0)
PLATFORMS PLATFORMS
java
ruby ruby
x64-mingw32
x86-mingw32
x86-mswin32
DEPENDENCIES DEPENDENCIES
bootsnap (>= 1.4.2) bootsnap (>= 1.4.2)
......
...@@ -8,17 +8,17 @@ namespace :crawler do ...@@ -8,17 +8,17 @@ namespace :crawler do
# Define exception logger # Define exception logger
exception_logger = ActiveSupport::Logger.new("log/exception_logger.log") exception_logger = ActiveSupport::Logger.new("log/exception_logger.log")
# Define skip logger
skip_url_logger = ActiveSupport::Logger.new("log/skip_url_logger.log")
# Loop page # Loop page
(1..2).each do |page| (1..2).each do |page|
# Fetch and parse HTML document # Fetch and parse HTML document
html_jobs = Nokogiri::HTML.parse(open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html")) html_jobs = Nokogiri::HTML.parse(open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
amount_html_jobs = html_jobs.css(".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href").length
# Loop item # Loop item
(0..amount_html_jobs - 1).each do |i| (0..html_jobs.css(".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href").length - 1).each do |i|
# Define logger
skip_url_logger = ActiveSupport::Logger.new("log/skip_url_logger.log")
# Get href of a tag and open job detail page # Get href of a tag and open job detail page
job_detail_url = html_jobs.css(".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href")[i].text job_detail_url = html_jobs.css(".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href")[i].text
...@@ -52,20 +52,22 @@ namespace :crawler do ...@@ -52,20 +52,22 @@ namespace :crawler do
# Check what template job belongs to # Check what template job belongs to
if html_job_detail.at_css("#uni_container .MyJobDetail") if html_job_detail.at_css("#uni_container .MyJobDetail")
# CSS DOM
css_dom = "#uni_container .MyJobDetail .MyJobLeft .LeftJobCB"
# Title # Title
job_attributes[:title] = html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .top-job .top-job-info h1").text job_attributes[:title] = html_job_detail.css("#{css_dom} .top-job .top-job-info h1").text
# Updated date job # Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .datepost span").text job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .datepost span").text
# Hash company # Hash company
company_attributes[:title] = html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .box1Detail .TitleDetailNew span").text company_attributes[:title] = html_job_detail.css("#{css_dom} .box1Detail .TitleDetailNew span").text
company_attributes[:address] = html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .box1Detail .TitleDetailNew label label").text company_attributes[:address] = html_job_detail.css("#{css_dom} .box1Detail .TitleDetailNew label label").text
company_attributes[:logo] = html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .box1Detail .align_center.logocompany a img @src").text company_attributes[:logo] = html_job_detail.css("#{css_dom} .box1Detail .align_center.logocompany a img @src").text
company_attributes[:description] = html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .desc_company.content_fck #emp_collapse").text.split("...")[0] company_attributes[:description] = html_job_detail.css("#{css_dom} .desc_company.content_fck #emp_collapse").text.split("...")[0]
# Get value for job attributes # Get value for job attributes
html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .box2Detail .DetailJobNew li p").each_with_index do |ele, index| html_job_detail.css("#{css_dom} .box2Detail .DetailJobNew li p").each_with_index do |ele, index|
type = ele.css("span").text type = ele.css("span").text
case type case type
...@@ -97,7 +99,7 @@ namespace :crawler do ...@@ -97,7 +99,7 @@ namespace :crawler do
# Get description for job attributes # Get description for job attributes
description = "" description = ""
html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .MarBot20").each_with_index do |ele, index| html_job_detail.css("#{css_dom} .MarBot20").each_with_index do |ele, index|
description << ele.inner_html description << ele.inner_html
end end
...@@ -105,20 +107,23 @@ namespace :crawler do ...@@ -105,20 +107,23 @@ namespace :crawler do
job_attributes[:job_description] = description job_attributes[:job_description] = description
elsif html_job_detail.at_css("#uni_container .job-template-2") elsif html_job_detail.at_css("#uni_container .job-template-2")
# CSS DOM
css_dom = "#uni_container .job-template-2 .content-job-detail"
# Title # Title
job_attributes[:title] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail .top-job .top-job-info h1").text job_attributes[:title] = html_job_detail.css("#{css_dom} .top-job .top-job-info h1").text
# Updated date job # Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật:", "") job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật:", "")
# Hash company # Hash company
company_attributes[:title] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail .right-col .aboutustp .info .top-job .top-job-info .tit_company").text company_attributes[:title] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .top-job-info .tit_company").text
company_attributes[:address] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail .right-col .aboutustp .info p")[0].text company_attributes[:address] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info p")[0].text
company_attributes[:logo] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail .right-col .aboutustp .info .top-job .logocompany a img @src").text company_attributes[:logo] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .logocompany a img @src").text
company_attributes[:description] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail .right-col .aboutustp .info .desc").text.split("...")[0] company_attributes[:description] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .desc").text.split("...")[0]
# Get value for job attributes # Get value for job attributes
html_job_detail.css("#uni_container .job-template-2 .content-job-detail .right-col .info-career .info li").each_with_index do |ele, index| html_job_detail.css("#{css_dom} .right-col .info-career .info li").each_with_index do |ele, index|
type = ele.css("b").text type = ele.css("b").text
case type case type
...@@ -148,102 +153,25 @@ namespace :crawler do ...@@ -148,102 +153,25 @@ namespace :crawler do
end end
# Set description for job attributes # Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail #showScroll").inner_html job_attributes[:job_description] = html_job_detail.css("#{css_dom} #showScroll").inner_html
elsif html_job_detail.at_css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .TitleJoblarge")
# Title
job_attributes[:title] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .TitleJoblarge h1").text
# Hash company
company_attributes[:title] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .box_470 .intro_company .content_intro p")[0].text
company_attributes[:logo] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .box_470 .intro_company .title_into .logoJobs img @src").text
company_attributes[:description] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .box_470 .intro_company .content_intro p")[1].text.split("...")[0]
# Get value for job attributes
html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .box_470 .intro_right .content_job_info .content_job_left .left_380 li").each_with_index do |ele, index|
type = ele.css(".col_left_76").text
case type
when "Nơi làm việc"
# Check exist or create city
ele.css(".two_dot a").each_with_index do |ele, index|
if index > 0
city_id = check_exist_or_create_city(ele.text.gsub(",",""))
city_ids << city_id
end
end
when "Cấp bậc"
job_attributes[:level] = ele.css(".two_dot").text
when "Lương"
job_attributes[:salary] = ele.css(".two_dot").text.gsub("Lương: ","")
when "Ngành nghề"
# Check exist or create industry
ele.css(".two_dot a").each_with_index do |ele, index|
industry_id = check_exist_or_create_industry(ele.text)
industry_ids << industry_id
end
when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.css(".two_dot").text
end
end
# Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .intro_left").inner_html
elsif html_job_detail.at_css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .intro_left .title320")
# Title
job_attributes[:title] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .intro_left .title320 h1").text
# Hash company
company_attributes[:title] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .box_470 .intro_company .content_intro p")[0].text
company_attributes[:description] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .box_470 .intro_company .content_intro p")[1].text.split("...")[0]
# Get value for job attributes
html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .intro_right .content_job_info .content_job_left .left_380 li").each_with_index do |ele, index|
type = ele.css(".col_left_76").text
case type
when "Nơi làm việc"
# Check exist or create city
ele.css(".two_dot a").each_with_index do |ele, index|
if index > 0
city_id = check_exist_or_create_city(ele.text.gsub(",",""))
city_ids << city_id
end
end
when "Cấp bậc"
job_attributes[:level] = ele.css(".two_dot").text
when "Lương"
job_attributes[:salary] = ele.css(".two_dot").text.gsub("Lương: ","")
when "Ngành nghề"
# Check exist or create industry
ele.css(".two_dot a").each_with_index do |ele, index|
industry_id = check_exist_or_create_industry(ele.text)
industry_ids << industry_id
end
when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.css(".two_dot").text
end
end
# Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .intro_left").inner_html
elsif html_job_detail.at_css("#uni_container .job-template-201") elsif html_job_detail.at_css("#uni_container .job-template-201")
# CSS DOM
css_dom = "#uni_container .job-template-201"
# Title # Title
job_attributes[:title] = html_job_detail.css("#uni_container .job-template-201 .content-job-detail .top-job .top-job-info h1").text job_attributes[:title] = html_job_detail.css("#{css_dom} .content-job-detail .top-job .top-job-info h1").text
# Updated date job # Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#uni_container .job-template-201 .content-job-detail .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật: ","") job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .content-job-detail .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật: ","")
# Hash company # Hash company
company_attributes[:title] = html_job_detail.css("#uni_container .job-template-201 .right-col .aboutustp .info .top-job .top-job-info .tit_company").text company_attributes[:title] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .top-job-info .tit_company").text
company_attributes[:logo] = html_job_detail.css("#uni_container .job-template-201 .right-col .aboutustp .info .top-job .logocompany a img @src").text company_attributes[:logo] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .logocompany a img @src").text
company_attributes[:address] = html_job_detail.css("#uni_container .job-template-201 .right-col .aboutustp .info p")[0].text company_attributes[:address] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info p")[0].text
company_attributes[:description] = html_job_detail.css("#uni_container .job-template-201 .right-col .aboutustp .info .desc").text.gsub(" Xem thêm", "") company_attributes[:description] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .desc").text.gsub(" Xem thêm", "")
# Get value for job attributes # Get value for job attributes
html_job_detail.css("#uni_container .job-template-201 .right-col .info-career .info li").each_with_index do |ele, index| html_job_detail.css("#{css_dom} .right-col .info-career .info li").each_with_index do |ele, index|
type = ele.css("b").text type = ele.css("b").text
case type case type
...@@ -273,7 +201,7 @@ namespace :crawler do ...@@ -273,7 +201,7 @@ namespace :crawler do
end end
# Set description for job attributes # Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#uni_container .job-template-201 .left-col #showScroll").inner_html job_attributes[:job_description] = html_job_detail.css("#{css_dom} .left-col #showScroll").inner_html
else else
skip_url_logger.info "another template #{job_detail_url}" skip_url_logger.info "another template #{job_detail_url}"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment