Commit 36ba17f0 by Hoang Phuc

Fix review, customize rake task crawler

parent a63a3e75
Pipeline #542 failed with stages
in 0 seconds
......@@ -61,6 +61,8 @@ GEM
bindex (0.8.1)
bootsnap (1.4.6)
msgpack (~> 1.0)
bootsnap (1.4.6-java)
msgpack (~> 1.0)
builder (3.2.4)
byebug (11.1.1)
capybara (3.32.0)
......@@ -76,6 +78,9 @@ GEM
crass (1.0.6)
erubi (1.9.0)
ffi (1.12.2)
ffi (1.12.2-java)
ffi (1.12.2-x64-mingw32)
ffi (1.12.2-x86-mingw32)
globalid (0.4.2)
activesupport (>= 4.2.0)
i18n (1.8.2)
......@@ -99,13 +104,27 @@ GEM
mini_portile2 (2.4.0)
minitest (5.14.0)
msgpack (1.3.3)
msgpack (1.3.3-java)
msgpack (1.3.3-x64-mingw32)
msgpack (1.3.3-x86-mingw32)
mysql2 (0.5.3)
mysql2 (0.5.3-x64-mingw32)
mysql2 (0.5.3-x86-mingw32)
mysql2 (0.5.3-x86-mswin32-60)
nio4r (2.5.2)
nio4r (2.5.2-java)
nokogiri (1.10.9)
mini_portile2 (~> 2.4.0)
nokogiri (1.10.9-java)
nokogiri (1.10.9-x64-mingw32)
mini_portile2 (~> 2.4.0)
nokogiri (1.10.9-x86-mingw32)
mini_portile2 (~> 2.4.0)
public_suffix (4.0.3)
puma (4.3.3)
nio4r (~> 2.0)
puma (4.3.3-java)
nio4r (~> 2.0)
rack (2.2.2)
rack-proxy (0.6.5)
rack
......@@ -148,6 +167,10 @@ GEM
sassc-rails (~> 2.1, >= 2.1.1)
sassc (2.2.1)
ffi (~> 1.9)
sassc (2.2.1-x64-mingw32)
ffi (~> 1.9)
sassc (2.2.1-x86-mingw32)
ffi (~> 1.9)
sassc-rails (2.1.2)
railties (>= 4.0.0)
sassc (>= 2.0)
......@@ -170,12 +193,15 @@ GEM
sprockets (>= 3.0.0)
thor (1.0.1)
thread_safe (0.3.6)
thread_safe (0.3.6-java)
tilt (2.0.10)
turbolinks (5.2.1)
turbolinks-source (~> 5.2)
turbolinks-source (5.2.0)
tzinfo (1.2.6)
thread_safe (~> 0.1)
tzinfo-data (1.2019.3)
tzinfo (>= 1.0.0)
web-console (4.0.1)
actionview (>= 6.0.0)
activemodel (>= 6.0.0)
......@@ -191,13 +217,19 @@ GEM
railties (>= 4.2)
websocket-driver (0.7.1)
websocket-extensions (>= 0.1.0)
websocket-driver (0.7.1-java)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.4)
xpath (3.2.0)
nokogiri (~> 1.8)
zeitwerk (2.3.0)
PLATFORMS
java
ruby
x64-mingw32
x86-mingw32
x86-mswin32
DEPENDENCIES
bootsnap (>= 1.4.2)
......
......@@ -8,17 +8,17 @@ namespace :crawler do
# Define exception logger
exception_logger = ActiveSupport::Logger.new("log/exception_logger.log")
# Define skip logger
skip_url_logger = ActiveSupport::Logger.new("log/skip_url_logger.log")
# Loop page
(1..2).each do |page|
# Fetch and parse HTML document
html_jobs = Nokogiri::HTML.parse(open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
amount_html_jobs = html_jobs.css(".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href").length
# Loop item
(0..amount_html_jobs - 1).each do |i|
# Define logger
skip_url_logger = ActiveSupport::Logger.new("log/skip_url_logger.log")
# Loop item
(0..html_jobs.css(".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href").length - 1).each do |i|
# Get href of a tag and open job detail page
job_detail_url = html_jobs.css(".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href")[i].text
......@@ -52,20 +52,22 @@ namespace :crawler do
# Check what template job belongs to
if html_job_detail.at_css("#uni_container .MyJobDetail")
# CSS DOM
css_dom = "#uni_container .MyJobDetail .MyJobLeft .LeftJobCB"
# Title
job_attributes[:title] = html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .top-job .top-job-info h1").text
job_attributes[:title] = html_job_detail.css("#{css_dom} .top-job .top-job-info h1").text
# Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .datepost span").text
job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .datepost span").text
# Hash company
company_attributes[:title] = html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .box1Detail .TitleDetailNew span").text
company_attributes[:address] = html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .box1Detail .TitleDetailNew label label").text
company_attributes[:logo] = html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .box1Detail .align_center.logocompany a img @src").text
company_attributes[:description] = html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .desc_company.content_fck #emp_collapse").text.split("...")[0]
company_attributes[:title] = html_job_detail.css("#{css_dom} .box1Detail .TitleDetailNew span").text
company_attributes[:address] = html_job_detail.css("#{css_dom} .box1Detail .TitleDetailNew label label").text
company_attributes[:logo] = html_job_detail.css("#{css_dom} .box1Detail .align_center.logocompany a img @src").text
company_attributes[:description] = html_job_detail.css("#{css_dom} .desc_company.content_fck #emp_collapse").text.split("...")[0]
# Get value for job attributes
html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .box2Detail .DetailJobNew li p").each_with_index do |ele, index|
html_job_detail.css("#{css_dom} .box2Detail .DetailJobNew li p").each_with_index do |ele, index|
type = ele.css("span").text
case type
......@@ -97,7 +99,7 @@ namespace :crawler do
# Get description for job attributes
description = ""
html_job_detail.css("#uni_container .MyJobDetail .MyJobLeft .LeftJobCB .MarBot20").each_with_index do |ele, index|
html_job_detail.css("#{css_dom} .MarBot20").each_with_index do |ele, index|
description << ele.inner_html
end
......@@ -105,20 +107,23 @@ namespace :crawler do
job_attributes[:job_description] = description
elsif html_job_detail.at_css("#uni_container .job-template-2")
# CSS DOM
css_dom = "#uni_container .job-template-2 .content-job-detail"
# Title
job_attributes[:title] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail .top-job .top-job-info h1").text
job_attributes[:title] = html_job_detail.css("#{css_dom} .top-job .top-job-info h1").text
# Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật:", "")
job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật:", "")
# Hash company
company_attributes[:title] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail .right-col .aboutustp .info .top-job .top-job-info .tit_company").text
company_attributes[:address] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail .right-col .aboutustp .info p")[0].text
company_attributes[:logo] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail .right-col .aboutustp .info .top-job .logocompany a img @src").text
company_attributes[:description] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail .right-col .aboutustp .info .desc").text.split("...")[0]
company_attributes[:title] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .top-job-info .tit_company").text
company_attributes[:address] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info p")[0].text
company_attributes[:logo] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .logocompany a img @src").text
company_attributes[:description] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .desc").text.split("...")[0]
# Get value for job attributes
html_job_detail.css("#uni_container .job-template-2 .content-job-detail .right-col .info-career .info li").each_with_index do |ele, index|
html_job_detail.css("#{css_dom} .right-col .info-career .info li").each_with_index do |ele, index|
type = ele.css("b").text
case type
......@@ -148,102 +153,25 @@ namespace :crawler do
end
# Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#uni_container .job-template-2 .content-job-detail #showScroll").inner_html
elsif html_job_detail.at_css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .TitleJoblarge")
# Title
job_attributes[:title] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .TitleJoblarge h1").text
# Hash company
company_attributes[:title] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .box_470 .intro_company .content_intro p")[0].text
company_attributes[:logo] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .box_470 .intro_company .title_into .logoJobs img @src").text
company_attributes[:description] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .box_470 .intro_company .content_intro p")[1].text.split("...")[0]
# Get value for job attributes
html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .box_470 .intro_right .content_job_info .content_job_left .left_380 li").each_with_index do |ele, index|
type = ele.css(".col_left_76").text
case type
when "Nơi làm việc"
# Check exist or create city
ele.css(".two_dot a").each_with_index do |ele, index|
if index > 0
city_id = check_exist_or_create_city(ele.text.gsub(",",""))
city_ids << city_id
end
end
when "Cấp bậc"
job_attributes[:level] = ele.css(".two_dot").text
when "Lương"
job_attributes[:salary] = ele.css(".two_dot").text.gsub("Lương: ","")
when "Ngành nghề"
# Check exist or create industry
ele.css(".two_dot a").each_with_index do |ele, index|
industry_id = check_exist_or_create_industry(ele.text)
industry_ids << industry_id
end
when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.css(".two_dot").text
end
end
# Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .intro_left").inner_html
elsif html_job_detail.at_css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .intro_left .title320")
# Title
job_attributes[:title] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .intro_left .title320 h1").text
# Hash company
company_attributes[:title] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .box_470 .intro_company .content_intro p")[0].text
company_attributes[:description] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .box_470 .intro_company .content_intro p")[1].text.split("...")[0]
# Get value for job attributes
html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .intro_right .content_job_info .content_job_left .left_380 li").each_with_index do |ele, index|
type = ele.css(".col_left_76").text
case type
when "Nơi làm việc"
# Check exist or create city
ele.css(".two_dot a").each_with_index do |ele, index|
if index > 0
city_id = check_exist_or_create_city(ele.text.gsub(",",""))
city_ids << city_id
end
end
when "Cấp bậc"
job_attributes[:level] = ele.css(".two_dot").text
when "Lương"
job_attributes[:salary] = ele.css(".two_dot").text.gsub("Lương: ","")
when "Ngành nghề"
# Check exist or create industry
ele.css(".two_dot a").each_with_index do |ele, index|
industry_id = check_exist_or_create_industry(ele.text)
industry_ids << industry_id
end
when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.css(".two_dot").text
end
end
# Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#uni_container #my_kiemviec #col3 #col3_content article #template_vantai #main_content .content_right .content_470 .intro_job .intro_left").inner_html
job_attributes[:job_description] = html_job_detail.css("#{css_dom} #showScroll").inner_html
elsif html_job_detail.at_css("#uni_container .job-template-201")
# CSS DOM
css_dom = "#uni_container .job-template-201"
# Title
job_attributes[:title] = html_job_detail.css("#uni_container .job-template-201 .content-job-detail .top-job .top-job-info h1").text
job_attributes[:title] = html_job_detail.css("#{css_dom} .content-job-detail .top-job .top-job-info h1").text
# Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#uni_container .job-template-201 .content-job-detail .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật: ","")
job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .content-job-detail .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật: ","")
# Hash company
company_attributes[:title] = html_job_detail.css("#uni_container .job-template-201 .right-col .aboutustp .info .top-job .top-job-info .tit_company").text
company_attributes[:logo] = html_job_detail.css("#uni_container .job-template-201 .right-col .aboutustp .info .top-job .logocompany a img @src").text
company_attributes[:address] = html_job_detail.css("#uni_container .job-template-201 .right-col .aboutustp .info p")[0].text
company_attributes[:description] = html_job_detail.css("#uni_container .job-template-201 .right-col .aboutustp .info .desc").text.gsub(" Xem thêm", "")
company_attributes[:title] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .top-job-info .tit_company").text
company_attributes[:logo] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .logocompany a img @src").text
company_attributes[:address] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info p")[0].text
company_attributes[:description] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .desc").text.gsub(" Xem thêm", "")
# Get value for job attributes
html_job_detail.css("#uni_container .job-template-201 .right-col .info-career .info li").each_with_index do |ele, index|
html_job_detail.css("#{css_dom} .right-col .info-career .info li").each_with_index do |ele, index|
type = ele.css("b").text
case type
......@@ -273,7 +201,7 @@ namespace :crawler do
end
# Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#uni_container .job-template-201 .left-col #showScroll").inner_html
job_attributes[:job_description] = html_job_detail.css("#{css_dom} .left-col #showScroll").inner_html
else
skip_url_logger.info "another template #{job_detail_url}"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment