Commit da165766 by Trịnh Hoàng Phúc

Merge branch 'feature/rake_task_crawler' into 'master'

Fix get attributes crawler because careerbuild has changed UI

See merge request !11
parents 11c5feb9 475317f1
Pipeline #581 failed with stages
in 0 seconds
......@@ -16,216 +16,87 @@ namespace :crawler do
# Fetch and parse HTML document
html_jobs = Nokogiri::HTML.parse(open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
# Loop item
(0..html_jobs.css(".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href").length - 1).each do |i|
# Get href of a tag and open job detail page
job_detail_url = html_jobs.css(".col-ListJobCate .gird_standard dl .brief .jobtitle .job a @href")[i].text
html_job_detail = Nokogiri::HTML.parse(open(URI.encode(job_detail_url)))
html_jobs.css(".jobs-side-list .job-item").each_with_index do |item, index|
# Job attributes
job_attributes = {
title: nil,
updated_date_job: nil,
title: item.css(".figure .figcaption .title a @title").text,
updated_date_job: item.css(".bottom-right-icon .time time").text,
level: nil,
years_of_experience: nil,
salary: nil,
salary: item.css(".figure .figcaption .caption .salary").text.gsub("$ ",""),
expiration_date: nil,
job_description: nil,
company_id: nil,
}
# Defind cities array
cities = []
item.css(".figure .figcaption .caption .location ul li").each_with_index do |city|
city = check_exist_or_create_city(city.text)
cities << city
end
if item.css(".figure .image a @href").text != "javascript:void(0);"
# Company attributes
html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text)))
if html_company_detail.at_css(".jobsby-company")
company_attributes = {
title: nil,
address: nil,
logo: nil,
description: nil
title: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content .name").text,
address: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content p")[1].text,
logo: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .img @src").text,
description: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content ul").inner_html.strip
}
# Defind city ids array
cities = []
# Check exist or create company
job_attributes[:company_id] = check_exist_or_create_company(company_attributes)
end
end
# Defind industry ids array
industries = []
# Check what template job belongs to
if html_job_detail.at_css("#uni_container .MyJobDetail")
# CSS DOM
css_dom = "#uni_container .MyJobDetail .MyJobLeft .LeftJobCB"
# Title
job_attributes[:title] = html_job_detail.css("#{css_dom} .top-job .top-job-info h1").text
# Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .datepost span").text
# Hash company
company_attributes[:title] = html_job_detail.css("#{css_dom} .box1Detail .TitleDetailNew span").text
company_attributes[:address] = html_job_detail.css("#{css_dom} .box1Detail .TitleDetailNew label label").text
company_attributes[:logo] = html_job_detail.css("#{css_dom} .box1Detail .align_center.logocompany a img @src").text
company_attributes[:description] = html_job_detail.css("#{css_dom} .desc_company.content_fck #emp_collapse").text.split("...")[0]
# Get value for job attributes
html_job_detail.css("#{css_dom} .box2Detail .DetailJobNew li p").each_with_index do |ele, index|
type = ele.css("span").text
html_job_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .figcaption .title .job_link @href").text)))
if html_job_detail.at_css(".search-result-list-detail")
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .has-background ul li").each do |ele|
type = ele.css("strong").text
case type
when "Nơi làm việc: "
# Check exist or create city
ele.css("b a").each_with_index do |ele, index|
if index > 0
city = check_exist_or_create_city(ele.text.gsub(",",""))
cities << city
when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.css("p").text.strip
when "Cấp bậc"
job_attributes[:level] = ele.css("p").text.strip
when "Kinh nghiệm"
job_attributes[:years_of_experience] = ele.css("p").text.strip
end
end
when "Cấp bậc: "
job_attributes[:level] = ele.css("label").text
when "Kinh nghiệm: "
job_attributes[:years_of_experience] = ele.text.gsub("Kinh nghiệm: ","")
when "Lương: "
job_attributes[:salary] = ele.text.gsub("Lương: ","")
when "Ngành nghề: "
# Check exist or create industry
ele.css("b a").each_with_index do |ele, index|
industry = check_exist_or_create_industry(ele.text.gsub(",",""))
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").each do |ele|
industry = check_exist_or_create_industry(ele.text.strip.gsub(",",""))
industries << industry
end
else
job_attributes[:expiration_date] = ele.text.gsub("Hết hạn nộp: ","")
end
end
# Get description for job attributes
description = ""
html_job_detail.css("#{css_dom} .MarBot20").each_with_index do |ele, index|
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-row").each do |ele|
description << ele.inner_html
end
# Set description for job attributes
job_attributes[:job_description] = description
elsif html_job_detail.at_css("#uni_container .job-template-2")
# CSS DOM
css_dom = "#uni_container .job-template-2 .content-job-detail"
# Title
job_attributes[:title] = html_job_detail.css("#{css_dom} .top-job .top-job-info h1").text
# Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật:", "")
# Hash company
company_attributes[:title] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .top-job-info .tit_company").text
company_attributes[:address] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info p")[0].text
company_attributes[:logo] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .logocompany a img @src").text
company_attributes[:description] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .desc").text.split("...")[0]
# Get value for job attributes
html_job_detail.css("#{css_dom} .right-col .info-career .info li").each_with_index do |ele, index|
type = ele.css("b").text
case type
when "Nơi làm việc"
# Check exist or create city
ele.css("span a").each_with_index do |ele, index|
if index > 0
city = check_exist_or_create_city(ele.text.gsub(",",""))
cities << city
end
end
when "Cấp bậc"
job_attributes[:level] = ele.css("span").text
when "Kinh nghiệm"
job_attributes[:years_of_experience] = ele.css("span").text
when "Lương"
job_attributes[:salary] = ele.text.gsub("Lương: ","")
when "Ngành nghề"
# Check exist or create industry
ele.css("span a").each_with_index do |ele, index|
industry = check_exist_or_create_industry(ele.text.gsub(",",""))
industries << industry
end
else
job_attributes[:expiration_date] = ele.css("span").text
end
end
# Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#{css_dom} #showScroll").inner_html
elsif html_job_detail.at_css("#uni_container .job-template-201")
# CSS DOM
css_dom = "#uni_container .job-template-201"
# Title
job_attributes[:title] = html_job_detail.css("#{css_dom} .content-job-detail .top-job .top-job-info h1").text
# Updated date job
job_attributes[:updated_date_job] = html_job_detail.css("#{css_dom} .content-job-detail .top-job .top-job-info p")[1].text.gsub("Ngày cập nhật: ","")
# Hash company
company_attributes[:title] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .top-job-info .tit_company").text
company_attributes[:logo] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .top-job .logocompany a img @src").text
company_attributes[:address] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info p")[0].text
company_attributes[:description] = html_job_detail.css("#{css_dom} .right-col .aboutustp .info .desc").text.gsub(" Xem thêm", "")
# Get value for job attributes
html_job_detail.css("#{css_dom} .right-col .info-career .info li").each_with_index do |ele, index|
type = ele.css("b").text
case type
when "Nơi làm việc"
# Check exist or create city
ele.css("span a").each_with_index do |ele, index|
if index > 0
city = check_exist_or_create_city(ele.text)
cities << city
end
end
when "Cấp bậc"
job_attributes[:level] = ele.css("span").text
when "Lương"
job_attributes[:salary] = ele.css("span").text
when "Ngành nghề"
# Check exist or create industry
ele.css("span a").each_with_index do |ele, index|
industry = check_exist_or_create_industry(ele.text)
industries << industry
end
when "Hết hạn nộp"
job_attributes[:expiration_date] = ele.css("span").text
job_attributes[:job_description] = description.strip
else
job_attributes[:years_of_experience] = ele.css("span").text
skip_url_logger.info "another template #{item.css(".figure .figcaption .title .job_link @href").text}"
end
end
# Set description for job attributes
job_attributes[:job_description] = html_job_detail.css("#{css_dom} .left-col #showScroll").inner_html
else
skip_url_logger.info "another template #{job_detail_url}"
end
# Check exist or create company
job_attributes[:company_id] = check_exist_or_create_company(company_attributes)
# Create job
job = check_exist_or_create_job(job_attributes)
# Create city_job
if cities.length > 0
cities.each do |city|
job.cities << city
end
end
# Create industry_job
if industries.length > 0
industries.each do |industry|
job.industries << industry
end
end
rescue
exception_logger.info "Error url: #{job_detail_url}"
next
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment