Commit 2d0503f4 by nnnghia98

remove unnecessary html tag

parent 0f713e18
require "nokogiri" require "nokogiri"
require "open-uri" require "open-uri"
require "resolv-replace" require "resolv-replace"
require "openssl"
OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE
class CrawlData class CrawlData
def crawl_web def crawl_web
page = Nokogiri::HTML.parse(open(Settings.crawl.base_url)) page = Nokogiri::HTML.parse(open(Settings.crawl.base_url))
total_job = page.css("div.ais-stats h1.col-sm-10 span").text.gsub(",", "").to_f total_job = page.css("div.ais-stats h1.col-sm-10 span").text.gsub(",", "").to_f
total_page = (total_job / 50).floor total_page = (total_job / 50).floor
fixed_total_page = 20 fixed_total_page = 20
crawl_job_title_logger = ActiveSupport::Logger.new("log/crawl_data.log")
crawl_job_title_logger.info "Crawl at #{Time.current}"
(1..fixed_total_page).each do |each_page| (1..fixed_total_page).each do |each_page|
page = Nokogiri::HTML.parse(open(URI.encode("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{each_page}-vi.html"))) page = Nokogiri::HTML.parse(open(URI.encode("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{each_page}-vi.html")))
(0..49).each do |j| (0..49).each do |j|
job_url = page.css("span.jobtitle h3 a @href")[j].text job_url = page.css(".jobtitle h3 a @href")[j].text
job_page = Nokogiri::HTML.parse(open(URI.encode(job_url))) job_page = Nokogiri::HTML.parse(open(URI.encode(job_url)))
# Job code # Job code
job_code = job_url.split("/").last.split(".")[-2] job_code = job_url.split("/").last.split(".")[-2]
next if job_page.css("div.LeftJobCB").nil? next if job_page.css(".LeftJobCB").nil?
# Job title # Job title
job_title = job_page.css("div.top-job-info h1").text.strip job_title = job_page.css(".top-job-info h1").text.strip
crawl_job_title_logger = ActiveSupport::Logger.new("log/crawl_data.log")
crawl_job_title_logger.info "#{job_title}" crawl_job_title_logger.info "#{job_title}"
# Job post date # Job post date
job_post_date = job_page.css("div.datepost span").text job_post_date = job_page.css(".datepost span").text
job_salary, job_position, job_expiration_date, job_industries, job_level = "" job_salary, job_position, job_expiration_date, job_industries, job_level = ""
job_workplace = [] job_workplace = []
detail_job_new = job_page.css("ul.DetailJobNew li p") detail_job_new = job_page.css(".DetailJobNew li p")
(0..detail_job_new.count - 1).each do |detail_part| (0..detail_job_new.count - 1).each do |detail_part|
detail = detail_job_new[detail_part].text detail = detail_job_new[detail_part].text
...@@ -62,18 +65,18 @@ class CrawlData ...@@ -62,18 +65,18 @@ class CrawlData
company_name, company_email, company_address, company_desc, company_code = "" company_name, company_email, company_address, company_desc, company_code = ""
# Company full name # Company full name
unless job_page.css("div.tit_company").nil? unless job_page.css(".tit_company").nil?
company_name = job_page.css("div.tit_company").text.strip company_name = job_page.css("div.tit_company").text.strip
end end
# Company code # Company code
company_code = job_url.split("/").last.split("-").last.split(".")[-2].strip company_code = job_url.split("/").last.split("-").last.split(".")[-2].strip
# Company address # Company address
unless job_page.css("p.TitleDetailNew label")[0].nil? unless job_page.css(".TitleDetailNew label")[0].nil?
company_address = job_page.css("p.TitleDetailNew label")[0].text.strip company_address = job_page.css("p.TitleDetailNew label")[0].text.strip
end end
# Company description # Company description
company_desc = job_page.css("span#emp_more p").text.strip company_desc = job_page.css("#emp_more p").text.strip
job_workplace.each do |city_name| job_workplace.each do |city_name|
city_id = city_id(city_name) city_id = city_id(city_name)
......
require_relative 'boot' require_relative 'boot'
require 'openssl'
require 'rails/all' require 'rails/all'
# Require the gems listed in Gemfile, including any gems # Require the gems listed in Gemfile, including any gems
# you've limited to :test, :development, or :production. # you've limited to :test, :development, or :production.
OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE
Bundler.require(*Rails.groups) Bundler.require(*Rails.groups)
module Venjob module Venjob
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment