Commit 2f65fef8 by Van Hau Le

Merge branch 'feature/crawl_data' into 'master'

Feature/crawl data

See merge request !21
parents 88267603 872bce8d
......@@ -28,6 +28,7 @@ gem 'draper'
gem 'rsolr'
gem 'rsolr-ext'
gem 'pry'
gem 'nokogiri'
# Use Redis adapter to run Action Cable in production
# gem 'redis', '~> 4.0'
# Use Active Model has_secure_password
......
......@@ -320,6 +320,7 @@ DEPENDENCIES
kaminari
listen (>= 3.0.5, < 3.2)
mysql2 (>= 0.4.4)
nokogiri
pry
puma (~> 3.11)
rails (~> 6.0.0)
......
......@@ -14,7 +14,7 @@ class JobDecorator < ApplicationDecorator
end
def display_short_des
simple_format object.short_des&.truncate(250)
simple_format object.requirement&.truncate(250)
end
def display_description
......
require "nokogiri"
require "open-uri"
require "resolv-replace"
require "openssl"
OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE
class CrawlData
def crawl_web
page = Nokogiri::HTML.parse(open(Settings.crawl.base_url, ssl_verify_mode: nil))
total_job = page.css("div.ais-stats h1.col-sm-10 span").text.gsub(",", "").to_f
return if total_job == 0
total_page = (total_job / Settings.crawl.jobs_per_page).floor
crawl_job_title_logger = ActiveSupport::Logger.new("log/crawl_data.log")
crawl_job_title_logger.info "Crawl at #{Time.current}"
(1..Settings.crawl.fixed_total_page).each do |each_page|
page = Nokogiri::HTML.parse(open(URI.encode("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{each_page}-vi.html")))
(0..49).each do |j|
job_url = page.css(".jobtitle h3 a @href")[j].text
job_page = Nokogiri::HTML.parse(open(URI.encode(job_url)))
job = JobHtml.new(job_page).parse_job
next if job_page.css(".LeftJobCB").nil? || job[:workplace].blank?
# Job code
job_code = job_url.split("/").last.split(".")[-2] || ""
job[:code] = job_code
# Company code
company_code = job_page.css(".viewmorejob a @href").present? ?
job_page.css(".viewmorejob a @href").text.split("/").last.split("-")[-2].strip : ""
crawl_job_title_logger.info "#{job[:title]}"
job[:workplace].each do |city_name|
city_id = City.find_or_create_by(name: city_name.strip, region: "Việt Nam").id
job[:company_id] = get_company(company_code, job[:company_name], job[:company_address],
job[:company_description]).id
saved_job = save_job(job)
CityJob.find_or_create_by!(job_id: saved_job.id, city_id: city_id)
job[:industries].each do |job_industry|
job_industry = job_industry.strip
industry_id = Industry.find_or_create_by!(name:job_industry).id
IndustryJob.find_or_create_by!(industry_id: industry_id, job_id:saved_job.id)
end
end
end
end
end
def get_company(code, name, address, description)
company = Company.find_or_initialize_by(code: code)
company.update(name: name, address: address, description: description)
company
end
def save_job(job_attrs)
attrs = job_attrs[:expiration_date].nil? ? {title: job_attrs[:title], company_id: job_attrs[:company_id]} :
{code: job_attrs[:code]}
job = Job.find_or_initialize_by attrs
job.update_attributes(job_attrs.except(:workplace, :industries, :company_name, :company_address, :company_description))
job
end
end
class JobHtml
def initialize( html_data = {} )
@html_data = html_data
end
def parse_job
job_info = get_job_info
job_detail = get_job_detail
{ title: get_title,
salary: job_info[:salary],
level: job_info[:level],
post_date: get_post_date,
description: job_detail[:description],
requirement: job_detail[:requirement],
expiration_date: job_info[:expiration_date],
workplace: job_info[:workplace],
industries: job_info[:industries],
company_name: get_company_name,
company_address: get_company_address,
company_description: get_company_description }
end
private
def get_title
@html_data.css(".top-job-info h1").text.strip
end
def get_post_date
@html_data.css(".datepost span").text
end
def get_job_info
info_container = @html_data.css(".DetailJobNew li p")
job_info = {}
(0..info_container.count - 1).each do |info_part|
info = info_container[info_part].text
case
when info.include?("Nơi làm việc")
job_info[:workplace] = info.gsub("/[\r\n]+/", "").partition(":").last.split(",") || []
when info.include?("Lương")
job_info[:salary] = info.gsub("/[\r\n]+/", "").partition(":").last.strip
when info.include?("Cấp bậc")
job_info[:level] = info.gsub("/[\r\n]+/", "").partition(":").last.strip
when info.include?("Hết hạn nộp")
job_info[:expiration_date] = info.gsub("/[\r\n]+/", "").partition(":").last.strip
when info.include?("Ngành nghề")
job_info[:industries] = info.gsub("/[\r\n]+/", "").partition(":").last.split(",")
end
end
job_info
end
def get_job_detail
detail_container = @html_data.css("div.MarBot20")
job_detail = {}
(0..detail_container.count - 1).map do |detail_part|
detail = detail_container[detail_part].text
if detail.include?("Mô tả Công việc")
job_detail[:description] = detail.partition("Mô tả Công việc").last
elsif detail.include?("Yêu Cầu Công Việc")
job_detail[:requirement] = detail.partition("Yêu Cầu Công Việc").last
end
end
job_detail
end
def get_company_name
@html_data.css(".tit_company").present? ? @html_data.css("div.tit_company").text.strip : ""
end
def get_company_description
@html_data.css("#emp_more p").text.strip
end
def get_company_address
@html_data.css(".TitleDetailNew label")[0].present? ? @html_data.css("p.TitleDetailNew label")[0].text.strip : ""
end
end
......@@ -6,7 +6,7 @@
</dl>
<dl class="job_data_row">
<dt>Short description</dt>
<dd><%= job["short_des"]&.truncate(250) %></dd>
<dd><%= job["description"]&.truncate(250) %></dd>
</dl>
<dl class="job_data_row">
<dt>Salary</dt>
......
......@@ -17,3 +17,8 @@ solr:
retry_503: 1
retry_after_limit: 1
server_url: "http://localhost:8983/solr/venjob"
crawl:
base_url: "https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"
jobs_per_page: 50
fixed_total_page: 20
......@@ -25,3 +25,13 @@ namespace :solr do
solr_delete_logger.info "Solr delete all data succesfully at #{Time.current}"
end
end
namespace :crawl do
desc "crawl data from careerbuilder.vn"
task crawl_data: :environment do
CrawlData.new.crawl_web
# crawl = ActiveSupport::Logger.new("log/crawl_data.log")
# crawl.info "Crawl data from careerbuilder.vn succesfully at #{Time.current}"
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment