Commit 992e35b3 by Xuan Trung Le

implement crawler data

parent 661293a4
// Place all the behaviors and hooks related to the matching controller here.
// All this logic will automatically be available in application.js.
// Place all the styles related to the datas controller here.
// They will automatically be included in application.css.
// You can use Sass (SCSS) here: http://sass-lang.com/
class DatasController < ApplicationController
require 'nokogiri'
require 'open-uri'
require 'nokogiri'
require 'open-uri'
class Crawler
BASE_CAREERBUILDER_URL = "https://careerbuilder.vn"
LIST_URL = "#{BASE_CAREERBUILDER_URL}/viec-lam"
def index
@datas = crawl_data
end
def crawl_data
links = get_link[0..4]
def self.crawl_job_infomation(job_links)
links = job_links
job_details = []
links.each do |link|
......@@ -18,10 +14,33 @@ class DatasController < ApplicationController
params = {}
doc = Nokogiri::HTML(open(link))
# get company information
params[:name] = doc.css('.top-job .top-job-info h1').text
params[:company_name] = doc.css('.top-job .top-job-info .tit_company').text
params[:updated_date] = doc.css('.top-job .datepost').text
if doc.css('#template_vantai').blank? &&
doc.css('#template_1').blank? &&
doc.css('#template_2').blank? &&
doc.css('#template_3').blank? &&
doc.css('#template_4').blank?
params = use_template_default(doc, link)
job_details << params
end
end
return job_details
end
def self.use_template_default(doc, link)
params = {}
params_company = {}
description = []
# get job's name
params[:name] = doc.css('.top-job .top-job-info h1').text.strip
params[:updated_date] = doc.css('.datepost').text.split(':')[1].strip
# get company infoimation
params_company = crawl_company_infomation(doc)
params[:company_name] = params_company[:name]
params[:company_location] = params_company[:location]
params[:company_description] = params_company[:description]
# get employment information
doc.css('.MyJobLeft .box2Detail .DetailJobNew li').children.each do |child|
......@@ -29,7 +48,7 @@ class DatasController < ApplicationController
next if info.blank?
case info[0].strip.upcase
when 'NƠI LÀM VIỆC'
params[:city] = info[1]
params[:city] = info[1].strip
when 'CẤP BẬC'
params[:level] = info[1]
when 'KINH NGHIỆM'
......@@ -45,17 +64,30 @@ class DatasController < ApplicationController
# get job description
doc.css('.MarBot20').children.each do |child|
params[:description] = child.to_html
description << child.to_html
end
params[:description] = description.join("")
job_details << params
# original_link
params[:original_link] = link
return params
end
return job_details
def self.crawl_company_infomation(doc)
# get company information
params = {}
params[:name] = doc.css('.box1Detail .TitleDetailNew span').text
params[:location] = doc.css('.box1Detail .TitleDetailNew label')[0].text
params[:description] = doc.css('.desc_company p').text
return params
end
def get_link
def self.get_job_link
url = "#{LIST_URL}/tat-ca-viec-lam-trang-#{1}-vi.html"
doc = Nokogiri::HTML(open(url))
return doc.css('.brief .jobtitle .job a').map { |a| a['href'] }.compact.uniq
links = doc.css('.gird_standard .brief .jobtitle .job a').map { |a| a['href'] }.compact.uniq
return links.delete_if{|link| link.include?('–')}
end
end
require "crawler.rb"
class ImportData
def job
end
end
class City < ApplicationRecord
belongs_to :country
belongs_to :country, optional: true
has_many :companies
has_many :jobs
end
......@@ -6,4 +6,27 @@ class Job < ApplicationRecord
has_many :favorite_jobs
has_many :people_who_liked, through: :favorite_jobs, class_name: 'User', source: :user
has_and_belongs_to_many :industries
def self.create_new_jobs(arr_jobs)
arr_jobs.each do |item|
job = Job.new(name: item[:name],
salary: item[:salary],
description: item[:description],
level: item[:level],
original_link: item[:original_link],
experience: item[:experience],
expiry_date: item[:expiry_date],
updated_date: item[:updated_date])
job.city = City.find_or_initialize_by(name: (item[:city] ||= '').split(':')[0]) #TODO fix this line
job.company = Company.find_or_initialize_by(name: item[:company_name])
job.company.location = item[:company_location]
job.company.description = item[:company_description]
job.company.city = job.city
job.save
end
end
def self.filter_link_exist(links)
return links - Job.all.map{|job| job.original_link}
end
end
class AddOriginalLinkToJobs < ActiveRecord::Migration[5.1]
def change
add_column :jobs, :original_link, :string
end
end
......@@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20171002033456) do
ActiveRecord::Schema.define(version: 20171004094208) do
create_table "apply_jobs", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "job_id"
......@@ -82,6 +82,7 @@ ActiveRecord::Schema.define(version: 20171002033456) do
t.datetime "updated_date"
t.datetime "updated_at", null: false
t.datetime "created_at", null: false
t.string "original_link"
t.index ["city_id"], name: "index_jobs_on_city_id"
t.index ["company_id"], name: "index_jobs_on_company_id"
end
......
ENV["RAILS_ENV"] ||= "production"
require "./app/data/crawler.rb"
namespace :data do
task insert_job: :environment do |t|
links = Crawler.get_job_link
links = Job.filter_link_exist(links)
@data = Crawler.crawl_job_infomation(links)
Job.create_new_jobs(@data)
end
end
require 'test_helper'
class DatasControllerTest < ActionDispatch::IntegrationTest
test "should get index" do
get datas_index_url
assert_response :success
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment