Commit 91b545e3 by Ba Toi Dang

Merge branch 'features/crawl_data' into 'master'

implement crawler data from careerbuild

See merge request !2
parents fe41d8a4 5c50f4b3
...@@ -11,5 +11,4 @@ ...@@ -11,5 +11,4 @@
// about supported directives. // about supported directives.
// //
//= require rails-ujs //= require rails-ujs
//= require turbolinks
//= require_tree . //= require_tree .
# encoding: UTF-8
require 'nokogiri'
require 'uri'
require 'open-uri'
class Crawler
BASE_CAREERBUILDER_URL = "https://careerbuilder.vn"
LIST_URL = "#{BASE_CAREERBUILDER_URL}/viec-lam"
def self.crawl_job_infomation(job_links)
job_details = []
job_links.each do |link|
puts "Fetching #{link}..."
params = {}
link = URI.escape(link)
doc = Nokogiri::HTML(open(link), nil, 'utf-8')
if doc.css('#template_vantai, #template_1,
#template_2, #template_3, #template_4,
#template_5, #template_6, #template_7,
#newyear_02').blank?
params = use_template_default(doc, link)
job_details << params
end
end
job_details
end
def self.use_template_default(doc, link)
params = {}
params_company = {}
description = []
# get job's name
params[:name] = doc.css('.top-job .top-job-info h1').text.strip
params[:updated_date] = doc.css('.datepost').text.split(':')[1].strip
# get company infoimation
params_company = crawl_company_infomation(doc)
params[:company_name] = params_company[:name]
params[:company_location] = params_company[:location]
params[:company_description] = params_company[:description]
# get employment information
doc.css('.MyJobLeft .box2Detail .DetailJobNew li').children.each do |child|
info = child.text.gsub(/\t|\n/, '').split(':')
next if info.blank?
case info[0].strip.upcase
when 'NƠI LÀM VIỆC'
params[:city] = info[1].strip
when 'CẤP BẬC'
params[:level] = info[1]
when 'KINH NGHIỆM'
params[:level] = info[1]
when 'LƯƠNG'
params[:salary] = info[1]
when 'NGÀNH NGHỀ'
params[:industry] = info[1]
when 'HẾT HẠN NỘP'
params[:expiry_date] = info[1]
end
end
# get job description
doc.css('.MarBot20').children.each do |child|
description << child.to_html
end
params[:description] = description.join("")
# original_link
params[:original_link] = link
params
end
def self.crawl_company_infomation(doc)
# get company information
params = {}
if doc.css('.box1Detail .TitleDetailNew span').length > 0
params[:name] = doc.css('.box1Detail .TitleDetailNew span').text
params[:location] = doc.at('.box1Detail .TitleDetailNew label').text
params[:description] = doc.css('.desc_company p').text
end
params[:name] ||= 'Bảo mật'
params
end
def self.get_job_links
url = "#{LIST_URL}/tat-ca-viec-lam-trang-#{1}-vi.html"
doc = Nokogiri::HTML(open(url))
doc.css('.gird_standard .brief .jobtitle .job a').map { |a| a['href'] }.compact.uniq
end
end
class City < ApplicationRecord class City < ApplicationRecord
belongs_to :country belongs_to :country, optional: true
has_many :companies has_and_belongs_to_many :companies
has_many :jobs has_and_belongs_to_many :jobs
end end
class Company < ApplicationRecord class Company < ApplicationRecord
belongs_to :city has_and_belongs_to_many :cities
has_many :jobs has_many :jobs
end end
class Job < ApplicationRecord class Job < ApplicationRecord
belongs_to :city
belongs_to :company belongs_to :company
has_many :apply_jobs has_many :apply_jobs
has_many :candidates, through: :apply_jobs, class_name: 'User', source: :user has_many :candidates, through: :apply_jobs, class_name: 'User', source: :user
has_many :favorite_jobs has_many :favorite_jobs
has_many :people_who_liked, through: :favorite_jobs, class_name: 'User', source: :user has_many :people_who_liked, through: :favorite_jobs, class_name: 'User', source: :user
has_and_belongs_to_many :industries has_and_belongs_to_many :industries
has_and_belongs_to_many :cities
def self.create_new_jobs(arr_jobs)
arr_jobs.each do |item|
city_names = []
job_industries = []
industry_names = []
job = Job.new(name: item[:name],
salary: item[:salary],
description: item[:description],
level: item[:level],
original_link: item[:original_link],
experience: item[:experience],
expiry_date: item[:expiry_date],
updated_date: item[:updated_date])
# City
unless item[:city].blank?
city_names = item[:city].split(',').map(&:strip)
job.cities << City.where(name: city_names)
end
# Company
company = Company.find_by(name: item[:company_name])
if company.nil?
company = Company.create(name: item[:company_name],
location: item[:company_location],
description: item[:company_description])
end
job.company = company
job.company.cities << (job.cities - job.company.cities)
# Industry
unless item[:industry].blank?
industry_names = item[:industry].split(',').map(&:strip)
job_industries = Industry.where(name: industry_names)
job.industries << job_industries
industry_names = industry_names - job_industries.pluck(:name)
industry_names.each do |name|
job.industries << Industry.create(name: name)
end
end
puts "Saving #{item[:name]} ......................................"
if job.save
puts "Job was successfully created"
else
puts "Error..."
end
end
end
def self.filter_link_exist(links)
return links - Job.where(original_link: links).pluck(:original_link)
end
end end
<nav class="navbar navbar-default">
<div class="container-fluid">
<div class="navbar-header">
<a class="navbar-brand" href="#">WebSiteName</a>
</div>
<ul class="nav navbar-nav navbar-right">
<li><a href="#"><span class="glyphicon glyphicon-user"></span> Sign Up</a></li>
<li><a href="#"><span class="glyphicon glyphicon-log-in"></span> Login</a></li>
</ul>
</div>
</nav>
Rails.application.routes.draw do Rails.application.routes.draw do
devise_for :users devise_for :users
# For details on the DSL available within this file, see http://guides.rubyonrails.org/routing.html
end end
class AddOriginalLinkToJobs < ActiveRecord::Migration[5.1]
def change
add_column :jobs, :original_link, :string, index: true, uniqe: true
end
end
class RemoveCityFromJobs < ActiveRecord::Migration[5.1]
def change
remove_reference :jobs, :city
end
end
class CreateJoinTableJobsCites < ActiveRecord::Migration[5.1]
def change
create_table :cities_jobs do |t|
t.references :job, index: true
t.references :city, index: true
end
end
end
class RemoveCityFromCompanies < ActiveRecord::Migration[5.1]
def change
remove_reference :companies, :city
end
end
class CreateJoinTableCitiesCompanies < ActiveRecord::Migration[5.1]
def change
create_table :cities_companies, id: false do |t|
t.references :company, index: true
t.references :city, index: true
end
end
end
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20171002033456) do ActiveRecord::Schema.define(version: 20171005085453) do
create_table "apply_jobs", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t| create_table "apply_jobs", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "job_id" t.bigint "job_id"
...@@ -30,14 +30,26 @@ ActiveRecord::Schema.define(version: 20171002033456) do ...@@ -30,14 +30,26 @@ ActiveRecord::Schema.define(version: 20171002033456) do
t.index ["country_id"], name: "index_cities_on_country_id" t.index ["country_id"], name: "index_cities_on_country_id"
end end
create_table "cities_companies", id: false, force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "city_id", null: false
t.bigint "company_id", null: false
t.index ["city_id", "company_id"], name: "index_cities_companies_on_city_id_and_company_id"
t.index ["company_id", "city_id"], name: "index_cities_companies_on_company_id_and_city_id"
end
create_table "cities_jobs", id: false, force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "job_id", null: false
t.bigint "city_id", null: false
t.index ["city_id", "job_id"], name: "index_cities_jobs_on_city_id_and_job_id"
t.index ["job_id", "city_id"], name: "index_cities_jobs_on_job_id_and_city_id"
end
create_table "companies", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t| create_table "companies", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.string "name" t.string "name"
t.string "location" t.string "location"
t.text "description" t.text "description"
t.bigint "city_id"
t.datetime "created_at", null: false t.datetime "created_at", null: false
t.datetime "updated_at", null: false t.datetime "updated_at", null: false
t.index ["city_id"], name: "index_companies_on_city_id"
end end
create_table "countries", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t| create_table "countries", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
...@@ -76,13 +88,12 @@ ActiveRecord::Schema.define(version: 20171002033456) do ...@@ -76,13 +88,12 @@ ActiveRecord::Schema.define(version: 20171002033456) do
t.text "description" t.text "description"
t.string "level" t.string "level"
t.string "experience" t.string "experience"
t.bigint "city_id"
t.bigint "company_id" t.bigint "company_id"
t.datetime "expiry_date" t.datetime "expiry_date"
t.datetime "updated_date" t.datetime "updated_date"
t.datetime "updated_at", null: false t.datetime "updated_at", null: false
t.datetime "created_at", null: false t.datetime "created_at", null: false
t.index ["city_id"], name: "index_jobs_on_city_id" t.string "original_link"
t.index ["company_id"], name: "index_jobs_on_company_id" t.index ["company_id"], name: "index_jobs_on_company_id"
end end
......
# This file should contain all the record creation needed to seed the database with its default values. viet_nam = Country.create(name: 'Viet Nam')
# The data can then be loaded with the rails db:seed command (or created alongside the database with db:setup). another = Country.create(name: 'another')
#
# Examples: cities_of_vn = ["Hà Nội", "Hồ Chí Minh", "An Giang", "Bà Rịa - Vũng Tàu", "Bạc Liêu", "Bắc Giang",
# "Bắc Ninh", "Bến Tre", "Bình Dương", "Bình Định", "Bình Phước", "Bình Thuận", "Cà Mau",
# movies = Movie.create([{ name: 'Star Wars' }, { name: 'Lord of the Rings' }]) "Cao Bằng", "Cần Thơ", "Dak Lak", "Dak Nông", "Đà Nẵng", "Điện Biên",
# Character.create(name: 'Luke', movie: movies.first) "Đồng Bằng Sông Cửu Long", "Đồng Nai", "Đồng Tháp", "Gia Lai", "Hà Giang", "Hà Nam",
"Hà Tây", "Hà Tĩnh", "Hải Dương", "Hải Phòng", "Hậu Giang", "Hòa Bình", "Hưng Yên",
"Khác", "Khánh Hòa", "Kiên Giang", "Kon Tum", "KV Bắc Trung Bộ", "KV Đông Nam Bộ",
"KV Nam Trung Bộ", "KV Tây Nguyên", "Lai Châu", "Lạng Sơn", "Lào Cai", "Long An",
"Nam Định", "Nghệ An", "Ninh Thuận", "Phú Thọ", "Phú Yên", "Quảng Bình", "Quảng Nam",
"Quảng Ngãi", "Quảng Ninh", "Quảng Trị", "Sóc Trăng", "Sơn La", "Tây Ninh", "Thái Bình",
"Thái Nguyên", "Thanh Hóa", "Thừa Thiên- Huế", "Tiền Giang", "Toàn quốc", "Trà Vinh",
"Tuyên Quang", "Vĩnh Long", "Vĩnh Phúc", "Yên Bái"]
cities_of_another_country = [
"Banteay Meanchey", "Battambang",
"Kampong Chhnang", "Kampong Speu", "Kampot", "Kandal", "Kâmpóng Thum, Cambodia",
"Kep", "Koh Kong", "Kratie", "Mondulkiri", "Otdar Meanchey", "Pailin", "Phnompenh",
"Preah Sihanouk", "Preah Vihear", "Prey Veng", "Pursat", "Rotanak Kiri", "Siem Reap",
"Sihanoukville", "Stung Treng", "Svay Rieng", "Tbong Khmum", "Kinshasa", "Hồng Kông",
"Attapeu", "Bokeo", "Bolikhamsai", "Champasak", "Houaphanh", "Khammouane",
"Louang Namtha", "Luang Prabang", "Oudomxay", "Phongsaly", "Sainyabuli", "Salavan",
"Savannakhet", "Sekong", "Vientiane", "Xaisomboun", "Xiangkhouang", "Qatar"]
cities_of_vn.each do |city_name|
City.create(name: city_name, country: viet_nam)
end
cities_of_another_country.each do |city_name|
City.create(name: city_name, country: another)
end
require "./app/data/crawler.rb"
namespace :data do
task insert_job: :environment do |t|
links = Crawler.get_job_links
links = Job.filter_link_exist(links)
@data = Crawler.crawl_job_infomation(links)
Job.create_new_jobs(@data)
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment