Commit 65db38d6 by Xuan Trung Le

Fix crawling data

parent 8d674ad5
# encoding: UTF-8
require 'nokogiri' require 'nokogiri'
require 'uri'
require 'open-uri' require 'open-uri'
class Crawler class Crawler
...@@ -12,13 +14,17 @@ class Crawler ...@@ -12,13 +14,17 @@ class Crawler
links.each do |link| links.each do |link|
puts "Fetching #{link}..." puts "Fetching #{link}..."
params = {} params = {}
doc = Nokogiri::HTML(open(link)) link = URI.escape(link)
doc = Nokogiri::HTML(open(link), nil, 'utf-8')
if doc.css('#template_vantai').blank? && if doc.css('#template_vantai').blank? &&
doc.css('#template_1').blank? && doc.css('#template_1').blank? &&
doc.css('#template_2').blank? && doc.css('#template_2').blank? &&
doc.css('#template_3').blank? && doc.css('#template_3').blank? &&
doc.css('#template_4').blank? doc.css('#template_4').blank? &&
doc.css('#template_5').blank? &&
doc.css('#template_6').blank? &&
doc.css('#template_7').blank?
params = use_template_default(doc, link) params = use_template_default(doc, link)
job_details << params job_details << params
...@@ -89,7 +95,6 @@ class Crawler ...@@ -89,7 +95,6 @@ class Crawler
def self.get_job_link def self.get_job_link
url = "#{LIST_URL}/tat-ca-viec-lam-trang-#{1}-vi.html" url = "#{LIST_URL}/tat-ca-viec-lam-trang-#{1}-vi.html"
doc = Nokogiri::HTML(open(url)) doc = Nokogiri::HTML(open(url))
links = doc.css('.gird_standard .brief .jobtitle .job a').map { |a| a['href'] }.compact.uniq return doc.css('.gird_standard .brief .jobtitle .job a').map { |a| a['href'] }.compact.uniq
return links.delete_if{|link| link.include?('–')}
end end
end end
class City < ApplicationRecord class City < ApplicationRecord
belongs_to :country, optional: true belongs_to :country, optional: true
has_many :companies has_and_belongs_to_many :companies
has_many :jobs has_and_belongs_to_many :jobs
end end
class Company < ApplicationRecord class Company < ApplicationRecord
belongs_to :city has_and_belongs_to_many :cities
has_many :jobs has_many :jobs
end end
class Job < ApplicationRecord class Job < ApplicationRecord
belongs_to :city
belongs_to :company belongs_to :company
has_many :apply_jobs has_many :apply_jobs
has_many :candidates, through: :apply_jobs, class_name: 'User', source: :user has_many :candidates, through: :apply_jobs, class_name: 'User', source: :user
has_many :favorite_jobs has_many :favorite_jobs
has_many :people_who_liked, through: :favorite_jobs, class_name: 'User', source: :user has_many :people_who_liked, through: :favorite_jobs, class_name: 'User', source: :user
has_and_belongs_to_many :industries has_and_belongs_to_many :industries
has_and_belongs_to_many :cities
def self.create_new_jobs(arr_jobs) def self.create_new_jobs(arr_jobs)
arr_jobs.each do |item| arr_jobs.each do |item|
...@@ -19,14 +19,16 @@ class Job < ApplicationRecord ...@@ -19,14 +19,16 @@ class Job < ApplicationRecord
updated_date: item[:updated_date]) updated_date: item[:updated_date])
# City # City
unless item[:city].blank? unless item[:city].blank?
job.city = City.find_or_create_by(name: (item[:city] ||= '').split(':')[0]) item[:city].split(',').each do |name|
job.cities << City.find_or_create_by(name: name.strip)
end
end end
# Company # Company
job.company = Company.find_or_initialize_by(name: item[:company_name]) job.company = Company.find_or_create_by(name: item[:company_name])
job.company.location = item[:company_location] job.company.location = item[:company_location]
job.company.description = item[:company_description] job.company.description = item[:company_description]
job.company.city = job.city job.company.cities = job.cities
# Industry # Industry
unless item[:industry].blank? unless item[:industry].blank?
...@@ -34,6 +36,7 @@ class Job < ApplicationRecord ...@@ -34,6 +36,7 @@ class Job < ApplicationRecord
job.industries << Industry.find_or_create_by(name: name.strip) job.industries << Industry.find_or_create_by(name: name.strip)
end end
end end
puts "Saving #{item[:name]} ......................................"
job.save job.save
end end
end end
......
class RemoveCityFromJobs < ActiveRecord::Migration[5.1]
def change
remove_reference :jobs, :city
end
end
class CreateJoinTableJobsCites < ActiveRecord::Migration[5.1]
def change
create_join_table :jobs, :cities do |t|
t.index [:job_id, :city_id]
t.index [:city_id, :job_id]
end
end
end
class RemoveCityFromCompanies < ActiveRecord::Migration[5.1]
def change
remove_reference :companies, :city
end
end
class CreateJoinTableCitiesCompanies < ActiveRecord::Migration[5.1]
def change
create_join_table :cities, :companies, id: false do |t|
t.index [:city_id, :company_id]
t.index [:company_id, :city_id]
end
end
end
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20171004094208) do ActiveRecord::Schema.define(version: 20171005085453) do
create_table "apply_jobs", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t| create_table "apply_jobs", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "job_id" t.bigint "job_id"
...@@ -30,14 +30,26 @@ ActiveRecord::Schema.define(version: 20171004094208) do ...@@ -30,14 +30,26 @@ ActiveRecord::Schema.define(version: 20171004094208) do
t.index ["country_id"], name: "index_cities_on_country_id" t.index ["country_id"], name: "index_cities_on_country_id"
end end
create_table "cities_companies", id: false, force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "city_id", null: false
t.bigint "company_id", null: false
t.index ["city_id", "company_id"], name: "index_cities_companies_on_city_id_and_company_id"
t.index ["company_id", "city_id"], name: "index_cities_companies_on_company_id_and_city_id"
end
create_table "cities_jobs", id: false, force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "job_id", null: false
t.bigint "city_id", null: false
t.index ["city_id", "job_id"], name: "index_cities_jobs_on_city_id_and_job_id"
t.index ["job_id", "city_id"], name: "index_cities_jobs_on_job_id_and_city_id"
end
create_table "companies", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t| create_table "companies", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.string "name" t.string "name"
t.string "location" t.string "location"
t.text "description" t.text "description"
t.bigint "city_id"
t.datetime "created_at", null: false t.datetime "created_at", null: false
t.datetime "updated_at", null: false t.datetime "updated_at", null: false
t.index ["city_id"], name: "index_companies_on_city_id"
end end
create_table "countries", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t| create_table "countries", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
...@@ -76,14 +88,12 @@ ActiveRecord::Schema.define(version: 20171004094208) do ...@@ -76,14 +88,12 @@ ActiveRecord::Schema.define(version: 20171004094208) do
t.text "description" t.text "description"
t.string "level" t.string "level"
t.string "experience" t.string "experience"
t.bigint "city_id"
t.bigint "company_id" t.bigint "company_id"
t.datetime "expiry_date" t.datetime "expiry_date"
t.datetime "updated_date" t.datetime "updated_date"
t.datetime "updated_at", null: false t.datetime "updated_at", null: false
t.datetime "created_at", null: false t.datetime "created_at", null: false
t.string "original_link" t.string "original_link"
t.index ["city_id"], name: "index_jobs_on_city_id"
t.index ["company_id"], name: "index_jobs_on_company_id" t.index ["company_id"], name: "index_jobs_on_company_id"
end end
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment