Commit 65db38d6 by Xuan Trung Le

Fix crawling data

parent 8d674ad5
# encoding: UTF-8
require 'nokogiri'
require 'uri'
require 'open-uri'
class Crawler
......@@ -12,13 +14,17 @@ class Crawler
links.each do |link|
puts "Fetching #{link}..."
params = {}
doc = Nokogiri::HTML(open(link))
link = URI.escape(link)
doc = Nokogiri::HTML(open(link), nil, 'utf-8')
if doc.css('#template_vantai').blank? &&
doc.css('#template_1').blank? &&
doc.css('#template_2').blank? &&
doc.css('#template_3').blank? &&
doc.css('#template_4').blank?
doc.css('#template_4').blank? &&
doc.css('#template_5').blank? &&
doc.css('#template_6').blank? &&
doc.css('#template_7').blank?
params = use_template_default(doc, link)
job_details << params
......@@ -89,7 +95,6 @@ class Crawler
def self.get_job_link
url = "#{LIST_URL}/tat-ca-viec-lam-trang-#{1}-vi.html"
doc = Nokogiri::HTML(open(url))
links = doc.css('.gird_standard .brief .jobtitle .job a').map { |a| a['href'] }.compact.uniq
return links.delete_if{|link| link.include?('–')}
return doc.css('.gird_standard .brief .jobtitle .job a').map { |a| a['href'] }.compact.uniq
end
end
class City < ApplicationRecord
belongs_to :country, optional: true
has_many :companies
has_many :jobs
has_and_belongs_to_many :companies
has_and_belongs_to_many :jobs
end
class Company < ApplicationRecord
belongs_to :city
has_and_belongs_to_many :cities
has_many :jobs
end
class Job < ApplicationRecord
belongs_to :city
belongs_to :company
has_many :apply_jobs
has_many :candidates, through: :apply_jobs, class_name: 'User', source: :user
has_many :favorite_jobs
has_many :people_who_liked, through: :favorite_jobs, class_name: 'User', source: :user
has_and_belongs_to_many :industries
has_and_belongs_to_many :cities
def self.create_new_jobs(arr_jobs)
arr_jobs.each do |item|
......@@ -19,14 +19,16 @@ class Job < ApplicationRecord
updated_date: item[:updated_date])
# City
unless item[:city].blank?
job.city = City.find_or_create_by(name: (item[:city] ||= '').split(':')[0])
item[:city].split(',').each do |name|
job.cities << City.find_or_create_by(name: name.strip)
end
end
# Company
job.company = Company.find_or_initialize_by(name: item[:company_name])
job.company = Company.find_or_create_by(name: item[:company_name])
job.company.location = item[:company_location]
job.company.description = item[:company_description]
job.company.city = job.city
job.company.cities = job.cities
# Industry
unless item[:industry].blank?
......@@ -34,6 +36,7 @@ class Job < ApplicationRecord
job.industries << Industry.find_or_create_by(name: name.strip)
end
end
puts "Saving #{item[:name]} ......................................"
job.save
end
end
......
class RemoveCityFromJobs < ActiveRecord::Migration[5.1]
def change
remove_reference :jobs, :city
end
end
class CreateJoinTableJobsCites < ActiveRecord::Migration[5.1]
def change
create_join_table :jobs, :cities do |t|
t.index [:job_id, :city_id]
t.index [:city_id, :job_id]
end
end
end
class RemoveCityFromCompanies < ActiveRecord::Migration[5.1]
def change
remove_reference :companies, :city
end
end
class CreateJoinTableCitiesCompanies < ActiveRecord::Migration[5.1]
def change
create_join_table :cities, :companies, id: false do |t|
t.index [:city_id, :company_id]
t.index [:company_id, :city_id]
end
end
end
......@@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20171004094208) do
ActiveRecord::Schema.define(version: 20171005085453) do
create_table "apply_jobs", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "job_id"
......@@ -30,14 +30,26 @@ ActiveRecord::Schema.define(version: 20171004094208) do
t.index ["country_id"], name: "index_cities_on_country_id"
end
create_table "cities_companies", id: false, force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "city_id", null: false
t.bigint "company_id", null: false
t.index ["city_id", "company_id"], name: "index_cities_companies_on_city_id_and_company_id"
t.index ["company_id", "city_id"], name: "index_cities_companies_on_company_id_and_city_id"
end
create_table "cities_jobs", id: false, force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "job_id", null: false
t.bigint "city_id", null: false
t.index ["city_id", "job_id"], name: "index_cities_jobs_on_city_id_and_job_id"
t.index ["job_id", "city_id"], name: "index_cities_jobs_on_job_id_and_city_id"
end
create_table "companies", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.string "name"
t.string "location"
t.text "description"
t.bigint "city_id"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["city_id"], name: "index_companies_on_city_id"
end
create_table "countries", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
......@@ -76,14 +88,12 @@ ActiveRecord::Schema.define(version: 20171004094208) do
t.text "description"
t.string "level"
t.string "experience"
t.bigint "city_id"
t.bigint "company_id"
t.datetime "expiry_date"
t.datetime "updated_date"
t.datetime "updated_at", null: false
t.datetime "created_at", null: false
t.string "original_link"
t.index ["city_id"], name: "index_jobs_on_city_id"
t.index ["company_id"], name: "index_jobs_on_company_id"
end
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment