Commit 0cef5747 by Thanh Hung Pham

Fix css to xpath, and validates model

parent ee42f340
class Area < ApplicationRecord class Area < ApplicationRecord
has_many :city has_many :city
validates :name, presence: true
end end
class Category < ApplicationRecord class Category < ApplicationRecord
has_many :job_category has_many :job_category
validates :name, presence: true
end end
class City < ApplicationRecord class City < ApplicationRecord
belongs_to :area belongs_to :area
has_many :job has_many :job
validates :name, presence: true
end end
class Company < ApplicationRecord class Company < ApplicationRecord
has_many :job has_many :job
validates :name, presence: true
validates :address, length: { maximum: 200 }
validates :district, length: { maximum: 200 }
validates :province, length: { maximum: 200 }
end end
...@@ -5,4 +5,6 @@ class Job < ApplicationRecord ...@@ -5,4 +5,6 @@ class Job < ApplicationRecord
belongs_to :contact, optional: true belongs_to :contact, optional: true
has_many :job_category has_many :job_category
validates :name, presence: true, length: { maximum: 200 }
end end
class User < ApplicationRecord class User < ApplicationRecord
validates :name, presence: true, length: { maximum: 50 }
VALID_EMAIL_REGEX = /\A[\w+\-.]+@[a-z\d\-.]+\.[a-z]+\z/i
validates :email, presence: true, length: { maximum: 255 }, format: { with: VALID_EMAIL_REGEX }
end end
class CreateAreas < ActiveRecord::Migration[5.1] class CreateAreas < ActiveRecord::Migration[5.1]
def change def change
create_table :areas do |t| create_table :areas do |t|
t.string :name t.string :name, index: true
t.timestamps t.timestamps
end end
......
class CreateCategories < ActiveRecord::Migration[5.1] class CreateCategories < ActiveRecord::Migration[5.1]
def change def change
create_table :categories do |t| create_table :categories do |t|
t.string :name t.string :name, idnex: true
t.timestamps t.timestamps
end end
......
class CreateCities < ActiveRecord::Migration[5.1] class CreateCities < ActiveRecord::Migration[5.1]
def change def change
create_table :cities do |t| create_table :cities do |t|
t.string :name t.string :name, index: true
t.references :area, index: true t.references :area, index: true
t.timestamps t.timestamps
......
class CreateJobs < ActiveRecord::Migration[5.1] class CreateJobs < ActiveRecord::Migration[5.1]
def change def change
create_table :jobs do |t| create_table :jobs do |t|
t.string :name t.string :name, index: true
t.text :description t.text :description
t.references :city, index: true t.references :city, index: true
t.string :salary t.string :salary
......
...@@ -9,8 +9,8 @@ namespace :crawler_data do ...@@ -9,8 +9,8 @@ namespace :crawler_data do
import_category(doc) import_category(doc)
import_city(doc) import_city(doc)
new_jobs_url = doc.css('div.logo_nav li.hasmenu li a')[0]['href'] new_jobs_url = doc.xpath("//div[@class='logo_nav']/ul/li[@class=' hasmenu']/ul/li/a[text()='Việc làm mới nhất']/@href")
inport_job(new_jobs_url) inport_job(new_jobs_url.to_s)
end end
def import_area def import_area
...@@ -19,19 +19,19 @@ namespace :crawler_data do ...@@ -19,19 +19,19 @@ namespace :crawler_data do
end end
def import_category(doc) def import_category(doc)
categories = doc.css('div.s-home2 div#NewSearchJob3 div.box_multiSelect_industry option') categories = doc.xpath("//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option")
categories = categories.slice(1..categories.size - 2) categories = categories.slice(1..categories.size - 2)
categories.each do |category| categories.each do |category|
Category.new(name: category.text).save if Category.where(name: category.text).blank? Category.new(name: category.text.strip).save if Category.where(name: category.text.strip).blank?
end end
end end
def import_city(doc) def import_city(doc)
cities = doc.css('div.s-home2 div#NewSearchJob3 div.box_multiSelect_location option').drop(1) cities = doc.xpath("//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option").drop(1)
area_id = 1 area_id = 1
cities.each do |city| cities.each do |city|
area_id = 2 if city.text == 'Angola' area_id = 2 if city.text == 'Angola'
City.new(name: city.text, area: Area.find(area_id)).save if City.where(name: city.text).blank? City.new(name: city.text.strip, area: Area.find(area_id)).save if City.where(name: city.text.strip).blank?
end end
end end
...@@ -40,46 +40,44 @@ namespace :crawler_data do ...@@ -40,46 +40,44 @@ namespace :crawler_data do
doc_new_jobs = Nokogiri::HTML(open(url)) doc_new_jobs = Nokogiri::HTML(open(url))
doc_new_jobs.encoding = 'utf-8' doc_new_jobs.encoding = 'utf-8'
doc_new_jobs.css('div.gird_standard dl dd h3 a').each do |link| doc_new_jobs.xpath("//div[@class='gird_standard ']/dl/dd/span/h3[@class='job']/a/@href").each do |link|
encoded_url = URI.encode(link['href']) encoded_url = URI.encode(link.to_s)
doc_job_details = Nokogiri::HTML(open(encoded_url)) doc_job_details = Nokogiri::HTML(open(encoded_url))
# Company Information # Company Information
company_name = doc_job_details.css('div.tit_company').text # Company name company_name = doc_job_details.xpath("//div[@class='tit_company']").text.strip # Company name
company_address = doc_job_details.css("div.box1Detail label[itemprop='addressLocality']").text # Company Address company_address = doc_job_details.xpath("//div[@class='box1Detail']/p[@class='TitleDetailNew']/label[@itemprop='address']/label[@itemprop='addressLocality']").text.strip # Company Address
company_description = doc_job_details.css('div.desc_company').text # Company description company_description = doc_job_details.xpath("//div[@class='desc_company content_fck']").text.strip # Company description
Company.new(name: company_name, address: company_address, description: company_description).save if Company.where(name: company_name).blank? Company.new(name: company_name, address: company_address, description: company_description).save if Company.where(name: company_name).blank?
# Job Information Job Information
job_name = doc_job_details.css('div.LeftJobCB h1').text # Job name job_name = doc_job_details.xpath("//div[@class='LeftJobCB']/div[@class='top-job']/div[@class='top-job-info']/h1").text.strip # Job name
job_description = doc_job_details.css('div.MarBot20').text # Job description job_description = doc_job_details.xpath("//div[@class='MarBot20']").text.strip # Job description
doc_job_details.css('ul.DetailJobNew span').each do |detail|
case detail.text.strip job_location = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Nơi làm việc: ']/b[@itemprop='jobLocation']").text.strip
when 'Nơi làm việc:'
@job_location = detail.parent.css("b[itemprop='jobLocation']").text # Job location job_level = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Cấp bậc: ']/label[@itemprop='occupationalCategory']").text.strip
when 'Cấp bậc:'
@job_level = detail.parent.css("label[itemprop='occupationalCategory']").text # Job level job_experience = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Kinh nghiệm: ']/text()")
when 'Kinh nghiệm:'
@job_experience = detail.parent.children.last.text # Job experience job_salary = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='baseSalary']").text.strip + " " +
when 'Lương:' doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='salaryCurrency']").text.strip
@job_salary = detail.parent.css("label[itemprop='baseSalary']").text + detail.parent.css("label[itemprop='salaryCurrency']").text # Job salary
when 'Ngành nghề:' job_category = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Ngành nghề: ']/b/a[@itemprop='industry']").text.strip
@job_category = detail.parent.css("b a[itemprop='industry']").text # Job category
when 'Hết hạn nộp:' job_expiry_date = doc_job_details.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Hết hạn nộp: ']/text()")
@job_expiry_date = detail.parent.children.last.text # Job expiry date
end
end
Job.new(name: job_name, description: job_description, Job.new(name: job_name, description: job_description,
salary: @job_salary, salary: job_salary,
city: City.find_by_name(@job_location), city: City.find_by_name(job_location),
level: @job_level, experience: @job_experience, status: 0).save level: job_level, experience: job_experience, status: 0,
expiry_date: job_expiry_date.to_datetime.strftime('%Q')).save
@job_category.split(',').each do |job_category| job_category.split(',').each do |category|
JobCategory.new(job: Job.find_by_name(job_name), category: Category.find_by_name(job_category)).save JobCategory.new(job: Job.find_by_name(job_name), category: Category.find_by_name(category)).save
end end
end end
url = doc_new_jobs.css('div.paginationTwoStatus a.right')[0]['href'] url = doc_new_jobs.xpath("//div[@class='paginationTwoStatus']/a[@class='right']/@href").to_s
end end
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment