Commit 9d821f37 by Tô Ngọc Ánh

Merge branch 'crawler' into 'master'

crawl companies, industries, locations

See merge request !2
parents 5e004055 6496e46e
Pipeline #689 failed with stages
in 0 seconds
...@@ -61,3 +61,7 @@ end ...@@ -61,3 +61,7 @@ end
# Windows does not include zoneinfo files, so bundle the tzinfo-data gem # Windows does not include zoneinfo files, so bundle the tzinfo-data gem
gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby] gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby]
##
gem "nokogiri"
##
...@@ -203,6 +203,7 @@ DEPENDENCIES ...@@ -203,6 +203,7 @@ DEPENDENCIES
jbuilder (~> 2.5) jbuilder (~> 2.5)
listen (>= 3.0.5, < 3.2) listen (>= 3.0.5, < 3.2)
mysql2 (>= 0.4.4, < 0.6.0) mysql2 (>= 0.4.4, < 0.6.0)
nokogiri
puma (~> 3.11) puma (~> 3.11)
rails (~> 5.2.4, >= 5.2.4.3) rails (~> 5.2.4, >= 5.2.4.3)
sass-rails (~> 5.0) sass-rails (~> 5.0)
......
class IndustriesJob < ApplicationRecord
belongs_to :job
belongs_to :industry
end
...@@ -3,6 +3,7 @@ class Job < ApplicationRecord ...@@ -3,6 +3,7 @@ class Job < ApplicationRecord
has_many :applied_jobs has_many :applied_jobs
has_many :histories has_many :histories
has_many :favorites has_many :favorites
has_many :locations_jobs
has_many :locations, through: :locations_jobs has_many :locations, through: :locations_jobs
has_and_belongs_to_many :industries has_and_belongs_to_many :industries
end end
class Location < ApplicationRecord class Location < ApplicationRecord
CITY_VIETNAM_NUMBER = 70
has_many :locations_jobs
has_many :jobs, through: :locations_jobs has_many :jobs, through: :locations_jobs
end end
class AddIndexToCompany < ActiveRecord::Migration[5.2]
def change
add_index :companies, :name, unique: true
end
end
class ChangeColAreaToOverseaInLocation < ActiveRecord::Migration[5.2]
def change
remove_column :locations, :area, :string
add_column :locations, :oversea, :boolean
end
end
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2020_07_17_014308) do ActiveRecord::Schema.define(version: 2020_07_20_075150) do
create_table "applied_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "applied_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.bigint "user_id" t.bigint "user_id"
...@@ -30,6 +30,7 @@ ActiveRecord::Schema.define(version: 2020_07_17_014308) do ...@@ -30,6 +30,7 @@ ActiveRecord::Schema.define(version: 2020_07_17_014308) do
t.string "address" t.string "address"
t.datetime "created_at", null: false t.datetime "created_at", null: false
t.datetime "updated_at", null: false t.datetime "updated_at", null: false
t.index ["name"], name: "index_companies_on_name", unique: true
end end
create_table "favorites", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "favorites", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
...@@ -77,10 +78,10 @@ ActiveRecord::Schema.define(version: 2020_07_17_014308) do ...@@ -77,10 +78,10 @@ ActiveRecord::Schema.define(version: 2020_07_17_014308) do
end end
create_table "locations", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "locations", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.string "area"
t.string "city" t.string "city"
t.datetime "created_at", null: false t.datetime "created_at", null: false
t.datetime "updated_at", null: false t.datetime "updated_at", null: false
t.boolean "oversea"
end end
create_table "locations_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t| create_table "locations_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
......
require "open-uri"
task crawl_jobs: :environment do
job_links = get_job_links(1)
crawl_jobs(job_links)
end
task crawl_industries_locations: :environment do
crawl_industries_and_locations
end
def get_job_links(page)
job_links = []
page.times do |i|
document = Nokogiri::HTML(open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i}-vi.html"))
jobs_xml = document.xpath('//div/a[@class="job_link"]/@href')
jobs_xml.each { |i| job_links << i.value}
end
job_links
end
def crawl_company(company_link)
begin
uri = URI.parse(URI.escape(company_link)) #fix error: uri must be ascii only
document = Nokogiri::HTML(open(uri))
company_name = document.css(".content .name").text
return nil if company_name.empty?
company = Company.find_by(name: company_name)
return company if company.present?
puts company_name
company_address = document.css(".content p")[1].text
company_description = document.css(".main-about-us").css('.content').text
company = Company.create!(name: company_name, address: company_address, description: company_description)
rescue => exception
puts exception
return nil
end
end
def crawl_jobs(job_links)
job_links.each do |link|
crawl_job(link)
end
end
def crawl_job(job_link)
begin
uri = URI.parse(URI.escape(job_link)) #fix error: uri must be ascii only
document = Nokogiri::HTML(open(uri))
job_title = document.at_css('.job-desc p.title').text
return if job_title.empty?
job_company_link = document.at_css('.job-desc a.job-company-name')[:href]
job_company = crawl_company(job_company_link)
return if job_company.nil?
job_detail = document.css('.job-detail-content div.detail-box')
job_location_name = job_detail[0].css('p a').map{ |val| val.text.strip }
job_locations = Location.where(city: job_location_name)
job_industry_names = job_detail[1].css('ul li')[1].css('p a').map{ |val| val.text.strip }
job_industries = Industry.where(name: job_industry_names)
job_salary = job_detail[2].css('ul li')[0].css('p').text.strip
job_experience = job_detail[2].css('ul li')[1].css('p').text.strip
job_level = job_detail[2].css('ul li')[2].css('p').text.strip
job_expiration_date = job_detail[2].css('ul li')[3].css('p').text.strip
job_description = document.css('.job-detail-content .detail-row').to_s
Job.find_or_create_by(title: job_title, company_id: job_company.id) do |job|
job.salary = job_salary
job.experience = job_experience
job.level = job_level
job.expiration_date = job_expiration_date
job.description = job_description
job.industries << job_industries
job.locations << job_locations
end
puts job_title
rescue => exception
puts exception
return exception
end
end
def crawl_industries_and_locations
document = Nokogiri::HTML(open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
industries_xml = document.css('#industry option')
industries = industries_xml.map(&:text)
locations_xml = document.css('#location option')
locations = locations_xml.map(&:text)
industries.each do |val|
Industry.find_or_create_by(name: val)
end
locations.take(Location::CITY_VIETNAM_NUMBER).each do |val|
Location.find_or_create_by(city: val) do |location|
location.oversea = false
end
end
locations.last(locations.count - Location::CITY_VIETNAM_NUMBER).each do |val|
Location.find_or_create_by(city: val) do |location|
location.oversea = true
end
end
end
\ No newline at end of file
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
job: one
industry: one
two:
job: two
industry: two
require 'test_helper'
class IndustriesJobTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment