Commit b45a5d2a by Tô Ngọc Ánh

crawl companies, industries, locations

parent 178460a2
Pipeline #687 failed with stages
in 0 seconds
......@@ -61,3 +61,7 @@ end
# Windows does not include zoneinfo files, so bundle the tzinfo-data gem
gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby]
##
gem "nokogiri"
##
......@@ -203,6 +203,7 @@ DEPENDENCIES
jbuilder (~> 2.5)
listen (>= 3.0.5, < 3.2)
mysql2 (>= 0.4.4, < 0.6.0)
nokogiri
puma (~> 3.11)
rails (~> 5.2.4, >= 5.2.4.3)
sass-rails (~> 5.0)
......
class AddIndexToCompany < ActiveRecord::Migration[5.2]
def change
add_index :companies, :name, unique: true
end
end
class ChangeColAreaToOverseaInLocation < ActiveRecord::Migration[5.2]
def change
remove_column :locations, :area, :string
add_column :locations, :oversea, :boolean
end
end
......@@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2020_07_17_014308) do
ActiveRecord::Schema.define(version: 2020_07_20_075150) do
create_table "applied_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.bigint "user_id"
......@@ -30,6 +30,7 @@ ActiveRecord::Schema.define(version: 2020_07_17_014308) do
t.string "address"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["name"], name: "index_companies_on_name", unique: true
end
create_table "favorites", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
......@@ -77,10 +78,10 @@ ActiveRecord::Schema.define(version: 2020_07_17_014308) do
end
create_table "locations", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
t.string "area"
t.string "city"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.boolean "oversea"
end
create_table "locations_jobs", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8", force: :cascade do |t|
......
task crawl_companies_jobs: :environment do
require "open-uri"
crawl_companies_and_jobs(3)
end
task crawl_industries_locations: :environment do
require "open-uri"
crawl_industries_and_locations
end
def crawl_companies_and_jobs(page)
for i in 1..page
company_links, job_links = get_company_and_job_links("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i}-vi.html")
crawl_companies(company_links)
end
end
def get_company_and_job_links(base_link)
document = Nokogiri::HTML(open(base_link))
companies_xml = document.xpath('//div/a[@class="company-name"]/@href')
company_links = companies_xml.map(&:value)
jobs_xml = document.xpath('//div/a[@class="job_link"]/@href')
job_links = jobs_xml.map(&:value)
[company_links, job_links]
end
def crawl_companies(company_links)
company_links.each do |link|
crawl_company(link)
end
end
def crawl_company(company_link)
begin
document = Nokogiri::HTML(open(company_link))
company_name = document.css(".content .name").text
exist = Company.find_by(name: company_name).present?
return if exist || company_name.empty?
puts company_name
company_address = document.css(".content p")[1].text
company_description = document.css(".main-about-us").css('.content').text
company = Company.create!(name: company_name, address: company_address, description: company_description)
rescue => exception
return exception
end
end
def crawl_job(job_link)
begin
document = Nokogiri::HTML(open(job_link))
job_company = document.css('')
job_title = document.css('.job-desc p.title').text
job_salary = document.css('')
job_experience = document.css('')
job_level = document.css('')
job_expiration_date = document.css('')
job_description = document.css('')
rescue => exception
end
end
def crawl_industries_and_locations
document = Nokogiri::HTML(open('https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html'))
industries_xml = document.css('#industry option')
industries = industries_xml.map(&:text)
locations_xml = document.css('#location option')
locations = locations_xml.map(&:text)
industries.each do |industry|
exist = Industry.find_by(name: industry).present?
break if exist
puts industry
Industry.create!(name: industry)
end
locations.take(70).each do |location|
exist = Location.find_by(city: location).present?
break if exist
puts location
Location.create!(oversea: false, city: location )
end
locations.last(locations.count - 70).each do |location|
exist = Location.find_by(city: location).present?
break if exist
puts location
Location.create!(oversea: true, city: location )
end
end
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment