Commit 6496e46e by Tô Ngọc Ánh

crawl jobs

parent 5907329d
Pipeline #688 failed with stages
in 0 seconds
class IndustriesJob < ApplicationRecord
belongs_to :job
belongs_to :industry
end
......@@ -3,6 +3,7 @@ class Job < ApplicationRecord
has_many :applied_jobs
has_many :histories
has_many :favorites
has_many :locations_jobs
has_many :locations, through: :locations_jobs
has_and_belongs_to_many :industries
end
class Location < ApplicationRecord
CITY_VIETNAM_NUMBER = 70
has_many :locations_jobs
has_many :jobs, through: :locations_jobs
end
require "open-uri"
task crawl_companies_jobs: :environment do
crawl_companies_and_jobs(10)
task crawl_jobs: :environment do
job_links = get_job_links(1)
crawl_jobs(job_links)
end
task crawl_industries_locations: :environment do
crawl_industries_and_locations
end
def crawl_companies_and_jobs(page)
def get_job_links(page)
job_links = []
page.times do |i|
company_links, job_links = get_company_and_job_links("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i}-vi.html")
crawl_companies(company_links)
crawl_jobs(job_links)
end
end
def get_company_and_job_links(base_link)
document = Nokogiri::HTML(open(base_link))
companies_xml = document.xpath('//div/a[@class="company-name"]/@href')
company_links = companies_xml.map(&:value)
document = Nokogiri::HTML(open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i}-vi.html"))
jobs_xml = document.xpath('//div/a[@class="job_link"]/@href')
job_links = jobs_xml.map(&:value)
[company_links, job_links]
end
def crawl_companies(company_links)
company_links.each do |link|
crawl_company(link)
jobs_xml.each { |i| job_links << i.value}
end
job_links
end
def crawl_company(company_link)
begin
document = Nokogiri::HTML(open(company_link))
uri = URI.parse(URI.escape(company_link)) #fix error: uri must be ascii only
document = Nokogiri::HTML(open(uri))
company_name = document.css(".content .name").text
return if company_name.empty?
return nil if company_name.empty?
exist = Company.find_by(name: company_name).present?
return if exist
company = Company.find_by(name: company_name)
return company if company.present?
puts company_name
company_address = document.css(".content p")[1].text
......@@ -46,6 +34,7 @@ def crawl_company(company_link)
company = Company.create!(name: company_name, address: company_address, description: company_description)
rescue => exception
puts exception
return nil
end
end
......@@ -57,16 +46,39 @@ end
def crawl_job(job_link)
begin
document = Nokogiri::HTML(open(job_link))
job_company = document.css('')
job_title = document.css('.job-desc p.title').text
job_salary = document.css('')
job_experience = document.css('')
job_level = document.css('')
job_expiration_date = document.css('')
job_description = document.css('')
uri = URI.parse(URI.escape(job_link)) #fix error: uri must be ascii only
document = Nokogiri::HTML(open(uri))
job_title = document.at_css('.job-desc p.title').text
return if job_title.empty?
job_company_link = document.at_css('.job-desc a.job-company-name')[:href]
job_company = crawl_company(job_company_link)
return if job_company.nil?
job_detail = document.css('.job-detail-content div.detail-box')
job_location_name = job_detail[0].css('p a').map{ |val| val.text.strip }
job_locations = Location.where(city: job_location_name)
job_industry_names = job_detail[1].css('ul li')[1].css('p a').map{ |val| val.text.strip }
job_industries = Industry.where(name: job_industry_names)
job_salary = job_detail[2].css('ul li')[0].css('p').text.strip
job_experience = job_detail[2].css('ul li')[1].css('p').text.strip
job_level = job_detail[2].css('ul li')[2].css('p').text.strip
job_expiration_date = job_detail[2].css('ul li')[3].css('p').text.strip
job_description = document.css('.job-detail-content .detail-row').to_s
Job.find_or_create_by(title: job_title, company_id: job_company.id) do |job|
job.salary = job_salary
job.experience = job_experience
job.level = job_level
job.expiration_date = job_expiration_date
job.description = job_description
job.industries << job_industries
job.locations << job_locations
end
puts job_title
rescue => exception
puts exception
return exception
end
end
......
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
job: one
industry: one
two:
job: two
industry: two
require 'test_helper'
class IndustriesJobTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment