Commit d2e14dc8 by Ngô Trung Hưng

autoload_paths

parent 043ca43e
Pipeline #757 canceled with stages
in 0 seconds
...@@ -9,9 +9,11 @@ Bundler.require(*Rails.groups) ...@@ -9,9 +9,11 @@ Bundler.require(*Rails.groups)
module Venjob module Venjob
class Application < Rails::Application class Application < Rails::Application
# Initialize configuration defaults for originally generated Rails version. # Initialize configuration defaults for originally generated Rails version.
config.autoload_paths << Rails.root.join('lib')
config.eager_load_paths << Rails.root.join('lib')
config.load_defaults 5.2 config.load_defaults 5.2
config.autoload_paths += [
Rails.root.join('lib/src'),
Rails.root.join('lib/src/base'),
Rails.root.join('lib/src/interface')]
# Settings in config/environments/* take precedence over those specified here. # Settings in config/environments/* take precedence over those specified here.
# Application configuration can go into files in config/initializers # Application configuration can go into files in config/initializers
# -- all .rb files in that directory are automatically loaded after loading # -- all .rb files in that directory are automatically loaded after loading
......
...@@ -5,10 +5,14 @@ require 'open-uri' ...@@ -5,10 +5,14 @@ require 'open-uri'
# Crawler data # Crawler data
class Crawler class Crawler
COMPANY_SECURITY = 1 COMPANY_SECURITY = 1
NUMBER_LINK = 5
SIZE_LI = 8
RANGE = 69 RANGE = 69
attr_accessor :number_link
def initialize(number_link)
@number_link = number_link
end
def path_to_first_link def path_to_first_link
Rails.root.join('tmp', 'link.txt') Rails.root.join('tmp', 'link.txt')
end end
...@@ -23,12 +27,12 @@ class Crawler ...@@ -23,12 +27,12 @@ class Crawler
end end
def safe_link(url) def safe_link(url)
Nokogiri::HTML(URI.open(URI.parse(URI.escape(url)))) Nokogiri::HTML(URI.open(URI.escape(url)))
end end
def crawl_link(page) def crawl_link
website_companies = [] website_companies = []
page.times do |i| number_link.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html")) page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
link_companies = page.search('.figcaption .caption @href') link_companies = page.search('.figcaption .caption @href')
website_companies += link_companies.map(&:value).uniq website_companies += link_companies.map(&:value).uniq
...@@ -50,7 +54,7 @@ class Crawler ...@@ -50,7 +54,7 @@ class Crawler
end end
def craw_data_companies def craw_data_companies
crawl_link(NUMBER_LINK).each do |url| crawl_link.each do |url|
page = safe_link(url) page = safe_link(url)
company_name = page.search('.company-info .content .name').text company_name = page.search('.company-info .content .name').text
Company.find_or_create_by(name: company_name) do |company| Company.find_or_create_by(name: company_name) do |company|
......
# frozen_string_literal: true # frozen_string_literal: true
require 'src/crawler.rb'
require_relative '../src/interface/red_interface.rb'
require_relative '../src/interface/blue_interface.rb'
require_relative '../src/interface/green_interface.rb'
# Crawler data job # Crawler data job
class CrawlerJob < Crawler class CrawlerJob < Crawler
def crawl_link(page) SIZE_LI = 8
def crawl_link
website_jobs = [] website_jobs = []
page.times do |i| number_link.times do |i|
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html")) page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{i + 1}-vi.html"))
link_jobs = page.search('.figcaption .title .job_link @href') link_jobs = page.search('.figcaption .title .job_link @href')
website_jobs += link_jobs.map(&:value) link_jobs.each do |val|
break if website_jobs.include?(link_make_stop_crawler) link = val.value
return website_jobs if link.include?(link_make_stop_crawler)
website_jobs << link
end
end end
File.write(path_to_first_link, website_jobs[0]) website_jobs
website_jobs.select(&:present?)
rescue StandardError => e rescue StandardError => e
logger.error "Crawler link on page have error #{e}" logger.error "Crawler link jobs on page have error #{e}"
end
def parse_data
@box_links ||= crawl_link.reverse!
end end
def reverse_arr def refresh_first_link
arr_link = [] File.write(path_to_first_link, parse_data.last)
crawl_link(NUMBER_LINK).each { |val| arr_link << val }
arr_link.reverse!
end end
def craw_data_jobs def craw_data_jobs
reverse_arr.each do |path| parse_data.each do |path|
page = safe_link(path) page = safe_link(path)
if page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].present? if page.search('.item-blue .detail-box:nth-child(1) ul li:nth-child(1) p')[0].present?
@data = RedInterface.new(page).create_data @data = RedInterface.new(page).create_data
...@@ -38,12 +39,12 @@ class CrawlerJob < Crawler ...@@ -38,12 +39,12 @@ class CrawlerJob < Crawler
@data = GreenInterface.new(page).create_data @data = GreenInterface.new(page).create_data
end end
add_data(@data) add_data(@data)
refresh_first_link
end end
end end
def add_data(data) def add_data(data)
id_company = Company.find_by name: data[:company_name] id_company = (Company.find_by name: data[:company_name]).try(:id) || COMPANY_SECURITY
id_company = id_company.present? ? id_company.id : COMPANY_SECURITY
job = Job.create(name: data[:name], job = Job.create(name: data[:name],
company_id: id_company, company_id: id_company,
level: data[:level], level: data[:level],
......
# frozen_string_literal: true # frozen_string_literal: true
require_relative '../base/base.rb'
# Inherience from base # Inherience from base
class BlueInterface < Base class BlueInterface < Base
def fill_company_name def fill_company_name
......
# frozen_string_literal: true # frozen_string_literal: true
require_relative '../base/base.rb'
# ahihi # ahihi
class GreenInterface < Base class GreenInterface < Base
def fill_name def fill_name
......
# frozen_string_literal: true # frozen_string_literal: true
require_relative '../base/base.rb'
# Inherience from base # Inherience from base
class RedInterface < Base class RedInterface < Base
end end
# frozen_string_literal: true # frozen_string_literal: true
require 'open-uri' require 'open-uri'
require 'src/crawler'
require 'src/crawler_job'
# rake task # rake task
namespace :crawler do namespace :crawler do
task populate: :environment do task populate: :environment do
NUMBER_LINK_WILL_BE_CRAWLER = 100
Company.find_or_create_by(name: 'Bảo mật') do |company| Company.find_or_create_by(name: 'Bảo mật') do |company|
company.address = 'Vui lòng xem trong mô tả công việc' company.address = 'Vui lòng xem trong mô tả công việc'
company.short_description = 'Vui lòng xem trong mô tả công việc' company.short_description = 'Vui lòng xem trong mô tả công việc'
end end
cw = Crawler.new cw = Crawler.new(NUMBER_LINK_WILL_BE_CRAWLER)
cw.craw_data_cities cw.craw_data_cities
cw.craw_data_companies cw.craw_data_companies
CrawlerJob.new.craw_data_jobs CrawlerJob.new(NUMBER_LINK_WILL_BE_CRAWLER).craw_data_jobs
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment