Fix code and remove css.file

parent b29fa28d
Pipeline #734 canceled with stages
in 0 seconds
...@@ -44,6 +44,7 @@ end ...@@ -44,6 +44,7 @@ end
group :development do group :development do
# Access an interactive console on exception pages or by calling 'console' anywhere in the code. # Access an interactive console on exception pages or by calling 'console' anywhere in the code.
gem 'pry'
gem 'web-console', '>= 3.3.0' gem 'web-console', '>= 3.3.0'
gem 'listen', '>= 3.0.5', '< 3.2' gem 'listen', '>= 3.0.5', '< 3.2'
# Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring # Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring
......
...@@ -66,6 +66,7 @@ GEM ...@@ -66,6 +66,7 @@ GEM
archive-zip (~> 0.10) archive-zip (~> 0.10)
nokogiri (~> 1.8) nokogiri (~> 1.8)
chronic (0.10.2) chronic (0.10.2)
coderay (1.1.3)
coffee-rails (4.2.2) coffee-rails (4.2.2)
coffee-script (>= 2.2.0) coffee-script (>= 2.2.0)
railties (>= 4.0.0) railties (>= 4.0.0)
...@@ -130,6 +131,9 @@ GEM ...@@ -130,6 +131,9 @@ GEM
parallel (1.19.2) parallel (1.19.2)
parser (2.7.1.4) parser (2.7.1.4)
ast (~> 2.4.1) ast (~> 2.4.1)
pry (0.13.1)
coderay (~> 1.1)
method_source (~> 1.0)
public_suffix (4.0.5) public_suffix (4.0.5)
puma (3.12.6) puma (3.12.6)
rack (2.2.3) rack (2.2.3)
...@@ -247,6 +251,7 @@ DEPENDENCIES ...@@ -247,6 +251,7 @@ DEPENDENCIES
listen (>= 3.0.5, < 3.2) listen (>= 3.0.5, < 3.2)
mechanize (~> 2.7.6) mechanize (~> 2.7.6)
mysql2 (~> 0.5.3) mysql2 (~> 0.5.3)
pry
puma (~> 3.11) puma (~> 3.11)
rails (~> 5.2.4, >= 5.2.4.3) rails (~> 5.2.4, >= 5.2.4.3)
rubocop (~> 0.88.0) rubocop (~> 0.88.0)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
env :PATH, ENV['PATH'] env :PATH, ENV['PATH']
every 5.minutes do every 20.minutes do
rake 'import:auto' rake 'import:auto'
end end
File added
class Crawler class Crawler
def initialize(logger)
@mylogger = logger
end
def crawl_city def crawl_city
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html")) page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"))
get_name = page.search('select#location') get_name = page.search('select#location')
...@@ -26,7 +30,7 @@ class Crawler ...@@ -26,7 +30,7 @@ class Crawler
end end
def crawl_company def crawl_company
for n in 1..10 (1..10).each do |n|
company_info = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{n}-vi.html")) company_info = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{n}-vi.html"))
company_link = company_info.css('div.caption a.company-name').map{ |link| link['href'] } company_link = company_info.css('div.caption a.company-name').map{ |link| link['href'] }
company_link.each do |link| company_link.each do |link|
...@@ -47,8 +51,8 @@ class Crawler ...@@ -47,8 +51,8 @@ class Crawler
address: address_company, address: address_company,
introduction: introduction_company) introduction: introduction_company)
end end
rescue StandardError => e rescue StandardError => e
puts e @mylogger.error "#{e.message}"
end end
end end
end end
...@@ -57,71 +61,55 @@ class Crawler ...@@ -57,71 +61,55 @@ class Crawler
end end
def crawl_job_relationships def crawl_job_relationships
for n in 1..10 (1..10).each do |n|
page_access = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{n}-vi.html")) page_access = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{n}-vi.html"))
get_link = page_access.css('a.job_link').map { |link| link['href'] } get_link = page_access.css('a.job_link').map { |link| link['href'] }
get_link.each do |link| get_link.each do |link|
page_job = Nokogiri::HTML(URI.open(URI.parse(URI.escape(link)))) page_job = Nokogiri::HTML(URI.open(URI.parse(URI.escape(link))))
get_row = page_job.search('div.bg-blue div.row') get_row = page_job.search('div.bg-blue div.row')
if get_row != "" if get_row != ""
get_name_company = page_job.search('div.job-desc a.job-company-name').text.strip begin
company_table = Company.find_by(name: get_name_company) get_name_company = page_job.search('div.job-desc a.job-company-name').text.strip
title_job = page_job.search('div.job-desc p').text company_table = Company.find_by(name: get_name_company)
description = page_job.search('div.detail-row') title_job = page_job.search('div.job-desc p').text
arr_column = get_row.css('div.has-background').map { |data| data.text.split(' ').join(' ') } description = page_job.search('div.detail-row')
arr_column.each_with_index do |val, key | next if company_table.nil?
unless company_table.nil? job_check = Job.find_by(title: title_job, company_id: company_table.id)
job_check = Job.find_by(title: title_job, company_id: company_table.id) salary = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').text.strip
if val.include?('Ngày cập nhật') experience = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').text.strip
arr_data = val.gsub('Ngày cập nhật ', '').split(' ') level = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').text.strip
date = arr_data.first expiration_date = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').text.strip
elsif val.include?('Lương') && val.include?('Kinh nghiệm') == true && job_check.nil? if job_check.nil?
arr_sub = val.gsub('Lương ', '').gsub(' Kinh nghiệm ', '*').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*') job = Job.create!(title: title_job,
salary = arr_sub[0] level: level,
experience = arr_sub[1] salary: salary,
level = arr_sub[2] experience: experience,
expiration_date = arr_sub[3] expiration_date: expiration_date,
job = Job.create!(title: title_job, description: description,
level: level, company_id: company_table.id)
salary: salary, end
experience: experience, find_job = Job.find_by(title: title_job, company_id: company_table.id)
expiration_date: expiration_date, puts find_job.title
description: description, unless find_job.nil?
company_id: company_table.id)
elsif val.include?('Lương') && val.include?('Kinh nghiệm') == false && job_check.nil?
arr_sub = val.gsub('Lương ', '').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*')
salary = arr_sub[0]
level = arr_sub[1]
expiration_date = arr_sub[2]
job = Job.create!(title: title_job,
level: level,
salary: salary,
experience: 'Không có',
expiration_date: expiration_date,
description: description,
company_id: company_table.id)
end
end
next if !company_table.nil?
job_table = Job.find_by(title: title_job)
unless job_table.nil?
location_rel = get_row.css('div.map p a').children.map { |location| location.text.strip } location_rel = get_row.css('div.map p a').children.map { |location| location.text.strip }
location_rel.each do |loc| location_rel.each do |loc|
city_table = City.find_by(name: loc) city_table = City.find_by(name: loc)
if CityJob.find_by(job_id: job_table.id, city_id: city_table.id).nil? if CityJob.find_by(job_id: find_job.id, city_id: city_table.id).nil?
puts "Created City: #{job_table.id} - #{city_table.id}.#{loc}" puts "Created City: #{find_job.id} - #{city_table.id}.#{loc}"
city_jobs = CityJob.create!(job_id: job_table.id, city_id: city_table.id) city_jobs = CityJob.create!(job_id: find_job.id, city_id: city_table.id)
end end
end end
industry_rel = get_row.css('li a').children.map { |industry| industry.text.strip } industry_rel = get_row.css('li a').children.map { |industry| industry.text.strip }
industry_rel.each do |ind| industry_rel.each do |ind|
industry_table = Industry.find_by(name: ind) industry_table = Industry.find_by(name: ind)
if IndustryJob.find_by(job_id: job_table.id, industry_id: industry_table.id).nil? if IndustryJob.find_by(job_id: find_job.id, industry_id: industry_table.id).nil?
puts "Created Industry: #{job_table.id} - #{industry_table.id}.#{ind}" puts "Created Industry: #{find_job.id} - #{industry_table.id}.#{ind}"
industry_jobs = IndustryJob.create!(job_id: job_table.id, industry_id: industry_table.id) industry_jobs = IndustryJob.create!(job_id: find_job.id, industry_id: industry_table.id)
end end
end end
end end
rescue StandardError => e
@mylogger.error "#{e.message}"
end end
end end
end end
...@@ -130,9 +118,6 @@ class Crawler ...@@ -130,9 +118,6 @@ class Crawler
def get_file_csv def get_file_csv
Net::FTP.open('192.168.1.156', 'training', 'training') do |ftp| Net::FTP.open('192.168.1.156', 'training', 'training') do |ftp|
files = ftp.list
puts "list files:"
puts files
ftp.getbinaryfile('jobs.zip') ftp.getbinaryfile('jobs.zip')
end end
end end
...@@ -147,24 +132,24 @@ class Crawler ...@@ -147,24 +132,24 @@ class Crawler
end end
end end
def import_file_csv def import_file_csv(file)
file = "jobs.csv"
CSV.foreach(file, headers: true) do |row| CSV.foreach(file, headers: true) do |row|
begin begin
company_name = row["company name"] company_name = row["company name"]
company_address = row["company address"] company_address = row["company address"]
company_introduction = row[:benefit] company_introduction = row["benefit"]
company_table = Company.find_by(name: company_name) company_table = Company.find_by(name: company_name)
if company_table.nil? if company_table.nil?
company_table = Company.create!(name: company_name, company_table = Company.create!(name: company_name,
address: company_address, address: company_address,
introduction: company_introduction) introduction: company_introduction)
end end
title_job = row[:name] title_job = row["name"]
description_job = row[:description] description_job = "#{row["description"]} #{row["requirement"]}"
level = row[:level] level = row["level"]
salary = row[:salary] salary = row["salary"]
unless company_table.nil? job_table = Job.find_by(title: title_job)
if !company_table.nil? && job_table.nil?
job_table = Job.create!(title: title_job, job_table = Job.create!(title: title_job,
description: description_job, description: description_job,
level: level, level: level,
...@@ -172,7 +157,7 @@ class Crawler ...@@ -172,7 +157,7 @@ class Crawler
company_id: company_table.id) company_id: company_table.id)
puts job_table.id puts job_table.id
end end
industry = row[:category] industry = row["category"]
industry_find = Industry.find_by(name: industry) industry_find = Industry.find_by(name: industry)
if industry_find.nil? if industry_find.nil?
industry_table = Industry.create!(name: industry) industry_table = Industry.create!(name: industry)
...@@ -192,16 +177,8 @@ class Crawler ...@@ -192,16 +177,8 @@ class Crawler
end end
puts "Location: #{location}" puts "Location: #{location}"
rescue StandardError => e rescue StandardError => e
puts e @mylogger.error "#{e.message}"
end end
end end
end end
def logger
# config.log_level = :info
Rails.logger = Logger.new(STDOUT)
Rails.logger = Logger.new "#{Rails.root}/log/#{Rails.env}.log"
Rails.logger.level = Logger::DEBUG
Rails.logger.datetime_format = "%Y-%m-%d %H:%M:%S"
end
end end
class Crontab class Crontab
def initialize(logger)
@mylogger = logger
end
def find_company def find_company
company_info = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-1-vi.html")) company_info = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-1-vi.html"))
company_link = company_info.css('div.caption a.company-name').map { |link| link['href'] } company_link = company_info.css('div.caption a.company-name').map { |link| link['href'] }
...@@ -18,7 +22,7 @@ class Crontab ...@@ -18,7 +22,7 @@ class Crontab
introduction: introduction_company) introduction: introduction_company)
end end
rescue StandardError => e rescue StandardError => e
puts e @mylogger.error "#{e.message}"
end end
end end
end end
...@@ -31,63 +35,48 @@ class Crontab ...@@ -31,63 +35,48 @@ class Crontab
page_job = Nokogiri::HTML(URI.open(URI.parse(URI.escape(link)))) page_job = Nokogiri::HTML(URI.open(URI.parse(URI.escape(link))))
get_row = page_job.search('div.bg-blue div.row') get_row = page_job.search('div.bg-blue div.row')
if get_row != "" if get_row != ""
begin
get_name_company = page_job.search('div.job-desc a.job-company-name').text.strip get_name_company = page_job.search('div.job-desc a.job-company-name').text.strip
company_table = Company.find_by(name: get_name_company) company_table = Company.find_by(name: get_name_company)
title_job = page_job.search('div.job-desc p').text title_job = page_job.search('div.job-desc p').text
description = page_job.search('div.detail-row') description = page_job.search('div.detail-row')
arr_column = get_row.css('div.has-background').map { |data| data.text.split(' ').join(' ') } next if company_table.nil?
job_table = Job.find_by(title: title_job) job_check = Job.find_by(title: title_job, company_id: company_table.id)
arr_column.each do |val| salary = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').text.strip
unless company_table.nil? experience = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').text.strip
job_check = Job.find_by(title: title_job, company_id: company_table.id) level = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').text.strip
if val.include?('Ngày cập nhật') expiration_date = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').text.strip
arr_data = val.gsub('Ngày cập nhật ', '').split(' ') if job_check.nil?
date_update = arr_data.first
elsif val.include?('Lương') && val.include?('Kinh nghiệm') == true && job_check.nil?
arr_sub = val.gsub('Lương ', '').gsub(' Kinh nghiệm ', '*').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*')
salary = arr_sub[0]
experience = arr_sub[1]
level = arr_sub[2]
expiration_date = arr_sub[3]
job = Job.create!(title: title_job, job = Job.create!(title: title_job,
level: level, level: level,
salary: salary, salary: salary,
experience: experience, experience: experience,
expiration_date: expiration_date, expiration_date: expiration_date,
description: description, description: description,
company_id: company_table.id) company_id: company_table.id)
elsif val.include?('Lương') && val.include?('Kinh nghiệm') == false && job_check.nil?
arr_sub = val.gsub('Lương ', '').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*')
salary = arr_sub[0]
level = arr_sub[1]
expiration_date = arr_sub[2]
job = Job.create!(title: title_job,
level: level,
salary: salary,
experience: 'Không có',
expiration_date: expiration_date,
description: description,
company_id: company_table.id)
end end
end find_job = Job.find_by(title: title_job, company_id: company_table.id)
end puts find_job.title
if !job_table.nil? && !company_table.nil? unless find_job.nil?
location_rel = get_row.css('div.map p a').children.map { |location| location.text.strip } location_rel = get_row.css('div.map p a').children.map { |location| location.text.strip }
location_rel.each do |loc| location_rel.each do |loc|
city_table = City.find_by(name: loc) city_table = City.find_by(name: loc)
if CityJob.find_by(job_id: job_table.id, city_id: city_table.id).nil? if CityJob.find_by(job_id: find_job.id, city_id: city_table.id).nil?
puts "Created City #{city_table.id} => #{loc}" puts "Created City: #{find_job.id} - #{city_table.id}.#{loc}"
city_jobs = CityJob.create!(job_id: job_table.id, city_id: city_table.id) city_jobs = CityJob.create!(job_id: find_job.id, city_id: city_table.id)
end
end end
end industry_rel = get_row.css('li a').children.map { |industry| industry.text.strip }
industry_rel = get_row.css('li a').children.map { |industry| industry.text.strip } industry_rel.each do |ind|
industry_rel.each do |ind| industry_table = Industry.find_by(name: ind)
industry_table = Industry.find_by(name: ind) if IndustryJob.find_by(job_id: find_job.id, industry_id: industry_table.id).nil?
if IndustryJob.find_by(job_id: job_table.id, industry_id: industry_table.id).nil? puts "Created Industry: #{find_job.id} - #{industry_table.id}.#{ind}"
puts "Created Industry #{job_table.id} - #{industry_table.id} => #{ind}" industry_jobs = IndustryJob.create!(job_id: find_job.id, industry_id: industry_table.id)
industry_jobs = IndustryJob.create!(job_id: job_table.id, industry_id: industry_table.id) end
end end
end end
rescue StandardError => e
@mylogger.error "#{e.message}"
end end
end end
end end
......
...@@ -3,34 +3,27 @@ require 'src/crontab.rb' ...@@ -3,34 +3,27 @@ require 'src/crontab.rb'
require 'net/ftp' require 'net/ftp'
require 'csv' require 'csv'
require 'zip' require 'zip'
action = Crawler.new
crontab = Crontab.new
namespace :import do namespace :import do
logger ||= Logger.new(Rails.root.join('log','my.log'))
desc 'crawler data' desc 'crawler data'
task crawler: :environment do task crawler: :environment do
action = Crawler.new(logger)
action.crawl_city action.crawl_city
action.crawl_industry action.crawl_industry
action.crawl_company action.crawl_company
action.crawl_job_relationships action.crawl_job_relationships
end end
desc 'get file CSV from Server'
task csv_get: :environment do
action.get_file_csv
action.extract_zip('./jobs.zip','.')
end
desc 'Import data from CSV'
task data_csv: :environment do
action.import_file_csv
end
desc 'Crontab' desc 'Crontab'
task auto: :environment do task auto: :environment do
action = Crawler.new(logger)
crontab = Crontab.new(logger)
crontab.find_company crontab.find_company
crontab.find_job crontab.find_job
action.logger action.get_file_csv
end action.extract_zip('./jobs.zip', 'lib/csv')
action.import_file_csv('lib/csv')
task log: :environment do
action.logger
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment