Fix code and remove css.file

parent b29fa28d
Pipeline #734 canceled with stages
in 0 seconds
...@@ -44,6 +44,7 @@ end ...@@ -44,6 +44,7 @@ end
group :development do group :development do
# Access an interactive console on exception pages or by calling 'console' anywhere in the code. # Access an interactive console on exception pages or by calling 'console' anywhere in the code.
gem 'pry'
gem 'web-console', '>= 3.3.0' gem 'web-console', '>= 3.3.0'
gem 'listen', '>= 3.0.5', '< 3.2' gem 'listen', '>= 3.0.5', '< 3.2'
# Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring # Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring
......
...@@ -66,6 +66,7 @@ GEM ...@@ -66,6 +66,7 @@ GEM
archive-zip (~> 0.10) archive-zip (~> 0.10)
nokogiri (~> 1.8) nokogiri (~> 1.8)
chronic (0.10.2) chronic (0.10.2)
coderay (1.1.3)
coffee-rails (4.2.2) coffee-rails (4.2.2)
coffee-script (>= 2.2.0) coffee-script (>= 2.2.0)
railties (>= 4.0.0) railties (>= 4.0.0)
...@@ -130,6 +131,9 @@ GEM ...@@ -130,6 +131,9 @@ GEM
parallel (1.19.2) parallel (1.19.2)
parser (2.7.1.4) parser (2.7.1.4)
ast (~> 2.4.1) ast (~> 2.4.1)
pry (0.13.1)
coderay (~> 1.1)
method_source (~> 1.0)
public_suffix (4.0.5) public_suffix (4.0.5)
puma (3.12.6) puma (3.12.6)
rack (2.2.3) rack (2.2.3)
...@@ -247,6 +251,7 @@ DEPENDENCIES ...@@ -247,6 +251,7 @@ DEPENDENCIES
listen (>= 3.0.5, < 3.2) listen (>= 3.0.5, < 3.2)
mechanize (~> 2.7.6) mechanize (~> 2.7.6)
mysql2 (~> 0.5.3) mysql2 (~> 0.5.3)
pry
puma (~> 3.11) puma (~> 3.11)
rails (~> 5.2.4, >= 5.2.4.3) rails (~> 5.2.4, >= 5.2.4.3)
rubocop (~> 0.88.0) rubocop (~> 0.88.0)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
env :PATH, ENV['PATH'] env :PATH, ENV['PATH']
every 5.minutes do every 20.minutes do
rake 'import:auto' rake 'import:auto'
end end
File added
class Crawler class Crawler
def initialize(logger)
@mylogger = logger
end
def crawl_city def crawl_city
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html")) page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"))
get_name = page.search('select#location') get_name = page.search('select#location')
...@@ -26,7 +30,7 @@ class Crawler ...@@ -26,7 +30,7 @@ class Crawler
end end
def crawl_company def crawl_company
for n in 1..10 (1..10).each do |n|
company_info = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{n}-vi.html")) company_info = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{n}-vi.html"))
company_link = company_info.css('div.caption a.company-name').map{ |link| link['href'] } company_link = company_info.css('div.caption a.company-name').map{ |link| link['href'] }
company_link.each do |link| company_link.each do |link|
...@@ -48,7 +52,7 @@ class Crawler ...@@ -48,7 +52,7 @@ class Crawler
introduction: introduction_company) introduction: introduction_company)
end end
rescue StandardError => e rescue StandardError => e
puts e @mylogger.error "#{e.message}"
end end
end end
end end
...@@ -57,30 +61,25 @@ class Crawler ...@@ -57,30 +61,25 @@ class Crawler
end end
def crawl_job_relationships def crawl_job_relationships
for n in 1..10 (1..10).each do |n|
page_access = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{n}-vi.html")) page_access = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{n}-vi.html"))
get_link = page_access.css('a.job_link').map { |link| link['href'] } get_link = page_access.css('a.job_link').map { |link| link['href'] }
get_link.each do |link| get_link.each do |link|
page_job = Nokogiri::HTML(URI.open(URI.parse(URI.escape(link)))) page_job = Nokogiri::HTML(URI.open(URI.parse(URI.escape(link))))
get_row = page_job.search('div.bg-blue div.row') get_row = page_job.search('div.bg-blue div.row')
if get_row != "" if get_row != ""
begin
get_name_company = page_job.search('div.job-desc a.job-company-name').text.strip get_name_company = page_job.search('div.job-desc a.job-company-name').text.strip
company_table = Company.find_by(name: get_name_company) company_table = Company.find_by(name: get_name_company)
title_job = page_job.search('div.job-desc p').text title_job = page_job.search('div.job-desc p').text
description = page_job.search('div.detail-row') description = page_job.search('div.detail-row')
arr_column = get_row.css('div.has-background').map { |data| data.text.split(' ').join(' ') } next if company_table.nil?
arr_column.each_with_index do |val, key |
unless company_table.nil?
job_check = Job.find_by(title: title_job, company_id: company_table.id) job_check = Job.find_by(title: title_job, company_id: company_table.id)
if val.include?('Ngày cập nhật') salary = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').text.strip
arr_data = val.gsub('Ngày cập nhật ', '').split(' ') experience = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').text.strip
date = arr_data.first level = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').text.strip
elsif val.include?('Lương') && val.include?('Kinh nghiệm') == true && job_check.nil? expiration_date = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').text.strip
arr_sub = val.gsub('Lương ', '').gsub(' Kinh nghiệm ', '*').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*') if job_check.nil?
salary = arr_sub[0]
experience = arr_sub[1]
level = arr_sub[2]
expiration_date = arr_sub[3]
job = Job.create!(title: title_job, job = Job.create!(title: title_job,
level: level, level: level,
salary: salary, salary: salary,
...@@ -88,40 +87,29 @@ class Crawler ...@@ -88,40 +87,29 @@ class Crawler
expiration_date: expiration_date, expiration_date: expiration_date,
description: description, description: description,
company_id: company_table.id) company_id: company_table.id)
elsif val.include?('Lương') && val.include?('Kinh nghiệm') == false && job_check.nil?
arr_sub = val.gsub('Lương ', '').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*')
salary = arr_sub[0]
level = arr_sub[1]
expiration_date = arr_sub[2]
job = Job.create!(title: title_job,
level: level,
salary: salary,
experience: 'Không có',
expiration_date: expiration_date,
description: description,
company_id: company_table.id)
end
end end
next if !company_table.nil? find_job = Job.find_by(title: title_job, company_id: company_table.id)
job_table = Job.find_by(title: title_job) puts find_job.title
unless job_table.nil? unless find_job.nil?
location_rel = get_row.css('div.map p a').children.map { |location| location.text.strip } location_rel = get_row.css('div.map p a').children.map { |location| location.text.strip }
location_rel.each do |loc| location_rel.each do |loc|
city_table = City.find_by(name: loc) city_table = City.find_by(name: loc)
if CityJob.find_by(job_id: job_table.id, city_id: city_table.id).nil? if CityJob.find_by(job_id: find_job.id, city_id: city_table.id).nil?
puts "Created City: #{job_table.id} - #{city_table.id}.#{loc}" puts "Created City: #{find_job.id} - #{city_table.id}.#{loc}"
city_jobs = CityJob.create!(job_id: job_table.id, city_id: city_table.id) city_jobs = CityJob.create!(job_id: find_job.id, city_id: city_table.id)
end end
end end
industry_rel = get_row.css('li a').children.map { |industry| industry.text.strip } industry_rel = get_row.css('li a').children.map { |industry| industry.text.strip }
industry_rel.each do |ind| industry_rel.each do |ind|
industry_table = Industry.find_by(name: ind) industry_table = Industry.find_by(name: ind)
if IndustryJob.find_by(job_id: job_table.id, industry_id: industry_table.id).nil? if IndustryJob.find_by(job_id: find_job.id, industry_id: industry_table.id).nil?
puts "Created Industry: #{job_table.id} - #{industry_table.id}.#{ind}" puts "Created Industry: #{find_job.id} - #{industry_table.id}.#{ind}"
industry_jobs = IndustryJob.create!(job_id: job_table.id, industry_id: industry_table.id) industry_jobs = IndustryJob.create!(job_id: find_job.id, industry_id: industry_table.id)
end end
end end
end end
rescue StandardError => e
@mylogger.error "#{e.message}"
end end
end end
end end
...@@ -130,9 +118,6 @@ class Crawler ...@@ -130,9 +118,6 @@ class Crawler
def get_file_csv def get_file_csv
Net::FTP.open('192.168.1.156', 'training', 'training') do |ftp| Net::FTP.open('192.168.1.156', 'training', 'training') do |ftp|
files = ftp.list
puts "list files:"
puts files
ftp.getbinaryfile('jobs.zip') ftp.getbinaryfile('jobs.zip')
end end
end end
...@@ -147,24 +132,24 @@ class Crawler ...@@ -147,24 +132,24 @@ class Crawler
end end
end end
def import_file_csv def import_file_csv(file)
file = "jobs.csv"
CSV.foreach(file, headers: true) do |row| CSV.foreach(file, headers: true) do |row|
begin begin
company_name = row["company name"] company_name = row["company name"]
company_address = row["company address"] company_address = row["company address"]
company_introduction = row[:benefit] company_introduction = row["benefit"]
company_table = Company.find_by(name: company_name) company_table = Company.find_by(name: company_name)
if company_table.nil? if company_table.nil?
company_table = Company.create!(name: company_name, company_table = Company.create!(name: company_name,
address: company_address, address: company_address,
introduction: company_introduction) introduction: company_introduction)
end end
title_job = row[:name] title_job = row["name"]
description_job = row[:description] description_job = "#{row["description"]} #{row["requirement"]}"
level = row[:level] level = row["level"]
salary = row[:salary] salary = row["salary"]
unless company_table.nil? job_table = Job.find_by(title: title_job)
if !company_table.nil? && job_table.nil?
job_table = Job.create!(title: title_job, job_table = Job.create!(title: title_job,
description: description_job, description: description_job,
level: level, level: level,
...@@ -172,7 +157,7 @@ class Crawler ...@@ -172,7 +157,7 @@ class Crawler
company_id: company_table.id) company_id: company_table.id)
puts job_table.id puts job_table.id
end end
industry = row[:category] industry = row["category"]
industry_find = Industry.find_by(name: industry) industry_find = Industry.find_by(name: industry)
if industry_find.nil? if industry_find.nil?
industry_table = Industry.create!(name: industry) industry_table = Industry.create!(name: industry)
...@@ -192,16 +177,8 @@ class Crawler ...@@ -192,16 +177,8 @@ class Crawler
end end
puts "Location: #{location}" puts "Location: #{location}"
rescue StandardError => e rescue StandardError => e
puts e @mylogger.error "#{e.message}"
end
end end
end end
def logger
# config.log_level = :info
Rails.logger = Logger.new(STDOUT)
Rails.logger = Logger.new "#{Rails.root}/log/#{Rails.env}.log"
Rails.logger.level = Logger::DEBUG
Rails.logger.datetime_format = "%Y-%m-%d %H:%M:%S"
end end
end end
class Crontab class Crontab
def initialize(logger)
@mylogger = logger
end
def find_company def find_company
company_info = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-1-vi.html")) company_info = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-1-vi.html"))
company_link = company_info.css('div.caption a.company-name').map { |link| link['href'] } company_link = company_info.css('div.caption a.company-name').map { |link| link['href'] }
...@@ -18,7 +22,7 @@ class Crontab ...@@ -18,7 +22,7 @@ class Crontab
introduction: introduction_company) introduction: introduction_company)
end end
rescue StandardError => e rescue StandardError => e
puts e @mylogger.error "#{e.message}"
end end
end end
end end
...@@ -31,24 +35,18 @@ class Crontab ...@@ -31,24 +35,18 @@ class Crontab
page_job = Nokogiri::HTML(URI.open(URI.parse(URI.escape(link)))) page_job = Nokogiri::HTML(URI.open(URI.parse(URI.escape(link))))
get_row = page_job.search('div.bg-blue div.row') get_row = page_job.search('div.bg-blue div.row')
if get_row != "" if get_row != ""
begin
get_name_company = page_job.search('div.job-desc a.job-company-name').text.strip get_name_company = page_job.search('div.job-desc a.job-company-name').text.strip
company_table = Company.find_by(name: get_name_company) company_table = Company.find_by(name: get_name_company)
title_job = page_job.search('div.job-desc p').text title_job = page_job.search('div.job-desc p').text
description = page_job.search('div.detail-row') description = page_job.search('div.detail-row')
arr_column = get_row.css('div.has-background').map { |data| data.text.split(' ').join(' ') } next if company_table.nil?
job_table = Job.find_by(title: title_job)
arr_column.each do |val|
unless company_table.nil?
job_check = Job.find_by(title: title_job, company_id: company_table.id) job_check = Job.find_by(title: title_job, company_id: company_table.id)
if val.include?('Ngày cập nhật') salary = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').text.strip
arr_data = val.gsub('Ngày cập nhật ', '').split(' ') experience = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').text.strip
date_update = arr_data.first level = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').text.strip
elsif val.include?('Lương') && val.include?('Kinh nghiệm') == true && job_check.nil? expiration_date = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').text.strip
arr_sub = val.gsub('Lương ', '').gsub(' Kinh nghiệm ', '*').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*') if job_check.nil?
salary = arr_sub[0]
experience = arr_sub[1]
level = arr_sub[2]
expiration_date = arr_sub[3]
job = Job.create!(title: title_job, job = Job.create!(title: title_job,
level: level, level: level,
salary: salary, salary: salary,
...@@ -56,38 +54,29 @@ class Crontab ...@@ -56,38 +54,29 @@ class Crontab
expiration_date: expiration_date, expiration_date: expiration_date,
description: description, description: description,
company_id: company_table.id) company_id: company_table.id)
elsif val.include?('Lương') && val.include?('Kinh nghiệm') == false && job_check.nil?
arr_sub = val.gsub('Lương ', '').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*')
salary = arr_sub[0]
level = arr_sub[1]
expiration_date = arr_sub[2]
job = Job.create!(title: title_job,
level: level,
salary: salary,
experience: 'Không có',
expiration_date: expiration_date,
description: description,
company_id: company_table.id)
end
end end
end find_job = Job.find_by(title: title_job, company_id: company_table.id)
if !job_table.nil? && !company_table.nil? puts find_job.title
unless find_job.nil?
location_rel = get_row.css('div.map p a').children.map { |location| location.text.strip } location_rel = get_row.css('div.map p a').children.map { |location| location.text.strip }
location_rel.each do |loc| location_rel.each do |loc|
city_table = City.find_by(name: loc) city_table = City.find_by(name: loc)
if CityJob.find_by(job_id: job_table.id, city_id: city_table.id).nil? if CityJob.find_by(job_id: find_job.id, city_id: city_table.id).nil?
puts "Created City #{city_table.id} => #{loc}" puts "Created City: #{find_job.id} - #{city_table.id}.#{loc}"
city_jobs = CityJob.create!(job_id: job_table.id, city_id: city_table.id) city_jobs = CityJob.create!(job_id: find_job.id, city_id: city_table.id)
end end
end end
industry_rel = get_row.css('li a').children.map { |industry| industry.text.strip } industry_rel = get_row.css('li a').children.map { |industry| industry.text.strip }
industry_rel.each do |ind| industry_rel.each do |ind|
industry_table = Industry.find_by(name: ind) industry_table = Industry.find_by(name: ind)
if IndustryJob.find_by(job_id: job_table.id, industry_id: industry_table.id).nil? if IndustryJob.find_by(job_id: find_job.id, industry_id: industry_table.id).nil?
puts "Created Industry #{job_table.id} - #{industry_table.id} => #{ind}" puts "Created Industry: #{find_job.id} - #{industry_table.id}.#{ind}"
industry_jobs = IndustryJob.create!(job_id: job_table.id, industry_id: industry_table.id) industry_jobs = IndustryJob.create!(job_id: find_job.id, industry_id: industry_table.id)
end
end end
end end
rescue StandardError => e
@mylogger.error "#{e.message}"
end end
end end
end end
......
...@@ -3,34 +3,27 @@ require 'src/crontab.rb' ...@@ -3,34 +3,27 @@ require 'src/crontab.rb'
require 'net/ftp' require 'net/ftp'
require 'csv' require 'csv'
require 'zip' require 'zip'
action = Crawler.new
crontab = Crontab.new
namespace :import do namespace :import do
logger ||= Logger.new(Rails.root.join('log','my.log'))
desc 'crawler data' desc 'crawler data'
task crawler: :environment do task crawler: :environment do
action = Crawler.new(logger)
action.crawl_city action.crawl_city
action.crawl_industry action.crawl_industry
action.crawl_company action.crawl_company
action.crawl_job_relationships action.crawl_job_relationships
end end
desc 'get file CSV from Server'
task csv_get: :environment do
action.get_file_csv
action.extract_zip('./jobs.zip','.')
end
desc 'Import data from CSV'
task data_csv: :environment do
action.import_file_csv
end
desc 'Crontab' desc 'Crontab'
task auto: :environment do task auto: :environment do
action = Crawler.new(logger)
crontab = Crontab.new(logger)
crontab.find_company crontab.find_company
crontab.find_job crontab.find_job
action.logger action.get_file_csv
end action.extract_zip('./jobs.zip', 'lib/csv')
action.import_file_csv('lib/csv')
task log: :environment do
action.logger
end end
end end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment