Fix code and remove css.file

parent b29fa28d
Pipeline #734 canceled with stages
in 0 seconds
......@@ -44,6 +44,7 @@ end
group :development do
# Access an interactive console on exception pages or by calling 'console' anywhere in the code.
gem 'pry'
gem 'web-console', '>= 3.3.0'
gem 'listen', '>= 3.0.5', '< 3.2'
# Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring
......
......@@ -66,6 +66,7 @@ GEM
archive-zip (~> 0.10)
nokogiri (~> 1.8)
chronic (0.10.2)
coderay (1.1.3)
coffee-rails (4.2.2)
coffee-script (>= 2.2.0)
railties (>= 4.0.0)
......@@ -130,6 +131,9 @@ GEM
parallel (1.19.2)
parser (2.7.1.4)
ast (~> 2.4.1)
pry (0.13.1)
coderay (~> 1.1)
method_source (~> 1.0)
public_suffix (4.0.5)
puma (3.12.6)
rack (2.2.3)
......@@ -247,6 +251,7 @@ DEPENDENCIES
listen (>= 3.0.5, < 3.2)
mechanize (~> 2.7.6)
mysql2 (~> 0.5.3)
pry
puma (~> 3.11)
rails (~> 5.2.4, >= 5.2.4.3)
rubocop (~> 0.88.0)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
env :PATH, ENV['PATH']
every 5.minutes do
every 20.minutes do
rake 'import:auto'
end
File added
class Crawler
def initialize(logger)
@mylogger = logger
end
def crawl_city
page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"))
get_name = page.search('select#location')
......@@ -26,7 +30,7 @@ class Crawler
end
def crawl_company
for n in 1..10
(1..10).each do |n|
company_info = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{n}-vi.html"))
company_link = company_info.css('div.caption a.company-name').map{ |link| link['href'] }
company_link.each do |link|
......@@ -48,7 +52,7 @@ class Crawler
introduction: introduction_company)
end
rescue StandardError => e
puts e
@mylogger.error "#{e.message}"
end
end
end
......@@ -57,30 +61,25 @@ class Crawler
end
def crawl_job_relationships
for n in 1..10
(1..10).each do |n|
page_access = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{n}-vi.html"))
get_link = page_access.css('a.job_link').map { |link| link['href'] }
get_link.each do |link|
page_job = Nokogiri::HTML(URI.open(URI.parse(URI.escape(link))))
get_row = page_job.search('div.bg-blue div.row')
if get_row != ""
begin
get_name_company = page_job.search('div.job-desc a.job-company-name').text.strip
company_table = Company.find_by(name: get_name_company)
title_job = page_job.search('div.job-desc p').text
description = page_job.search('div.detail-row')
arr_column = get_row.css('div.has-background').map { |data| data.text.split(' ').join(' ') }
arr_column.each_with_index do |val, key |
unless company_table.nil?
next if company_table.nil?
job_check = Job.find_by(title: title_job, company_id: company_table.id)
if val.include?('Ngày cập nhật')
arr_data = val.gsub('Ngày cập nhật ', '').split(' ')
date = arr_data.first
elsif val.include?('Lương') && val.include?('Kinh nghiệm') == true && job_check.nil?
arr_sub = val.gsub('Lương ', '').gsub(' Kinh nghiệm ', '*').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*')
salary = arr_sub[0]
experience = arr_sub[1]
level = arr_sub[2]
expiration_date = arr_sub[3]
salary = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').text.strip
experience = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').text.strip
level = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').text.strip
expiration_date = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').text.strip
if job_check.nil?
job = Job.create!(title: title_job,
level: level,
salary: salary,
......@@ -88,40 +87,29 @@ class Crawler
expiration_date: expiration_date,
description: description,
company_id: company_table.id)
elsif val.include?('Lương') && val.include?('Kinh nghiệm') == false && job_check.nil?
arr_sub = val.gsub('Lương ', '').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*')
salary = arr_sub[0]
level = arr_sub[1]
expiration_date = arr_sub[2]
job = Job.create!(title: title_job,
level: level,
salary: salary,
experience: 'Không có',
expiration_date: expiration_date,
description: description,
company_id: company_table.id)
end
end
next if !company_table.nil?
job_table = Job.find_by(title: title_job)
unless job_table.nil?
find_job = Job.find_by(title: title_job, company_id: company_table.id)
puts find_job.title
unless find_job.nil?
location_rel = get_row.css('div.map p a').children.map { |location| location.text.strip }
location_rel.each do |loc|
city_table = City.find_by(name: loc)
if CityJob.find_by(job_id: job_table.id, city_id: city_table.id).nil?
puts "Created City: #{job_table.id} - #{city_table.id}.#{loc}"
city_jobs = CityJob.create!(job_id: job_table.id, city_id: city_table.id)
if CityJob.find_by(job_id: find_job.id, city_id: city_table.id).nil?
puts "Created City: #{find_job.id} - #{city_table.id}.#{loc}"
city_jobs = CityJob.create!(job_id: find_job.id, city_id: city_table.id)
end
end
industry_rel = get_row.css('li a').children.map { |industry| industry.text.strip }
industry_rel.each do |ind|
industry_table = Industry.find_by(name: ind)
if IndustryJob.find_by(job_id: job_table.id, industry_id: industry_table.id).nil?
puts "Created Industry: #{job_table.id} - #{industry_table.id}.#{ind}"
industry_jobs = IndustryJob.create!(job_id: job_table.id, industry_id: industry_table.id)
if IndustryJob.find_by(job_id: find_job.id, industry_id: industry_table.id).nil?
puts "Created Industry: #{find_job.id} - #{industry_table.id}.#{ind}"
industry_jobs = IndustryJob.create!(job_id: find_job.id, industry_id: industry_table.id)
end
end
end
rescue StandardError => e
@mylogger.error "#{e.message}"
end
end
end
......@@ -130,9 +118,6 @@ class Crawler
def get_file_csv
Net::FTP.open('192.168.1.156', 'training', 'training') do |ftp|
files = ftp.list
puts "list files:"
puts files
ftp.getbinaryfile('jobs.zip')
end
end
......@@ -147,24 +132,24 @@ class Crawler
end
end
def import_file_csv
file = "jobs.csv"
def import_file_csv(file)
CSV.foreach(file, headers: true) do |row|
begin
company_name = row["company name"]
company_address = row["company address"]
company_introduction = row[:benefit]
company_introduction = row["benefit"]
company_table = Company.find_by(name: company_name)
if company_table.nil?
company_table = Company.create!(name: company_name,
address: company_address,
introduction: company_introduction)
end
title_job = row[:name]
description_job = row[:description]
level = row[:level]
salary = row[:salary]
unless company_table.nil?
title_job = row["name"]
description_job = "#{row["description"]} #{row["requirement"]}"
level = row["level"]
salary = row["salary"]
job_table = Job.find_by(title: title_job)
if !company_table.nil? && job_table.nil?
job_table = Job.create!(title: title_job,
description: description_job,
level: level,
......@@ -172,7 +157,7 @@ class Crawler
company_id: company_table.id)
puts job_table.id
end
industry = row[:category]
industry = row["category"]
industry_find = Industry.find_by(name: industry)
if industry_find.nil?
industry_table = Industry.create!(name: industry)
......@@ -192,16 +177,8 @@ class Crawler
end
puts "Location: #{location}"
rescue StandardError => e
puts e
end
@mylogger.error "#{e.message}"
end
end
def logger
# config.log_level = :info
Rails.logger = Logger.new(STDOUT)
Rails.logger = Logger.new "#{Rails.root}/log/#{Rails.env}.log"
Rails.logger.level = Logger::DEBUG
Rails.logger.datetime_format = "%Y-%m-%d %H:%M:%S"
end
end
class Crontab
def initialize(logger)
@mylogger = logger
end
def find_company
company_info = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-1-vi.html"))
company_link = company_info.css('div.caption a.company-name').map { |link| link['href'] }
......@@ -18,7 +22,7 @@ class Crontab
introduction: introduction_company)
end
rescue StandardError => e
puts e
@mylogger.error "#{e.message}"
end
end
end
......@@ -31,24 +35,18 @@ class Crontab
page_job = Nokogiri::HTML(URI.open(URI.parse(URI.escape(link))))
get_row = page_job.search('div.bg-blue div.row')
if get_row != ""
begin
get_name_company = page_job.search('div.job-desc a.job-company-name').text.strip
company_table = Company.find_by(name: get_name_company)
title_job = page_job.search('div.job-desc p').text
description = page_job.search('div.detail-row')
arr_column = get_row.css('div.has-background').map { |data| data.text.split(' ').join(' ') }
job_table = Job.find_by(title: title_job)
arr_column.each do |val|
unless company_table.nil?
next if company_table.nil?
job_check = Job.find_by(title: title_job, company_id: company_table.id)
if val.include?('Ngày cập nhật')
arr_data = val.gsub('Ngày cập nhật ', '').split(' ')
date_update = arr_data.first
elsif val.include?('Lương') && val.include?('Kinh nghiệm') == true && job_check.nil?
arr_sub = val.gsub('Lương ', '').gsub(' Kinh nghiệm ', '*').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*')
salary = arr_sub[0]
experience = arr_sub[1]
level = arr_sub[2]
expiration_date = arr_sub[3]
salary = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').text.strip
experience = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').text.strip
level = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').text.strip
expiration_date = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').text.strip
if job_check.nil?
job = Job.create!(title: title_job,
level: level,
salary: salary,
......@@ -56,38 +54,29 @@ class Crontab
expiration_date: expiration_date,
description: description,
company_id: company_table.id)
elsif val.include?('Lương') && val.include?('Kinh nghiệm') == false && job_check.nil?
arr_sub = val.gsub('Lương ', '').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*')
salary = arr_sub[0]
level = arr_sub[1]
expiration_date = arr_sub[2]
job = Job.create!(title: title_job,
level: level,
salary: salary,
experience: 'Không có',
expiration_date: expiration_date,
description: description,
company_id: company_table.id)
end
end
end
if !job_table.nil? && !company_table.nil?
find_job = Job.find_by(title: title_job, company_id: company_table.id)
puts find_job.title
unless find_job.nil?
location_rel = get_row.css('div.map p a').children.map { |location| location.text.strip }
location_rel.each do |loc|
city_table = City.find_by(name: loc)
if CityJob.find_by(job_id: job_table.id, city_id: city_table.id).nil?
puts "Created City #{city_table.id} => #{loc}"
city_jobs = CityJob.create!(job_id: job_table.id, city_id: city_table.id)
if CityJob.find_by(job_id: find_job.id, city_id: city_table.id).nil?
puts "Created City: #{find_job.id} - #{city_table.id}.#{loc}"
city_jobs = CityJob.create!(job_id: find_job.id, city_id: city_table.id)
end
end
industry_rel = get_row.css('li a').children.map { |industry| industry.text.strip }
industry_rel.each do |ind|
industry_table = Industry.find_by(name: ind)
if IndustryJob.find_by(job_id: job_table.id, industry_id: industry_table.id).nil?
puts "Created Industry #{job_table.id} - #{industry_table.id} => #{ind}"
industry_jobs = IndustryJob.create!(job_id: job_table.id, industry_id: industry_table.id)
if IndustryJob.find_by(job_id: find_job.id, industry_id: industry_table.id).nil?
puts "Created Industry: #{find_job.id} - #{industry_table.id}.#{ind}"
industry_jobs = IndustryJob.create!(job_id: find_job.id, industry_id: industry_table.id)
end
end
end
rescue StandardError => e
@mylogger.error "#{e.message}"
end
end
end
......
......@@ -3,34 +3,27 @@ require 'src/crontab.rb'
require 'net/ftp'
require 'csv'
require 'zip'
action = Crawler.new
crontab = Crontab.new
namespace :import do
logger ||= Logger.new(Rails.root.join('log','my.log'))
desc 'crawler data'
task crawler: :environment do
action = Crawler.new(logger)
action.crawl_city
action.crawl_industry
action.crawl_company
action.crawl_job_relationships
end
desc 'get file CSV from Server'
task csv_get: :environment do
action.get_file_csv
action.extract_zip('./jobs.zip','.')
end
desc 'Import data from CSV'
task data_csv: :environment do
action.import_file_csv
end
desc 'Crontab'
task auto: :environment do
action = Crawler.new(logger)
crontab = Crontab.new(logger)
crontab.find_company
crontab.find_job
action.logger
end
task log: :environment do
action.logger
action.get_file_csv
action.extract_zip('./jobs.zip', 'lib/csv')
action.import_file_csv('lib/csv')
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment