Commit 3fdc0746 by Hoang Nam Nguyen

Finish crawl and import data

parent 332760aa
class AddBenefitToJob < ActiveRecord::Migration[5.1]
def change
add_column :jobs, :benefit, :text
end
end
......@@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20170905095755) do
ActiveRecord::Schema.define(version: 20170906075139) do
create_table "cities", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.string "location"
......@@ -45,6 +45,7 @@ ActiveRecord::Schema.define(version: 20170905095755) do
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.string "experence"
t.text "benefit"
end
end
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -14,26 +14,34 @@ namespace :crawl do
p.each do |detail|
begin
each_page = Nokogiri::HTML(open(detail['href']))
#get name_company, address_company, descipton company save database
name_company = each_page.css('.tit_company').text
title_address = each_page.css('.TitleDetailNew label[itemprop="address"]').text
detail_company = each_page.xpath("//*[@id='emp_collapse']").text.strip
company = Company.find_or_create_by(company_name: name_company, company_address: title_address,company_descripton: detail_company)
#get industry name and save database
industry_name = each_page.css('.fl_left a[itemprop="industry"]').text
industryname = Industry.find_or_create_by(industry_name: industry_name)
#get locationwork and save database
location_work = each_page.css('p.fl_left b[itemprop="jobLocation"]').text
locationwork = City.find_or_create_by(location: location_work)
#get jobtitle update level experence salary detail_work expiry_date and save datapase
job_title = each_page.css('h1[itemprop="title"]').text
date_post = each_page.css('.datepost').text
level = each_page.css('label[itemprop="occupationalCategory"]').text
experence = each_page.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Kinh nghiệm: ']/text()").text.strip
salary = each_page.xpath("//ul[@class='DetailJobNew']//li//p[span/text()='Lương: ']").text.gsub('Lương:', '').strip
detail_work = each_page.css('.MarBot20').text
benefits = each_page.css('.list-benefits').text.delete("\t").split("\n").map(&:strip).reject(&:blank?).join(',')
time_off = each_page.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Hết hạn nộp: ']/text()").text.strip
Job.create(job_title: job_title, update_job: date_post, level: level, expiry_date: time_off, company_id: company.id, city_id: locationwork.id, industry_id: industryname.id, descripton: detail_work,experence: experence,salary: salary) if Job.find_by(job_title: job_title).blank?
Job.find_or_create_by(job_title: job_title, update_job: date_post, level: level, expiry_date: time_off,
company_id: company.id, city_id: locationwork.id, industry_id: industryname.id,
descripton: detail_work,experence: experence,salary: salary,benefit: benefits)
#show pages on console screen
puts "#{detail['href']} - Page #{i}/#{last_page}"
rescue => e
p e
......
......@@ -3,7 +3,52 @@ require 'net/ftp'
require 'open-uri'
require 'zip'
filename = File.join Rails.root, "jobs.zip"
CONTENT_SERVER_DOMAIN_NAME = "192.168.1.156"
CONTENT_SERVER_FTP_LOGIN = "training"
CONTENT_SERVER_FTP_PASSWORD = "training"
namespace :import do
task :data => :environment do
#download file
data_path = File.join(Rails.root.join('tmp','import_data'), 'jobs.zip')
ftp = Net::FTP.new
ftp.connect(CONTENT_SERVER_DOMAIN_NAME,21)
ftp.login(CONTENT_SERVER_FTP_LOGIN,CONTENT_SERVER_FTP_PASSWORD)
ftp.passive = true
ftp.getbinaryfile("jobs.zip", data_path)
Zip::ZipFile.open()
\ No newline at end of file
#unzip file
unzip_path = File.join(Rails.root.join('tmp','import_data'),'jobs.csv')
Zip::ZipFile.open(data_path) do |zip_file|
zip_file.each do |f|
f_path = File.join(unzip_path)
zip_file.extract(f, f_path) unless File.exist?(f_path)
end
end
#After unzip and import file into database
CSV.foreach(unzip_path) do |row|
begin
#get industry from jobs.csv after unzip
category = Industry.find_or_create_by(industry_name: row[1])
#get workplace from jobs.csv
workplace = row[16].delete("\"[""]\"")
city = City.find_or_create_by(location: workplace)
#get company_address and company_name
company = Company.find_or_create_by(company_address: row[2], company_name: row[5])
#get details job
descripton_work = "#{row[7]} + #{row[10]} + #{row[12]}"
job = Job.find_or_create_by(benefit: row[0], descripton: descripton_work, level: row[8],
job_title: row[9], salary: row[11], industry_id: category.id,
city_id: city.id, company_id: company.id)
p "[Success] job ##{job.id}"
rescue => e
p e
end
end
end
end
\ No newline at end of file
SSUUMMMMAARRYY OOFF LLEESSSS CCOOMMMMAANNDDSS
Commands marked with * may be preceded by a number, _N.
Notes in parentheses indicate the behavior if _N is given.
A key preceded by a caret indicates the Ctrl key; thus ^K is ctrl-K.
h H Display this help.
q :q Q :Q ZZ Exit.
---------------------------------------------------------------------------
MMOOVVIINNGG
e ^E j ^N CR * Forward one line (or _N lines).
y ^Y k ^K ^P * Backward one line (or _N lines).
f ^F ^V SPACE * Forward one window (or _N lines).
b ^B ESC-v * Backward one window (or _N lines).
z * Forward one window (and set window to _N).
w * Backward one window (and set window to _N).
ESC-SPACE * Forward one window, but don't stop at end-of-file.
d ^D * Forward one half-window (and set half-window to _N).
u ^U * Backward one half-window (and set half-window to _N).
ESC-) RightArrow * Left one half screen width (or _N positions).
ESC-( LeftArrow * Right one half screen width (or _N positions).
F Forward forever; like "tail -f".
ESC-F Like F but stop when search pattern is found.
r ^R ^L Repaint screen.
R Repaint screen, discarding buffered input.
---------------------------------------------------
Default "window" is the screen height.
Default "half-window" is half of the screen height.
---------------------------------------------------------------------------
SSEEAARRCCHHIINNGG
/_p_a_t_t_e_r_n * Search forward for (_N-th) matching line.
?_p_a_t_t_e_r_n * Search backward for (_N-th) matching line.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment