Fix code and remove css.file

8d1b59bc · Huỳnh Thiên Phước · b29fa28d · 8d1b59bc · 8d1b59bc · b29fa28d
Commit 8d1b59bc authored Jul 29, 2020 by Huỳnh Thiên Phước
9 changed files
--- a/Gemfile
+++ b/Gemfile
@@ -44,6 +44,7 @@ end

 group :development do
  # Access an interactive console on exception pages or by calling 'console' anywhere in the code.
+  gem 'pry'
  gem 'web-console', '>= 3.3.0'
  gem 'listen', '>= 3.0.5', '< 3.2'
  # Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring

--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -66,6 +66,7 @@ GEM
      archive-zip (~> 0.10)
      nokogiri (~> 1.8)
    chronic (0.10.2)
+    coderay (1.1.3)
    coffee-rails (4.2.2)
      coffee-script (>= 2.2.0)
      railties (>= 4.0.0)
@@ -130,6 +131,9 @@ GEM
    parallel (1.19.2)
    parser (2.7.1.4)
      ast (~> 2.4.1)
+    pry (0.13.1)
+      coderay (~> 1.1)
+      method_source (~> 1.0)
    public_suffix (4.0.5)
    puma (3.12.6)
    rack (2.2.3)
@@ -247,6 +251,7 @@ DEPENDENCIES
  listen (>= 3.0.5, < 3.2)
  mechanize (~> 2.7.6)
  mysql2 (~> 0.5.3)
+  pry
  puma (~> 3.11)
  rails (~> 5.2.4, >= 5.2.4.3)
  rubocop (~> 0.88.0)

--- a/app/assets/stylesheets/bootstrap.css.map
+++ b/app/assets/stylesheets/bootstrap.css.map
--- a/app/assets/stylesheets/bootstrap.min.css.map
+++ b/app/assets/stylesheets/bootstrap.min.css.map
--- a/config/schedule.rb
+++ b/config/schedule.rb
 env :PATH, ENV['PATH']
-every 5.minutes do
+every 20.minutes do
  rake 'import:auto'
 end

--- a/jobs.zip
+++ b/jobs.zip
--- a/lib/src/crawler.rb
+++ b/lib/src/crawler.rb
 class Crawler
+  def initialize(logger)
+    @mylogger = logger
+  end
+  
  def crawl_city
    page = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html"))
    get_name = page.search('select#location')
@@ -26,7 +30,7 @@ class Crawler
  end

  def crawl_company
-    for n in 1..10
+    (1..10).each do |n|
      company_info = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{n}-vi.html"))
      company_link = company_info.css('div.caption a.company-name').map{ |link| link['href'] }
      company_link.each do |link|  
@@ -48,7 +52,7 @@ class Crawler
                introduction: introduction_company)
              end
            rescue StandardError => e
-                puts e
+              @mylogger.error "#{e.message}"
            end
          end
        end
@@ -57,30 +61,25 @@ class Crawler
  end

  def crawl_job_relationships
-    for n in 1..10
+    (1..10).each do |n|
      page_access = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{n}-vi.html"))
      get_link = page_access.css('a.job_link').map { |link| link['href'] }
      get_link.each do |link|
        page_job = Nokogiri::HTML(URI.open(URI.parse(URI.escape(link))))
        get_row = page_job.search('div.bg-blue div.row')
        if get_row != ""
+          begin
          get_name_company = page_job.search('div.job-desc a.job-company-name').text.strip
          company_table = Company.find_by(name: get_name_company)
          title_job = page_job.search('div.job-desc p').text
          description = page_job.search('div.detail-row')
-        arr_column = get_row.css('div.has-background').map { |data| data.text.split(' ').join(' ') }
-        arr_column.each_with_index do |val, key |
-          unless company_table.nil?
+            next if company_table.nil?
              job_check = Job.find_by(title: title_job, company_id: company_table.id) 
-            if val.include?('Ngày cập nhật')
-              arr_data = val.gsub('Ngày cập nhật ', '').split(' ')
-              date = arr_data.first
-            elsif val.include?('Lương') && val.include?('Kinh nghiệm') == true && job_check.nil?
-              arr_sub = val.gsub('Lương ', '').gsub(' Kinh nghiệm ', '*').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*')
-              salary = arr_sub[0]
-              experience = arr_sub[1]
-              level = arr_sub[2]
-              expiration_date = arr_sub[3]
+              salary = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').text.strip
+              experience = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').text.strip
+              level = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').text.strip
+              expiration_date = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').text.strip
+              if job_check.nil?
                job = Job.create!(title: title_job,
                                  level: level,
                                  salary: salary,
@@ -88,40 +87,29 @@ class Crawler
                                  expiration_date: expiration_date,
                                  description: description,
                                  company_id: company_table.id)         
-            elsif val.include?('Lương') && val.include?('Kinh nghiệm') == false && job_check.nil?
-              arr_sub = val.gsub('Lương ', '').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*')
-              salary = arr_sub[0]
-              level = arr_sub[1]
-              expiration_date = arr_sub[2]
-              job = Job.create!(title: title_job,
-                                level: level,
-                                salary: salary,
-                                experience: 'Không có',
-                                expiration_date: expiration_date,
-                                description: description,
-                                company_id: company_table.id)
-            end
              end
-          next if !company_table.nil?
-            job_table = Job.find_by(title: title_job)
-            unless job_table.nil?
+            find_job = Job.find_by(title: title_job, company_id: company_table.id)
+            puts find_job.title
+            unless find_job.nil?
              location_rel = get_row.css('div.map p a').children.map { |location| location.text.strip }
              location_rel.each do |loc|
                city_table = City.find_by(name: loc)
-                if CityJob.find_by(job_id: job_table.id, city_id: city_table.id).nil?
-                  puts "Created City: #{job_table.id} - #{city_table.id}.#{loc}"
-                  city_jobs = CityJob.create!(job_id: job_table.id, city_id: city_table.id)
+                if CityJob.find_by(job_id: find_job.id, city_id: city_table.id).nil?
+                  puts "Created City: #{find_job.id} - #{city_table.id}.#{loc}"
+                  city_jobs = CityJob.create!(job_id: find_job.id, city_id: city_table.id)
                end
              end
              industry_rel = get_row.css('li a').children.map { |industry| industry.text.strip }
              industry_rel.each do |ind|
                industry_table = Industry.find_by(name: ind)
-                if IndustryJob.find_by(job_id: job_table.id, industry_id: industry_table.id).nil?
-                  puts "Created Industry: #{job_table.id} - #{industry_table.id}.#{ind}"
-                  industry_jobs = IndustryJob.create!(job_id: job_table.id, industry_id: industry_table.id)
+                if IndustryJob.find_by(job_id: find_job.id, industry_id: industry_table.id).nil?
+                  puts "Created Industry: #{find_job.id} - #{industry_table.id}.#{ind}"
+                  industry_jobs = IndustryJob.create!(job_id: find_job.id, industry_id: industry_table.id)
                end
              end
            end
+          rescue StandardError => e
+              @mylogger.error "#{e.message}"
          end
        end
      end
@@ -130,9 +118,6 @@ class Crawler

  def get_file_csv
    Net::FTP.open('192.168.1.156', 'training', 'training') do |ftp|
-      files = ftp.list
-      puts "list files:"
-      puts files
      ftp.getbinaryfile('jobs.zip')
    end
  end
@@ -147,24 +132,24 @@ class Crawler
    end
  end

-  def import_file_csv
-   file = "jobs.csv"
+  def import_file_csv(file)
    CSV.foreach(file, headers: true) do |row|
      begin
        company_name = row["company name"]
        company_address = row["company address"]
-        company_introduction = row[:benefit]
+        company_introduction = row["benefit"]
        company_table = Company.find_by(name: company_name)
        if company_table.nil?
          company_table = Company.create!(name: company_name,
                                          address: company_address,
                                          introduction: company_introduction)
        end
-        title_job = row[:name]
-        description_job = row[:description]
-        level = row[:level]
-        salary = row[:salary]
-        unless company_table.nil?
+        title_job = row["name"]
+        description_job = "#{row["description"]} #{row["requirement"]}"
+        level = row["level"]
+        salary = row["salary"]
+        job_table = Job.find_by(title: title_job)
+        if !company_table.nil? && job_table.nil?
          job_table = Job.create!(title: title_job,
                                  description: description_job,
                                  level: level,
@@ -172,7 +157,7 @@ class Crawler
                                  company_id: company_table.id)
          puts job_table.id
        end
-        industry = row[:category]
+        industry = row["category"]
        industry_find = Industry.find_by(name: industry)
        if industry_find.nil?
          industry_table = Industry.create!(name: industry)
@@ -192,16 +177,8 @@ class Crawler
        end
        puts "Location: #{location}"
      rescue StandardError => e
-        puts e
-      end
+        @mylogger.error "#{e.message}"
      end
    end
-
-  def logger
-    # config.log_level = :info
-    Rails.logger = Logger.new(STDOUT)
-    Rails.logger = Logger.new "#{Rails.root}/log/#{Rails.env}.log"
-    Rails.logger.level = Logger::DEBUG
-    Rails.logger.datetime_format = "%Y-%m-%d %H:%M:%S"
  end
 end
--- a/lib/src/crontab.rb
+++ b/lib/src/crontab.rb
 class Crontab
+  def initialize(logger)
+    @mylogger = logger
+  end
+  
  def find_company
    company_info = Nokogiri::HTML(URI.open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-1-vi.html"))
    company_link = company_info.css('div.caption a.company-name').map { |link| link['href'] }
@@ -18,7 +22,7 @@ class Crontab
                                        introduction: introduction_company)
            end
          rescue StandardError => e
-            puts e
+            @mylogger.error "#{e.message}"
          end
        end
      end
@@ -31,24 +35,18 @@ class Crontab
      page_job = Nokogiri::HTML(URI.open(URI.parse(URI.escape(link))))
      get_row = page_job.search('div.bg-blue div.row')
      if get_row != ""
+        begin
        get_name_company = page_job.search('div.job-desc a.job-company-name').text.strip
        company_table = Company.find_by(name: get_name_company)
        title_job = page_job.search('div.job-desc p').text
        description = page_job.search('div.detail-row')
-        arr_column = get_row.css('div.has-background').map { |data| data.text.split(' ').join(' ') }
-        job_table = Job.find_by(title: title_job)
-        arr_column.each do |val|
-          unless company_table.nil?
+          next if company_table.nil?
            job_check = Job.find_by(title: title_job, company_id: company_table.id) 
-            if val.include?('Ngày cập nhật')
-              arr_data = val.gsub('Ngày cập nhật ', '').split(' ')
-              date_update = arr_data.first
-            elsif val.include?('Lương') && val.include?('Kinh nghiệm') == true && job_check.nil?
-              arr_sub = val.gsub('Lương ', '').gsub(' Kinh nghiệm ', '*').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*')
-              salary = arr_sub[0]
-              experience = arr_sub[1]
-              level = arr_sub[2]
-              expiration_date = arr_sub[3]
+            salary = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-usd")]]/p').text.strip
+            experience = get_row.at_xpath('//li[./strong/i[contains(@class, "fa fa-briefcase")]]/p').text.strip
+            level = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-account")]]/p').text.strip
+            expiration_date = get_row.at_xpath('//li[./strong/i[contains(@class, "mdi mdi-calendar-check")]]/p').text.strip
+            if job_check.nil?
              job = Job.create!(title: title_job,
                                level: level,
                                salary: salary,
@@ -56,38 +54,29 @@ class Crontab
                                expiration_date: expiration_date,
                                description: description,
                                company_id: company_table.id)         
-            elsif val.include?('Lương') && val.include?('Kinh nghiệm') == false && job_check.nil?
-              arr_sub = val.gsub('Lương ', '').gsub(' Cấp bậc ', '*').gsub(' Hết hạn nộp ', '*').split('*')
-              salary = arr_sub[0]
-              level = arr_sub[1]
-              expiration_date = arr_sub[2]
-              job = Job.create!(title: title_job,
-                                level: level,
-                                salary: salary,
-                                experience: 'Không có',
-                                expiration_date: expiration_date,
-                                description: description,
-                                company_id: company_table.id)
-            end
            end
-        end
-        if !job_table.nil? && !company_table.nil?
+          find_job = Job.find_by(title: title_job, company_id: company_table.id)
+          puts find_job.title
+          unless find_job.nil?
            location_rel = get_row.css('div.map p a').children.map { |location| location.text.strip }
            location_rel.each do |loc|
              city_table = City.find_by(name: loc)
-            if CityJob.find_by(job_id: job_table.id, city_id: city_table.id).nil?
-              puts "Created City #{city_table.id} => #{loc}"
-              city_jobs = CityJob.create!(job_id: job_table.id, city_id: city_table.id)
+              if CityJob.find_by(job_id: find_job.id, city_id: city_table.id).nil?
+                puts "Created City: #{find_job.id} - #{city_table.id}.#{loc}"
+                city_jobs = CityJob.create!(job_id: find_job.id, city_id: city_table.id)
              end
            end
            industry_rel = get_row.css('li a').children.map { |industry| industry.text.strip }
            industry_rel.each do |ind|
              industry_table = Industry.find_by(name: ind)
-            if IndustryJob.find_by(job_id: job_table.id, industry_id: industry_table.id).nil?
-              puts "Created Industry #{job_table.id} - #{industry_table.id} => #{ind}"
-              industry_jobs = IndustryJob.create!(job_id: job_table.id, industry_id: industry_table.id)
+              if IndustryJob.find_by(job_id: find_job.id, industry_id: industry_table.id).nil?
+                puts "Created Industry: #{find_job.id} - #{industry_table.id}.#{ind}"
+                industry_jobs = IndustryJob.create!(job_id: find_job.id, industry_id: industry_table.id)
+              end
            end
          end
+        rescue StandardError => e
+          @mylogger.error "#{e.message}"
        end
      end
    end

--- a/lib/tasks/crawler_import.rake
+++ b/lib/tasks/crawler_import.rake
@@ -3,34 +3,27 @@ require 'src/crontab.rb'
 require 'net/ftp'
 require 'csv'
 require 'zip'
-action = Crawler.new
-crontab = Crontab.new
+
+
 namespace :import do
+  logger ||= Logger.new(Rails.root.join('log','my.log'))
+
  desc 'crawler data'
  task crawler: :environment do
+    action = Crawler.new(logger)
    action.crawl_city
    action.crawl_industry
    action.crawl_company
    action.crawl_job_relationships
  end
-  desc 'get file CSV from Server'
-  task csv_get: :environment do
-    action.get_file_csv
-    action.extract_zip('./jobs.zip','.')
-  end
-  desc 'Import data from CSV'
-  task data_csv: :environment do
-    action.import_file_csv
-  end
  desc 'Crontab'
-
  task auto: :environment do
+    action = Crawler.new(logger)
+    crontab = Crontab.new(logger)
    crontab.find_company
    crontab.find_job
-    action.logger
-  end
-
-  task log: :environment do
-    action.logger
+    action.get_file_csv
+    action.extract_zip('./jobs.zip', 'lib/csv')
+    action.import_file_csv('lib/csv')
  end
 end