Commit b465fec5 by Trịnh Hoàng Phúc

Merge branch 'feature/rake_task_crawler' into 'master'

Fix crawler, import csv

See merge request !12
parents da165766 eb95d62c
Pipeline #584 canceled with stages
in 0 seconds
......@@ -25,6 +25,8 @@ gem 'bcrypt', '~> 3.1.7'
gem 'devise'
gem 'activerecord-import'
gem 'settingslogic'
# Use Active Storage variant
# gem 'image_processing', '~> 1.2'
......
......@@ -45,6 +45,8 @@ GEM
activerecord (6.0.2.2)
activemodel (= 6.0.2.2)
activesupport (= 6.0.2.2)
activerecord-import (1.0.4)
activerecord (>= 3.2)
activestorage (6.0.2.2)
actionpack (= 6.0.2.2)
activejob (= 6.0.2.2)
......@@ -269,6 +271,7 @@ PLATFORMS
x86-mswin32
DEPENDENCIES
activerecord-import
bcrypt (~> 3.1.7)
bootsnap (>= 1.4.2)
byebug
......
......@@ -2,7 +2,6 @@ class CreateCities < ActiveRecord::Migration[6.0]
def change
create_table :cities do |t|
t.string :title
t.timestamps
end
end
......
class AddForeignToCities < ActiveRecord::Migration[6.0]
def change
add_column :cities, :foreign, :boolean, :default => false
end
end
......@@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2020_04_08_025325) do
ActiveRecord::Schema.define(version: 2020_04_23_044651) do
create_table "applies", options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", force: :cascade do |t|
t.bigint "user_id", null: false
......@@ -25,6 +25,7 @@ ActiveRecord::Schema.define(version: 2020_04_08_025325) do
t.string "title"
t.datetime "created_at", precision: 6, null: false
t.datetime "updated_at", precision: 6, null: false
t.boolean "foreign", default: false
end
create_table "cities_jobs", id: false, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", force: :cascade do |t|
......
......@@ -2,7 +2,7 @@ require "nokogiri"
require "open-uri"
namespace :crawler do
desc "TODO"
desc "Crawler Careerbuilder"
task job: :environment do
# Define exception logger
......@@ -12,12 +12,12 @@ namespace :crawler do
skip_url_logger = ActiveSupport::Logger.new("log/skip_url_logger.log")
# Loop page
(1..2).each do |page|
(10..12).each do |page|
# Fetch and parse HTML document
html_jobs = Nokogiri::HTML.parse(open("https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-trang-#{page}-vi.html"))
# Loop item
html_jobs.css(".jobs-side-list .job-item").each_with_index do |item, index|
html_jobs.css(".jobs-side-list .job-item").each do |item|
# Job attributes
job_attributes = {
title: item.css(".figure .figcaption .title a @title").text,
......@@ -32,15 +32,15 @@ namespace :crawler do
# Defind cities array
cities = []
item.css(".figure .figcaption .caption .location ul li").each_with_index do |city|
city = check_exist_or_create_city(city.text)
item.css(".figure .figcaption .caption .location ul li").each do |city|
city = check_exist_or_create_city(city.text.strip)
cities << city
end
if item.css(".figure .image a @href").text != "javascript:void(0);"
# Company attributes
html_company_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .image a @href").text)))
if html_company_detail.at_css(".jobsby-company")
unless html_company_detail.at_css(".jobsby-company").nil?
company_attributes = {
title: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content .name").text,
address: html_company_detail.css(".jobsby-company .company-introduction .company-info .info .content p")[1].text,
......@@ -55,7 +55,7 @@ namespace :crawler do
industries = []
html_job_detail = Nokogiri::HTML.parse(open(URI.encode(item.css(".figure .figcaption .title .job_link @href").text)))
if html_job_detail.at_css(".search-result-list-detail")
unless html_job_detail.at_css(".search-result-list-detail").nil?
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .has-background ul li").each do |ele|
type = ele.css("strong").text
......@@ -69,7 +69,7 @@ namespace :crawler do
end
end
html_job_detail.css(".search-result-list-detail .tabs #tab-1 .job-detail-content .detail-box .industry p a").each do |ele|
industry = check_exist_or_create_industry(ele.text.strip.gsub(",",""))
industry = check_exist_or_create_industry(ele.text.gsub(",","").strip)
industries << industry
end
# Get description for job attributes
......@@ -86,37 +86,91 @@ namespace :crawler do
# Create job
job = check_exist_or_create_job(job_attributes)
# Create city_job
if cities.length > 0
if cities.count > 0
cities.each do |city|
job.cities << city
end
end
# Create industry_job
if industries.length > 0
if industries.count > 0
industries.each do |industry|
job.industries << industry
end
end
rescue
exception_logger.info "Error url: #{job_detail_url}"
exception_logger.info "Error url: #{item.css(".figure .figcaption .title .job_link @href").text}"
next
end
end
end
task city: :environment do
# Fetch and parse HTML document
html_cities = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
unless html_cities.at_css(".find-jobsby-categories .main-jobs-by-location").nil?
# Defind cities array
cities = []
# Get city in country
html_cities.css(".find-jobsby-categories .main-jobs-by-location .jobs-in-country .list-jobs-by-country li a").each do |title|
city = {
title: title.text.gsub("Việc làm tại ","").strip,
foreign: false
}
cities << city
end
# Get city foreign
html_cities.css(".find-jobsby-categories .main-jobs-by-location .overseas-jobs .list-overseas-jobs li a").each do |title|
city = {
title: title.text.strip,
foreign: true
}
cities << city
end
if cities.count > 0
City.import cities
end
end
end
task industry: :environment do
# Fetch and parse HTML document
html_industries = Nokogiri::HTML.parse(open("https://careerbuilder.vn/tim-viec-lam.html"))
unless html_industries.at_css(".find-jobsby-categories .list-of-working-positions").nil?
# Defind industries array
industries = []
# Get industry
html_industries.css(".find-jobsby-categories .list-of-working-positions .list-jobs li a").each do |title|
industry = {
title: title.text.strip
}
industries << industry
end
if industries.count > 0
Industry.import industries
end
end
end
def check_exist_or_create_company(company_attributes)
find_company = Company.find_or_create_by(company_attributes)
return find_company.id
end
def check_exist_or_create_city(city_title)
find_city = City.find_or_create_by(title: city_title)
return find_city
def check_exist_or_create_industry(industry_title)
industries = Industry.where("title LIKE ?", industry_title)
if industries.count == 0
industry = Industry.create(title: industry_title)
else
industry = industries[0]
end
return industry
end
def check_exist_or_create_industry(industry_title)
find_industry = Industry.find_or_create_by(title: industry_title)
return find_industry
def check_exist_or_create_city(city_title)
cities = City.where("title LIKE ?", city_title)
if cities.count == 0
city = City.create(title: city_title)
else
city = cities[0]
end
return city
end
def check_exist_or_create_job(job_attributes)
......
require 'csv'
namespace :import do
desc "TODO"
desc "Import CSV"
task csv: :environment do
......@@ -44,14 +44,24 @@ namespace :import do
return find_company.id
end
def check_exist_or_create_city(city_title)
find_city = City.find_or_create_by(title: city_title)
return find_city
def check_exist_or_create_industry(industry_title)
industries = Industry.where("title LIKE ?", industry_title)
if industries.count == 0
industry = Industry.create(title: industry_title)
else
industry = industries[0]
end
return industry
end
def check_exist_or_create_industry(industry_title)
find_industry = Industry.find_or_create_by(title: industry_title)
return find_industry
def check_exist_or_create_city(city_title)
cities = City.where("title LIKE ?", city_title)
if cities.count == 0
city = City.create(title: city_title)
else
city = cities[0]
end
return city
end
def check_exist_or_create_job(job_attributes)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment