Commit 454004db by Tan Phat Nguyen

Merge branch 'hungpt_crawler_data' into 'master'

Feature crawler data

See merge request !1
parents 0ee72cca 478d6a95
......@@ -28,7 +28,8 @@ gem 'jbuilder', '~> 2.5'
# Use Redis adapter to run Action Cable in production
# gem 'redis', '~> 3.0'
# Use ActiveModel has_secure_password
# gem 'bcrypt', '~> 3.1.7'
gem 'bcrypt', '~> 3.1.7'
gem 'whenever'
# Use Capistrano for deployment
# gem 'capistrano-rails', group: :development
......@@ -48,6 +49,8 @@ group :development do
# Spring speeds up development by keeping your application running in the background. Read more: https://github.com/rails/spring
gem 'spring'
gem 'spring-watcher-listen', '~> 2.0.0'
gem 'better_errors'
gem 'binding_of_caller'
end
# Windows does not include zoneinfo files, so bundle the tzinfo-data gem
......
......@@ -41,7 +41,14 @@ GEM
addressable (2.5.1)
public_suffix (~> 2.0, >= 2.0.2)
arel (8.0.0)
bcrypt (3.1.11)
better_errors (2.1.1)
coderay (>= 1.0.0)
erubis (>= 2.6.6)
rack (>= 0.9.0)
bindex (0.5.0)
binding_of_caller (0.7.2)
debug_inspector (>= 0.0.1)
builder (3.2.3)
byebug (9.0.6)
capybara (2.14.3)
......@@ -53,6 +60,8 @@ GEM
xpath (~> 2.0)
childprocess (0.7.0)
ffi (~> 1.0, >= 1.0.11)
chronic (0.10.2)
coderay (1.1.1)
coffee-rails (4.2.2)
coffee-script (>= 2.2.0)
railties (>= 4.0.0)
......@@ -61,7 +70,9 @@ GEM
execjs
coffee-script-source (1.12.2)
concurrent-ruby (1.0.5)
debug_inspector (0.0.3)
erubi (1.6.0)
erubis (2.7.0)
execjs (2.7.0)
ffi (1.9.18)
globalid (0.4.0)
......@@ -163,6 +174,8 @@ GEM
websocket-driver (0.6.5)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.2)
whenever (0.9.7)
chronic (>= 0.6.3)
xpath (2.1.0)
nokogiri (~> 1.3)
......@@ -170,6 +183,9 @@ PLATFORMS
ruby
DEPENDENCIES
bcrypt (~> 3.1.7)
better_errors
binding_of_caller
byebug
capybara (~> 2.13)
coffee-rails (~> 4.2)
......@@ -186,6 +202,7 @@ DEPENDENCIES
tzinfo-data
uglifier (>= 1.3.0)
web-console (>= 3.3.0)
whenever
BUNDLED WITH
1.15.1
class Apply < ApplicationRecord
belongs_to :user
belongs_to :job
end
class Area < ApplicationRecord
has_many :city
validates :name, presence: true
end
class Category < ApplicationRecord
has_many :job_category
validates :name, presence: true
end
class City < ApplicationRecord
belongs_to :area
has_many :job
validates :name, presence: true
end
class Company < ApplicationRecord
has_many :job
validates :name, presence: true
validates :address, length: { maximum: 200 }
validates :district, length: { maximum: 200 }
validates :province, length: { maximum: 200 }
end
class Contact < ApplicationRecord
has_many :job
end
class Favorite < ApplicationRecord
belongs_to :user
belongs_to :job
end
class History < ApplicationRecord
belongs_to :user
belongs_to :job
end
class Job < ApplicationRecord
belongs_to :city, optional: true
belongs_to :company, optional: true
belongs_to :job_type, optional: true
belongs_to :contact, optional: true
has_many :job_category
validates :name, presence: true, length: { maximum: 200 }
end
class JobCategory < ApplicationRecord
belongs_to :job
belongs_to :category
end
class JobType < ApplicationRecord
has_many :job
end
class User < ApplicationRecord
validates :name, presence: true, length: { maximum: 50 }
VALID_EMAIL_REGEX = /\A[\w+\-.]+@[a-z\d\-.]+\.[a-z]+\z/i
validates :email, presence: true, length: { maximum: 255 }, format: { with: VALID_EMAIL_REGEX }
end
......@@ -14,5 +14,7 @@ module VeNJOB
# Settings in config/environments/* take precedence over those specified here.
# Application configuration should go into files in config/initializers
# -- all .rb files in that directory are automatically loaded.
config.autoload_paths += %W[#{config.root}/lib]
config.eager_load_paths += %W[#{config.root}/lib]
end
end
......@@ -14,7 +14,7 @@ default: &default
encoding: utf8
pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
username: root
password:
password: 123456
socket: /var/run/mysqld/mysqld.sock
development:
......
# Use this file to easily define all of your cron jobs.
#
# It's helpful, but not entirely necessary to understand cron before proceeding.
# http://en.wikipedia.org/wiki/Cron
# Example:
#
# set :output, "/path/to/my/cron_log.log"
#
# every 2.hours do
# command "/usr/bin/some_great_command"
# runner "MyModel.some_method"
# rake "some:great:rake:task"
# end
#
# every 4.days do
# runner "AnotherModel.prune_old_records"
# end
# Learn more: http://github.com/javan/whenever
env :PATH, ENV['PATH']
set :environment, 'development'
set :output, { error: 'log/cron_error_log.log', standard: 'log/cron_log.log' }
every 1.day, at: '12:00 pm' do
rake 'crawler:load'
end
class CreateAreas < ActiveRecord::Migration[5.1]
def change
create_table :areas do |t|
t.string :name, index: true
t.timestamps
end
end
end
class CreateCategories < ActiveRecord::Migration[5.1]
def change
create_table :categories do |t|
t.string :name, idnex: true
t.timestamps
end
end
end
class CreateJobTypes < ActiveRecord::Migration[5.1]
def change
create_table :job_types do |t|
t.string :name
t.timestamps
end
end
end
class CreateCompanies < ActiveRecord::Migration[5.1]
def change
create_table :companies do |t|
t.string :name
t.string :address
t.text :description
t.string :district
t.string :province
t.timestamps
end
end
end
class CreateContacts < ActiveRecord::Migration[5.1]
def change
create_table :contacts do |t|
t.string :name
t.string :email
t.string :phone
t.timestamps
end
end
end
class CreateUsers < ActiveRecord::Migration[5.1]
def change
create_table :users do |t|
t.string :email
t.string :password
t.string :fullname
t.string :reset_digest
t.datetime :reset_sent_at
t.string :activation_digest
t.boolean :activated
t.datetime :activated_at
t.boolean :admin
t.string :cv_name
t.timestamps
end
end
end
class CreateCities < ActiveRecord::Migration[5.1]
def change
create_table :cities do |t|
t.string :name, index: true
t.references :area, index: true
t.timestamps
end
end
end
class CreateJobs < ActiveRecord::Migration[5.1]
def change
create_table :jobs do |t|
t.string :name, index: true
t.text :description
t.references :city, index: true
t.string :salary
t.references :company, index: true
t.text :benefit
t.string :level
t.text :requirement
t.references :job_type, index: true
t.references :contact, index: true
t.datetime :expiry_date
t.string :experience
t.integer :status
t.timestamps
end
end
end
class CreateJobCategories < ActiveRecord::Migration[5.1]
def change
create_table :job_categories do |t|
t.references :job, index: true
t.references :category, index: true
t.timestamps
end
end
end
class CreateFavorites < ActiveRecord::Migration[5.1]
def change
create_table :favorites do |t|
t.references :user, index: true
t.references :job, index: true
t.timestamps
end
end
end
class CreateHistories < ActiveRecord::Migration[5.1]
def change
create_table :histories do |t|
t.references :user, index: true
t.references :job, index: true
t.timestamps
end
end
end
class CreateApplies < ActiveRecord::Migration[5.1]
def change
create_table :applies do |t|
t.references :user, index: true
t.references :job, index: true
t.datetime :applied_at
t.string :ip_address
t.string :user_agent
t.timestamps
end
end
end
# This file is auto-generated from the current state of the database. Instead
# of editing this file, please use the migrations feature of Active Record to
# incrementally modify your database, and then regenerate this schema definition.
#
# Note that this schema.rb definition is the authoritative source for your
# database schema. If you need to create the application database on another
# system, you should be using db:schema:load, not running all the migrations
# from scratch. The latter is a flawed and unsustainable approach (the more migrations
# you'll amass, the slower it'll run and the greater likelihood for issues).
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20170628020034) do
create_table "applies", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "user_id"
t.bigint "job_id"
t.datetime "applied_at"
t.string "ip_address"
t.string "user_agent"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["job_id"], name: "index_applies_on_job_id"
t.index ["user_id"], name: "index_applies_on_user_id"
end
create_table "areas", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.string "name"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["name"], name: "index_areas_on_name"
end
create_table "categories", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.string "name"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
end
create_table "cities", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.string "name"
t.bigint "area_id"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["area_id"], name: "index_cities_on_area_id"
t.index ["name"], name: "index_cities_on_name"
end
create_table "companies", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.string "name"
t.string "address"
t.text "description"
t.string "district"
t.string "province"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
end
create_table "contacts", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.string "name"
t.string "email"
t.string "phone"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
end
create_table "favorites", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "user_id"
t.bigint "job_id"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["job_id"], name: "index_favorites_on_job_id"
t.index ["user_id"], name: "index_favorites_on_user_id"
end
create_table "histories", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "user_id"
t.bigint "job_id"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["job_id"], name: "index_histories_on_job_id"
t.index ["user_id"], name: "index_histories_on_user_id"
end
create_table "job_categories", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.bigint "job_id"
t.bigint "category_id"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["category_id"], name: "index_job_categories_on_category_id"
t.index ["job_id"], name: "index_job_categories_on_job_id"
end
create_table "job_types", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.string "name"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
end
create_table "jobs", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.string "name"
t.text "description"
t.bigint "city_id"
t.string "salary"
t.bigint "company_id"
t.text "benefit"
t.string "level"
t.text "requirement"
t.bigint "job_type_id"
t.bigint "contact_id"
t.datetime "expiry_date"
t.string "experience"
t.integer "status"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["city_id"], name: "index_jobs_on_city_id"
t.index ["company_id"], name: "index_jobs_on_company_id"
t.index ["contact_id"], name: "index_jobs_on_contact_id"
t.index ["job_type_id"], name: "index_jobs_on_job_type_id"
t.index ["name"], name: "index_jobs_on_name"
end
create_table "users", force: :cascade, options: "ENGINE=InnoDB DEFAULT CHARSET=utf8" do |t|
t.string "email"
t.string "password"
t.string "fullname"
t.string "reset_digest"
t.datetime "reset_sent_at"
t.string "activation_digest"
t.boolean "activated"
t.datetime "activated_at"
t.boolean "admin"
t.string "cv_name"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
end
end
......@@ -5,3 +5,5 @@
#
# movies = Movie.create([{ name: 'Star Wars' }, { name: 'Lord of the Rings' }])
# Character.create(name: 'Luke', movie: movies.first)
Area.new(name: 'Viet Nam').save
Area.new(name: 'International').save
require 'thread'
require 'open-uri'
require 'nokogiri'
require 'logger'
class Crawler::Careerbuilder
attr_reader :domain, :thread_count, :logger
def initialize(domain, thread_count = 1)
@links = [url: 'http://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html', handler: :top] * thread_count
@crawled = []
@mutex = Mutex.new
@domain = domain
@thread_count = thread_count
@logger = Logger.new("#{Rails.root}/log/careerbuilder_crawler.log")
end
def crawl
@logger.info('Start crawl')
doc = Nokogiri::HTML(open(@domain))
import_category(doc)
import_city(doc)
workers = (0...thread_count).map do
Thread.new do
begin
while link = next_link
crawl_url = link[:url]
begin
doc_new_jobs = Nokogiri::HTML(open(crawl_url))
doc_new_jobs.encoding = 'utf-8'
send(link[:handler], doc_new_jobs, crawl_url)
rescue StandardError => e
logger.error("[URL: ] #{crawl_url}")
logger.error(e.message)
logger.error(e.backtrace)
end
end
puts '=======Thread End======='
rescue ThreadError
end
end
end
workers.map(&:join)
@logger.info('Crawl finished')
end
def next_link
link = nil
@mutex.synchronize do
return if @crawled.count > 500
link = @links.shift
@crawled.push(link[:url]) if link
end
link
end
def push_link(link, handler)
@mutex.synchronize do
@links.push({url: link, handler: handler}) unless @crawled.include?(link)
end
end
def shift_link(link, handler)
@mutex.synchronize do
@links.unshift({url: link, handler: handler}) unless @crawled.include?(link)
end
end
def top(doc, _link)
doc.xpath("//div[@class='gird_standard ']/dl/dd/span/h3[@class='job']/a/@href").each do |link|
shift_link(link, :detail)
end
next_page = doc.xpath("//div[@class='paginationTwoStatus']/a[@class='right']/@href").to_s
push_link(next_page, :top) if next_page
end
def detail(doc, _link)
# Company Information
company_name = doc.xpath("//div[@class='tit_company']").text.strip # Company name
company_address = doc.xpath("//div[@class='box1Detail']/p[@class='TitleDetailNew']/label[@itemprop='address']/label[@itemprop='addressLocality']").text.strip # Company Address
company_description = doc.xpath("//div[@class='desc_company content_fck']").text.strip # Company description
company = Company.find_or_create_by(name: company_name)
company.name = company_name
company.address = company_address
company.description = company_description
company.save
# Job Information
job_name = doc.xpath("//div[@class='LeftJobCB']/div[@class='top-job']/div[@class='top-job-info']/h1").text.strip # Job name
job_description = doc.xpath("//div[@class='MarBot20']").text.strip # Job description
job_location = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Nơi làm việc: ']/b[@itemprop='jobLocation']").text.strip
job_level = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Cấp bậc: ']/label[@itemprop='occupationalCategory']").text.strip
job_experience = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Kinh nghiệm: ']/text()")
job_salary = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='baseSalary']").text.strip + " " +
doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Lương: ']/label[@itemprop='salaryCurrency']").text.strip
job_category = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Ngành nghề: ']/b/a[@itemprop='industry']").text.strip
job_expiry_date = doc.xpath("//ul[@class='DetailJobNew']/li/p[span/text()='Hết hạn nộp: ']/text()").to_s
city = City.find_by_name(job_location)
job = Job.find_or_create_by(name: job_name, city: city, company: company)
job.description = job_description
job.salary = job_salary
job.level = job_level
job.experience = job_experience
job.status = 0
job.expiry_date = job_expiry_date.to_datetime
job.save
job_category.split(',').each do |category|
category = Category.find_by_name(category)
JobCategory.find_or_create_by(job: job, category: category)
end
end
def import_category(doc)
categories = doc.xpath("//div[@class='s-home2']/div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_industry']/select/option")
categories = categories.drop(1)
categories.each do |category|
Category.find_or_create_by(name: category.text.strip)
end
rescue StandardError => e
logger.error("[method: ] #{import_category}")
logger.error(e.message)
logger.error(e.backtrace)
end
def import_city(doc)
cities = doc.xpath("//div[@class='s-home2']//div[@id='NewSearchJob3']/form/div[@class='search-horizontal']/div[@class='ui-widget box_multiSelect_location']/select/option").drop(1)
area = Area.find_by_name('Viet Nam')
cities.each do |city|
area = Area.find_by_name('International') if city.text == 'Angola'
City.find_or_create_by(name: city.text.strip, area: area)
end
rescue StandardError => e
logger.error("[method: ] #{import_city}")
logger.error(e.message)
logger.error(e.backtrace)
end
end
namespace :crawler do
desc 'client crawler'
task load: :environment do
thread_count = ENV['THREAD_COUNT'] || 1
Crawler::Careerbuilder.new('http://careerbuilder.vn', thread_count.to_i).crawl
end
end
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
user: one
job: one
applied_at: 2017-06-28 09:00:34
ip_address: MyString
user_agrent: MyString
two:
user: two
job: two
applied_at: 2017-06-28 09:00:34
ip_address: MyString
user_agrent: MyString
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
name: MyString
two:
name: MyString
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
name: MyString
two:
name: MyString
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
name: MyString
area: one
two:
name: MyString
area: two
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
name: MyString
address: MyString
description: MyString
district: MyString
province: MyString
two:
name: MyString
address: MyString
description: MyString
district: MyString
province: MyString
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
name: MyString
email: MyString
phone: MyString
two:
name: MyString
email: MyString
phone: MyString
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
user: one
job: one
two:
user: two
job: two
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
user: one
job: one
two:
user: two
job: two
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
job: one
category: one
two:
job: two
category: two
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
name: MyString
two:
name: MyString
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
name: MyString
description: MyString
city: one
salary: MyString
company: one
benefit: MyString
level: MyString
requirement: MyString
job_type: one
contact: one
expiry_date: 2017-06-28 08:53:37
experience: MyString
status: 1
two:
name: MyString
description: MyString
city: two
salary: MyString
company: two
benefit: MyString
level: MyString
requirement: MyString
job_type: two
contact: two
expiry_date: 2017-06-28 08:53:37
experience: MyString
status: 1
# Read about fixtures at http://api.rubyonrails.org/classes/ActiveRecord/FixtureSet.html
one:
email: MyString
password: MyString
fullname: MyString
reset_digest: MyString
reset_sent_at: 2017-06-28 08:47:36
activation_digest: MyString
activated: false
activated_at: 2017-06-28 08:47:36
admin: false
cv_name: MyString
two:
email: MyString
password: MyString
fullname: MyString
reset_digest: MyString
reset_sent_at: 2017-06-28 08:47:36
activation_digest: MyString
activated: false
activated_at: 2017-06-28 08:47:36
admin: false
cv_name: MyString
require 'test_helper'
class ApplyTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
require 'test_helper'
class AreaTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
require 'test_helper'
class CategoryTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
require 'test_helper'
class CityTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
require 'test_helper'
class CompanyTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
require 'test_helper'
class ContactTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
require 'test_helper'
class FavoriteTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
require 'test_helper'
class HistoryTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
require 'test_helper'
class JobCategoryTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
require 'test_helper'
class JobTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
require 'test_helper'
class JobTypeTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
require 'test_helper'
class UserTest < ActiveSupport::TestCase
# test "the truth" do
# assert true
# end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment