Commit 56eb2218 by Thi Thanh Yen Hoang

first commit

parent 201e696b
require 'nokogiri'
require 'open-uri'
require 'csv'
class Crawler
def initialize
# link queue(to be crawled)
@queue = ["http://kenh14.vn"]
# crawled links
@crawled = []
@data = file
#@test = 0
#@test2 = ""
end
def run
# while there still have link in queue
while link = next_link
# get HTML and parse using the usual Nokogiri gem
p link
page = Nokogiri::HTML(open(link))
# analyze page content
page_analyze(page, link)
# push new link to queue
push_link(page)
content(page)
#if @test > 0
#p @test
#p @test2
#break
#end
end
end
def page_analyze(doc, link)
@title = doc.css("title").text
end
def push_link(doc)
@url = doc.css("link[rel=canonical] @href").text
doc.css("a @href").each do |url|
if url.value.include?("http://") &&
url.value.include?(".chn") &&
!@crawled.include?(url.value) &&
!url.value.include?(".php") &&
!url.value.include?("/video")
@queue << url.value
@crawled << url.value
#if url.value == "http://kenh14.vn//trang-2.chn"
#@test = 1
#end
end
if !url.value.include?("http://kenh14.vn") &&
url.value.include?(".chn") &&
!@crawled.include?("http://kenh14.vn" + url.value) &&
!url.value.include?(".php") && !url.value.include?("/video") &&
!url.value.include?("//trang-")
val = "http://kenh14.vn" + url.value
@queue << val
@crawled << val
#if val == "http://kenh14.vn//trang-2.chn"
#@test2 = url.value
#@test = 2
#end
end
end
end
def next_link
link = nil
link = @queue.shift
return link
end
def content(doc)
@content1 = doc.css("h2[class=knd-sapo]").text
@content2 = doc.css("div[class=knd-content]").text
@data << [@url, @title, @content1 + @content2]
end
def file
csv = CSV.open("test.csv", "wb")
csv << ["URL", "TITLE", "CONTENT"]
#csv << [@url, @title, @content1 + @content2]
return csv
end
end
a = Crawler.new
a.run
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment