Skip to content

Instantly share code, notes, and snippets.

@smucode
Forked from jimweirich/html_parser.rb
Created February 9, 2012 12:55
Show Gist options
  • Select an option

  • Save smucode/1779815 to your computer and use it in GitHub Desktop.

Select an option

Save smucode/1779815 to your computer and use it in GitHub Desktop.
Vital Ruby Advance Lab 2
require("nokogiri")
require("./url_fetcher")
class HtmlParser
def parse url
html = UrlFetcher.new.fetch url
Nokogiri::HTML(html).css('a').map{|e| parse_url(url, get_href(e)) }.compact
end
def get_href e
e.attribute("href") rescue nil
end
def parse_url base, url
URI.parse(base).merge(URI.parse(url)).to_s rescue nil
end
end
require("./url_fetcher");
require("./html_parser");
describe UrlFetcher do
it "fetches urls" do
fetcher = UrlFetcher.new
res = fetcher.fetch("ftp://ftp.powertech.no/test.txt")
res.should eq("test.txt\n")
end
it "returns nil on failures" do
fetcher = UrlFetcher.new
res = fetcher.fetch("http://foo.bar/")
res.should eq(nil)
end
end
describe HtmlParser do
it "parses html and returns empty array when no links exists" do
parser = HtmlParser.new
list = parser.parse("ftp://ftp.powertech.no/test.txt")
list.size.should eq(0)
end
it "parses html and returns array when links exists" do
parser = HtmlParser.new
list = parser.parse("http://ulv.no/")
list.size.should eq(3)
list[0].should eq("http://www.sau.no/")
end
it "should not return empty hrefs" do
parser = HtmlParser.new
list = parser.parse("http://vg.no/")
list.each{|e| e.should_not eq(nil)}
end
it "should only return absolute paths" do
parser = HtmlParser.new
list = parser.parse("http://vg.no/")
list.each{|e| e.should include("http") }
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment