Skip to content

Instantly share code, notes, and snippets.

@mostlyfine
Forked from tdtds/nikkei-scraper.rb
Created January 14, 2011 02:01
Show Gist options
  • Select an option

  • Save mostlyfine/779025 to your computer and use it in GitHub Desktop.

Select an option

Save mostlyfine/779025 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
# -*- coding: utf-8; -*-
#
# scraping nikkei.com for Kindle
#
require 'nokogiri'
require 'open-uri'
TOP = 'http://www.nikkei.com'
class String
def canonical
self.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem
end
end
def html_header( title )
<<-HTML.gsub( /^\t/, '' )
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
<title>#{title}</title>
<link rel="stylesheet" href="nikkei.css" type="text/css" media="all"></link>
</head>
<body>
<h1>#{title}</h1>
HTML
end
def html_item( item, uri )
aid = uri2aid( uri )
return '' unless aid
html = nil
if File::exist?( "src/#{aid}.html" ) # loading cache
html = Nokogiri( open( "src/#{aid}.html", 'r:utf-8', &:read ) )
else
puts "getting #{aid}"
begin
html = Nokogiri( open( "#{TOP}#{uri}", 'r:utf-8', &:read ) )
rescue
$stderr.puts "cannot get #{TOP}#{uri}."
raise
end
open( "src/#{aid}.html", 'w:utf-8' ) do |f|
f.write( html.to_html )
end
sleep 1
end
html
open( "tmp/#{aid}.html", 'w:utf-8' ) do |f|
f.puts html_header( (html / 'h4.cmn-article_title')[0].text.strip.canonical )
(html / 'div.cmn-article_text p').each do |text|
(text / 'span.JSID_urlData').remove
f.puts "\t<p>#{text.text.strip.sub( /^ /, '' ).canonical}</p>"
end
f.puts html_footer
end
%Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>|
end
def html_footer
<<-HTML.gsub( /^\t/, '' )
</body>
</html>
HTML
end
def ncx_header
<<-XML.gsub( /^\t/, '' )
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<docTitle><text>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</text></docTitle>
<navMap>
<navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
XML
end
def ncx_item( item, uri, index )
aid = uri2aid( uri )
aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : ''
end
def ncx_footer
<<-XML.gsub( /^\t/, '' )
</navMap>
</ncx>
XML
end
def opf_header
<<-XML.gsub( /^\t/, '' )
<?xml version="1.0" encoding="utf-8"?>
<package unique-identifier="uid">
<metadata>
<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
<dc:Title>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</dc:Title>
<dc:Language>en-US</dc:Language>
<dc:Creator>日本経済新聞社</dc:Creator>
<dc:Description>日経電子版、#{Time::now.strftime '%Y-%m-%d %H:%M'}生成</dc:Description>
<dc:Date>#{Time::now.strftime( '%d/%m/%Y' )}</dc:Date>
</dc-metadata>
<x-metadata>
<output encoding="utf-8" content-type="text/x-oeb1-document"></output>
<EmbeddedCover>nikkei.jpg</EmbeddedCover>
</x-metadata>
</metadata>
<manifest>
<item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
<item id="style" media-type="text/css" href="nikkei.css"></item>
<item id="index" media-type="text/html" href="toc.html"></item>
XML
end
def opf_item( uri )
aid = uri2aid( uri )
aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : ''
end
def opf_footer( aids )
r = <<-XML.gsub( /^\t/, '' )
</manifest>
<spine toc="toc">
<itemref idref="index" />
XML
aids.each do |aid|
r << %Q|\t<itemref idref="#{aid}" />\n|
end
r << <<-XML.gsub( /^\t/, '' )
</spine>
<tours></tours>
<guide>
<reference type="toc" title="Table of Contents" href="toc.html"></reference>
<reference type="start" title="Table of Contents" href="toc.html"></reference>
</guide>
</package>
XML
r
end
def uri2aid( uri )
uri.scan( /g=([^;$]+)/ ).flatten[0]
end
def generate( toc )
open( 'tmp/toc.html', 'w:utf-8' ) do |html|
open( 'tmp/toc.ncx', 'w:utf-8' ) do |ncx|
open( 'tmp/nikkei.opf', 'w:utf-8' ) do |opf|
first = true
toc_index = 0
aids = []
ncx.puts ncx_header
opf.puts opf_header
toc.each do |category|
category.each do |article|
if article.class == String
html.puts first ?
html_header( 'Table of Contents' ) :
"\t</ul>\n\t<mbp:pagebreak />"
html.puts "\t<h2>#{article}</h2>"
html.puts "\t<ul>"
first = false
else
html.puts html_item( article[0], article[1] )
ncx.puts ncx_item( article[0], article[1], toc_index += 1 )
unless aids.index( uri2aid( article[1] ) )
opf.puts opf_item( article[1] )
aids << uri2aid( article[1] ) if uri2aid( article[1] )
end
end
end
end
html.puts "\t</ul>"
html.puts html_footer
ncx.puts ncx_footer
opf.puts opf_footer( aids )
end
end
end
end
toc = []
top = Nokogiri( open( ARGV[0] || TOP, 'r:utf-8', &:read ) )
#
# scraping top news
#
toc_top = ['TOP NEWS']
%w(first second_alone third fourth).each do |category|
(top / "div.nx-top_news_#{category} h3 a").each do |a|
toc_top << [a.text.strip.canonical, a.attr( 'href' )]
end
end
toc << toc_top
#
# scraping all categories
#
(top / 'div.cmnc-genre').each do |genre|
toc_cat = []
(genre / 'h4.cmnc-genre_title a').each do |cat|
next if /local/ =~ cat.attr( 'href' )
toc_cat << cat.text
(genre / 'li a').each do |article|
toc_cat << [article.text.canonical, article.attr( 'href' )]
end
end
toc << toc_cat
end
generate( toc )
* {
margin: 0px;
padding: 0px;
text-indent: 0px;
}
h1 {
font-size: 150%;
font-weight: bold;
}
h2 {
font-size: 120%;
font-weight: bold;
margin: 1em 0em 0em 0em;
}
p {
text-indent: 0em;
margin: 1em 0em 0em 0em;
line-height: 200%;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment