mostlyfine · January 14, 2011 02:01
diff --git a/nikkei-scraper.rb b/nikkei-scraper.rb
 #!/usr/bin/env ruby
 # -*- coding: utf-8; -*-
 #
 # scraping nikkei.com for Kindle
 #

 require 'nokogiri'
 require 'open-uri'

 TOP = 'http://www.nikkei.com'

 class String
 	def canonical
 		self.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem
 	end
 end

 def html_header( title )
 	<<-HTML.gsub( /^\t/, '' )
 	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
 	<html>
 	<head>
 		<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
 		<title>#{title}</title>
 		<link rel="stylesheet" href="nikkei.css" type="text/css" media="all"></link>
 	</head>
 	<body>
 		<h1>#{title}</h1>
 	HTML
 end

 def html_item( item, uri )
 	aid = uri2aid( uri )
 	return '' unless aid
 	html = nil
 	if File::exist?( "src/#{aid}.html" ) # loading cache
 		html = Nokogiri( open( "src/#{aid}.html", 'r:utf-8', &:read ) )
 	else
 		puts "getting #{aid}"
 		begin
 			html = Nokogiri( open( "#{TOP}#{uri}", 'r:utf-8', &:read ) )
 		rescue
 			$stderr.puts "cannot get #{TOP}#{uri}."
 			raise
 		end
 		open( "src/#{aid}.html", 'w:utf-8' ) do |f|
 			f.write( html.to_html )
 		end
 		sleep 1
 	end
 	html

 	open( "tmp/#{aid}.html", 'w:utf-8' ) do |f|
 		f.puts html_header( (html / 'h4.cmn-article_title')[0].text.strip.canonical )
 		(html / 'div.cmn-article_text p').each do |text|
 			(text / 'span.JSID_urlData').remove
 			f.puts "\t<p>#{text.text.strip.sub( /^　/, '' ).canonical}</p>"
 		end
 		f.puts html_footer
 	end

 	%Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>|
 end

 def html_footer
 	<<-HTML.gsub( /^\t/, '' )
 	</body>
 	</html>
 	HTML
 end

 def ncx_header
 	<<-XML.gsub( /^\t/, '' )
 	<?xml version="1.0" encoding="UTF-8"?>
 	<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
 	<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
 	<docTitle><text>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</text></docTitle>
 	<navMap>
 		<navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
 	XML
 end

 def ncx_item( item, uri, index )
 	aid = uri2aid( uri )
 	aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : ''
 end

 def ncx_footer
 	<<-XML.gsub( /^\t/, '' )
 	</navMap>
 	</ncx>
 	XML
 end

 def opf_header
 	<<-XML.gsub( /^\t/, '' )
 	<?xml version="1.0" encoding="utf-8"?>
 	<package unique-identifier="uid">
 		<metadata>
 			<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
 				<dc:Title>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</dc:Title>
 				<dc:Language>en-US</dc:Language>
 				<dc:Creator>日本経済新聞社</dc:Creator>
 				<dc:Description>日経電子版、#{Time::now.strftime '%Y-%m-%d %H:%M'}生成</dc:Description>
 				<dc:Date>#{Time::now.strftime( '%d/%m/%Y' )}</dc:Date>
 			</dc-metadata>
 			<x-metadata>
 				<output encoding="utf-8" content-type="text/x-oeb1-document"></output>
 				<EmbeddedCover>nikkei.jpg</EmbeddedCover>
 			</x-metadata>
 		</metadata>
 		<manifest>
 			<item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
 			<item id="style" media-type="text/css" href="nikkei.css"></item>
 			<item id="index" media-type="text/html" href="toc.html"></item>
 	XML
 end

 def opf_item( uri )
 	aid = uri2aid( uri )
 	aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : ''
 end

 def opf_footer( aids )
 	r = <<-XML.gsub( /^\t/, '' )
 	</manifest>
 	<spine toc="toc">
 		<itemref idref="index" />
 	XML
 	aids.each do |aid|
 		r << %Q|\t<itemref idref="#{aid}" />\n|
 	end
 	r << <<-XML.gsub( /^\t/, '' )
 	</spine>
 	<tours></tours>
 	<guide>
 	  <reference type="toc" title="Table of Contents" href="toc.html"></reference>
 	  <reference type="start" title="Table of Contents" href="toc.html"></reference>
 	</guide>
 	</package>
 	XML
 	r
 end

 def uri2aid( uri )
 	uri.scan( /g=([^;$]+)/ ).flatten[0]
 end

 def generate( toc )
 	open( 'tmp/toc.html', 'w:utf-8' ) do |html|
 	open( 'tmp/toc.ncx', 'w:utf-8' ) do |ncx|
 	open( 'tmp/nikkei.opf', 'w:utf-8' ) do |opf|
 		first = true
 		toc_index = 0
 		aids = []
 		ncx.puts ncx_header
 		opf.puts opf_header
 		toc.each do |category|
 			category.each do |article|
 				if article.class == String
 					html.puts first ?
 						html_header( 'Table of Contents' ) :
 						"\t</ul>\n\t<mbp:pagebreak />"
 					html.puts "\t<h2>#{article}</h2>"
 					html.puts "\t<ul>"
 					first = false
 				else
 					html.puts html_item( article[0], article[1] )
 					ncx.puts ncx_item( article[0], article[1], toc_index += 1 )
 					unless aids.index( uri2aid( article[1] ) )
 						opf.puts opf_item( article[1] )
 						aids << uri2aid( article[1] ) if uri2aid( article[1] )
 					end
 				end
 			end
 		end
 		html.puts "\t</ul>"
 		html.puts html_footer
 		ncx.puts ncx_footer
 		opf.puts opf_footer( aids )
 	end
 	end
 	end
 end


 toc = []
 top = Nokogiri( open( ARGV[0] || TOP, 'r:utf-8', &:read ) )

 #
 # scraping top news
 #
 toc_top = ['TOP NEWS']

 %w(first second_alone third fourth).each do |category|
 	(top / "div.nx-top_news_#{category} h3 a").each do |a|
 		toc_top << [a.text.strip.canonical, a.attr( 'href' )]
 	end
 end
 toc << toc_top

 #
 # scraping all categories
 #
 (top / 'div.cmnc-genre').each do |genre|
 	toc_cat = []
 	(genre / 'h4.cmnc-genre_title a').each do |cat|
 		next if /local/ =~ cat.attr( 'href' )
 		toc_cat << cat.text
 		(genre / 'li a').each do |article|
 			toc_cat << [article.text.canonical, article.attr( 'href' )]
 		end
 	end
 	toc << toc_cat
 end

 generate( toc )
diff --git a/nikkei.css b/nikkei.css
 * {
 	margin: 0px;
 	padding: 0px;
 	text-indent: 0px;
 }

 h1 {
 	font-size: 150%;
 	font-weight: bold;
 }

 h2 {
 	font-size: 120%;
 	font-weight: bold;
 	margin: 1em 0em 0em 0em;
 }

 p {
 	text-indent: 0em;
 	margin: 1em 0em 0em 0em;
 	line-height: 200%;
 }
	#!/usr/bin/env ruby
	# -- coding: utf-8; --
	#
	# scraping nikkei.com for Kindle
	#

	require 'nokogiri'
	require 'open-uri'

	TOP = 'http://www.nikkei.com'

	class String
	def canonical
	self.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem
	end
	end

	def html_header( title )
	<<-HTML.gsub( /^\t/, '' )
	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
	<html>
	<head>
	<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
	<title>#{title}</title>
	<link rel="stylesheet" href="nikkei.css" type="text/css" media="all"></link>
	</head>
	<body>
	<h1>#{title}</h1>
	HTML
	end

	def html_item( item, uri )
	aid = uri2aid( uri )
	return '' unless aid
	html = nil
	if File::exist?( "src/#{aid}.html" ) # loading cache
	html = Nokogiri( open( "src/#{aid}.html", 'r:utf-8', &:read ) )
	else
	puts "getting #{aid}"
	begin
	html = Nokogiri( open( "#{TOP}#{uri}", 'r:utf-8', &:read ) )
	rescue
	$stderr.puts "cannot get #{TOP}#{uri}."
	raise
	end
	open( "src/#{aid}.html", 'w:utf-8' ) do \|f\|
	f.write( html.to_html )
	end
	sleep 1
	end
	html

	open( "tmp/#{aid}.html", 'w:utf-8' ) do \|f\|
	f.puts html_header( (html / 'h4.cmn-article_title')[0].text.strip.canonical )
	(html / 'div.cmn-article_text p').each do \|text\|
	(text / 'span.JSID_urlData').remove
	f.puts "\t<p>#{text.text.strip.sub( /^　/, '' ).canonical}</p>"
	end
	f.puts html_footer
	end

	%Q\|\t\t<li><a href="#{aid}.html">#{item}</a></li>\|
	end

	def html_footer
	<<-HTML.gsub( /^\t/, '' )
	</body>
	</html>
	HTML
	end

	def ncx_header
	<<-XML.gsub( /^\t/, '' )
	<?xml version="1.0" encoding="UTF-8"?>
	<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
	<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
	<docTitle><text>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</text></docTitle>
	<navMap>
	<navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
	XML
	end

	def ncx_item( item, uri, index )
	aid = uri2aid( uri )
	aid ? %Q\|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>\| : ''
	end

	def ncx_footer
	<<-XML.gsub( /^\t/, '' )
	</navMap>
	</ncx>
	XML
	end

	def opf_header
	<<-XML.gsub( /^\t/, '' )
	<?xml version="1.0" encoding="utf-8"?>
	<package unique-identifier="uid">
	<metadata>
	<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
	<dc:Title>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</dc:Title>
	<dc:Language>en-US</dc:Language>
	<dc:Creator>日本経済新聞社</dc:Creator>
	<dc:Description>日経電子版、#{Time::now.strftime '%Y-%m-%d %H:%M'}生成</dc:Description>
	<dc:Date>#{Time::now.strftime( '%d/%m/%Y' )}</dc:Date>
	</dc-metadata>
	<x-metadata>
	<output encoding="utf-8" content-type="text/x-oeb1-document"></output>
	<EmbeddedCover>nikkei.jpg</EmbeddedCover>
	</x-metadata>
	</metadata>
	<manifest>
	<item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
	<item id="style" media-type="text/css" href="nikkei.css"></item>
	<item id="index" media-type="text/html" href="toc.html"></item>
	XML
	end

	def opf_item( uri )
	aid = uri2aid( uri )
	aid ? %Q\|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>\| : ''
	end

	def opf_footer( aids )
	r = <<-XML.gsub( /^\t/, '' )
	</manifest>
	<spine toc="toc">
	<itemref idref="index" />
	XML
	aids.each do \|aid\|
	r << %Q\|\t<itemref idref="#{aid}" />\n\|
	end
	r << <<-XML.gsub( /^\t/, '' )
	</spine>
	<tours></tours>
	<guide>
	<reference type="toc" title="Table of Contents" href="toc.html"></reference>
	<reference type="start" title="Table of Contents" href="toc.html"></reference>
	</guide>
	</package>
	XML
	r
	end

	def uri2aid( uri )
	uri.scan( /g=([^;$]+)/ ).flatten[0]
	end

	def generate( toc )
	open( 'tmp/toc.html', 'w:utf-8' ) do \|html\|
	open( 'tmp/toc.ncx', 'w:utf-8' ) do \|ncx\|
	open( 'tmp/nikkei.opf', 'w:utf-8' ) do \|opf\|
	first = true
	toc_index = 0
	aids = []
	ncx.puts ncx_header
	opf.puts opf_header
	toc.each do \|category\|
	category.each do \|article\|
	if article.class == String
	html.puts first ?
	html_header( 'Table of Contents' ) :
	"\t</ul>\n\t<mbp:pagebreak />"
	html.puts "\t<h2>#{article}</h2>"
	html.puts "\t<ul>"
	first = false
	else
	html.puts html_item( article[0], article[1] )
	ncx.puts ncx_item( article[0], article[1], toc_index += 1 )
	unless aids.index( uri2aid( article[1] ) )
	opf.puts opf_item( article[1] )
	aids << uri2aid( article[1] ) if uri2aid( article[1] )
	end
	end
	end
	end
	html.puts "\t</ul>"
	html.puts html_footer
	ncx.puts ncx_footer
	opf.puts opf_footer( aids )
	end
	end
	end
	end


	toc = []
	top = Nokogiri( open( ARGV[0] \|\| TOP, 'r:utf-8', &:read ) )

	#
	# scraping top news
	#
	toc_top = ['TOP NEWS']

	%w(first second_alone third fourth).each do \|category\|
	(top / "div.nx-top_news_#{category} h3 a").each do \|a\|
	toc_top << [a.text.strip.canonical, a.attr( 'href' )]
	end
	end
	toc << toc_top

	#
	# scraping all categories
	#
	(top / 'div.cmnc-genre').each do \|genre\|
	toc_cat = []
	(genre / 'h4.cmnc-genre_title a').each do \|cat\|
	next if /local/ =~ cat.attr( 'href' )
	toc_cat << cat.text
	(genre / 'li a').each do \|article\|
	toc_cat << [article.text.canonical, article.attr( 'href' )]
	end
	end
	toc << toc_cat
	end

	generate( toc )
	* {
	margin: 0px;
	padding: 0px;
	text-indent: 0px;
	}

	h1 {
	font-size: 150%;
	font-weight: bold;
	}

	h2 {
	font-size: 120%;
	font-weight: bold;
	margin: 1em 0em 0em 0em;
	}

	p {
	text-indent: 0em;
	margin: 1em 0em 0em 0em;
	line-height: 200%;
	}