-
-
Save mostlyfine/779025 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env ruby | |
| # -*- coding: utf-8; -*- | |
| # | |
| # scraping nikkei.com for Kindle | |
| # | |
| require 'nokogiri' | |
| require 'open-uri' | |
| TOP = 'http://www.nikkei.com' | |
| class String | |
| def canonical | |
| self.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem | |
| end | |
| end | |
| def html_header( title ) | |
| <<-HTML.gsub( /^\t/, '' ) | |
| <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> | |
| <html> | |
| <head> | |
| <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta> | |
| <title>#{title}</title> | |
| <link rel="stylesheet" href="nikkei.css" type="text/css" media="all"></link> | |
| </head> | |
| <body> | |
| <h1>#{title}</h1> | |
| HTML | |
| end | |
| def html_item( item, uri ) | |
| aid = uri2aid( uri ) | |
| return '' unless aid | |
| html = nil | |
| if File::exist?( "src/#{aid}.html" ) # loading cache | |
| html = Nokogiri( open( "src/#{aid}.html", 'r:utf-8', &:read ) ) | |
| else | |
| puts "getting #{aid}" | |
| begin | |
| html = Nokogiri( open( "#{TOP}#{uri}", 'r:utf-8', &:read ) ) | |
| rescue | |
| $stderr.puts "cannot get #{TOP}#{uri}." | |
| raise | |
| end | |
| open( "src/#{aid}.html", 'w:utf-8' ) do |f| | |
| f.write( html.to_html ) | |
| end | |
| sleep 1 | |
| end | |
| html | |
| open( "tmp/#{aid}.html", 'w:utf-8' ) do |f| | |
| f.puts html_header( (html / 'h4.cmn-article_title')[0].text.strip.canonical ) | |
| (html / 'div.cmn-article_text p').each do |text| | |
| (text / 'span.JSID_urlData').remove | |
| f.puts "\t<p>#{text.text.strip.sub( /^ /, '' ).canonical}</p>" | |
| end | |
| f.puts html_footer | |
| end | |
| %Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>| | |
| end | |
| def html_footer | |
| <<-HTML.gsub( /^\t/, '' ) | |
| </body> | |
| </html> | |
| HTML | |
| end | |
| def ncx_header | |
| <<-XML.gsub( /^\t/, '' ) | |
| <?xml version="1.0" encoding="UTF-8"?> | |
| <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd"> | |
| <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> | |
| <docTitle><text>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</text></docTitle> | |
| <navMap> | |
| <navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint> | |
| XML | |
| end | |
| def ncx_item( item, uri, index ) | |
| aid = uri2aid( uri ) | |
| aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : '' | |
| end | |
| def ncx_footer | |
| <<-XML.gsub( /^\t/, '' ) | |
| </navMap> | |
| </ncx> | |
| XML | |
| end | |
| def opf_header | |
| <<-XML.gsub( /^\t/, '' ) | |
| <?xml version="1.0" encoding="utf-8"?> | |
| <package unique-identifier="uid"> | |
| <metadata> | |
| <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/"> | |
| <dc:Title>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</dc:Title> | |
| <dc:Language>en-US</dc:Language> | |
| <dc:Creator>日本経済新聞社</dc:Creator> | |
| <dc:Description>日経電子版、#{Time::now.strftime '%Y-%m-%d %H:%M'}生成</dc:Description> | |
| <dc:Date>#{Time::now.strftime( '%d/%m/%Y' )}</dc:Date> | |
| </dc-metadata> | |
| <x-metadata> | |
| <output encoding="utf-8" content-type="text/x-oeb1-document"></output> | |
| <EmbeddedCover>nikkei.jpg</EmbeddedCover> | |
| </x-metadata> | |
| </metadata> | |
| <manifest> | |
| <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item> | |
| <item id="style" media-type="text/css" href="nikkei.css"></item> | |
| <item id="index" media-type="text/html" href="toc.html"></item> | |
| XML | |
| end | |
| def opf_item( uri ) | |
| aid = uri2aid( uri ) | |
| aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : '' | |
| end | |
| def opf_footer( aids ) | |
| r = <<-XML.gsub( /^\t/, '' ) | |
| </manifest> | |
| <spine toc="toc"> | |
| <itemref idref="index" /> | |
| XML | |
| aids.each do |aid| | |
| r << %Q|\t<itemref idref="#{aid}" />\n| | |
| end | |
| r << <<-XML.gsub( /^\t/, '' ) | |
| </spine> | |
| <tours></tours> | |
| <guide> | |
| <reference type="toc" title="Table of Contents" href="toc.html"></reference> | |
| <reference type="start" title="Table of Contents" href="toc.html"></reference> | |
| </guide> | |
| </package> | |
| XML | |
| r | |
| end | |
| def uri2aid( uri ) | |
| uri.scan( /g=([^;$]+)/ ).flatten[0] | |
| end | |
| def generate( toc ) | |
| open( 'tmp/toc.html', 'w:utf-8' ) do |html| | |
| open( 'tmp/toc.ncx', 'w:utf-8' ) do |ncx| | |
| open( 'tmp/nikkei.opf', 'w:utf-8' ) do |opf| | |
| first = true | |
| toc_index = 0 | |
| aids = [] | |
| ncx.puts ncx_header | |
| opf.puts opf_header | |
| toc.each do |category| | |
| category.each do |article| | |
| if article.class == String | |
| html.puts first ? | |
| html_header( 'Table of Contents' ) : | |
| "\t</ul>\n\t<mbp:pagebreak />" | |
| html.puts "\t<h2>#{article}</h2>" | |
| html.puts "\t<ul>" | |
| first = false | |
| else | |
| html.puts html_item( article[0], article[1] ) | |
| ncx.puts ncx_item( article[0], article[1], toc_index += 1 ) | |
| unless aids.index( uri2aid( article[1] ) ) | |
| opf.puts opf_item( article[1] ) | |
| aids << uri2aid( article[1] ) if uri2aid( article[1] ) | |
| end | |
| end | |
| end | |
| end | |
| html.puts "\t</ul>" | |
| html.puts html_footer | |
| ncx.puts ncx_footer | |
| opf.puts opf_footer( aids ) | |
| end | |
| end | |
| end | |
| end | |
| toc = [] | |
| top = Nokogiri( open( ARGV[0] || TOP, 'r:utf-8', &:read ) ) | |
| # | |
| # scraping top news | |
| # | |
| toc_top = ['TOP NEWS'] | |
| %w(first second_alone third fourth).each do |category| | |
| (top / "div.nx-top_news_#{category} h3 a").each do |a| | |
| toc_top << [a.text.strip.canonical, a.attr( 'href' )] | |
| end | |
| end | |
| toc << toc_top | |
| # | |
| # scraping all categories | |
| # | |
| (top / 'div.cmnc-genre').each do |genre| | |
| toc_cat = [] | |
| (genre / 'h4.cmnc-genre_title a').each do |cat| | |
| next if /local/ =~ cat.attr( 'href' ) | |
| toc_cat << cat.text | |
| (genre / 'li a').each do |article| | |
| toc_cat << [article.text.canonical, article.attr( 'href' )] | |
| end | |
| end | |
| toc << toc_cat | |
| end | |
| generate( toc ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| * { | |
| margin: 0px; | |
| padding: 0px; | |
| text-indent: 0px; | |
| } | |
| h1 { | |
| font-size: 150%; | |
| font-weight: bold; | |
| } | |
| h2 { | |
| font-size: 120%; | |
| font-weight: bold; | |
| margin: 1em 0em 0em 0em; | |
| } | |
| p { | |
| text-indent: 0em; | |
| margin: 1em 0em 0em 0em; | |
| line-height: 200%; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment