-
-
Save fpaint/1d6bdfdd7d609d8677c4a8211fa81938 to your computer and use it in GitHub Desktop.
| require 'nokogiri' | |
| require 'faraday' | |
| require 'date' | |
| require 'digest' | |
| class Ward | |
| BASE = 'https://www.parahumans.net' | |
| class Page | |
| attr_reader :title, :content | |
| def initialize(title, content) | |
| @title = title | |
| @content = content | |
| end | |
| end | |
| class Loader | |
| def get(url) | |
| cache = "cache/#{Digest::MD5.hexdigest(url)}" | |
| if(File.exists?(cache)) | |
| body = File.read(cache) | |
| else | |
| res = Faraday.get(url) | |
| raise "Get #{url} failed: #{res.status}" unless res.success? | |
| body = res.body | |
| File.write(cache, body) | |
| end | |
| return parse(body) | |
| end | |
| def parse(html) | |
| doc = Nokogiri::HTML(html) | |
| title = doc.css('main article header.entry-header h1.entry-title').text | |
| content = doc.css('main article div.entry-content') | |
| content.css('.sharedaddy').remove | |
| content.css('p').first.remove | |
| content.css('p').last.remove | |
| content.css('a, span').each { |node| node.replace(node.children) } | |
| content.css('em').each { |node| node.name = 'emphasis' } | |
| content.xpath('//@style').remove | |
| Page.new(title, content.inner_html) | |
| end | |
| end | |
| class Book | |
| def initialize(pages = []) | |
| @pages = pages | |
| end | |
| def generate(filename) | |
| doc = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') do |xml| | |
| xml.FictionBook(xmlns: 'http://www.gribuser.ru/xml/fictionbook/2.0', 'xmlns:l' => 'http://www.w3.org/1999/xlink') { | |
| xml.description { | |
| xml.send('title-info') { | |
| xml.author { | |
| xml.send 'first-name', 'John Charles' | |
| xml.send 'last-name', 'McCrae' | |
| } | |
| xml.send 'book-title', 'Ward' | |
| xml.annotation { | |
| xml.p 'The unwritten rules that govern the fights and outright wars between ‘capes’ have been amended: everyone gets their second chance. It’s an uneasy thing to come to terms with when notorious supervillains and even monsters are playing at being hero. The world ended two years ago, and as humanity straddles the old world and the new, there aren’t records, witnesses, or facilities to answer the villains’ past actions in the present. One of many compromises, uneasy truces and deceptions that are starting to splinter as humanity rebuilds.' | |
| xml.p 'None feel the injustice of this new status quo or the lack of established footing more than the past residents of the parahuman asylums. The facilities hosted parahumans and their victims, but the facilities are ruined or gone; one of many fragile ex-patients is left to find a place in a fractured world. She’s perhaps the person least suited to have anything to do with this tenuous peace or to stand alongside these false heroes. She’s put in a position to make the decision: will she compromise to help forge what they call, with dark sentiment, a second golden age? Or will she stand tall as a gilded dark age dawns?' | |
| } | |
| xml.lang 'en' | |
| } | |
| xml.send('document-info') { | |
| xml.author { | |
| xml.nickname 'Cuterebra' | |
| } | |
| xml.send 'program-used', 'Ruby, Faraday, Nokogiri' | |
| xml.date(Date.today, value: Date.today) | |
| xml.id 'fbab61de-f08a-4763-88f8-7b3808e153d0' | |
| xml.send 'src-url', BASE | |
| } | |
| } | |
| xml.body { | |
| @pages.each do |page| | |
| xml.section do |section| | |
| section.title page.title | |
| section << page.content | |
| end | |
| end | |
| } | |
| } | |
| end | |
| File.write(filename, doc.to_xml) | |
| end | |
| end | |
| def contents | |
| res = Faraday.get(BASE) | |
| doc = Nokogiri::HTML(res.body) | |
| urls = (links(doc.css('aside #nav_menu-5')) + links(doc.css('aside #nav_menu-6'))).uniq.reject{|url| url.include?('category/story')} | |
| urls | |
| end | |
| def links(node) | |
| node.css('a').map{|a| a.attr(:href)} | |
| end | |
| def create_book(filename) | |
| print 'Getting contents...', $/ | |
| urls = contents | |
| pages = [] | |
| loader = Loader.new | |
| urls.each do |url| | |
| print "Reading #{url}", $/ | |
| pages << loader.get(url) | |
| end | |
| print 'Generating book', $/ | |
| book = Ward::Book.new(pages) | |
| book.generate(filename) | |
| end | |
| end | |
| ward = Ward.new | |
| ward.create_book('ward.fb2') |
One more fix with forum name corrected
`require 'nokogiri'
require 'faraday'
require 'date'
require 'digest'
require 'fileutils'
class Ward
BASE = 'https://www.parahumans.net'
class Page
attr_reader :title, :content
def initialize(title, content)
@title = title
@content = content
end
end
class Loader
def get(url)
cache = "cache/#{Digest::MD5.hexdigest(url)}"
if File.exist?(cache)
body = File.read(cache)
else
res = Faraday.get(url)
raise "Get #{url} failed: #{res.status}" unless res.success?
body = res.body
FileUtils.mkdir_p(File.dirname(cache))
File.write(cache, body)
end
return parse(body)
end
def parse(html)
doc = Nokogiri::HTML(html)
title = doc.css('main article header.entry-header h1.entry-title').text
content = doc.css('main article div.entry-content')
content.css('.sharedaddy').remove
content.css('p').first.remove
content.css('p').last.remove
# Декодируем Cloudflare Email Protection
content.css('.__cf_email__').each do |cf_email|
encoded = cf_email['data-cfemail']
next unless encoded
decoded = decode_cf_email(encoded)
cf_email.replace(decoded) if decoded
end
# Удаляем оставшиеся email-защитные обертки
content.css('a[data-cfemail]').each do |link|
encoded = link['data-cfemail']
next unless encoded
decoded = decode_cf_email(encoded)
link.replace(decoded) if decoded
end
# Удаляем ненужные теги
content.css('a, span').each { |node| node.replace(node.children) }
content.css('em').each { |node| node.name = 'emphasis' }
content.xpath('//@style').remove
# Защищаем оставшиеся @ в тексте
html_content = content.inner_html.gsub('@', '@')
Page.new(title, html_content)
end
private
# Правильное декодирование Cloudflare Email Protection
def decode_cf_email(encoded)
key = encoded[0..1].to_i(16)
decoded = ""
(2...encoded.length).step(2) do |i|
hex = encoded[i..i+1]
char_code = hex.to_i(16) ^ key
decoded << char_code.chr
end
decoded
rescue
nil
end
end
class Book
def initialize(pages = [])
@pages = pages
end
def generate(filename)
doc = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') do |xml|
xml.FictionBook(xmlns: 'http://www.gribuser.ru/xml/fictionbook/2.0', 'xmlns:l' => 'http://www.w3.org/1999/xlink') {
xml.description {
xml.send('title-info') {
xml.author {
xml.send 'first-name', 'John Charles'
xml.send 'last-name', 'McCrae'
}
xml.send 'book-title', 'Ward'
xml.annotation {
xml.p 'The unwritten rules that govern the fights and outright wars between ‘capes’ have been amended: everyone gets their second chance. It’s an uneasy thing to come to terms with when notorious supervillains and even monsters are playing at being hero. The world ended two years ago, and as humanity straddles the old world and the new, there aren’t records, witnesses, or facilities to answer the villains’ past actions in the present. One of many compromises, uneasy truces and deceptions that are starting to splinter as humanity rebuilds.'
xml.p 'None feel the injustice of this new status quo or the lack of established footing more than the past residents of the parahuman asylums. The facilities hosted parahumans and their victims, but the facilities are ruined or gone; one of many fragile ex-patients is left to find a place in a fractured world. She’s perhaps the person least suited to have anything to do with this tenuous peace or to stand alongside these false heroes. She’s put in a position to make the decision: will she compromise to help forge what they call, with dark sentiment, a second golden age? Or will she stand tall as a gilded dark age dawns?'
}
xml.lang 'en'
}
xml.send('document-info') {
xml.author {
xml.nickname 'Cuterebra'
}
xml.send 'program-used', 'Ruby, Faraday, Nokogiri'
xml.date(Date.today, value: Date.today)
xml.id 'fbab61de-f08a-4763-88f8-7b3808e153d0'
xml.send 'src-url', BASE
}
}
xml.body {
@pages.each do |page|
xml.section do |section|
section.title page.title
section << page.content
end
end
}
}
end
File.write(filename, doc.to_xml)
end
end
def contents
res = Faraday.get(BASE)
doc = Nokogiri::HTML(res.body)
urls = (links(doc.css('aside #nav_menu-5')) + links(doc.css('aside #nav_menu-6'))).uniq.reject{|url| url.include?('category/story')}
urls
end
def links(node)
node.css('a').map{|a| a.attr(:href)}
end
def create_book(filename)
print 'Getting contents...', $/
urls = contents
pages = []
loader = Loader.new
urls.each do |url|
print "Reading #{url}", $/
pages << loader.get(url)
end
print 'Generating book', $/
book = Ward::Book.new(pages)
book.generate(filename)
end
end
ward = Ward.new
ward.create_book('ward.fb2')`
Fixed some issues
`require 'nokogiri'
require 'faraday'
require 'date'
require 'digest'
require 'fileutils'
class Ward
BASE = 'https://www.parahumans.net'
class Page
end
class Loader
end
class Book
end
def contents
res = Faraday.get(BASE)
doc = Nokogiri::HTML(res.body)
urls = (links(doc.css('aside #nav_menu-5')) + links(doc.css('aside #nav_menu-6'))).uniq.reject{|url| url.include?('category/story')}
urls
end
def links(node)
node.css('a').map{|a| a.attr(:href)}
end
def create_book(filename)
print 'Getting contents...', $/
urls = contents
pages = []
loader = Loader.new
urls.each do |url|
print "Reading #{url}", $/
pages << loader.get(url)
end
print 'Generating book', $/
book = Ward::Book.new(pages)
book.generate(filename)
end
end
ward = Ward.new
ward.create_book('ward.fb2')
`