-
-
Save m040601/659996 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # One-off script I wrote to slurp sample sentences off http://smart.fm | |
| require 'rubygems' | |
| require 'open-uri' | |
| require 'hpricot' | |
| def cleanup_text(str) | |
| # str = str.gsub /<\/?b>/, "" # Human readable | |
| str = str.gsub /\s+/, " " | |
| return str.strip | |
| end | |
| # Currently equipped for creating a tab-separated Anki file | |
| def save_sentence(savefile, original, explanation) | |
| # savefile.puts original + "\n\n" + explanation + "\n\n" # Human readable | |
| original = original.gsub /\n/, "<br>" | |
| explanation = explanation.gsub /\n/, "<br>" | |
| savefile.puts original + "\t" + explanation + "\n" | |
| end | |
| def download_one_page(source_url, savefile, page_num) | |
| doc = Hpricot(open(source_url + "?page=" + page_num.to_s)) | |
| (doc/"li.sentence_package").each do |sentence| | |
| original = cleanup_text( (sentence/"p.text > a").first.inner_html ) | |
| original = original.gsub /\s/, "" | |
| transliteration = cleanup_text( (sentence/"p.transliteration").inner_html ) | |
| translation = cleanup_text( (sentence/"p.translation > a").inner_html ) | |
| save_sentence( savefile, original, transliteration + "\n" + translation ) | |
| end | |
| end | |
| def download_sentences(list_num, list_title) | |
| source_url = "http://smart.fm/lists/" + list_num.to_s + "/sentences" | |
| puts "Saving " + source_url + " to " + list_title + ".txt" | |
| savefile = open(list_title + ".txt", "w") | |
| doc = Hpricot(open(source_url)) | |
| num_pages = (doc/"div.pagination > a:nth-last-of-type(1)").inner_html.to_i | |
| for i in 1..num_pages do | |
| download_one_page(source_url, savefile, i) | |
| end | |
| savefile.close | |
| end | |
| def download_series(series_num) | |
| doc = Hpricot(open("http://smart.fm/series/" + series_num.to_s)) | |
| (doc/"div.list-det > h4 > a").each do |element| | |
| /http:\/\/smart.fm\/lists\/(\d+)-(.*)/ =~ element.attributes['href'] | |
| download_sentences( $1, $2 ) | |
| end | |
| end | |
| download_series(3318) | |
| download_series(3321) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment