Last active
March 22, 2019 02:45
-
-
Save KeithCu/29f6a2131a7dcad7de9a270c9d11b62c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from functools import lru_cache | |
| import mwclient | |
| from mwclient import Site | |
| ua = 'LibreOffice Wiktionary/0.1 run by User:xxy' | |
| #This API returns Mediawiki markup | |
| @lru_cache(maxsize = 10) | |
| def Wiktionary(lang, word): | |
| site = mwclient.Site(host = 'en.wiktionary.org', clients_useragent=ua) | |
| page = site.pages[word] | |
| return page.text() | |
| import requests | |
| #TODO: Convert to work with LibreOffice langids? | |
| #Is this information exposed to UNO / Python? | |
| #https://docs.libreoffice.org/setup_native/html/sellang_8cxx_source.html | |
| def lang_to_url(lang, api): | |
| url_prefix = "https://" | |
| url_suffix = ".wiktionary.org" | |
| if api: | |
| url_suffix = url_suffix + "/w/api.php" | |
| #This usually works (french -> fr, english -> en) | |
| wiki_url_lang = lang[0:2] | |
| if lang == "portuguese": | |
| wiki_url_lang = "pt" | |
| return url_prefix + wiki_url_lang + url_suffix | |
| #This API returns HTML. | |
| @lru_cache(maxsize = 10) | |
| def Wiktionary2(lang, word): | |
| text = None | |
| #A persistent cache for debugging purposes to save fetches. | |
| try: | |
| fin = open("/tmp/wiktionary" + lang + word + ".html", mode = "r") | |
| text = fin.read() | |
| except FileNotFoundError: | |
| session = requests.Session() | |
| wiki_url = lang_to_url(lang, True) | |
| p = { "action": "parse", | |
| "page": word, | |
| "format": "json", | |
| # "disablelimitreport" : True, | |
| "disableeditsection" : True, | |
| "disabletoc" : True, | |
| #"prop" : "text|categories|templates|images|externallinks|sections|properties|parsewarnings" | |
| } | |
| #This retrieves mobile view. Still returns just as many | |
| #sections by default. | |
| if False: | |
| p['action'] = "mobileview" | |
| r = session.get(url = wiki_url, params = p) | |
| data = r.json() | |
| text = data['parse']['text']['*'] | |
| fout = open("/tmp/wiktionary" + lang + word + ".html", mode = "w") | |
| fout.write(text) | |
| string = BS4Filter(lang, text) | |
| fout = open("/tmp/wiktionary" + lang + word + "filtered.html", mode = "w") | |
| fout.write(string) | |
| return string | |
| from bs4 import BeautifulSoup | |
| def BS4Filter(lang, text): | |
| soup = BeautifulSoup(text, 'lxml') | |
| lang_url = lang_to_url(lang, False) | |
| #Fix local links to point to full Wiktionary URL | |
| #Could find no way to do this via the API. | |
| for link in soup.find_all("a"): | |
| href = link.get('href') | |
| if href and href.startswith("/"): | |
| link["href"] = lang_url + href | |
| tag = soup.find("span", id = lang.title()) | |
| if tag is not None: | |
| tag.decompose() | |
| tag = soup.find("span", id="Etymology") | |
| if tag is not None: | |
| tag.decompose() | |
| tags = soup.find_all("table", {"class" : "translations"}) | |
| for tag in tags: | |
| tag.decompose() | |
| tags = soup.find_all("span", {"class" : "mw-headline"}) | |
| for tag in tags: | |
| tag.decompose() | |
| string = soup.prettify() | |
| return string | |
| from googletrans import Translator | |
| g_trans = Translator() | |
| def GTranslate(src, dest_lang): | |
| return g_trans.translate(src, dest = dest_lang).text | |
| import requests, json | |
| def AzureTranslate(src, dest_lang): | |
| subscriptionKey = 'put_key_here' | |
| base_url = 'https://api.cognitive.microsofttranslator.com/translate?api-version=3.0' | |
| params = '&to=' + dest_lang | |
| url = base_url + params | |
| headers = { 'Ocp-Apim-Subscription-Key': subscriptionKey, | |
| 'Content-type': 'application/json' } | |
| body = [{ 'text' : src }] | |
| request = requests.post(url, headers = headers, json = body) | |
| response = request.json() | |
| string = response[0]["translations"][0]["text"] | |
| return string | |
| from timeit import default_timer as timer | |
| def HTMLFilterSpeedTest(lang, word): | |
| fin = open("/tmp/wiktionary" + lang + word + ".html", mode = "r") | |
| text = fin.read() | |
| start = timer() | |
| for i in range (100): | |
| res = BS4Filter(lang, text) | |
| end = timer() | |
| print ("Can complete: %f per second." % (100 / (end - start))) | |
| res = Wiktionary2("english", "compression") | |
| print (res) | |
| res = Wiktionary2("french", "compression") | |
| print (res) | |
| HTMLFilterSpeedTest("english", "compression") | |
| res = GTranslate("This is a Google translation test.", "ja") | |
| print (res) | |
| res = GTranslate("This is a Google translation test.", "fr") | |
| print (res) | |
| res = AzureTranslate("This is an Azure translation test.", "ja") | |
| print (res) | |
| res = AzureTranslate("This is an Azure translation test.", "fr") | |
| print (res) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment