Skip to content

Instantly share code, notes, and snippets.

@KeithCu
Last active March 22, 2019 02:45
Show Gist options
  • Select an option

  • Save KeithCu/29f6a2131a7dcad7de9a270c9d11b62c to your computer and use it in GitHub Desktop.

Select an option

Save KeithCu/29f6a2131a7dcad7de9a270c9d11b62c to your computer and use it in GitHub Desktop.
from functools import lru_cache
import mwclient
from mwclient import Site
ua = 'LibreOffice Wiktionary/0.1 run by User:xxy'
#This API returns Mediawiki markup
@lru_cache(maxsize = 10)
def Wiktionary(lang, word):
site = mwclient.Site(host = 'en.wiktionary.org', clients_useragent=ua)
page = site.pages[word]
return page.text()
import requests
#TODO: Convert to work with LibreOffice langids?
#Is this information exposed to UNO / Python?
#https://docs.libreoffice.org/setup_native/html/sellang_8cxx_source.html
def lang_to_url(lang, api):
url_prefix = "https://"
url_suffix = ".wiktionary.org"
if api:
url_suffix = url_suffix + "/w/api.php"
#This usually works (french -> fr, english -> en)
wiki_url_lang = lang[0:2]
if lang == "portuguese":
wiki_url_lang = "pt"
return url_prefix + wiki_url_lang + url_suffix
#This API returns HTML.
@lru_cache(maxsize = 10)
def Wiktionary2(lang, word):
text = None
#A persistent cache for debugging purposes to save fetches.
try:
fin = open("/tmp/wiktionary" + lang + word + ".html", mode = "r")
text = fin.read()
except FileNotFoundError:
session = requests.Session()
wiki_url = lang_to_url(lang, True)
p = { "action": "parse",
"page": word,
"format": "json",
# "disablelimitreport" : True,
"disableeditsection" : True,
"disabletoc" : True,
#"prop" : "text|categories|templates|images|externallinks|sections|properties|parsewarnings"
}
#This retrieves mobile view. Still returns just as many
#sections by default.
if False:
p['action'] = "mobileview"
r = session.get(url = wiki_url, params = p)
data = r.json()
text = data['parse']['text']['*']
fout = open("/tmp/wiktionary" + lang + word + ".html", mode = "w")
fout.write(text)
string = BS4Filter(lang, text)
fout = open("/tmp/wiktionary" + lang + word + "filtered.html", mode = "w")
fout.write(string)
return string
from bs4 import BeautifulSoup
def BS4Filter(lang, text):
soup = BeautifulSoup(text, 'lxml')
lang_url = lang_to_url(lang, False)
#Fix local links to point to full Wiktionary URL
#Could find no way to do this via the API.
for link in soup.find_all("a"):
href = link.get('href')
if href and href.startswith("/"):
link["href"] = lang_url + href
tag = soup.find("span", id = lang.title())
if tag is not None:
tag.decompose()
tag = soup.find("span", id="Etymology")
if tag is not None:
tag.decompose()
tags = soup.find_all("table", {"class" : "translations"})
for tag in tags:
tag.decompose()
tags = soup.find_all("span", {"class" : "mw-headline"})
for tag in tags:
tag.decompose()
string = soup.prettify()
return string
from googletrans import Translator
g_trans = Translator()
def GTranslate(src, dest_lang):
return g_trans.translate(src, dest = dest_lang).text
import requests, json
def AzureTranslate(src, dest_lang):
subscriptionKey = 'put_key_here'
base_url = 'https://api.cognitive.microsofttranslator.com/translate?api-version=3.0'
params = '&to=' + dest_lang
url = base_url + params
headers = { 'Ocp-Apim-Subscription-Key': subscriptionKey,
'Content-type': 'application/json' }
body = [{ 'text' : src }]
request = requests.post(url, headers = headers, json = body)
response = request.json()
string = response[0]["translations"][0]["text"]
return string
from timeit import default_timer as timer
def HTMLFilterSpeedTest(lang, word):
fin = open("/tmp/wiktionary" + lang + word + ".html", mode = "r")
text = fin.read()
start = timer()
for i in range (100):
res = BS4Filter(lang, text)
end = timer()
print ("Can complete: %f per second." % (100 / (end - start)))
res = Wiktionary2("english", "compression")
print (res)
res = Wiktionary2("french", "compression")
print (res)
HTMLFilterSpeedTest("english", "compression")
res = GTranslate("This is a Google translation test.", "ja")
print (res)
res = GTranslate("This is a Google translation test.", "fr")
print (res)
res = AzureTranslate("This is an Azure translation test.", "ja")
print (res)
res = AzureTranslate("This is an Azure translation test.", "fr")
print (res)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment