KeithCu · March 22, 2019 02:45
diff --git a/Wiktionary.py b/Wiktionary.py
 from functools import lru_cache


 import mwclient
 from mwclient import Site

 ua = 'LibreOffice Wiktionary/0.1 run by User:xxy'
 #This API returns Mediawiki markup
 @lru_cache(maxsize = 10)
 def Wiktionary(lang, word):
    site = mwclient.Site(host = 'en.wiktionary.org', clients_useragent=ua)
    page = site.pages[word]
    return page.text()

 import requests

 #TODO: Convert to work with LibreOffice langids?
 #Is this information exposed to UNO / Python?
 #https://docs.libreoffice.org/setup_native/html/sellang_8cxx_source.html
 def lang_to_url(lang, api):
    url_prefix = "https://"

    url_suffix = ".wiktionary.org"
    if api:
        url_suffix = url_suffix + "/w/api.php"

    #This usually works (french -> fr, english -> en)
    wiki_url_lang = lang[0:2]

    if lang == "portuguese":
        wiki_url_lang = "pt"
        
    return url_prefix + wiki_url_lang + url_suffix

 #This API returns HTML.
 @lru_cache(maxsize = 10)
 def Wiktionary2(lang, word):

    text = None

    #A persistent cache for debugging purposes to save fetches.
    try:
        fin = open("/tmp/wiktionary" + lang + word + ".html", mode = "r")
        text = fin.read()
    except FileNotFoundError:
        session = requests.Session()

        wiki_url = lang_to_url(lang, True)

        p = { "action": "parse",
            "page": word,
            "format": "json",
 #            "disablelimitreport" : True,
            "disableeditsection" : True,
            "disabletoc" : True,
            #"prop" : "text|categories|templates|images|externallinks|sections|properties|parsewarnings"
            }

        #This retrieves mobile view. Still returns just as many
        #sections by default.
        if False:
            p['action'] = "mobileview"

        r = session.get(url = wiki_url, params = p)
        data = r.json()

        text = data['parse']['text']['*']

        fout = open("/tmp/wiktionary" + lang + word + ".html", mode = "w")
        fout.write(text)

    string = BS4Filter(lang, text)

    fout = open("/tmp/wiktionary" + lang + word + "filtered.html", mode = "w")
    fout.write(string)

    return string


 from bs4 import BeautifulSoup

 def BS4Filter(lang, text):
    soup = BeautifulSoup(text, 'lxml')
    lang_url = lang_to_url(lang, False)

    #Fix local links to point to full Wiktionary URL
    #Could find no way to do this via the API. 
    for link in soup.find_all("a"):
        href = link.get('href')
        if href and  href.startswith("/"):
            link["href"] = lang_url + href

    tag = soup.find("span", id = lang.title())
    if tag is not None:
        tag.decompose()

    tag = soup.find("span", id="Etymology")
    if tag is not None:
        tag.decompose()

    tags = soup.find_all("table", {"class" : "translations"})
    for tag in tags:
        tag.decompose()

    tags = soup.find_all("span", {"class" : "mw-headline"})
    for tag in tags:
        tag.decompose()

    string = soup.prettify()

    return string

 from googletrans import Translator
 g_trans = Translator()

 def GTranslate(src, dest_lang):
    return g_trans.translate(src, dest = dest_lang).text

 import requests, json

 def AzureTranslate(src, dest_lang):

    subscriptionKey = 'put_key_here'

    base_url = 'https://api.cognitive.microsofttranslator.com/translate?api-version=3.0'
    params = '&to=' + dest_lang
    url = base_url + params

    headers = { 'Ocp-Apim-Subscription-Key': subscriptionKey,
        'Content-type': 'application/json' }

    body = [{ 'text' : src }]

    request = requests.post(url, headers = headers, json = body)
    response = request.json()

    string = response[0]["translations"][0]["text"]

    return string


 from timeit import default_timer as timer

 def HTMLFilterSpeedTest(lang, word):

    fin = open("/tmp/wiktionary" + lang + word + ".html", mode = "r")
    text = fin.read()

    start = timer()

    for i in range (100):
        res = BS4Filter(lang, text)

    end = timer()
    print ("Can complete: %f per second." % (100 / (end - start)))

 res = Wiktionary2("english", "compression")
 print (res)

 res = Wiktionary2("french", "compression")
 print (res)

 HTMLFilterSpeedTest("english", "compression")

 res = GTranslate("This is a Google translation test.", "ja")
 print (res)

 res = GTranslate("This is a Google translation test.", "fr")
 print (res)

 res = AzureTranslate("This is an Azure translation test.", "ja")
 print (res)

 res = AzureTranslate("This is an Azure translation test.", "fr")
 print (res)
	from functools import lru_cache


	import mwclient
	from mwclient import Site

	ua = 'LibreOffice Wiktionary/0.1 run by User:xxy'
	#This API returns Mediawiki markup
	@lru_cache(maxsize = 10)
	def Wiktionary(lang, word):
	site = mwclient.Site(host = 'en.wiktionary.org', clients_useragent=ua)
	page = site.pages[word]
	return page.text()

	import requests

	#TODO: Convert to work with LibreOffice langids?
	#Is this information exposed to UNO / Python?
	#https://docs.libreoffice.org/setup_native/html/sellang_8cxx_source.html
	def lang_to_url(lang, api):
	url_prefix = "https://"

	url_suffix = ".wiktionary.org"
	if api:
	url_suffix = url_suffix + "/w/api.php"

	#This usually works (french -> fr, english -> en)
	wiki_url_lang = lang[0:2]

	if lang == "portuguese":
	wiki_url_lang = "pt"

	return url_prefix + wiki_url_lang + url_suffix

	#This API returns HTML.
	@lru_cache(maxsize = 10)
	def Wiktionary2(lang, word):

	text = None

	#A persistent cache for debugging purposes to save fetches.
	try:
	fin = open("/tmp/wiktionary" + lang + word + ".html", mode = "r")
	text = fin.read()
	except FileNotFoundError:
	session = requests.Session()

	wiki_url = lang_to_url(lang, True)

	p = { "action": "parse",
	"page": word,
	"format": "json",
	# "disablelimitreport" : True,
	"disableeditsection" : True,
	"disabletoc" : True,
	#"prop" : "text\|categories\|templates\|images\|externallinks\|sections\|properties\|parsewarnings"
	}

	#This retrieves mobile view. Still returns just as many
	#sections by default.
	if False:
	p['action'] = "mobileview"

	r = session.get(url = wiki_url, params = p)
	data = r.json()

	text = data['parse']['text']['*']

	fout = open("/tmp/wiktionary" + lang + word + ".html", mode = "w")
	fout.write(text)

	string = BS4Filter(lang, text)

	fout = open("/tmp/wiktionary" + lang + word + "filtered.html", mode = "w")
	fout.write(string)

	return string


	from bs4 import BeautifulSoup

	def BS4Filter(lang, text):
	soup = BeautifulSoup(text, 'lxml')
	lang_url = lang_to_url(lang, False)

	#Fix local links to point to full Wiktionary URL
	#Could find no way to do this via the API.
	for link in soup.find_all("a"):
	href = link.get('href')
	if href and href.startswith("/"):
	link["href"] = lang_url + href

	tag = soup.find("span", id = lang.title())
	if tag is not None:
	tag.decompose()

	tag = soup.find("span", id="Etymology")
	if tag is not None:
	tag.decompose()

	tags = soup.find_all("table", {"class" : "translations"})
	for tag in tags:
	tag.decompose()

	tags = soup.find_all("span", {"class" : "mw-headline"})
	for tag in tags:
	tag.decompose()

	string = soup.prettify()

	return string

	from googletrans import Translator
	g_trans = Translator()

	def GTranslate(src, dest_lang):
	return g_trans.translate(src, dest = dest_lang).text

	import requests, json

	def AzureTranslate(src, dest_lang):

	subscriptionKey = 'put_key_here'

	base_url = 'https://api.cognitive.microsofttranslator.com/translate?api-version=3.0'
	params = '&to=' + dest_lang
	url = base_url + params

	headers = { 'Ocp-Apim-Subscription-Key': subscriptionKey,
	'Content-type': 'application/json' }

	body = [{ 'text' : src }]

	request = requests.post(url, headers = headers, json = body)
	response = request.json()

	string = response[0]["translations"][0]["text"]

	return string


	from timeit import default_timer as timer

	def HTMLFilterSpeedTest(lang, word):

	fin = open("/tmp/wiktionary" + lang + word + ".html", mode = "r")
	text = fin.read()

	start = timer()

	for i in range (100):
	res = BS4Filter(lang, text)

	end = timer()
	print ("Can complete: %f per second." % (100 / (end - start)))

	res = Wiktionary2("english", "compression")
	print (res)

	res = Wiktionary2("french", "compression")
	print (res)

	HTMLFilterSpeedTest("english", "compression")

	res = GTranslate("This is a Google translation test.", "ja")
	print (res)

	res = GTranslate("This is a Google translation test.", "fr")
	print (res)

	res = AzureTranslate("This is an Azure translation test.", "ja")
	print (res)

	res = AzureTranslate("This is an Azure translation test.", "fr")
	print (res)
No results found