Precision: 0.84776 +- 0.00871
Recall: 0.83987 +- 0.00888
| testing_corpus | precision | recall |
|---|---|---|
| kaz.cleaned_0 | 0.838899 | 0.829193 |
| kaz.cleaned_1 | 0.841818 | 0.834286 |
| kaz.cleaned_2 | 0.860011 | 0.84795 |
| import numpy as np | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| model_name = "AMR-KELEG/Sentence-ALDi" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
| def compute_score(sentence): | |
| """Returns a normalized divergence 'distance' score from MSA in [0, 1]""" | |
| # Warning -- inputs longer than 512 subtokens are truncated |
| import re | |
| import torch | |
| import pandas as pd | |
| from tqdm import tqdm | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| DIALECTS = [ | |
| "Algeria", | |
| "Bahrain", | |
| "Egypt", |