Skip to content

Instantly share code, notes, and snippets.

View chumpblocckami's full-sized avatar
🎯
Focusing

Matteo Mazzola chumpblocckami

🎯
Focusing
View GitHub Profile
@chumpblocckami
chumpblocckami / gist:dc977f129bc504f96f1d6b4201c34ebf
Created November 6, 2024 21:24
utility functions for statistic relevance
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kstest
def draw_boxplot(df: pd.DataFrame):
df_melted = df.melt(id_vars=["checkpoints", "label"], value_vars=["BARI", "MIRI", "ORPI"],
var_name="Index", value_name="Value")
print(df_melted)
import nltk
from nltk import *
import random
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
file = "dataset"
collCategorized= CategorizedPlaintextCorpusReader(file, r'.*\.txt', cat_pattern=r'(\w+)/*',encoding="utf8")
documents=[(list(collCategorized.words(fileid)), category) for category in collCategorized.categories()
for fileid in collCategorized.fileids(category)]
random.shuffle(documents)