Skip to content

Instantly share code, notes, and snippets.

@idshklein
Created October 27, 2025 19:39
Show Gist options
  • Select an option

  • Save idshklein/de006300813ca3ee91680b1b623a4f9e to your computer and use it in GitHub Desktop.

Select an option

Save idshklein/de006300813ca3ee91680b1b623a4f9e to your computer and use it in GitHub Desktop.
pacman::p_load(plotly,RColorBrewer,httr,jsonlite,stringi,tidyverse,word2vec,lsa,pheatmap,ggfortify,gginnards)
get_chapter <- function(book, chapter) {
url <- sprintf("https://www.sefaria.org/api/texts/%s.%s?lang=he", book, chapter)
res <- fromJSON(content(GET(url), "text", encoding = "UTF-8"))
if (length(res$text) == 0) return(NULL)
data.frame(
book = book,
chapter = chapter,
verse = seq_along(res$he),
text = res$he,
stringsAsFactors = FALSE
)
}
# Example: בראשית פרקים א–ה
books <- c("Genesis","Exodus","Leviticus","Numbers","Deuteronomy")
bible <-expand_grid(books,ch = 1:50) %>% mutate(res = map2(books,ch,get_chapter,.progress = T))
bible <- bible%>% unnest(res)
clean_hebrew <- function(txt) {
txt %>%
stri_trans_general("NFD; [:Nonspacing Mark:] Remove; NFC") %>% # remove niqqud / cantillation marks
gsub("[^\\p{Hebrew}\\s]", "", ., perl = TRUE) %>% # keep only Hebrew letters and spaces
gsub("\\s+", " ", .) %>% # normalize multiple spaces
gsub("-", " ", .) %>%
gsub("־", " ", .) %>%
gsub("׃", " ", .) %>%
trimws() # remove leading/trailing spaces
}
bible$clean <- clean_hebrew(bible$text)
# Prepare a temporary training corpus
writeLines(bible$clean, "tanakh.txt")
model <- word2vec("tanakh.txt", type = "skip-gram", dim = 200, window = 20, iter = 20)
predict(model, c("משה"), type = "nearest", top_n = 10)[[1]] %>% View()
# Get embeddings for all words
emb <- as.matrix(model)
verse_vec <- function(txt) {
words <- strsplit(txt, "\\s+")[[1]]
vecs <- emb[intersect(words, rownames(emb)), , drop = FALSE]
if (nrow(vecs) == 0) return(rep(0, ncol(emb)))
colMeans(vecs)
}
bible$vec <- lapply(bible$clean, verse_vec)
#
# chapter_vecs <- bible |>
# group_by(book, chapter) |>
# summarise(vec = list(Reduce(`+`, vec) / length(vec)))
#
#
#
# X <- do.call(rbind, chapter_vecs$vec)
# rownames(X) <- paste(chapter_vecs$book, chapter_vecs$chapter)
#
#
#
# sim <- cosine(t(X)) # cosine similarity
# pheatmap(sim,cutree_cols = 13,cutree_rows = 13)
# pheatmap(sim,
# cluster_rows = FALSE,
# cluster_cols = TRUE, main = "דמיון סמנטי בין פרקים בתנ״ך",
# cutree_cols = 10)
# pheatmap(sim,
# cluster_rows = FALSE,
# cluster_cols = FALSE, main = "דמיון סמנטי בין פרקים בתנ״ך")
X <- do.call(rbind, bible$vec)
rownames(X) <- paste(bible$book, bible$chapter,bible$verse)
p <- autoplot(prcomp(X), data = bible %>% select(-vec), colour = 'chapter') + facet_wrap(~book)
p <- delete_layers(p,"GeomPoint")
p + geom_text(aes(label = verse,color = chapter),size = 2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment