Created
October 27, 2025 19:39
-
-
Save idshklein/de006300813ca3ee91680b1b623a4f9e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| pacman::p_load(plotly,RColorBrewer,httr,jsonlite,stringi,tidyverse,word2vec,lsa,pheatmap,ggfortify,gginnards) | |
| get_chapter <- function(book, chapter) { | |
| url <- sprintf("https://www.sefaria.org/api/texts/%s.%s?lang=he", book, chapter) | |
| res <- fromJSON(content(GET(url), "text", encoding = "UTF-8")) | |
| if (length(res$text) == 0) return(NULL) | |
| data.frame( | |
| book = book, | |
| chapter = chapter, | |
| verse = seq_along(res$he), | |
| text = res$he, | |
| stringsAsFactors = FALSE | |
| ) | |
| } | |
| # Example: בראשית פרקים א–ה | |
| books <- c("Genesis","Exodus","Leviticus","Numbers","Deuteronomy") | |
| bible <-expand_grid(books,ch = 1:50) %>% mutate(res = map2(books,ch,get_chapter,.progress = T)) | |
| bible <- bible%>% unnest(res) | |
| clean_hebrew <- function(txt) { | |
| txt %>% | |
| stri_trans_general("NFD; [:Nonspacing Mark:] Remove; NFC") %>% # remove niqqud / cantillation marks | |
| gsub("[^\\p{Hebrew}\\s]", "", ., perl = TRUE) %>% # keep only Hebrew letters and spaces | |
| gsub("\\s+", " ", .) %>% # normalize multiple spaces | |
| gsub("-", " ", .) %>% | |
| gsub("־", " ", .) %>% | |
| gsub("׃", " ", .) %>% | |
| trimws() # remove leading/trailing spaces | |
| } | |
| bible$clean <- clean_hebrew(bible$text) | |
| # Prepare a temporary training corpus | |
| writeLines(bible$clean, "tanakh.txt") | |
| model <- word2vec("tanakh.txt", type = "skip-gram", dim = 200, window = 20, iter = 20) | |
| predict(model, c("משה"), type = "nearest", top_n = 10)[[1]] %>% View() | |
| # Get embeddings for all words | |
| emb <- as.matrix(model) | |
| verse_vec <- function(txt) { | |
| words <- strsplit(txt, "\\s+")[[1]] | |
| vecs <- emb[intersect(words, rownames(emb)), , drop = FALSE] | |
| if (nrow(vecs) == 0) return(rep(0, ncol(emb))) | |
| colMeans(vecs) | |
| } | |
| bible$vec <- lapply(bible$clean, verse_vec) | |
| # | |
| # chapter_vecs <- bible |> | |
| # group_by(book, chapter) |> | |
| # summarise(vec = list(Reduce(`+`, vec) / length(vec))) | |
| # | |
| # | |
| # | |
| # X <- do.call(rbind, chapter_vecs$vec) | |
| # rownames(X) <- paste(chapter_vecs$book, chapter_vecs$chapter) | |
| # | |
| # | |
| # | |
| # sim <- cosine(t(X)) # cosine similarity | |
| # pheatmap(sim,cutree_cols = 13,cutree_rows = 13) | |
| # pheatmap(sim, | |
| # cluster_rows = FALSE, | |
| # cluster_cols = TRUE, main = "דמיון סמנטי בין פרקים בתנ״ך", | |
| # cutree_cols = 10) | |
| # pheatmap(sim, | |
| # cluster_rows = FALSE, | |
| # cluster_cols = FALSE, main = "דמיון סמנטי בין פרקים בתנ״ך") | |
| X <- do.call(rbind, bible$vec) | |
| rownames(X) <- paste(bible$book, bible$chapter,bible$verse) | |
| p <- autoplot(prcomp(X), data = bible %>% select(-vec), colour = 'chapter') + facet_wrap(~book) | |
| p <- delete_layers(p,"GeomPoint") | |
| p + geom_text(aes(label = verse,color = chapter),size = 2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment