Last active
July 20, 2025 16:57
-
-
Save bradleyboehmke/74f562b0bea54e9fa1f696ed67ddcb5b to your computer and use it in GitHub Desktop.
Simple application of word embeddings with GloVe algorithm via text2vec
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # See http://text2vec.org/glove.html for more details about text2vec | |
| get_embeddings <- function(text) { | |
| # Create iterator over tokens | |
| tokens <- text2vec::space_tokenizer(text) | |
| # Create vocabulary. Terms will be unigrams (simple words). | |
| message("Creating vocabulary...") | |
| it <- text2vec::itoken(tokens, progressbar = FALSE) | |
| vocab <- text2vec::create_vocabulary(it) | |
| vocab <- text2vec::prune_vocabulary(vocab, term_count_min = 5L) | |
| # Use our filtered vocabulary | |
| vectorizer <- text2vec::vocab_vectorizer(vocab) | |
| # Use window of 5 for context words | |
| message("Creating term-co-occurence matrix...") | |
| tcm <- text2vec::create_tcm(it, vectorizer, skip_grams_window = 5L) | |
| # Fit the model | |
| message("Computing embeddings based on GloVe algorithm...") | |
| glove <- text2vec::GlobalVectors$new( | |
| word_vectors_size = 50, | |
| vocabulary = vocab, | |
| x_max = 10 | |
| ) | |
| wv_main <- glove$fit_transform(tcm, n_iter = 20, convergence_tol = 0.01) | |
| wv_context = glove$components | |
| wv_main + t(wv_context) | |
| } | |
| get_similar_words <- function(reference_word, word_embeddings) { | |
| # Find closest aligned word embeddings based on cosine similarity | |
| tryCatch({ | |
| word <- word_embeddings[reference_word, , drop = FALSE] | |
| }, | |
| error = function(e) { | |
| stop("The supplied word (", word, ") is not part of the created vocabulary.") | |
| } | |
| ) | |
| cos_sim <- text2vec::sim2( | |
| x = word_embeddings, | |
| y = word, | |
| method = "cosine", | |
| norm = "l2" | |
| ) | |
| head(sort(cos_sim[,1], decreasing = TRUE), 5) | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # download file of 500K+ Amazon reviews | |
| url <- "https://snap.stanford.edu/data/finefoods.txt.gz" | |
| download.file(url, "finefoods.txt.gz") | |
| # import reviews | |
| reviews <- readr::read_lines("finefoods.txt.gz") | |
| text <- reviews[stringr::str_detect(reviews, "review/text:")] | |
| text <- stringr::str_remove_all(text, "review/text:") | |
| # standardize text | |
| text <- iconv(text, to = "UTF-8") | |
| text <- tolower(text) | |
| text <- stringr::str_replace_all(text, "[[:punct:] ]+", " ") | |
| text <- stringr::str_trim(text) | |
| # helper functions wrapping text2vec | |
| source("helper_functions.R") | |
| # train word embeddings based on GloVe model, this will take | |
| # a few minutes to train | |
| word_embeddings <- get_embeddings(text) | |
| # find words with similar embeddings | |
| get_similar_words("dog", word_embeddings) | |
| get_similar_words("2", word_embeddings) | |
| get_similar_words("delicious", word_embeddings) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment