bradleyboehmke · July 20, 2025 16:57
diff --git a/help_functions.R b/help_functions.R
 # See http://text2vec.org/glove.html for more details about text2vec
 get_embeddings <- function(text) {
  
  # Create iterator over tokens
  tokens <- text2vec::space_tokenizer(text)
  
  # Create vocabulary. Terms will be unigrams (simple words).
  message("Creating vocabulary...")
  it <- text2vec::itoken(tokens, progressbar = FALSE)
  vocab <- text2vec::create_vocabulary(it)
  vocab <- text2vec::prune_vocabulary(vocab, term_count_min = 5L)
  
  # Use our filtered vocabulary
  vectorizer <- text2vec::vocab_vectorizer(vocab)
  
  # Use window of 5 for context words
  message("Creating term-co-occurence matrix...")
  tcm <- text2vec::create_tcm(it, vectorizer, skip_grams_window = 5L)
  
  # Fit the model
  message("Computing embeddings based on GloVe algorithm...")
  glove <- text2vec::GlobalVectors$new(
    word_vectors_size = 50, 
    vocabulary = vocab, 
    x_max = 10
    )
  wv_main <- glove$fit_transform(tcm, n_iter = 20, convergence_tol = 0.01)
  wv_context = glove$components
  wv_main + t(wv_context)
 }

 get_similar_words <- function(reference_word, word_embeddings) {
  
  # Find closest aligned word embeddings based on cosine similarity
  tryCatch({
    word <- word_embeddings[reference_word, , drop = FALSE]
  },
    error = function(e) {
      stop("The supplied word (", word, ") is not part of the created vocabulary.")
    }
  )
  
  cos_sim <- text2vec::sim2(
    x = word_embeddings, 
    y = word, 
    method = "cosine", 
    norm = "l2"
    )
  
  head(sort(cos_sim[,1], decreasing = TRUE), 5)
  
 }
diff --git a/text_text2vec.R b/text_text2vec.R
 # download file of 500K+ Amazon reviews
 url <- "https://snap.stanford.edu/data/finefoods.txt.gz"
 download.file(url, "finefoods.txt.gz")

 # import reviews
 reviews <- readr::read_lines("finefoods.txt.gz") 
 text <- reviews[stringr::str_detect(reviews, "review/text:")]
 text <- stringr::str_remove_all(text, "review/text:")

 # standardize text
 text <- iconv(text, to = "UTF-8")
 text <- tolower(text)
 text <- stringr::str_replace_all(text, "[[:punct:] ]+", " ")
 text <- stringr::str_trim(text)

 # helper functions wrapping text2vec
 source("helper_functions.R")

 # train word embeddings based on GloVe model, this will take
 # a few minutes to train
 word_embeddings <- get_embeddings(text)

 # find words with similar embeddings
 get_similar_words("dog", word_embeddings)
 get_similar_words("2", word_embeddings)
 get_similar_words("delicious", word_embeddings)
	# See http://text2vec.org/glove.html for more details about text2vec
	get_embeddings <- function(text) {

	# Create iterator over tokens
	tokens <- text2vec::space_tokenizer(text)

	# Create vocabulary. Terms will be unigrams (simple words).
	message("Creating vocabulary...")
	it <- text2vec::itoken(tokens, progressbar = FALSE)
	vocab <- text2vec::create_vocabulary(it)
	vocab <- text2vec::prune_vocabulary(vocab, term_count_min = 5L)

	# Use our filtered vocabulary
	vectorizer <- text2vec::vocab_vectorizer(vocab)

	# Use window of 5 for context words
	message("Creating term-co-occurence matrix...")
	tcm <- text2vec::create_tcm(it, vectorizer, skip_grams_window = 5L)

	# Fit the model
	message("Computing embeddings based on GloVe algorithm...")
	glove <- text2vec::GlobalVectors$new(
	word_vectors_size = 50,
	vocabulary = vocab,
	x_max = 10
	)
	wv_main <- glove$fit_transform(tcm, n_iter = 20, convergence_tol = 0.01)
	wv_context = glove$components
	wv_main + t(wv_context)
	}

	get_similar_words <- function(reference_word, word_embeddings) {

	# Find closest aligned word embeddings based on cosine similarity
	tryCatch({
	word <- word_embeddings[reference_word, , drop = FALSE]
	},
	error = function(e) {
	stop("The supplied word (", word, ") is not part of the created vocabulary.")
	}
	)

	cos_sim <- text2vec::sim2(
	x = word_embeddings,
	y = word,
	method = "cosine",
	norm = "l2"
	)

	head(sort(cos_sim[,1], decreasing = TRUE), 5)

	}
	# download file of 500K+ Amazon reviews
	url <- "https://snap.stanford.edu/data/finefoods.txt.gz"
	download.file(url, "finefoods.txt.gz")

	# import reviews
	reviews <- readr::read_lines("finefoods.txt.gz")
	text <- reviews[stringr::str_detect(reviews, "review/text:")]
	text <- stringr::str_remove_all(text, "review/text:")

	# standardize text
	text <- iconv(text, to = "UTF-8")
	text <- tolower(text)
	text <- stringr::str_replace_all(text, "[[:punct:] ]+", " ")
	text <- stringr::str_trim(text)

	# helper functions wrapping text2vec
	source("helper_functions.R")

	# train word embeddings based on GloVe model, this will take
	# a few minutes to train
	word_embeddings <- get_embeddings(text)

	# find words with similar embeddings
	get_similar_words("dog", word_embeddings)
	get_similar_words("2", word_embeddings)
	get_similar_words("delicious", word_embeddings)