drewhendrickson · July 22, 2017 17:45
diff --git a/sample_build_cooccurance_matrix.R b/sample_build_cooccurance_matrix.R
 # build some sample data
 entries = 1000
 data <- data.frame(nomem_encr = round(runif(entries, 100, 10000)),
                   Cluster.x = sample(1:4, entries, replace=T),
                   Cluster.y = sample(1:4, entries, replace=T))

 # check that my sample data looks ok
 head(data)

 # get the unique cluster labels for each dimension
 # this protects me if the cluster labels are numbers or letters
 x_cluster_labels = unique(data$Cluster.x)
 y_cluster_labels = unique(data$Cluster.y)

 # build an empty cooccurance matrix count
 # the names of the dimensions are based on the previous variables
 cooccurance_counts <- matrix(0, nrow=length(x_cluster_labels), 
                             ncol=length(y_cluster_labels),
                             dimnames = list(paste0("x_", x_cluster_labels),
                                             paste0("y_", y_cluster_labels)))

 # loop through each row in the data
 for (i in 1:nrow(data)) {

  # determine which row and column of my coocurrance matrix should be updated
  #   match returns the index in the cluster_label variable that matches
  #   the current value
  current_x_index = match(data[i,]$Cluster.x, x_cluster_labels)
  current_y_index = match(data[i,]$Cluster.y, y_cluster_labels)
  
  # update the appropriate value of the cooccurance matrix
  cooccurance_counts[current_x_index, current_y_index] = 
    cooccurance_counts[current_x_index, current_y_index] +1
 }

 # print out my co-occurance counts
 cooccurance_counts

 # simple check: should be true that I end up with one 
 # entry in the cooccurance matrix for each row
 sum(cooccurance_counts) == nrow(data)
	# build some sample data
	entries = 1000
	data <- data.frame(nomem_encr = round(runif(entries, 100, 10000)),
	Cluster.x = sample(1:4, entries, replace=T),
	Cluster.y = sample(1:4, entries, replace=T))

	# check that my sample data looks ok
	head(data)

	# get the unique cluster labels for each dimension
	# this protects me if the cluster labels are numbers or letters
	x_cluster_labels = unique(data$Cluster.x)
	y_cluster_labels = unique(data$Cluster.y)

	# build an empty cooccurance matrix count
	# the names of the dimensions are based on the previous variables
	cooccurance_counts <- matrix(0, nrow=length(x_cluster_labels),
	ncol=length(y_cluster_labels),
	dimnames = list(paste0("x_", x_cluster_labels),
	paste0("y_", y_cluster_labels)))

	# loop through each row in the data
	for (i in 1:nrow(data)) {

	# determine which row and column of my coocurrance matrix should be updated
	# match returns the index in the cluster_label variable that matches
	# the current value
	current_x_index = match(data[i,]$Cluster.x, x_cluster_labels)
	current_y_index = match(data[i,]$Cluster.y, y_cluster_labels)

	# update the appropriate value of the cooccurance matrix
	cooccurance_counts[current_x_index, current_y_index] =
	cooccurance_counts[current_x_index, current_y_index] +1
	}

	# print out my co-occurance counts
	cooccurance_counts

	# simple check: should be true that I end up with one
	# entry in the cooccurance matrix for each row
	sum(cooccurance_counts) == nrow(data)
No results found