Skip to content

Instantly share code, notes, and snippets.

@samionb
Created April 17, 2023 07:53
Show Gist options
  • Select an option

  • Save samionb/5b366f106ff1b1b64a5f81b9c9fc7887 to your computer and use it in GitHub Desktop.

Select an option

Save samionb/5b366f106ff1b1b64a5f81b9c9fc7887 to your computer and use it in GitHub Desktop.
#include <vector>
#include <random>
#include <algorithm>
#include <cmath>
#include <limits>
// KMeans function takes a dataset, number of clusters, and maximum number of iterations
// Returns a vector of cluster centers
std::vector<std::vector<float>> KMeans(const std::vector<std::vector<float>>& dataset, int k, int max_iterations)
{
// Initialize random number generator
std::random_device rd;
std::mt19937 gen(rd());
// Initialize centroids by randomly selecting k data points from the dataset
std::vector<std::vector<float>> centroids(k);
std::uniform_int_distribution<int> dist(0, dataset.size() - 1);
for (int i = 0; i < k; i++) {
centroids[i] = dataset[dist(gen)];
}
// Initialize cluster assignments for each data point
std::vector<int> cluster_assignment(dataset.size());
// Run KMeans algorithm for max_iterations iterations
for (int iter = 0; iter < max_iterations; iter++) {
// Assign each data point to its nearest centroid
for (int i = 0; i < dataset.size(); i++) {
float min_distance = std::numeric_limits<float>::max();
int closest_cluster = 0;
for (int j = 0; j < k; j++) {
float distance = 0.0f;
for (int d = 0; d < dataset[i].size(); d++) {
distance += std::pow(dataset[i][d] - centroids[j][d], 2);
}
distance = std::sqrt(distance);
if (distance < min_distance) {
min_distance = distance;
closest_cluster = j;
}
}
cluster_assignment[i] = closest_cluster;
}
// Update centroids based on the mean of the data points assigned to each cluster
std::vector<std::vector<float>> new_centroids(k, std::vector<float>(dataset[0].size(), 0.0f));
std::vector<int> cluster_sizes(k, 0);
for (int i = 0; i < dataset.size(); i++) {
int cluster = cluster_assignment[i];
cluster_sizes[cluster]++;
for (int d = 0; d < dataset[i].size(); d++) {
new_centroids[cluster][d] += dataset[i][d];
}
}
for (int j = 0; j < k; j++) {
if (cluster_sizes[j] > 0) {
for (int d = 0; d < dataset[0].size(); d++) {
new_centroids[j][d] /= cluster_sizes[j];
}
}
}
centroids = new_centroids;
}
return centroids;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment