Created
April 17, 2023 07:53
-
-
Save samionb/5b366f106ff1b1b64a5f81b9c9fc7887 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <vector> | |
| #include <random> | |
| #include <algorithm> | |
| #include <cmath> | |
| #include <limits> | |
| // KMeans function takes a dataset, number of clusters, and maximum number of iterations | |
| // Returns a vector of cluster centers | |
| std::vector<std::vector<float>> KMeans(const std::vector<std::vector<float>>& dataset, int k, int max_iterations) | |
| { | |
| // Initialize random number generator | |
| std::random_device rd; | |
| std::mt19937 gen(rd()); | |
| // Initialize centroids by randomly selecting k data points from the dataset | |
| std::vector<std::vector<float>> centroids(k); | |
| std::uniform_int_distribution<int> dist(0, dataset.size() - 1); | |
| for (int i = 0; i < k; i++) { | |
| centroids[i] = dataset[dist(gen)]; | |
| } | |
| // Initialize cluster assignments for each data point | |
| std::vector<int> cluster_assignment(dataset.size()); | |
| // Run KMeans algorithm for max_iterations iterations | |
| for (int iter = 0; iter < max_iterations; iter++) { | |
| // Assign each data point to its nearest centroid | |
| for (int i = 0; i < dataset.size(); i++) { | |
| float min_distance = std::numeric_limits<float>::max(); | |
| int closest_cluster = 0; | |
| for (int j = 0; j < k; j++) { | |
| float distance = 0.0f; | |
| for (int d = 0; d < dataset[i].size(); d++) { | |
| distance += std::pow(dataset[i][d] - centroids[j][d], 2); | |
| } | |
| distance = std::sqrt(distance); | |
| if (distance < min_distance) { | |
| min_distance = distance; | |
| closest_cluster = j; | |
| } | |
| } | |
| cluster_assignment[i] = closest_cluster; | |
| } | |
| // Update centroids based on the mean of the data points assigned to each cluster | |
| std::vector<std::vector<float>> new_centroids(k, std::vector<float>(dataset[0].size(), 0.0f)); | |
| std::vector<int> cluster_sizes(k, 0); | |
| for (int i = 0; i < dataset.size(); i++) { | |
| int cluster = cluster_assignment[i]; | |
| cluster_sizes[cluster]++; | |
| for (int d = 0; d < dataset[i].size(); d++) { | |
| new_centroids[cluster][d] += dataset[i][d]; | |
| } | |
| } | |
| for (int j = 0; j < k; j++) { | |
| if (cluster_sizes[j] > 0) { | |
| for (int d = 0; d < dataset[0].size(); d++) { | |
| new_centroids[j][d] /= cluster_sizes[j]; | |
| } | |
| } | |
| } | |
| centroids = new_centroids; | |
| } | |
| return centroids; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment