在C++中,有多种聚类算法可以处理非线性数据。以下是一些常用的算法:
#include <iostream>
#include <vector>
#include <cmath>
#include <random>
#include <algorithm>
using namespace std;
vector<int> kMeans(const vector<vector<double>>& data, int k, int max_iterations = 100) {
int n = data.size();
vector<int> labels(n, -1);
vector<double> centroids(k, 0);
random_device rd;
mt19937 gen(rd());
uniform_int_distribution<> dis(0, n - 1);
for (int i = 0; i < max_iterations; ++i) {
vector<double> distances(n, 0);
for (int j = 0; j < n; ++j) {
double min_dist = DBL_MAX;
for (int l = 0; l < k; ++l) {
double dist = 0;
for (int m = 0; m < data[j].size(); ++m) {
dist += pow(data[j][m] - centroids[l][m], 2);
}
min_dist = min(min_dist, dist);
}
distances[j] = sqrt(min_dist);
}
vector<int> new_labels(n, -1);
for (int j = 0; j < n; ++j) {
double min_dist = DBL_MAX;
int min_index = -1;
for (int l = 0; l < k; ++l) {
if (distances[j] < min_dist) {
min_dist = distances[j];
min_index = l;
}
}
new_labels[j] = min_index;
if (new_labels[j] == labels[j]) {
break;
}
}
for (int j = 0; j < n; ++j) {
labels[j] = new_labels[j];
}
for (int l = 0; l < k; ++l) {
vector<double> cluster_data;
for (int j = 0; j < n; ++j) {
if (labels[j] == l) {
cluster_data.push_back(data[j]);
}
}
if (!cluster_data.empty()) {
double sum[cluster_data[0].size()] = {0};
for (const auto& point : cluster_data) {
for (int m = 0; m < point.size(); ++m) {
sum[m] += point[m];
}
}
for (int m = 0; m < cluster_data[0].size(); ++m) {
centroids[l][m] = sum[m] / cluster_data.size();
}
}
}
}
return labels;
}
#include <iostream>
#include <vector>
#include <cmath>
#include <queue>
#include <unordered_set>
using namespace std;
vector<int> dbscan(const vector<vector<double>>& data, double eps, int min_samples) {
int n = data.size();
vector<int> labels(n, -1);
queue<int> q;
unordered_set<int> visited;
for (int i = 0; i < n; ++i) {
if (visited.find(i) != visited.end()) {
continue;
}
q.push(i);
visited.insert(i);
int num_points = 0;
vector<double> point_eps_radius(data[0].size(), 0);
while (!q.empty()) {
int point = q.front();
q.pop();
num_points++;
for (int m = 0; m < data[point].size(); ++m) {
point_eps_radius[m] = max(point_eps_radius[m], abs(data[point][m] - data[q.front()][m]));
}
for (int neighbor : get_neighbors(data, point, eps)) {
if (visited.find(neighbor) == visited.end()) {
q.push(neighbor);
visited.insert(neighbor);
}
}
}
if (num_points < min_samples) {
continue;
}
int cluster_id = n;
for (int neighbor : get_neighbors(data, q.front(), eps)) {
if (visited.find(neighbor) == visited.end() && labels[neighbor] == -1) {
vector<int> cluster = dbscan(data, eps, min_samples);
if (cluster.size() > 0) {
cluster_id = min(cluster_id, cluster[0]);
}
}
}
for (int neighbor : get_neighbors(data, q.front(), eps)) {
if (visited.find(neighbor) != visited.end()) {
labels[neighbor] = cluster_id;
}
}
}
return labels;
}
vector<int> get_neighbors(const vector<vector<double>>& data, int point, double eps) {
int n = data.size();
vector<int> neighbors;
for (int i = 0; i < n; ++i) {
if (i == point) {
continue;
}
double distance = 0;
for (int m = 0; m < data[point].size(); ++m) {
distance += pow(data[point][m] - data[i][m], 2);
}
if (distance < eps * eps) {
neighbors.push_back(i);
}
}
return neighbors;
}
#include <iostream>
#include <vector>
#include <cmath>
#include <random>
#include <algorithm>
using namespace std;
vector<int> gmm(const vector<vector<double>>& data, int n_components, double max_iter = 100, double tol = 1e-4) {
int n = data.size();
vector<int> labels(n, -1);
vector<double> weights(n_components, 1.0 / n_components);
vector<vector<double>> means(n_components, vector<double>(data[0].size(), 0));
vector<vector<double>> covariances(n_components, vector<double>(data[0].size(), 0));
random_device rd;
mt19937 gen(rd());
uniform_real_distribution<> dis(0, 1);
for (int iter = 0; iter < max_iter; ++iter) {
vector<int> labels_new(n, -1);
vector<double> weights_new(n_components, 0);
vector<vector<double>> means_new(n_components, vector<double>(data[0].size(), 0));
vector<vector<double>> covariances_new(n_components, vector<double>(data[0].size(), 0));
for (int j = 0; j < n; ++j) {
double max_log_likelihood = -DBL_MAX;
int max_component = -1;
for (int k = 0; k < n_components; ++k) {
double log_likelihood = 0;
for (int m = 0; m < data[j].size(); ++m) {
double mean = means[k][m];
double covariance = covariances[k][m];
double value = data[j][m];
log_likelihood += log(2 * M_PI * pow(covariance, 0.5)) + pow(value - mean, 2) / (2 * covariance);
}
if (log_likelihood > max_log_likelihood) {
max_log_likelihood = log_likelihood;
max_component = k;
}
}
labels_new[j] = max_component;
weights_new[max_component] += 1;
means_new[max_component] = data[j];
if (data[j].size() > 1) {
covariances_new[max_component] += data[j] * data[j].t();
}
}
for (int k = 0; k < n_components; ++k) {
weights[k] = weights_new[k] / n;
means[k] = means_new[k] / weights[k];
if (data[j].size() > 1) {
covariances[k] = covariances_new[k] / weights[k];
}
}
if (max(abs(weights_new - weights)) < tol && max(abs(means_new - means)) < tol) {
break;
}
}
return labels;
}
这些算法可以处理非线性数据,但可能需要调整参数以获得最佳聚类效果。在实际应用中,可以尝试多种算法并比较它们的聚类结果,以选择最适合特定数据的算法。
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。