我嘗試比較sklearn包中和從頭開始的 kmean 聚類結果。暫存代碼如下所示:import matplotlib.pyplot as pltfrom matplotlib import stylestyle.use('ggplot')import numpy as npcolors = 10 * ["g", "r", "c", "b", "k"]class K_Means: def __init__(self, k=3, tol=0.001, max_iter=300): self.k = k self.tol = tol self.max_iter = max_iter def fit(self, data): self.centroids = {} for i in range(self.k): self.centroids[i] = data[i] for i in range(self.max_iter): self.classifications = {} for i in range(self.k): self.classifications[i] = [] for featureset in data: distances = [np.linalg.norm(featureset - self.centroids[centroid]) for centroid in self.centroids] classification = distances.index(min(distances)) self.classifications[classification].append(featureset) prev_centroids = dict(self.centroids) for classification in self.classifications: self.centroids[classification] = np.average(self.classifications[classification], axis=0) optimized = True for c in self.centroids: original_centroid = prev_centroids[c] current_centroid = self.centroids[c] if np.sum((current_centroid - original_centroid) / original_centroid * 100.0) > self.tol: print(np.sum((current_centroid - original_centroid) / original_centroid * 100.0)) optimized = False if optimized: break def predict(self, data): distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids] classification = distances.index(min(distances)) return classification但由于收斂質心不同,結果也不同。sklearn 的散點圖:同時,上面代碼的散點圖:我想知道臨時代碼中存在哪些錯誤。
sklearn 和從頭開始的不同 Kmean 結果
慕碼人8056858
2023-08-08 15:03:24