2 回答

TA貢獻1866條經驗 獲得超5個贊
我看到的關鍵點是means
初始化。按照sklearn Gaussian Mixture的默認實現,我切換到 KMeans,而不是隨機初始化。
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')
eps=1e-8?
def PDF(data, means, variances):
? ? return 1/(np.sqrt(2 * np.pi * variances) + eps) * np.exp(-1/2 * (np.square(data - means) / (variances + eps)))
def EM_GMM(data, k=3, iterations=100, init_strategy='kmeans'):
? ? weights = np.ones((k, 1)) / k # shape=(k, 1)
? ??
? ? if init_strategy=='kmeans':
? ? ? ? from sklearn.cluster import KMeans
? ? ? ??
? ? ? ? km = KMeans(k).fit(data[:, None])
? ? ? ? means = km.cluster_centers_ # shape=(k, 1)
? ? ? ??
? ? else: # init_strategy=='random'
? ? ? ? means = np.random.choice(data, k)[:, np.newaxis] # shape=(k, 1)
? ??
? ? variances = np.random.random_sample(size=k)[:, np.newaxis] # shape=(k, 1)
? ? data = np.repeat(data[np.newaxis, :], k, 0) # shape=(k, n)
? ? for step in range(iterations):
? ? ? ? # Expectation step
? ? ? ? likelihood = PDF(data, means, np.sqrt(variances)) # shape=(k, n)
? ? ? ? # Maximization step
? ? ? ? b = likelihood * weights # shape=(k, n)
? ? ? ? b /= np.sum(b, axis=1)[:, np.newaxis] + eps
? ? ? ? # updage means, variances, and weights
? ? ? ? means = np.sum(b * data, axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps)
? ? ? ? variances = np.sum(b * np.square(data - means), axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps)
? ? ? ? weights = np.mean(b, axis=1)[:, np.newaxis]
? ? ? ??
? ? return means, variances
這似乎更一致地產生所需的輸出:
s = np.array([25.31? ? ? , 24.31? ? ? , 24.12? ? ? , 43.46? ? ? , 41.48666667,
? ? ? ? ? ? ? 41.48666667, 37.54? ? ? , 41.175? ? ?, 44.81? ? ? , 44.44571429,
? ? ? ? ? ? ? 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
? ? ? ? ? ? ? 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
? ? ? ? ? ? ? 44.44571429, 44.44571429, 39.71? ? ? , 26.69? ? ? , 34.15? ? ? ,
? ? ? ? ? ? ? 24.94? ? ? , 24.75? ? ? , 24.56? ? ? , 24.38? ? ? , 35.25? ? ? ,
? ? ? ? ? ? ? 44.62? ? ? , 44.94? ? ? , 44.815? ? ?, 44.69? ? ? , 42.31? ? ? ,
? ? ? ? ? ? ? 40.81? ? ? , 44.38? ? ? , 44.56? ? ? , 44.44? ? ? , 44.25? ? ? ,
? ? ? ? ? ? ? 43.66666667, 43.66666667, 43.66666667, 43.66666667, 43.66666667,
? ? ? ? ? ? ? 40.75? ? ? , 32.31? ? ? , 36.08? ? ? , 30.135? ? ?, 24.19? ? ? ])
k=3
n_iter=100
means, variances = EM_GMM(s, k, n_iter)
print(means,variances)
[[44.42596231]
?[24.509301? ]
?[35.4137508 ]]?
[[0.07568723]
?[0.10583743]
?[0.52125856]]
# Plotting the results
colors = ['green', 'red', 'blue', 'yellow']
bins = np.linspace(np.min(s)-2, np.max(s)+2, 100)
plt.figure(figsize=(10,7))
plt.xlabel('$x$')
plt.ylabel('pdf')
sns.scatterplot(s, [0.05] * len(s), color='navy', s=40, marker=2, label='Series data')
for i, (m, v) in enumerate(zip(means, variances)):
? ? sns.lineplot(bins, PDF(bins, m, v), color=colors[i], label=f'Cluster {i+1}')
plt.legend()
plt.plot()
最后我們可以看到純隨機初始化產生了不同的結果;讓我們看看結果means:
for _ in range(5):
? ? print(EM_GMM(s, k, n_iter, init_strategy='random')[0], '\n')
[[44.42596231]
?[44.42596231]
?[44.42596231]]
[[44.42596231]
?[24.509301? ]
?[30.1349997 ]]
[[44.42596231]
?[35.4137508 ]
?[44.42596231]]
[[44.42596231]
?[30.1349997 ]
?[44.42596231]]
[[44.42596231]
?[44.42596231]
?[44.42596231]]
可以看出這些結果有多么不同,在某些情況下,結果均值是恒定的,這意味著初始化選擇了 3 個相似的值并且在迭代時沒有太大變化。在 中添加一些打印語句EM_GMM將澄清這一點。

TA貢獻1795條經驗 獲得超7個贊
# Expectation step
likelihood = PDF(data, means, np.sqrt(variances))
我們為什么要sqrt過去variances?pdf 函數接受差異。所以這應該是PDF(data, means, variances)。
另一個問題,
# Maximization step
b = likelihood * weights # shape=(k, n)
b /= np.sum(b, axis=1)[:, np.newaxis] + eps
上面第二行應該是b /= np.sum(b, axis=0)[:, np.newaxis] + eps
同樣在 的初始化中variances,
variances = np.random.random_sample(size=k)[:, np.newaxis] # shape=(k, 1)
為什么我們要隨機初始化方差?我們有data和means,為什么不像 中那樣計算當前估計方差vars = np.expand_dims(np.mean(np.square(data - means), axis=1), -1)?
通過這些更改,這是我的實現,
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')
eps=1e-8
def pdf(data, means, vars):
denom = np.sqrt(2 * np.pi * vars) + eps
numer = np.exp(-0.5 * np.square(data - means) / (vars + eps))
return numer /denom
def em_gmm(data, k, n_iter, init_strategy='k_means'):
weights = np.ones((k, 1), dtype=np.float32) / k
if init_strategy == 'k_means':
from sklearn.cluster import KMeans
km = KMeans(k).fit(data[:, None])
means = km.cluster_centers_
else:
means = np.random.choice(data, k)[:, np.newaxis]
data = np.repeat(data[np.newaxis, :], k, 0)
vars = np.expand_dims(np.mean(np.square(data - means), axis=1), -1)
for step in range(n_iter):
p = pdf(data, means, vars)
b = p * weights
denom = np.expand_dims(np.sum(b, axis=0), 0) + eps
b = b / denom
means_n = np.sum(b * data, axis=1)
means_d = np.sum(b, axis=1) + eps
means = np.expand_dims(means_n / means_d, -1)
vars = np.sum(b * np.square(data - means), axis=1) / means_d
vars = np.expand_dims(vars, -1)
weights = np.expand_dims(np.mean(b, axis=1), -1)
return means, vars
def main():
s = np.array([25.31, 24.31, 24.12, 43.46, 41.48666667,
41.48666667, 37.54, 41.175, 44.81, 44.44571429,
44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429,
44.44571429, 44.44571429, 39.71, 26.69, 34.15,
24.94, 24.75, 24.56, 24.38, 35.25,
44.62, 44.94, 44.815, 44.69, 42.31,
40.81, 44.38, 44.56, 44.44, 44.25,
43.66666667, 43.66666667, 43.66666667, 43.66666667, 43.66666667,
40.75, 32.31, 36.08, 30.135, 24.19])
k = 3
n_iter = 100
means, vars = em_gmm(s, k, n_iter)
y = 0
colors = ['green', 'red', 'blue', 'yellow']
bins = np.linspace(np.min(s) - 2, np.max(s) + 2, 100)
plt.figure(figsize=(10, 7))
plt.xlabel('$x$')
plt.ylabel('pdf')
sns.scatterplot(s, [0.0] * len(s), color='navy', s=40, marker=2, label='Series data')
for i, (m, v) in enumerate(zip(means, vars)):
sns.lineplot(bins, pdf(bins, m, v), color=colors[i], label=f'Cluster {i + 1}')
plt.legend()
plt.plot()
plt.show()
pass
這是我的結果。
添加回答
舉報