摘要:聚類效果數據集代碼從文件中加載數據一次性讀取文件中的所有數據遍歷數據中的每一行對
聚類效果
1.658985 4.285136 -3.453687 3.424321 4.838138 -1.151539 -5.379713 -3.362104 0.972564 2.924086 -3.567919 1.531611 0.450614 -3.302219 -3.487105 -1.724432 2.668759 1.594842 -3.156485 3.191137 3.165506 -3.999838 -2.786837 -3.099354 4.208187 2.984927 -2.123337 2.943366 0.704199 -0.479481 -0.392370 -3.963704 2.831667 1.574018 -0.790153 3.343144 2.943496 -3.357075 -3.195883 -2.283926 2.336445 2.875106 -1.786345 2.554248 2.190101 -1.906020 -3.403367 -2.778288 1.778124 3.880832 -1.688346 2.230267 2.592976 -2.054368 -4.007257 -3.207066 2.257734 3.387564 -2.679011 0.785119 0.939512 -4.023563 -3.674424 -2.261084 2.046259 2.735279 -3.189470 1.780269 4.372646 -0.822248 -2.579316 -3.497576 1.889034 5.190400 -0.798747 2.185588 2.836520 -2.658556 -3.837877 -3.253815 2.096701 3.886007 -2.709034 2.923887 3.367037 -3.184789 -2.121479 -4.232586 2.329546 3.179764 -3.284816 3.273099 3.091414 -3.815232 -3.762093 -2.432191 3.542056 2.778832 -1.736822 4.241041 2.127073 -2.983680 -4.323818 -3.938116 3.792121 5.135768 -4.786473 3.358547 2.624081 -3.260715 -4.009299 -2.978115 2.493525 1.963710 -2.513661 2.642162 1.864375 -3.176309 -3.171184 -3.572452 2.894220 2.489128 -2.562539 2.884438 3.491078 -3.947487 -2.565729 -2.012114 3.332948 3.983102 -1.616805 3.573188 2.280615 -2.559444 -2.651229 -3.103198 2.321395 3.154987 -1.685703 2.939697 3.031012 -3.620252 -4.599622 -2.185829 4.196223 1.126677 -2.133863 3.093686 4.668892 -2.562705 -2.793241 -2.149706 2.884105 3.043438 -2.967647 2.848696 4.479332 -1.764772 -4.905566 -2.911070
import numpy as np import math import matplotlib.pyplot as plt #從文件中加載數據 def loadDataSet(file_name): data_mat = [] with open(file_name) as fr: #一次性讀取文件中的所有數據 lines = fr.readlines() #print(lines) #遍歷數據中的每一行 for line in lines: #對每一行以 進行分割 cur_line = line.strip().split(" ") #["1.658985", "4.285136"] #print(cur_line) #將每一行的內容由字符串轉換成float flt_line = list(map(lambda x:float(x), cur_line)) #[-4.905566, -2.91107] #print(flt_line) #將轉換后的內容append到data_mat中 data_mat.append(flt_line) #返回一個array類型 return np.array(data_mat) #計算兩個向量的歐式距離 #傳入vecA=(x1,y1);vecB=(x2,y2) #計算的是sqrt((x1-x2)^2+(y1-y2)^2) def dist_eclud(vecA,vecB): vec_square = [] for element in vecA - vecB: element = element ** 2 vec_square.append(element) return sum(vec_square) ** 0.5 #構建k個隨機質心 def rand_cent(dataSet,k): #n表示dataSet的列數 n = data_set.shape[1] #print(np.shape(dataSet)) #構造一個k*n的0矩陣 centroids = np.zeros((k, n)) #填充矩陣的每一列 for j in range(n): #得到dataSet中第j列的最小值s min_j = float(min(data_set[:,j])) #獲得第j列的最小值與最大值的差 range_j = float(max(data_set[:,j])) - min_j #minJ+最小值與最大值的差*一個(0-1)之間的隨機數 centroids[:,j] = (min_j + range_j * np.random.rand(k, 1))[:,0] return centroids #K-均值聚類算法 def Kmeans(data_set, k): #m為dataSet的行數 m = data_set.shape[0] #初始化一個m*2的0矩陣 #每一行表示每一個點,[0]存放該點對應的質心的行;[1]為到質心的距離 cluster_assment = np.zeros((m, 2)) #構建k個隨機質心 centroids = rand_cent(data_set, k) cluster_changed = True #當任意一點所屬的類別發生了變化的時候 while cluster_changed: cluster_changed = False #遍歷每個點(每一行) for i in range(m): #初始化 min_dist = np.inf; min_index = -1 #遍歷每一個質心 for j in range(k): #計算當前這一點與質心的dis dist_ji = dist_eclud(centroids[j,:], data_set[i,:]) #更新最小的dis與對應的質心所在的行j if dist_ji < min_dist: min_dist = dist_ji; min_index = j #該點的質心所在的行發生了變換——該點所屬的類別發生了變化 if cluster_assment[i,0] != min_index: cluster_changed = True #更新類別與該點到質心的距離 cluster_assment[i,:] = min_index, min_dist**2 #更新質心 for cent in range(k): pts_inclust = data_set[np.nonzero(list(map(lambda x:x==cent, cluster_assment[:,0])))] centroids[cent,:] = np.mean(pts_inclust, axis=0) #返回質心,一個m*2的矩陣,[0]存放該點對應的質心的行(類別);[1]為到質心的距離 return centroids, cluster_assment #繪制散點圖 def drawFigure(dataMat): #第一列(特征1)為橫坐標 pointX=dataMat[:,0] pointY=dataMat[:,1] fig, ax = plt.subplots(figsize=(10,5)) ax.scatter(pointX, pointY, s=30, c="r", marker="o", label="sample point") ax.legend() ax.set_xlabel("factor1") ax.set_ylabel("factor2") plt.show() #繪制聚類后的散點圖 def drawFigure2(data_set,my_centroids): point_x = data_set[:,0] point_y = data_set[:,1] cent_x = my_centroids[:,0] cent_y = my_centroids[:,1] fig, ax = plt.subplots(figsize=(10,5)) ax.scatter(point_x, point_y, s=30, c="r", marker="o", label="sample point") ax.scatter(cent_x, cent_y, s=100, c="black", marker="v", label="centroids") ax.legend() ax.set_xlabel("factor1") ax.set_ylabel("factor2") plt.show() if __name__=="__main__": #將文本內容轉換成矩陣 data_set=loadDataSet("testSet.txt") my_centroids, my_cluster_assment = Kmeans(data_set, 4) drawFigure2(data_set,my_centroids) #print(my_centroids) # print(my_cluster_assment) #畫圖 #drawFigure(dataMat) #計算第一行與第二行的距離 #dist=distEclud(dataMat[0],dataMat[1]) #print(dist) #mm=randCent(dataMat,2) #print(mm) #print(dataMat) #第一列 #print(dataMat[:,0]) #第一行 #print(dataMat[0])
文章版權歸作者所有,未經允許請勿轉載,若此文章存在違規行為,您可以聯系管理員刪除。
轉載請注明本文地址:http://specialneedsforspecialkids.com/yun/43546.html
閱讀 3243·2021-10-27 14:20
閱讀 2525·2021-10-08 10:05
閱讀 1625·2021-09-09 09:33
閱讀 2902·2019-08-30 13:16
閱讀 1435·2019-08-29 18:34
閱讀 1171·2019-08-29 10:58
閱讀 1228·2019-08-28 18:22
閱讀 1226·2019-08-26 13:33