반응형
1. 차원축소하기(pca) :
여러가지 att가 존재할 때 내가 원하는만큼 차원을 축소
# [차원축소하기 PCA]
import sklearn.decomposition
import matplotlib.pyplot as plt
import numpy as np
def main():
X, attributes = input_data()
pca_array = normalize(X)
pca, pca_array = run_PCA(X, 2)
visualize_2d_wine(pca_array)
def input_data():
f = open("data/attributes.txt", 'r')
attributes = []
while True:
line = f.readline().strip("\n")
if not line: break
attributes.append(line)
f.close()
f = open("data/wine.csv", 'r')
X = []
while True:
line = f.readline().strip("\n")
if not line: break
line=list(map(float,line.split(",")))
X.append(line)
f.close()
return np.array(X), attributes
def run_PCA(X, num_components):
pca = sklearn.decomposition.PCA(n_components=num_components)
pca.fit(X)
pca_array = pca.transform(X)
return pca, pca_array
def normalize(X):
for i in range(X.shape[1]):
X[:,i]=X[:,i]-np.min(X[:,i]) #최솟값을 빼주고
X[:,i]=X[:,i]/np.max(X[:,i]) #최댓값을 나눠주고
return X
def visualize_2d_wine(X):
plt.scatter(X[:,0],X[:,1]) #산점도로 보이기
plt.savefig("image.png")
if __name__ == '__main__':
main()
2. K-MEANS 클러스터링
: 2차원으로 축소된 데이터를 좌표평면위에 산점도로 타점하고, k-means 클러스터링을 적용
# [k-mean 클러스터링]
import sklearn.decomposition
import sklearn.cluster
import matplotlib.pyplot as plt
import numpy as np
def main():
X, attributes = input_data()
X = normalize(X)
pca, pca_array = run_PCA(X, 2)
labels = kmeans(pca_array, 3, [0, 1, 2])
visualize_2d_wine(pca_array, labels)
def input_data():
X = []
attributes = []
with open('data/wine.csv') as fp:
for line in fp:
X.append([float(x) for x in line.strip().split(',')])
with open('data/attributes.txt') as fp:
attributes = [x.strip() for x in fp.readlines()]
return np.array(X), attributes
def run_PCA(X, num_components):
pca = sklearn.decomposition.PCA(n_components=num_components)
pca.fit(X)
pca_array = pca.transform(X)
return pca, pca_array
def kmeans(X, num_clusters, initial_centroid_indices):
import time
N = len(X)
centroids = X[initial_centroid_indices]
labels = np.zeros(N)
while True:
change=False
for i in range(N):
myL=[]
for j in range(num_clusters):
myL.append(distance(centroids[j],X[i]))
if labels[i]!=np.argmin(myL):
change=True
labels[i]=np.argmin(myL)
for k in range(num_clusters):
x=X[labels==k][:,0]
y=X[labels==k][:,1]
x=np.mean(x)
y=np.mean(y)
centroids[k]=[x,y]
if change==False:
break
return labels
def distance(x1, x2):
return np.sqrt(np.sum((x1 - x2) ** 2))
def normalize(X):
for dim in range(len(X[0])):
X[:, dim] -= np.min(X[:, dim])
X[:, dim] /= np.max(X[:, dim])
return X
'''
이전에 더해, 각각의 데이터 포인트에 색을 입히는 과정도 진행합니다.
'''
def visualize_2d_wine(X, labels):
plt.figure(figsize=(10, 6))
plt.scatter(X[:,0],X[:,1],c=labels)
plt.savefig("image.svg", format="svg")
if __name__ == '__main__':
main()
반응형