데이터분석/분석-비지도학습

차원축소후 k-means 클러스터링

씩씩한 IT블로그 2020. 7. 5. 23:46
반응형

1. 차원축소하기(pca) :

여러가지 att가 존재할 때 내가 원하는만큼 차원을 축소

# [차원축소하기 PCA]

import sklearn.decomposition
import matplotlib.pyplot as plt
import numpy as np

def main():
    
    X, attributes = input_data()
    pca_array = normalize(X)
    pca, pca_array = run_PCA(X, 2)
    visualize_2d_wine(pca_array)
    
    
def input_data():
    f = open("data/attributes.txt", 'r')
    attributes = []
    while True:
        line = f.readline().strip("\n")
        if not line: break
        attributes.append(line)
    f.close()
    
    f = open("data/wine.csv", 'r')
    X = []
    while True:
        line = f.readline().strip("\n")
        if not line: break
        line=list(map(float,line.split(",")))
        X.append(line)
    f.close()
    
    return np.array(X), attributes

def run_PCA(X, num_components):
    pca = sklearn.decomposition.PCA(n_components=num_components)
    pca.fit(X)
    pca_array = pca.transform(X)
    return pca, pca_array
    
def normalize(X):
    for i in range(X.shape[1]):
        X[:,i]=X[:,i]-np.min(X[:,i]) #최솟값을 빼주고
        X[:,i]=X[:,i]/np.max(X[:,i]) #최댓값을 나눠주고
    return X

def visualize_2d_wine(X):
    plt.scatter(X[:,0],X[:,1]) #산점도로 보이기
    plt.savefig("image.png")

if __name__ == '__main__':
    main()

 

2. K-MEANS 클러스터링

: 2차원으로 축소된 데이터를 좌표평면위에 산점도로 타점하고, k-means 클러스터링을 적용

# [k-mean 클러스터링]

import sklearn.decomposition
import sklearn.cluster
import matplotlib.pyplot as plt
import numpy as np

def main():
    X, attributes = input_data()
    X = normalize(X)
    pca, pca_array = run_PCA(X, 2)
    labels = kmeans(pca_array, 3, [0, 1, 2])
    visualize_2d_wine(pca_array, labels)

def input_data():
    X = []
    attributes = []
    
    with open('data/wine.csv') as fp:
        for line in fp:
            X.append([float(x) for x in line.strip().split(',')])
    
    with open('data/attributes.txt') as fp:
        attributes = [x.strip() for x in fp.readlines()]

    return np.array(X), attributes

def run_PCA(X, num_components):
    pca = sklearn.decomposition.PCA(n_components=num_components)
    pca.fit(X)
    pca_array = pca.transform(X)

    return pca, pca_array

def kmeans(X, num_clusters, initial_centroid_indices):
    import time
    N = len(X)
    centroids = X[initial_centroid_indices]
    labels = np.zeros(N)
    
    while True:
        change=False
        
        for i in range(N):
            myL=[]
            for j in range(num_clusters):
                myL.append(distance(centroids[j],X[i]))
            
        
            
        
            if labels[i]!=np.argmin(myL):
                change=True
            labels[i]=np.argmin(myL)
            
             
        for k in range(num_clusters):
            x=X[labels==k][:,0]
            y=X[labels==k][:,1]
            
            x=np.mean(x)
            y=np.mean(y)
            centroids[k]=[x,y]
        
        if change==False:
            break

    return labels
    
    
def distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))
    
def normalize(X):
    for dim in range(len(X[0])):
        X[:, dim] -= np.min(X[:, dim])
        X[:, dim] /= np.max(X[:, dim])
    return X

'''
이전에 더해, 각각의 데이터 포인트에 색을 입히는 과정도 진행합니다.
'''

def visualize_2d_wine(X, labels):
    plt.figure(figsize=(10, 6))
    plt.scatter(X[:,0],X[:,1],c=labels)
    
    plt.savefig("image.svg", format="svg")

if __name__ == '__main__':
    main()
반응형