PCA & PCR Code
# -*- coding: utf-8 -*-
# DO NOT CHANGE
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
def cal_PC(X, n_components):
# X: input data matrix
# n_components: the number of principal components
# return (eigenvalues of n_components of PCs, n_feature*n_components matrix (each column is PC))
# HINT: np.linalg.eigh
XT = X.T
for i in range(len(XT)):
XT[i] -= XT[i].mean()
cov = np.matmul(XT,X) #eigh... plot 그림 이상.
eigenvalue_list, eigvector_list = np.linalg.eig(cov)
eigenvalue = eigenvalue_list[:n_components]
eigenvector = []
for i in range(n_components):
val = eigvector_list[:,i]
eigenvector.append(val)
eigenvector = np.asarray(eigenvector)
return eigenvalue, eigenvector
def proj_PC(X,eigvec):
# X: input data matrix
# eigvec: n_feature*n_components matrix (each column is PC)
# return n_data*n_components transformed data matrix
data_matrix = np.matmul(X, eigvec.T)
return data_matrix
def PCR(X, y, n_components):
# X: input data matrix
# y: output target vector
# n_components: the number of principal components
# return regression model
regression = LinearRegression()
eigenvalue, eigenvector = cal_PC(X, n_components)
T = proj_PC(X,eigenvector)
regression_model = regression.fit(T,y)
return regression_model
# PCA
iris=datasets.load_iris()
X1=iris.data
y1=iris.target
n_components=2
eigval,eigvec=cal_PC(X1, n_components)
T1=proj_PC(X1, eigvec)
# TODO: Get transformed data using PCA implemented by scikit-learn
pca = PCA(n_components=2)
pca.fit(X1)
T = pca.transform(X1)
# TODO: Plot
plt.scatter(T[:,0], T[:,1], c=y1)
plt.scatter(T1[:,0], T1[:,1], c=y1)
# Regression
n_components=4
boston=datasets.load_boston()
X2=boston.data
y2=boston.target
reg_pca=PCR(X2,y2,n_components)
# TODO: Build a regression model using all features
reg = LinearRegression()
reg.fit(X2, y2)
# TODO: Compare R-square using all samples of PCR with ordinary regression model
reg_pcr = PCR(X2, y2, n_components)
eigval,eigvec=cal_PC(X2, n_components)
T2=proj_PC(X2, eigvec)
reg.score(X2, y2)
reg_pcr.score(T2, y2)
'Study > Data Science' 카테고리의 다른 글
PCA(Principal component analysis) 간단 정리 (0) | 2019.02.21 |
---|---|
Python 감성 분석 및 감성사전 구축. (5) | 2018.12.27 |
NLTK를 이용한 Frequency Distributions, Conditional Frequency Distributions, Stopwords (0) | 2018.11.08 |
python ntlk를 이용한 sent, word tokenize (0) | 2018.11.06 |
𝐹-test and T-test for OLS regression boston dataset (0) | 2018.10.17 |