TEMPLATE METHOD 템플릿 메소드
- 상위 클래스에 구조를, 하위 클래스에 처리를 구현하는 패턴.
- 공통 사항은 추상클래스에서 구현, 처리는 각각의 서브 클래스에서 구현.
- 코드 중복을 줄이고, 리펙토리에 유용, 확장에 용의.




// 샘플 코드 class AbstractClass{ private: virtual void subMethod() = 0; public: void templateMethod() {subMethod();} }; class ConcreteClass : public AbstractClass{ public: void subMethod() override {cout<< "subMethod"<<endl;} }; int main(){ AbstractClass *test = new ConcreteClass; test ->templateMethod(); delete test; return 0; }



//AbstractClass class 라면{ private: virtual void 스프넣기() = 0; void 물끓이기() {cout<<"물을 끓인다"<<endl;} public: void 만들기(){ 물끓이기(); 스프넣기(); } }; //ConcreteClass class 짜장라면 : public 라면{ private: void 스프넣기() override {cout<<"짜장 스프을 넣는다"<<endl;} }; // Main int main(){ 라면* a = new 짜장라면; a->만들기(); delete a; return 0; }



데이터셋 다운로드

https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/


UCI Wine Quality Data set

Attribute information

  • 1 - fixed acidity
  • 2 - volatile acidity
  • 3 - citric acid
  • 4 - residual sugar
  • 5 - chlorides
  • 6 - free sulfur dioxide
  • 7 - total sulfur dioxide
  • 8 - density
  • 9 - pH
  • 10 - sulphates
  • 11 - alcohol
  • 12 - quality (score between 0 and 10)

코드 구현 


CSV 데이터 확인.


import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

wine_data = pd.read_csv('winequality-white.csv',delimiter=';',dtype=float)

wine_data.head(10)



데이터 자르기 및 qulity 변수 값 변경.


x_data = wine_data.iloc[:,0:-1]

y_data = wine_data.iloc[:,-1]



# Score 값이 7보다 작으면 0,  7보다 크거나 같으면 1로 값 변경.

y_data = np.array([1 if i>=7 else 0 for i in y_data])

x_data.head(5)


# 트레인, 테스트 데이터 나누기.

train_x, test_x, train_y, test_y = sklearn.model_selection.train_test_split(x_data, y_data, test_size = 0.3,random_state=42)


KNN


모델 구축


from sklearn.neighbors import KNeighborsClassifier 

                                  

clf = KNeighborsClassifier(n_neighbors=2,metric= 'euclidean')

clf.fit(train_x,train_y) 



Confusion Matrix


confusion = confusion_matrix(test_y,y_pred_test)

print("confusion_matrix\n{}".format(confusion))



성능


y_pred_train = clf.predict(train_x)

y_pred_test = clf.predict(test_x)

y_pred_test2 = clf.predict_proba(test_x)


print("Train Data:", accuracy_score(train_y, y_pred_train))

print("Test Data" , accuracy_score(test_y, y_pred_test))



from sklearn.metrics import classification_report

y_true, y_pred = test_y, clf.predict(test_x)

print(classification_report(y_true, y_pred))


ROC CURVE


fpr, tpr, thresholds = roc_curve(test_y, y_pred_test2[:,1], pos_label=1)

roc_auc = auc(fpr, tpr)


plt.title('Receiver Operating Characteristic')

plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)

plt.legend(loc = 'lower right')

plt.plot([0, 1], [0, 1],'r--')

plt.xlim([0, 1])

plt.ylim([0, 1])

plt.ylabel('True Positive Rate')

plt.xlabel('False Positive Rate')

plt.show()



최적 K 찾기


from sklearn.datasets import load_breast_cancer


training_accuracy = []

test_accuracy = []

# 1에서 10까지 n_neighbors를 적용

neighbors_settings = range(1, 20)


for n_neighbors in neighbors_settings:

    # 모델 생성

    clf = KNeighborsClassifier(n_neighbors=n_neighbors)

    clf.fit(train_x, train_y)

    # 훈련 세트 정확도 저장

    training_accuracy.append(clf.score(train_x, train_y))

    # 일반화 정확도 저장

    test_accuracy.append(clf.score(test_x, test_y))


plt.plot(neighbors_settings, training_accuracy, label="train accuracy")

plt.plot(neighbors_settings, test_accuracy, label="test accuracy")

plt.ylabel("accuracy")

plt.xlabel("n_neighbors")

plt.legend()












python을 이용한 Wine Quality 데이터 Decision Tree


데이터셋 다운로드

https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/


UCI Wine Quality Data set

Attribute information

  • 1 - fixed acidity
  • 2 - volatile acidity
  • 3 - citric acid
  • 4 - residual sugar
  • 5 - chlorides
  • 6 - free sulfur dioxide
  • 7 - total sulfur dioxide
  • 8 - density
  • 9 - pH
  • 10 - sulphates
  • 11 - alcohol
  • 12 - quality (score between 0 and 10)



Python 에서 Decision Tree를 이미지화 해줄 Graphviz 설치.

https://graphviz.gitlab.io/_pages/Download/Download_windows.html


Zip 파일을 받아 원하는 경로에 설치를 해줍니다.

C:\Program Files (x86)\graphviz-2.38


코드 구현 


CSV 데이터 확인.


import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

wine_data = pd.read_csv('winequality-white.csv',delimiter=';',dtype=float)

wine_data.head(10)



데이터 자르기 및 qulity 변수 값 변경.


x_data = wine_data.iloc[:,0:-1]

y_data = wine_data.iloc[:,-1]



# Score 값이 7보다 작으면 0,  7보다 크거나 같으면 1로 값 변경.

y_data = np.array([1 if i>=7 else 0 for i in y_data])

x_data.head(5)


# 트레인, 테스트 데이터 나누기.

train_x, test_x, train_y, test_y = sklearn.model_selection.train_test_split(x_data, y_data, test_size = 0.3,random_state=42)



Decision Tree 


모델 구축


# criterion = default = gini, splitter =default = best (best :최적의 해 -> 푸르닝 효과!?

,random은 랜덤하게 나눠줌.)

#                                                         entropy,  트리 계층 depth                       

wine_tree = tree.DecisionTreeClassifier(criterion='gini', max_depth=5,splitter='random')

wine_tree.fit(train_x, train_y)


성능


y_pred_train = wine_tree.predict(train_x)

y_pred_test = wine_tree.predict(test_x)


print("Train Data:", accuracy_score(train_y, y_pred_train))

print("Test Data" , accuracy_score(test_y, y_pred_test))



from sklearn.metrics import classification_report

y_true, y_pred = test_y, wine_tree.predict(test_x)

print(classification_report(y_true, y_pred))




시각화


import os

import pydotplus

from sklearn.tree import export_graphviz

from IPython.display import Image


# path 설정 -  자신의 설치한 경로.

os.environ["PATH"] += os.pathsep + r'C:/Program Files (x86)/graphviz-2.38/release/bin'


dot_data = export_graphviz(wine_tree, out_file=None,feature_names=x_data.columns,

                                class_names='Qual', filled=True, rounded=True, special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data)


# 이미지의 depth 는 모델 학습의 depth의 영향을 받음.

# 현재 이미지는 depth = 5 

Image(graph.create_png())




Python을 이용한 UCI Wine Quality 데이터 Logistic Regression


데이터셋 다운로드

https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/


UCI Wine Quality Data set

Attribute information

  • 1 - fixed acidity
  • 2 - volatile acidity
  • 3 - citric acid
  • 4 - residual sugar
  • 5 - chlorides
  • 6 - free sulfur dioxide
  • 7 - total sulfur dioxide
  • 8 - density
  • 9 - pH
  • 10 - sulphates
  • 11 - alcohol
  • 12 - quality (score between 0 and 10)


코드 구현 


CSV 데이터 확인.


import pandas as pd

import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split


wine_data = pd.read_csv('winequality-white.csv',delimiter=';',dtype=float)

wine_data.head(10)



데이터 자르기 및 qulity 변수 값 변경.


x_data = wine_data.iloc[:,0:-1]

y_data = wine_data.iloc[:,-1]



# Score 값이 7보다 작으면 0,  7보다 크거나 같으면 1로 값 변경.

y_data = np.array([1 if i>=7 else 0 for i in y_data])

x_data.head(5)


# 트레인, 테스트 데이터 나누기.

train_x, test_x, train_y, test_y = sklearn.model_selection.train_test_split(x_data, y_data, test_size = 0.3,random_state=42)


로지스틱 리그레션 모델 학습


log_reg = LogisticRegression()

log_reg.fit(train_x, train_y)


성능 평가


y_pred = log_reg.predict(test_x)

print("Train Data:",log_reg.score(train_x,train_y))

print("Test Data",sum(y_pred == test_y) / len(test_y))




from sklearn.metrics import classification_report

y_true, y_pred = test_y, log_reg.predict(test_x)

print(classification_report(y_true, y_pred))



odds ratio


print (np.exp(logit.params))




params = logit.params

conf = logit.conf_int()

conf['OR'] = params

conf.columns = ['2.5%', '97.5%', 'OR']

print (np.exp(conf))



MLE


logit = sm.Logit(train_y,train_x).fit()

logit.summary()




Boston House Prices dataset을 이용한 Multiple linear Regression.


Attribute information

  • CRIM per capita crime rate by town
  • ZN proportion of residential land zoned for lots over 25,000 sq.ft.
  • INDUS proportion of non-retail business acres per town
  • CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
  • NOX nitric oxides concentration (parts per 10 million)
  • RM average number of rooms per dwelling
  • AGE proportion of owner-occupied units built prior to 1940
  • DIS weighted distances to five Boston employment centres
  • RAD index of accessibility to radial highways
  • TAX full-value property-tax rate per 10,000
  • PTRATIO pupil-teacher ratio by town
  • B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
  • LSTAT lower status of the population
  • MEDV Median value of owner-occupied homes in 1000's

import
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pylab as plt
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import scipy.stats as stats
import sklearn
import statsmodels.api as sm

데이터 셋 불러오기.

from sklearn.datasets import load_boston

boston = load_boston()

dfX = pd.DataFrame(boston.data, columns=boston.feature_names)

dfy = pd.DataFrame(boston.target, columns=["MEDV"])

df_house = pd.concat([dfX, dfy], axis=1)

df_house.tail(10)



DataFrame의 describe()을 이용한 데이터 요약.


df_house.describe()



Seaborn을 이용한 pairplot


columns = ["ZN", "NOX", "RM", "MEDV"]

sns.pairplot(df_house[columns])

plt.show()




테스트, 검증 데이터 셋 구성과 학습 모델 생성


# X ,Y 데이터 나누기.

x_data = df_house.iloc[:,0:-1]

y_data = df_house.iloc[:,-1]

x_data.tail(5)


#Train , Test 데이터 분할하기.


#방법 1

# ratio = 0.7

# num_data = int(len(df_house)* ratio)

# train_X, test_X = x_data[:num_data], x_data[num_data:] #slicing

# train_y, test_y = y_data[:num_data], y_data[num_data:]


#방법 2 ( Train data = 67%,  Test data = 33% )

train_x, test_x, train_y, test_y = sklearn.model_selection.train_test_split(x_data, y_data, test_size = 0.33)


# LinearRegression 모델 생성

m_reg = LinearRegression(fit_intercept = True) #fit_intercept -> beta0 생성 여부

# 모델 학습

m_reg.fit(train_x, train_y) 


# 테스트 데이터를 이용해 예측하기.

y_pred = m_reg.predict(test_x)

print(m_reg.score(test_x, test_y))

# 출력 결과

# 0.6704720247887545


OLS 방식


m_reg = sm.OLS(train_y, train_x).fit()

# Print out the statistics

m_reg.summary()

 


Mean Squared Error 구하기.


Y_pred = m_reg.predict(test_x)

mse = sklearn.metrics.mean_squared_error(test_y, Y_pred)

print(mse)

# 출력 결과

# 25.67338712126145


예측값과 실제 값에 대한 Scatter plot 그리기


plt.scatter(test_y, y_pred)

line = np.linspace(min(test_y), max(test_y), 1000)

plt.plot(line, line, color = 'r')

plt.xlabel('Test_value')

plt.ylabel('Pred_value')

plt.show()



+ Recent posts