반응형
각종 feature engineering을 수행한 후 회귀분석을 통해 시계열데이터를 분석해 본다
non_feature engineering과의 차이도 알아본다.
1. 필요한 라이브러리 다운
# Ignore the warnings
# 버전이 바뀌었을때 발생할 수 있는 오류들을 경고해주는 메시지
import warnings
#warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
# System related and data input controls
import os
# Data manipulation and visualization
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_rows = 100
pd.options.display.max_columns = 20
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Modeling algorithms
# General
import statsmodels.api as sm
from scipy import stats
# Model selection
from sklearn.model_selection import train_test_split
# Evaluation metrics
# for regression
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error
2. 데이터 다운
# location = 'https://raw.githubusercontent.com/cheonbi/DataScience/master/Data/Bike_Sharing_Demand_Full.csv'
# 데이터다운
location = './Data/BikeSharingDemand/Bike_Sharing_Demand_Full.csv'
raw_all = pd.read_csv(location)
raw_all
3. 전처리 함수 (feature_engineering과 non_feature_engineering)
### Functinalize
### Feature engineering of default
def non_feature_engineering(raw):
raw_nfe = raw.copy()
if 'datetime' in raw_nfe.columns:
raw_nfe['datetime'] = pd.to_datetime(raw_nfe['datetime'])
raw_nfe['DateTime'] = pd.to_datetime(raw_nfe['datetime'])
if raw_nfe.index.dtype == 'int64':
raw_nfe.set_index('DateTime', inplace=True)
# bring back
# if raw_nfe.index.dtype != 'int64':
# raw_nfe.reset_index(drop=False, inplace=True)
raw_nfe = raw_nfe.asfreq('H', method='ffill')
return raw_nfe
# raw_rd = non_feature_engineering(raw_all)
### Feature engineering of all
def feature_engineering(raw):
raw_fe = raw.copy()
# 시간 타입으로 변환
if 'datetime' in raw_fe.columns:
raw_fe['datetime'] = pd.to_datetime(raw_fe['datetime'])
raw_fe['DateTime'] = pd.to_datetime(raw_fe['datetime'])
# DataTime을 인덱스로
if raw_fe.index.dtype == 'int64':
raw_fe.set_index('DateTime', inplace=True)
# 시(Hour)단위로 시계열화
raw_fe = raw_fe.asfreq('H', method='ffill')
# 시계열분해(trend, seasonal)
result = sm.tsa.seasonal_decompose(raw_fe['count'], model='additive')
Y_trend = pd.DataFrame(result.trend)
Y_trend.fillna(method='ffill', inplace=True)
Y_trend.fillna(method='bfill', inplace=True)
Y_trend.columns = ['count_trend']
Y_seasonal = pd.DataFrame(result.seasonal)
Y_seasonal.fillna(method='ffill', inplace=True)
Y_seasonal.fillna(method='bfill', inplace=True)
Y_seasonal.columns = ['count_seasonal']
pd.concat([raw_fe, Y_trend, Y_seasonal], axis=1).isnull().sum()
if 'count_trend' not in raw_fe.columns:
if 'count_seasonal' not in raw_fe.columns:
raw_fe = pd.concat([raw_fe, Y_trend, Y_seasonal], axis=1)
# 이동평균
# (일단위)
Y_count_Day = raw_fe[['count']].rolling(24).mean()
Y_count_Day.fillna(method='ffill', inplace=True)
Y_count_Day.fillna(method='bfill', inplace=True)
Y_count_Day.columns = ['count_Day']
# (주단위)
Y_count_Week = raw_fe[['count']].rolling(24*7).mean()
Y_count_Week.fillna(method='ffill', inplace=True)
Y_count_Week.fillna(method='bfill', inplace=True)
Y_count_Week.columns = ['count_Week']
if 'count_Day' not in raw_fe.columns:
raw_fe = pd.concat([raw_fe, Y_count_Day], axis=1)
if 'count_Week' not in raw_fe.columns:
raw_fe = pd.concat([raw_fe, Y_count_Week], axis=1)
# 이전 값과의 차이
Y_diff = raw_fe[['count']].diff()
Y_diff.fillna(method='ffill', inplace=True)
Y_diff.fillna(method='bfill', inplace=True)
Y_diff.columns = ['count_diff']
if 'count_diff' not in raw_fe.columns:
raw_fe = pd.concat([raw_fe, Y_diff], axis=1)
# 10개로 자르기
raw_fe['temp_group'] = pd.cut(raw_fe['temp'], 10)
# 시간대별 att생성하기
raw_fe['Year'] = raw_fe.datetime.dt.year
raw_fe['Quater'] = raw_fe.datetime.dt.quarter
raw_fe['Quater_ver2'] = raw_fe['Quater'] + (raw_fe.Year - raw_fe.Year.min()) * 4
raw_fe['Month'] = raw_fe.datetime.dt.month
raw_fe['Day'] = raw_fe.datetime.dt.day
raw_fe['Hour'] = raw_fe.datetime.dt.hour
raw_fe['DayofWeek'] = raw_fe.datetime.dt.dayofweek
# 이동
raw_fe['count_lag1'] = raw_fe['count'].shift(1)
raw_fe['count_lag2'] = raw_fe['count'].shift(2)
raw_fe['count_lag1'].fillna(method='bfill', inplace=True)
raw_fe['count_lag2'].fillna(method='bfill', inplace=True)
# 더미데이터만들기
if 'Quater' in raw_fe.columns:
if 'Quater_Dummy' not in ['_'.join(col.split('_')[:2]) for col in raw_fe.columns]:
raw_fe = pd.concat([raw_fe, pd.get_dummies(raw_fe['Quater'], prefix='Quater_Dummy', drop_first=True)], axis=1)
del raw_fe['Quater']
return raw_fe
# raw_fe = feature_engineering(raw_all)
4. 데이터 분할함수
### Functionalize
### Data split of cross sectional
def datasplit_cs(raw, Y_colname, X_colname, test_size, random_seed=123):
X_train, X_test, Y_train, Y_test = train_test_split(raw[X_colname], raw[Y_colname], test_size=test_size, random_state=random_seed)
print('X_train:', X_train.shape, 'Y_train:', Y_train.shape)
print('X_test:', X_test.shape, 'Y_test:', Y_test.shape)
return X_train, X_test, Y_train, Y_test
# X_train, X_test, Y_train, Y_test = datasplit_cs(raw_fe, Y_colname, X_colname, 0.2)
### Data split of time series
def datasplit_ts(raw, Y_colname, X_colname, criteria):
raw_train = raw.loc[raw.index < criteria,:]
raw_test = raw.loc[raw.index >= criteria,:]
Y_train = raw_train[Y_colname]
X_train = raw_train[X_colname]
Y_test = raw_test[Y_colname]
X_test = raw_test[X_colname]
print('Train_size:', raw_train.shape, 'Test_size:', raw_test.shape)
print('X_train:', X_train.shape, 'Y_train:', Y_train.shape)
print('X_test:', X_test.shape, 'Y_test:', Y_test.shape)
return X_train, X_test, Y_train, Y_test
# X_train, X_test, Y_train, Y_test = datasplit_ts(raw_fe, Y_colname, X_colname, '2012-07-01')
5. 모델 결괏값 평가
### Functionalize
### Evaluation of 1 pair of set
def evaluation(Y_real, Y_pred, graph_on=False):
loss_length = len(Y_real.values.flatten()) - len(Y_pred)
if loss_length != 0:
Y_real = Y_real[loss_length:]
if graph_on == True:
pd.concat([Y_real, pd.DataFrame(Y_pred, index=Y_real.index, columns=['prediction'])], axis=1).plot(kind='line', figsize=(20,6),
xlim=(Y_real.index.min(),Y_real.index.max()),
linewidth=3, fontsize=20)
plt.title('Time Series of Target', fontsize=20)
plt.xlabel('Index', fontsize=15)
plt.ylabel('Target Value', fontsize=15)
MAE = abs(Y_real.values.flatten() - Y_pred).mean()
MSE = ((Y_real.values.flatten() - Y_pred)**2).mean()
MAPE = (abs(Y_real.values.flatten() - Y_pred)/Y_real.values.flatten()*100).mean()
Score = pd.DataFrame([MAE, MSE, MAPE], index=['MAE', 'MSE', 'MAPE'], columns=['Score']).T
Residual = pd.DataFrame(Y_real.values.flatten() - Y_pred, index=Y_real.index, columns=['Error'])
return Score, Residual
# Score_tr, Residual_tr = evaluation(Y_train, pred_tr_reg1, graph_on=True)
### Evaluation of train/test pairs
def evaluation_trte(Y_real_tr, Y_pred_tr, Y_real_te, Y_pred_te, graph_on=False):
Score_tr, Residual_tr = evaluation(Y_real_tr, Y_pred_tr, graph_on=graph_on)
Score_te, Residual_te = evaluation(Y_real_te, Y_pred_te, graph_on=graph_on)
Score_trte = pd.concat([Score_tr, Score_te], axis=0)
Score_trte.index = ['Train', 'Test']
return Score_trte, Residual_tr, Residual_te
# Score_reg1, Resid_tr_reg1, Resid_te_reg1 = evaluation_trte(Y_train, pred_tr_reg1, Y_test, pred_te_reg1, graph_on=True)
6. 회귀분석(feature engineering)
# 데이터 전처리
raw_fe = feature_engineering(raw_all)
raw_fe
# 데이터 분할
Y_colname = ['count'] #예측할 att
X_remove = ['datetime', 'DateTime', 'temp_group', 'casual', 'registered'] #제외할 att
X_colname = [x for x in raw_fe.columns if x not in Y_colname+X_remove]
X_train, X_test, Y_train, Y_test = datasplit_ts(raw_fe, Y_colname, X_colname, '2012-07-01') #위에서 구현한 분할함수를 이용하여 데이터 분할
# LinearRegression (using statsmodels)
fit_reg1 = sm.OLS(Y_train, X_train).fit() #회귀분석 계산
display(fit_reg1.summary()) #통계량 정리
# 예측값
pred_tr_reg1 = fit_reg1.predict(X_train).values
pred_te_reg1 = fit_reg1.predict(X_test).values
# 예측값 평가
Score_reg1, Resid_tr_reg1, Resid_te_reg1 = evaluation_trte(Y_train, pred_tr_reg1, Y_test, pred_te_reg1, graph_on=True)
print(Score_reg1)
print(Resid_tr_reg1)
print(Resid_te_reg1)
7. 회귀분석(non_feature engineering)
raw_nfe=non_feature_engineering(raw_all)
Y_colname = ['count']
X_remove = ['datetime', 'DateTime', 'temp_group', 'casual', 'registered']
X_colname = [x for x in raw_nfe.columns if x not in Y_colname+X_remove]
X_train, X_test, Y_train, Y_test = datasplit_ts(raw_nfe, Y_colname, X_colname, '2012-07-01')
# LinearRegression (using statsmodels)
fit_reg1n = sm.OLS(Y_train, X_train).fit() #회귀분석 계산
display(fit_reg1n.summary()) #통계량 정리
pred_tr_reg1 = fit_reg1n.predict(X_train).values
pred_te_reg1 = fit_reg1n.predict(X_test).values
Score_reg1n, Resid_tr_reg1n, Resid_te_reg1n = evaluation_trte(Y_train, pred_tr_reg1, Y_test, pred_te_reg1, graph_on=True)
Score_reg1n
반응형