Witam,
Dane treningowe: 24 405 wierszy z 24 parametrami, za pomocą których próbujemy przewidzieć 0 lub 1 dla każdego wiersza.
Zacząłem od modelu Sequential z biblioteki Keras i uzyskałem około 63%, po dostojeniu hiperparametrów, na zbiorze testowym:
model = Sequential()
model.add(Dense(128,kernel_regularizer=regularizers.l1(0.00001),input_dim=24, activation='relu'))
model.add(layers.BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(128,kernel_regularizer=regularizers.l1(0.00001), activation='relu'))
model.add(layers.BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
model.fit(X_train, y_train,
epochs=50,
batch_size=64,verbose=0)
pred1 = model.predict(X_test)
list_of_models.append(pred1)
print('Accuracy on test set:')
print(model.evaluate(X_test, y_test, batch_size = 64, verbose=0))
Można ulepszyć coś w kodzie żeby poprawić predykcje na zbiorze testowym?
Są jakieś inne efektywne sposoby na poprawę wydajności modelów podobnych do tego?
Składanie modeli(stacking poniżej, boosting i bagging) nie dało dużo lepszych rezultatów:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import regularizers
from tensorflow.keras import optimizers
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from numpy import dstack
data = pd.read_csv('Data2.csv', encoding='latin-1', low_memory = False)
X = data[['WRank', 'LRank', 'WIAG',
'LOAG', 'ODW', 'ODL',
'Wcomb', 'Lcomb', 'POSW', 'POSL', 'PINW',
'PINL', 'DSWW', 'DSLW', 'FSIW','FSIL',
'BPFGW', 'DSWL', 'DSLL','GPW','GPL',
'BPFGL', 'TSSW', 'TSSL']]
print(X.shape)
labels = pd.read_csv('labels.csv', encoding='latin-1', low_memory = False)
y = labels[['y']]
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.125, random_state=0)
scaler = StandardScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
a = X_test.shape[0]
list_of_models = []
############################################ 1 model ########################
model = Sequential()
model.add(Dense(128,kernel_regularizer=regularizers.l1(0.00001),input_dim=24, activation='relu'))
model.add(layers.BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(128,kernel_regularizer=regularizers.l1(0.00001), activation='relu'))
model.add(layers.BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
model.fit(X_train, y_train,
epochs=50,
batch_size=64,verbose=0)
pred1 = model.predict(X_test)
list_of_models.append(pred1)
########################### 2 model ##################################
XGB = XGBClassifier(max_depth= 7, learning_rate=0.004549208122492831, n_estimators= 171, min_child_weight= 8,
subsample= 1, colsample_bytree= 0.792184214487922)
XGB = XGB.fit(X_train, y_train)
pred2 = XGB.predict(X_test)
pred2.transpose()
pred2 = pred2.reshape(a,1)
list_of_models.append(pred2)
######################## 3 model #########################
SVM = svm.SVC(kernel="rbf", gamma='scale', C=1)
SVM.fit(X_train, y_train)
pred3 = SVM.predict(X_test)
pred3 = pred3.transpose()
pred3 = pred3.reshape(a,1)
list_of_models.append(pred3)
####################### 4 model ########################
RF = RandomForestClassifier(n_estimators = 28, criterion='entropy',max_depth=7,
min_samples_split=10,max_leaf_nodes=1000,max_features='auto')
RF.fit(X_train, y_train)
pred5 = RF.predict(X_test)
pred5.transpose()
pred5 = pred5.reshape(a,1)
list_of_models.append(pred5)
##################### 5 model #########################
KNN = KNeighborsClassifier(n_neighbors = 100, p=1,
leaf_size = 25, algorithm = 'auto',
weights = 'distance')
KNN.fit(X_train, y_train)
pred4 = KNN.predict(X_test)
pred4.transpose()
pred4 = pred4.reshape(a,1)
list_of_models.append(pred4)
#################### META-LEARNER ########################
for model in list_of_models:
stack_X_train = None
for model in list_of_models:
if stack_X_train is None:
stack_X_train = model
else:
stack_X_train = dstack((stack_X_train, model))
stack_X_train = stack_X_train.reshape((stack_X_train.shape[0], stack_X_train.shape[1]*stack_X_train.shape[2]))
print(stack_X_train)
stack_X_train,stack_X_test, stack_y_train, stack_y_test = train_test_split(stack_X_train, y_test,
test_size=0.2, random_state = 0)
model = Sequential()
model.add(Dense(128,input_dim=5, activation='relu'))
model.add(layers.BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(layers.BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
model.fit(stack_X_train, stack_y_train, batch_size= 128, epochs=40)
#pred1 = model.predict(stack_X_test)
print(f"stack predction model accuracy:{model.evaluate(stack_X_test, stack_y_test, batch_size = 128, verbose=0)}")