Sklearn preprocessing.scale ValueError: Input contains infinity or a value too large for dtype

0

Witam mam dokładnie problem z tym co napisałem w tytule. Utworzyłem generator który zapisuje wszystkie ceny bitcoina z binance w formacie OHLC + volumen + rsi
do pliku csv. Następnie date te przekazywane są do modelu który ma próbować przewidzieć wzrost lub spadek ceny za następne 3 świeczki. Wszystko działa dobrze jeżeli nie dodam kolumny z volumenem. Próbowałem ją zamienić na int oraz szukałem odpowiedzi w google ale nic mi nie pomaga. Ogólnie przerabiam skrypt z tego poradnika aby czegoś się nauczyć https://pythonprogramming.net/crypto-rnn-model-deep-learning-python-tensorflow-keras/
Jak źle wstawiłem kod to mnie poprawcie.

KOD ŹRÓDŁOWY DO POBIERANIA DANYCH

from binance.client import Client
import pandas as pd
#import talib as ta

client = Client("", "")
klines = client.get_historical_klines("BTCUSDT", Client.KLINE_INTERVAL_1HOUR, "18 Aug, 2017")

for line in klines:
    del line[6:]

df = pd.DataFrame(klines, columns=['time', 'low', 'high', 'open', 'close', 'volume'])
#df['rsi'] = ta.RSI(df['close'])
df.to_csv('btc.csv')

MODEL

import pandas as pd
from collections import deque
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint
import time
from sklearn import preprocessing


SEQ_LEN = 6
FUTURE_PERIOD_PREDICT = 3


def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

main_df = pd.DataFrame()
mdf = pd.read_csv('btc.csv')
mdf.set_index("time", drop=True, inplace=True)
mdf = mdf[['low', 'high', 'open', 'close', 'volume', 'rsi']]
print(mdf)
main_df = mdf


main_df.fillna(method='ffill', inplace=True)
main_df.dropna(inplace=True)

main_df['future'] = main_df['close'].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df['close'], main_df['future']))

times = sorted(main_df.index.values)
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]

validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]

def classify(current, future):
    if float(future) > float(current):  # if the future price is higher than the current, that's a buy, or a 1
        return 1
    else:  # otherwise... it's a 0!
        return 0


def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            print(df[col])
            df[col] = df[col].pct_change()
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.



    df.dropna(inplace=True)  # cleanup again... jic.

    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), np.array(y)


train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

"""print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")"""

model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))


opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

history = model.fit(
    train_x, train_y,
    batch_size=64,
    epochs=10,
    validation_data=(validation_x, validation_y))

# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
1

Ciężko będzie zreoprodukować, spróbuj to jakoś bardziej agnostycznie; debugowaleś?

0

Jeszcze nie, dopiero się uczę ale spróbuje z debuggerem. Myślałem że może popełniłem jakiś klasyczny błąd.

1 użytkowników online, w tym zalogowanych: 0, gości: 1