I am trying to experiment with transferring the hidden states of an LSTM from an encoder layer to a decoder layer, as demonstrated in the Keras blog.
My data is randomly generated sine-waves (that is, wavelength and phase are determined randomly, as well as the length of the sequence), and the network is trained to receive a number of sine-waves and predict their progression.
Without transferring the hidden states, my code is as follows:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, TimeDistributed,Lambda, Dropout, Activation ,RepeatVector
from keras.callbacks import ModelCheckpoint
import numpy as np
features_num=5
encoder_inputs = Input(shape=(None, features_num))
encoder = LSTM(40, return_state=False)
encoder_outputs= encoder(encoder_inputs)
decoder_input=RepeatVector(150)(encoder_outputs)
decoder_lstm = LSTM(40, return_sequences=True, return_state=True)
decoder_outputs,_,_=decoder_lstm(decoder_input)
decoder_outputs=TimeDistributed(Dense(features_num))(decoder_outputs)
model = Model(encoder_inputs, decoder_outputs)
print(model.summary())
model.compile(loss='mean_squared_error', optimizer='adam')
def create_wavelength(min_wavelength, max_wavelength, fluxes_in_wavelength, category ) :
#category :: 0 - train ; 2 - validate ; 4- test. 1;3;5 - dead space
c=(category+np.random.random())/6
k = fluxes_in_wavelength
#
base= (np.trunc(k*np.random.random()*(max_wavelength-min_wavelength)) +k*min_wavelength) /k
answer=base+c/k
return (answer)
def make_line(length,category):
shift= np.random.random()
wavelength = create_wavelength(30,10,1,category)
a=np.arange(length)
answer=np.sin(a/wavelength+shift)
return answer
def make_data(seq_num,seq_len,dim,category):
data=np.array([]).reshape(0,seq_len,dim)
for i in range (seq_num):
mini_data=np.array([]).reshape(0,seq_len)
for j in range (dim):
line = make_line(seq_len,category)
line=line.reshape(1,seq_len)
mini_data=np.append(mini_data,line,axis=0)
mini_data=np.swapaxes(mini_data,1,0)
mini_data=mini_data.reshape(1,seq_len,dim)
data=np.append(data,mini_data,axis=0)
return (data)
def train_generator():
while True:
sequence_length = np.random.randint(150, 300)+150
data=make_data(1000,sequence_length,features_num,0) # category=0 in train
x_train = data[:,:-150,:] # all but last 150
y_train = (data[:, -150:, :]) # last 150
yield x_train, y_train
def val_generator():
while True:
sequence_length = np.random.randint(150, 300)+150
data=make_data(1000,sequence_length,features_num,2) # category=2 in val
x_val = data[:,:-150,:] # all but last 150
y_val = (data[:, -150:, :]) # last 150
yield x_val, y_val
def test_maker():
if True:
sequence_length = np.random.randint(150, 300)+150
data=make_data(1000,sequence_length,features_num,4) # category=4 in test
x_test = data[:,:-150,:] # all but last 150
y_test = (data[:, -150:, :]) # last 150
return x_test, y_test
filepath_for_w= 'flux_vi_model.h5'
checkpointer=ModelCheckpoint(filepath_for_w, monitor='val_loss', verbose=0, save_best_only=True, mode='auto', period=1)
model.fit_generator(train_generator(),callbacks=[checkpointer], steps_per_epoch=30, epochs=1000, verbose=1,validation_data=val_generator(),validation_steps=30)
model.save('filepath_for_w')
x,y= test_maker()
a=model.predict (x)
np.save ('a.npy',a)
np.save ('y.npy',y)
np.save ('x.npy',x)
print (np.mean(np.absolute(y-a)))
The result is the distance between the actual 150 points of the sine-wave vs. the predicted values.
For this code, I received a result of 0.065.
When I tried to make use of the hidden states of the LSTM, my results, to my surprise, have worsened. I use the same code, replacing the model with:
encoder_inputs = Input(shape=(None, features_num))
encoder = LSTM(40, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]
decoder_input=RepeatVector(150)(encoder_outputs)
decoder_lstm = LSTM(40, return_sequences=True, return_state=True)
decoder_outputs,_,_=decoder_lstm(decoder_input, initial_state=encoder_states)
decoder_outputs=TimeDistributed(Dense(features_num))(decoder_outputs)
The result was 0.101, indicting a reduced ability to predict the continuation of the sine-wave when having access to the hidden states of the encoder.
Is my approach wrong in this case and the hidden states cannot be used to improve prediction? Or did I construct the model incorrectly?
See Question&Answers more detail:
os