Problem Description
I've written an generator which tokenizes and pads text sequences and want to use it in Model.fit().
This works fine if i instruct the generator to yield one element (2 times for train and test) per iteration (batch_size 1).
But when i call the generator with batch_size greater than 1 (i.e. 64) tensorflow complains that my model has only 1 input not 64.
Layer autoencoder expects 1 input(s), but it received 64 input tensors.
Code
Generator
def tokenize_and_pad(filename: str, batch_size: int, tokenizer: Tokenizer, maxlen=50, training=True):
batch = []
try:
for data in filereader_generator(filename, batch_size=batch_size):
data = tokenizer.texts_to_sequences(data)
data = keras.preprocessing.sequence.pad_sequences(
data,
padding='post',
truncating='post',
maxlen=maxlen
)
# data: [[3 2 2 ... 0 0 0]
# [3 2 2 ... 0 0 0]
# [3 2 2 ... 0 0 0]
# ...
# [3 2 2 ... 0 0 0]
# [3 2 2 ... 0 0 0]
# [3 2 2 ... 0 0 0]]
#
# len(data): 64
batch.append([data])
if len(batch) == batch_size:
batch = batch
if training:
yield batch, batch
else:
yield batch
batch = []
except Exception as e:
if training:
yield np.array([batch]), np.array([batch])
raise StopIteration
else:
yield batch
raise StopIteration
snippet from main
autoencoder, encoder, decoder = dense_autoencoder(latent_dim=latent_dims, layers=layers, activation=activation)
autoencoder.compile(optimizer='adam', loss='mse', metrics=['mse'])
print(autoencoder.summary())
train = tokenize_and_pad(f_train, batch_size=1, tokenizer=tokenizer)
val = tokenize_and_pad(f_val, batch_size=1, tokenizer=tokenizer)
steps_per_epoch = file_len(f_train)/batch_size
val_steps_per_epoch = file_len(f_val)/batch_size
autoencoder.fit(
train,
epochs=5,
steps_per_epoch=steps_per_epoch,
batch_size=batch_size,
validation_data=val,
validation_batch_size=batch_size,
validation_steps=val_steps_per_epoch
)
Autoencoder
def dense_autoencoder(latent_dim=100, layers=[1024], input_dims=50, activation='relu'):
layer_counter = 1
enc_in = keras.Input(shape=(input_dims,), name='enc_input')
x = keras.layers.Embedding(input_dim=100, output_dim=10, input_length=input_dims, mask_zero=True, name='embedding')(enc_in)
x = keras.layers.Flatten(name='flatten')(x)
for layer in layers:
x = keras.layers.Dense(layer, activation=activation, name=f"enc_l{layer_counter}")(x)
layer_counter += 1
enc_out = keras.layers.Dense(latent_dim, activation=activation, name='enc_out')(x)
layer_counter = 1
dec_in = keras.Input(shape=(latent_dim,), name='dec_input')
x = dec_in
for layer in reversed(layers):
x = keras.layers.Dense(layer, activation=activation, name=f"dec_l{layer_counter}")(x)
layer_counter += 1
dec_out = keras.layers.Dense(input_dims, name='dec_out')(x)
encoder = keras.Model(inputs=enc_in, outputs=enc_out, name='encoder')
decoder = keras.Model(inputs=dec_in, outputs=dec_out, name='decoder')
autoencoder = keras.Model(inputs=enc_in, outputs=decoder(enc_out), name='autoencoder')
return autoencoder, encoder, decoder
What i tried
yield np.array(batch), np.array(batch)
`
TF error: Invalid argument: Matrix size-incompatible: In[0]: [64,32000], In[1]: [500,2048]
yield [batch], [batch]
Same error as in problem description
yield [[batch], [batch]]
TF error: Layer autoencoder expects 1 input(s), but it received 128 input tensors.
Question
Is there a way to feed batches to the fit function and if yes how can i yield the batches right?
Or is this a misunderstanding of mine and the gernerator has to return only one value and the batch is build thru the batch_size parameter of the fit function ?
question from:
https://stackoverflow.com/questions/65864735/problem-when-generator-for-tf-keras-fit-returns-more-than-one-element