Hi,
Recently I trained a Keras model based on 600 files, roughly 40-80 lines each with long messages. Here is my train function:
def train(self, file_path):
text = open(file_path, 'rb').read().decode(encoding='utf-8', errors="ignore")
if self.tokenizer is None:
self.tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=False)
self.tokenizer.fit_on_texts([text])
total_words = len(self.tokenizer.word_index) + 1
sequences = []
for i in range(self.sequence_length, len(text)):
seq = text[i - self.sequence_length:i]
sequences.append(seq)
input_sequences = self.tokenizer.texts_to_sequences(sequences)
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=self.sequence_length, padding='pre')
inputs, targets = input_sequences[:, :-1], input_sequences[:, -1]
targets = tf.keras.utils.to_categorical(targets, num_classes=total_words)
if self.model is None:
self.model = tf.keras.Sequential([
tf.keras.layers.Embedding(total_words, self.embedding_dim, input_length=self.sequence_length-1),
tf.keras.layers.LSTM(self.rnn_units),
tf.keras.layers.Dense(total_words, activation='softmax')
])
self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
self.model.fit(inputs, targets, epochs=3, batch_size=self.batch_size)
This is what it is outputting:
that
l
on
a
im sorry to i a
to
im to character a to not like a
a
ud83d
with the one
are i so
4
im sorry in ud83d
and
that
to a
it's my is
to
you a i have in a
im the server
im and some i still the
like 4
wait
im
im to to you in it's to a or
I expected it to be able to form sentences at the very least, not just total garbage. If my expectations were too high, then please do let me know. If it helps, here is my generation function:
def generate_text(self, seed_text):
generated_text = ""
token_list = self.tokenizer.texts_to_sequences([seed_text])[0]
for _ in range(50):
token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list], maxlen=self.sequence_length-1, padding='pre')
predicted_probs = self.model.predict(token_list, verbose=0)[0]
predicted_id = np.random.choice(len(predicted_probs), p=predicted_probs)
if predicted_id == 0:
break
predicted_word = self.tokenizer.index_word.get(predicted_id, "")
generated_text += " " + predicted_word
token_list = list(token_list[0][1:]) + [predicted_id]
return generated_text
Thank you for any support.