#! /bin/env python3 from __future__ import absolute_import, division, print_function, unicode_literals import argparse import logging import os import re import numpy as np import tensorflow as tf import tensorflow_datasets as tfds # Probably remove #import matplotlib.pyplot as plt # Constants MAX_SENTENCE_LENGTH = 40 BATCH_SIZE = 64 BUFFER_SIZE = 20000 # Hyper-parameters # Notebook poitns out that 'num_layers, d_model, and units have been reduced, and to see # https://arxiv.org/abs/1706.03762 for more information. #NUM_LAYERS = 2 NUM_LAYERS = 4 #D_MODEL = 256 D_MODEL = 512 NUM_HEADS = 8 #UNITS = 512 UNITS = 1024 DROPOUT = 0.1 SAMPLE_QUESTIONS = [ "Hi.", "What is your name?", "My name is Fred.", "What's your name?", "How ya doin?", "What do you do for a living?", "Would you like to have sex with me?", "Which sexy cartoon character do you like best?", "Anything else you want to ask?", "Are you human?", "Do you like me?", "What did you to today?", "Do you like pizza?", "Are you a man or a woman?" ] # Globals, set elsewhere START_TOKEN = None END_TOKEN = None VOCAB_SIZE = None EPOCHS = None # random but predictable results: #tf.random.set_seed(4242) logger = logging.getLogger() def initialize_dataset(questions, answers): # decoder inputs use the previous target as input # remove START_TOKEN from targets dataset = tf.data.Dataset.from_tensor_slices(( { 'inputs': questions, 'dec_inputs': answers[:, :-1] }, { 'outputs': answers[:, 1:] }, )) dataset = dataset.cache() dataset = dataset.shuffle(BUFFER_SIZE) dataset = dataset.batch(BATCH_SIZE) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset def preprocess_sentence(sentence): sentence = sentence.lower().strip() # creating a space between a word and the punctuation following it # eg: "he is a boy." => "he is a boy ." sentence = re.sub(r"([?.!,])", r" \1 ", sentence) sentence = re.sub(r'[" "]+', " ", sentence) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",") sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence) sentence = sentence.strip() # adding a start and an end token to the sentence return sentence # Tokenize, filter and pad sentences def tokenize_and_filter(tokenizer, inputs, outputs): tokenized_inputs, tokenized_outputs = [], [] for (sentence1, sentence2) in zip(inputs, outputs): # tokenize sentence sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN # check tokenized sentence max length if len(sentence1) <= MAX_SENTENCE_LENGTH and len(sentence2) <= MAX_SENTENCE_LENGTH: tokenized_inputs.append(sentence1) tokenized_outputs.append(sentence2) # pad tokenized sentences tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences( tokenized_inputs, maxlen=MAX_SENTENCE_LENGTH, padding='post') tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences( tokenized_outputs, maxlen=MAX_SENTENCE_LENGTH, padding='post') return tokenized_inputs, tokenized_outputs def load_file(filename): ''' Returns an array of questions and answers ''' questions, answers = [], [] count = 0 with open(filename, 'r') as file: while True: q, a = file.readline(), file.readline() if not q or not a: logger.info(f'{count} question and answer pairs read.') return questions, answers count += 1 questions.append(preprocess_sentence(q)) answers.append(preprocess_sentence(a)) # Unreachable def tokenizer_init(questions, answers): global START_TOKEN global END_TOKEN global VOCAB_SIZE # Build tokenizer using tfds for both questions and answers # not great taht this is depecated. TODO: Update to use tensorflow_text try: tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(questions + answers, target_vocab_size=2**13) except: tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(questions + answers, target_vocab_size=2**13) # Define start and end token to indicate the start and end of a sentence START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1] # Vocabulary size plus start and end token VOCAB_SIZE = tokenizer.vocab_size + 2 logger.debug(f'Vocab size: {VOCAB_SIZE}') logger.debug(f'Tokenized sample question: { tokenizer.encode(questions[20]) }') return tokenizer def scaled_dot_product_attention(query, key, value, mask): """Calculate the attention weights. """ matmul_qk = tf.matmul(query, key, transpose_b=True) # scale matmul_qk depth = tf.cast(tf.shape(key)[-1], tf.float32) logits = matmul_qk / tf.math.sqrt(depth) # add the mask to zero out padding tokens if mask is not None: logits += (mask * -1e9) # softmax is normalized on the last axis (seq_len_k) attention_weights = tf.nn.softmax(logits, axis=-1) output = tf.matmul(attention_weights, value) return output class MultiHeadAttention(tf.keras.layers.Layer): def __init__(self, d_model, num_heads, name="multi_head_attention"): super(MultiHeadAttention, self).__init__(name=name) self.num_heads = num_heads self.d_model = d_model assert d_model % self.num_heads == 0 self.depth = d_model // self.num_heads self.query_dense = tf.keras.layers.Dense(units=d_model) self.key_dense = tf.keras.layers.Dense(units=d_model) self.value_dense = tf.keras.layers.Dense(units=d_model) self.dense = tf.keras.layers.Dense(units=d_model) def split_heads(self, inputs, batch_size): inputs = tf.reshape( inputs, shape=(batch_size, -1, self.num_heads, self.depth)) return tf.transpose(inputs, perm=[0, 2, 1, 3]) def call(self, inputs): query, key, value, mask = inputs['query'], inputs['key'], inputs[ 'value'], inputs['mask'] batch_size = tf.shape(query)[0] # linear layers query = self.query_dense(query) key = self.key_dense(key) value = self.value_dense(value) # split heads query = self.split_heads(query, batch_size) key = self.split_heads(key, batch_size) value = self.split_heads(value, batch_size) # scaled dot-product attention scaled_attention = scaled_dot_product_attention(query, key, value, mask) scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # concatenation of heads concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) # final linear layer outputs = self.dense(concat_attention) return outputs def create_padding_mask(x): mask = tf.cast(tf.math.equal(x, 0), tf.float32) # (batch_size, 1, 1, sequence length) return mask[:, tf.newaxis, tf.newaxis, :] def create_look_ahead_mask(x): seq_len = tf.shape(x)[1] look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) padding_mask = create_padding_mask(x) return tf.maximum(look_ahead_mask, padding_mask) class PositionalEncoding(tf.keras.layers.Layer): def __init__(self, position, d_model): super(PositionalEncoding, self).__init__() self.pos_encoding = self.positional_encoding(position, d_model) def get_angles(self, position, i, d_model): angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32)) return position * angles def positional_encoding(self, position, d_model): angle_rads = self.get_angles( position=tf.range(position, dtype=tf.float32)[:, tf.newaxis], i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :], d_model=d_model) # apply sin to even index in the array sines = tf.math.sin(angle_rads[:, 0::2]) # apply cos to odd index in the array cosines = tf.math.cos(angle_rads[:, 1::2]) pos_encoding = tf.concat([sines, cosines], axis=-1) pos_encoding = pos_encoding[tf.newaxis, ...] return tf.cast(pos_encoding, tf.float32) def call(self, inputs): return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :] def encoder_layer(units, d_model, num_heads, dropout, name="encoder_layer"): inputs = tf.keras.Input(shape=(None, d_model), name="inputs") padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask") attention = MultiHeadAttention( d_model, num_heads, name="attention")({ 'query': inputs, 'key': inputs, 'value': inputs, 'mask': padding_mask }) attention = tf.keras.layers.Dropout(rate=dropout)(attention) attention = tf.keras.layers.LayerNormalization( epsilon=1e-6)(inputs + attention) outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention) outputs = tf.keras.layers.Dense(units=d_model)(outputs) outputs = tf.keras.layers.Dropout(rate=dropout)(outputs) outputs = tf.keras.layers.LayerNormalization( epsilon=1e-6)(attention + outputs) return tf.keras.Model( inputs=[inputs, padding_mask], outputs=outputs, name=name) def encoder(vocab_size, num_layers, units, d_model, num_heads, dropout, name="encoder"): inputs = tf.keras.Input(shape=(None,), name="inputs") padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask") embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs) embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32)) embeddings = PositionalEncoding(vocab_size, d_model)(embeddings) outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings) for i in range(num_layers): outputs = encoder_layer( units=units, d_model=d_model, num_heads=num_heads, dropout=dropout, name="encoder_layer_{}".format(i), )([outputs, padding_mask]) return tf.keras.Model( inputs=[inputs, padding_mask], outputs=outputs, name=name) def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"): inputs = tf.keras.Input(shape=(None, d_model), name="inputs") enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs") look_ahead_mask = tf.keras.Input( shape=(1, None, None), name="look_ahead_mask") padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask') attention1 = MultiHeadAttention( d_model, num_heads, name="attention_1")(inputs={ 'query': inputs, 'key': inputs, 'value': inputs, 'mask': look_ahead_mask }) attention1 = tf.keras.layers.LayerNormalization( epsilon=1e-6)(attention1 + inputs) attention2 = MultiHeadAttention( d_model, num_heads, name="attention_2")(inputs={ 'query': attention1, 'key': enc_outputs, 'value': enc_outputs, 'mask': padding_mask }) attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2) attention2 = tf.keras.layers.LayerNormalization( epsilon=1e-6)(attention2 + attention1) outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention2) outputs = tf.keras.layers.Dense(units=d_model)(outputs) outputs = tf.keras.layers.Dropout(rate=dropout)(outputs) outputs = tf.keras.layers.LayerNormalization( epsilon=1e-6)(outputs + attention2) return tf.keras.Model( inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask], outputs=outputs, name=name) def decoder(vocab_size, num_layers, units, d_model, num_heads, dropout, name='decoder'): inputs = tf.keras.Input(shape=(None,), name='inputs') enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs') look_ahead_mask = tf.keras.Input( shape=(1, None, None), name='look_ahead_mask') padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask') embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs) embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32)) embeddings = PositionalEncoding(vocab_size, d_model)(embeddings) outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings) for i in range(num_layers): outputs = decoder_layer( units=units, d_model=d_model, num_heads=num_heads, dropout=dropout, name='decoder_layer_{}'.format(i), )(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask]) return tf.keras.Model( inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask], outputs=outputs, name=name) def transformer(vocab_size, num_layers, units, d_model, num_heads, dropout, name="transformer"): inputs = tf.keras.Input(shape=(None,), name="inputs") dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs") enc_padding_mask = tf.keras.layers.Lambda( create_padding_mask, output_shape=(1, 1, None), name='enc_padding_mask')(inputs) # mask the future tokens for decoder inputs at the 1st attention block look_ahead_mask = tf.keras.layers.Lambda( create_look_ahead_mask, output_shape=(1, None, None), name='look_ahead_mask')(dec_inputs) # mask the encoder outputs for the 2nd attention block dec_padding_mask = tf.keras.layers.Lambda( create_padding_mask, output_shape=(1, 1, None), name='dec_padding_mask')(inputs) enc_outputs = encoder( vocab_size=vocab_size, num_layers=num_layers, units=units, d_model=d_model, num_heads=num_heads, dropout=dropout, )(inputs=[inputs, enc_padding_mask]) dec_outputs = decoder( vocab_size=vocab_size, num_layers=num_layers, units=units, d_model=d_model, num_heads=num_heads, dropout=dropout, )(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask]) outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs) return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name) def train_model(): tf.keras.backend.clear_session() return transformer( vocab_size=VOCAB_SIZE, num_layers=NUM_LAYERS, units=UNITS, d_model=D_MODEL, num_heads=NUM_HEADS, dropout=DROPOUT) def loss_function(y_true, y_pred): y_true = tf.reshape(y_true, shape=(-1, MAX_SENTENCE_LENGTH - 1)) loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none')(y_true, y_pred) mask = tf.cast(tf.not_equal(y_true, 0), tf.float32) loss = tf.multiply(loss, mask) return tf.reduce_mean(loss) # A custom learning rate class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): def __init__(self, d_model, warmup_steps=4000): super(CustomSchedule, self).__init__() self.d_model = d_model self.d_model = tf.cast(self.d_model, tf.float32) self.warmup_steps = warmup_steps def __call__(self, step): arg1 = tf.math.rsqrt(step) arg2 = step * (self.warmup_steps**-1.5) return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) def get_config(self): # TODO: This is related to saving, and may not be correct at all. config = { 'd_model': self.d_model, 'warmup_steps': self.warmup_steps, } return config def accuracy(y_true, y_pred): # ensure labels have shape (batch_size, MAX_LENGTH - 1) y_true = tf.reshape(y_true, shape=(-1, MAX_SENTENCE_LENGTH - 1)) return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred) def evaluate(tokenizer, model, sentence): sentence = preprocess_sentence(sentence) sentence = tf.expand_dims( START_TOKEN + tokenizer.encode(sentence) + END_TOKEN, axis=0) output = tf.expand_dims(START_TOKEN, 0) for i in range(MAX_SENTENCE_LENGTH): predictions = model(inputs=[sentence, output], training=False) # select the last word from the seq_len dimension predictions = predictions[:, -1:, :] predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) # return the result if the predicted_id is equal to the end token if tf.equal(predicted_id, END_TOKEN[0]): break # concatenated the predicted_id to the output which is given to the decoder # as its input. output = tf.concat([output, predicted_id], axis=-1) return tf.squeeze(output, axis=0) def predict(tokenizer, model, sentence): prediction = evaluate(tokenizer, model, sentence) predicted_sentence = tokenizer.decode( [i for i in prediction if i < tokenizer.vocab_size]) #logger.debug('Input: {}'.format(sentence)) #logger.debug('Output: {}'.format(predicted_sentence)) return predicted_sentence def load_args(): parser = argparse.ArgumentParser() parser.add_argument('source', help='Source File') parser.add_argument('output', help='File to save the model') parser.add_argument('--debug', help='Extra debugging messages', action='store_true') parser.add_argument('--epochs', help='Number of epochs to train', type=int, default=20) parser.add_argument('--checkpointdir', help='Filename for checkpoint state', default='checkpoints') parser.add_argument('--resume', help='Resume and keep training', action='store_true') parser.add_argument('--loadcheckpoint', help='Load checkpoint but skip training', action='store_true') args = parser.parse_args() global logger if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) global EPOCHS EPOCHS = args.epochs return args def main(): global logger handler = logging.StreamHandler() logger.addHandler(handler) args = load_args() logger.debug(f'Loading file {args.source}') questions, answers = load_file(args.source) logger.debug('Sample question: {}'.format(questions[25])) logger.debug('Sample answer: {}'.format(answers[25])) tokenizer = tokenizer_init(questions, answers) questions, answers = tokenize_and_filter(tokenizer, questions, answers) dataset = initialize_dataset(questions, answers) logger.debug(create_look_ahead_mask(tf.constant([[1, 2, 0, 4, 5]]))) model = train_model() learning_rate = CustomSchedule(D_MODEL) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy]) # Create a callback that saves the model's weights # TODO: Add a resume learning step. See https://www.tensorflow.org/tutorials/keras/save_and_load for # information on how tor esume from a checkpoint cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(args.checkpointdir, f'{args.output}.ckpt'), save_weights_only=True, verbose=1) if args.resume: logger.debug('Loading previously saved checkpoint') model.load_weights(os.path.join(args.checkpointdir, f'{args.output}.ckpt')).expect_partial() if args.loadcheckpoint: logger.debug('Loading previously saved checkpoint but expecting partial') model.load_weights(os.path.join(args.checkpointdir, f'{args.output}.ckpt')).expect_partial() if not args.loadcheckpoint: logger.debug(f'Untrained model:\n{model.summary()}') model.fit(dataset, epochs=EPOCHS, callbacks=[cp_callback]) logger.debug(f'Trained model:\n{model.summary()}') # save the model - This doesn't work yet #model.save(args.output) for sentence in SAMPLE_QUESTIONS: print(f'Q: {sentence}') print(f'A: {predict(tokenizer, model, sentence)}') if __name__ == "__main__": main()