123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611 |
- #! /bin/env python3
- from __future__ import absolute_import, division, print_function, unicode_literals
- import argparse
- import logging
- import os
- import re
- import numpy as np
- import tensorflow as tf
- import tensorflow_datasets as tfds
- # Probably remove
- #import matplotlib.pyplot as plt
- # Constants
- MAX_SENTENCE_LENGTH = 40
- BATCH_SIZE = 64
- BUFFER_SIZE = 20000
- # Hyper-parameters
- # Notebook poitns out that 'num_layers, d_model, and units have been reduced, and to see
- # https://arxiv.org/abs/1706.03762 for more information.
- #NUM_LAYERS = 2
- NUM_LAYERS = 4
- #D_MODEL = 256
- D_MODEL = 512
- NUM_HEADS = 8
- #UNITS = 512
- UNITS = 1024
- DROPOUT = 0.1
- SAMPLE_QUESTIONS = [
- "Hi.",
- "What is your name?",
- "My name is Fred.",
- "What's your name?",
- "How ya doin?",
- "What do you do for a living?",
- "Would you like to have sex with me?",
- "Which sexy cartoon character do you like best?",
- "Anything else you want to ask?",
- "Are you human?",
- "Do you like me?",
- "What did you to today?",
- "Do you like pizza?",
- "Are you a man or a woman?"
- ]
- # Globals, set elsewhere
- START_TOKEN = None
- END_TOKEN = None
- VOCAB_SIZE = None
- EPOCHS = None
- # random but predictable results:
- #tf.random.set_seed(4242)
- logger = logging.getLogger()
- def initialize_dataset(questions, answers):
- # decoder inputs use the previous target as input
- # remove START_TOKEN from targets
- dataset = tf.data.Dataset.from_tensor_slices((
- {
- 'inputs': questions,
- 'dec_inputs': answers[:, :-1]
- },
- {
- 'outputs': answers[:, 1:]
- },
- ))
- dataset = dataset.cache()
- dataset = dataset.shuffle(BUFFER_SIZE)
- dataset = dataset.batch(BATCH_SIZE)
- dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
-
- return dataset
- def preprocess_sentence(sentence):
- sentence = sentence.lower().strip()
- # creating a space between a word and the punctuation following it
- # eg: "he is a boy." => "he is a boy ."
- sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
- sentence = re.sub(r'[" "]+', " ", sentence)
- # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
- sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
- sentence = sentence.strip()
- # adding a start and an end token to the sentence
- return sentence
- # Tokenize, filter and pad sentences
- def tokenize_and_filter(tokenizer, inputs, outputs):
- tokenized_inputs, tokenized_outputs = [], []
-
- for (sentence1, sentence2) in zip(inputs, outputs):
- # tokenize sentence
- sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
- sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN
- # check tokenized sentence max length
- if len(sentence1) <= MAX_SENTENCE_LENGTH and len(sentence2) <= MAX_SENTENCE_LENGTH:
- tokenized_inputs.append(sentence1)
- tokenized_outputs.append(sentence2)
-
- # pad tokenized sentences
- tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
- tokenized_inputs, maxlen=MAX_SENTENCE_LENGTH, padding='post')
- tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
- tokenized_outputs, maxlen=MAX_SENTENCE_LENGTH, padding='post')
- return tokenized_inputs, tokenized_outputs
- def load_file(filename):
- ''' Returns an array of questions and answers '''
- questions, answers = [], []
- count = 0
- with open(filename, 'r') as file:
- while True:
- q, a = file.readline(), file.readline()
- if not q or not a:
- logger.info(f'{count} question and answer pairs read.')
- return questions, answers
- count += 1
- questions.append(preprocess_sentence(q))
- answers.append(preprocess_sentence(a))
- # Unreachable
- def tokenizer_init(questions, answers):
- global START_TOKEN
- global END_TOKEN
- global VOCAB_SIZE
- # Build tokenizer using tfds for both questions and answers
- # not great taht this is depecated. TODO: Update to use tensorflow_text
- try:
- tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(questions + answers, target_vocab_size=2**13)
- except:
- tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(questions + answers, target_vocab_size=2**13)
- # Define start and end token to indicate the start and end of a sentence
- START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]
- # Vocabulary size plus start and end token
- VOCAB_SIZE = tokenizer.vocab_size + 2
- logger.debug(f'Vocab size: {VOCAB_SIZE}')
- logger.debug(f'Tokenized sample question: { tokenizer.encode(questions[20]) }')
- return tokenizer
- def scaled_dot_product_attention(query, key, value, mask):
- """Calculate the attention weights. """
- matmul_qk = tf.matmul(query, key, transpose_b=True)
- # scale matmul_qk
- depth = tf.cast(tf.shape(key)[-1], tf.float32)
- logits = matmul_qk / tf.math.sqrt(depth)
- # add the mask to zero out padding tokens
- if mask is not None:
- logits += (mask * -1e9)
- # softmax is normalized on the last axis (seq_len_k)
- attention_weights = tf.nn.softmax(logits, axis=-1)
- output = tf.matmul(attention_weights, value)
- return output
- class MultiHeadAttention(tf.keras.layers.Layer):
- def __init__(self, d_model, num_heads, name="multi_head_attention"):
- super(MultiHeadAttention, self).__init__(name=name)
- self.num_heads = num_heads
- self.d_model = d_model
- assert d_model % self.num_heads == 0
- self.depth = d_model // self.num_heads
- self.query_dense = tf.keras.layers.Dense(units=d_model)
- self.key_dense = tf.keras.layers.Dense(units=d_model)
- self.value_dense = tf.keras.layers.Dense(units=d_model)
- self.dense = tf.keras.layers.Dense(units=d_model)
- def split_heads(self, inputs, batch_size):
- inputs = tf.reshape(
- inputs, shape=(batch_size, -1, self.num_heads, self.depth))
- return tf.transpose(inputs, perm=[0, 2, 1, 3])
- def call(self, inputs):
- query, key, value, mask = inputs['query'], inputs['key'], inputs[
- 'value'], inputs['mask']
- batch_size = tf.shape(query)[0]
- # linear layers
- query = self.query_dense(query)
- key = self.key_dense(key)
- value = self.value_dense(value)
- # split heads
- query = self.split_heads(query, batch_size)
- key = self.split_heads(key, batch_size)
- value = self.split_heads(value, batch_size)
- # scaled dot-product attention
- scaled_attention = scaled_dot_product_attention(query, key, value, mask)
- scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
- # concatenation of heads
- concat_attention = tf.reshape(scaled_attention,
- (batch_size, -1, self.d_model))
- # final linear layer
- outputs = self.dense(concat_attention)
- return outputs
- def create_padding_mask(x):
- mask = tf.cast(tf.math.equal(x, 0), tf.float32)
- # (batch_size, 1, 1, sequence length)
- return mask[:, tf.newaxis, tf.newaxis, :]
- def create_look_ahead_mask(x):
- seq_len = tf.shape(x)[1]
- look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
- padding_mask = create_padding_mask(x)
- return tf.maximum(look_ahead_mask, padding_mask)
- class PositionalEncoding(tf.keras.layers.Layer):
- def __init__(self, position, d_model):
- super(PositionalEncoding, self).__init__()
- self.pos_encoding = self.positional_encoding(position, d_model)
- def get_angles(self, position, i, d_model):
- angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
- return position * angles
- def positional_encoding(self, position, d_model):
- angle_rads = self.get_angles(
- position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
- i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
- d_model=d_model)
- # apply sin to even index in the array
- sines = tf.math.sin(angle_rads[:, 0::2])
- # apply cos to odd index in the array
- cosines = tf.math.cos(angle_rads[:, 1::2])
- pos_encoding = tf.concat([sines, cosines], axis=-1)
- pos_encoding = pos_encoding[tf.newaxis, ...]
- return tf.cast(pos_encoding, tf.float32)
- def call(self, inputs):
- return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]
- def encoder_layer(units, d_model, num_heads, dropout, name="encoder_layer"):
- inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
- padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")
- attention = MultiHeadAttention(
- d_model, num_heads, name="attention")({
- 'query': inputs,
- 'key': inputs,
- 'value': inputs,
- 'mask': padding_mask
- })
- attention = tf.keras.layers.Dropout(rate=dropout)(attention)
- attention = tf.keras.layers.LayerNormalization(
- epsilon=1e-6)(inputs + attention)
- outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention)
- outputs = tf.keras.layers.Dense(units=d_model)(outputs)
- outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
- outputs = tf.keras.layers.LayerNormalization(
- epsilon=1e-6)(attention + outputs)
- return tf.keras.Model(
- inputs=[inputs, padding_mask], outputs=outputs, name=name)
- def encoder(vocab_size,
- num_layers,
- units,
- d_model,
- num_heads,
- dropout,
- name="encoder"):
- inputs = tf.keras.Input(shape=(None,), name="inputs")
- padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")
- embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
- embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
- embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
- outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)
- for i in range(num_layers):
- outputs = encoder_layer(
- units=units,
- d_model=d_model,
- num_heads=num_heads,
- dropout=dropout,
- name="encoder_layer_{}".format(i),
- )([outputs, padding_mask])
- return tf.keras.Model(
- inputs=[inputs, padding_mask], outputs=outputs, name=name)
- def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"):
- inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
- enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")
- look_ahead_mask = tf.keras.Input(
- shape=(1, None, None), name="look_ahead_mask")
- padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
- attention1 = MultiHeadAttention(
- d_model, num_heads, name="attention_1")(inputs={
- 'query': inputs,
- 'key': inputs,
- 'value': inputs,
- 'mask': look_ahead_mask
- })
- attention1 = tf.keras.layers.LayerNormalization(
- epsilon=1e-6)(attention1 + inputs)
- attention2 = MultiHeadAttention(
- d_model, num_heads, name="attention_2")(inputs={
- 'query': attention1,
- 'key': enc_outputs,
- 'value': enc_outputs,
- 'mask': padding_mask
- })
- attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
- attention2 = tf.keras.layers.LayerNormalization(
- epsilon=1e-6)(attention2 + attention1)
- outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention2)
- outputs = tf.keras.layers.Dense(units=d_model)(outputs)
- outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
- outputs = tf.keras.layers.LayerNormalization(
- epsilon=1e-6)(outputs + attention2)
- return tf.keras.Model(
- inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
- outputs=outputs,
- name=name)
- def decoder(vocab_size,
- num_layers,
- units,
- d_model,
- num_heads,
- dropout,
- name='decoder'):
- inputs = tf.keras.Input(shape=(None,), name='inputs')
- enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs')
- look_ahead_mask = tf.keras.Input(
- shape=(1, None, None), name='look_ahead_mask')
- padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
-
- embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
- embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
- embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
- outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)
- for i in range(num_layers):
- outputs = decoder_layer(
- units=units,
- d_model=d_model,
- num_heads=num_heads,
- dropout=dropout,
- name='decoder_layer_{}'.format(i),
- )(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask])
- return tf.keras.Model(
- inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
- outputs=outputs,
- name=name)
- def transformer(vocab_size,
- num_layers,
- units,
- d_model,
- num_heads,
- dropout,
- name="transformer"):
- inputs = tf.keras.Input(shape=(None,), name="inputs")
- dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")
- enc_padding_mask = tf.keras.layers.Lambda(
- create_padding_mask, output_shape=(1, 1, None),
- name='enc_padding_mask')(inputs)
- # mask the future tokens for decoder inputs at the 1st attention block
- look_ahead_mask = tf.keras.layers.Lambda(
- create_look_ahead_mask,
- output_shape=(1, None, None),
- name='look_ahead_mask')(dec_inputs)
- # mask the encoder outputs for the 2nd attention block
- dec_padding_mask = tf.keras.layers.Lambda(
- create_padding_mask, output_shape=(1, 1, None),
- name='dec_padding_mask')(inputs)
- enc_outputs = encoder(
- vocab_size=vocab_size,
- num_layers=num_layers,
- units=units,
- d_model=d_model,
- num_heads=num_heads,
- dropout=dropout,
- )(inputs=[inputs, enc_padding_mask])
- dec_outputs = decoder(
- vocab_size=vocab_size,
- num_layers=num_layers,
- units=units,
- d_model=d_model,
- num_heads=num_heads,
- dropout=dropout,
- )(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])
- outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)
- return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)
- def train_model():
- tf.keras.backend.clear_session()
- return transformer(
- vocab_size=VOCAB_SIZE,
- num_layers=NUM_LAYERS,
- units=UNITS,
- d_model=D_MODEL,
- num_heads=NUM_HEADS,
- dropout=DROPOUT)
- def loss_function(y_true, y_pred):
- y_true = tf.reshape(y_true, shape=(-1, MAX_SENTENCE_LENGTH - 1))
-
- loss = tf.keras.losses.SparseCategoricalCrossentropy(
- from_logits=True, reduction='none')(y_true, y_pred)
- mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
- loss = tf.multiply(loss, mask)
- return tf.reduce_mean(loss)
- # A custom learning rate
- class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
- def __init__(self, d_model, warmup_steps=4000):
- super(CustomSchedule, self).__init__()
- self.d_model = d_model
- self.d_model = tf.cast(self.d_model, tf.float32)
- self.warmup_steps = warmup_steps
- def __call__(self, step):
- arg1 = tf.math.rsqrt(step)
- arg2 = step * (self.warmup_steps**-1.5)
- return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
- def get_config(self):
- # TODO: This is related to saving, and may not be correct at all.
- config = {
- 'd_model': self.d_model,
- 'warmup_steps': self.warmup_steps,
- }
- return config
- def accuracy(y_true, y_pred):
- # ensure labels have shape (batch_size, MAX_LENGTH - 1)
- y_true = tf.reshape(y_true, shape=(-1, MAX_SENTENCE_LENGTH - 1))
- return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
- def evaluate(tokenizer, model, sentence):
- sentence = preprocess_sentence(sentence)
- sentence = tf.expand_dims(
- START_TOKEN + tokenizer.encode(sentence) + END_TOKEN, axis=0)
- output = tf.expand_dims(START_TOKEN, 0)
- for i in range(MAX_SENTENCE_LENGTH):
- predictions = model(inputs=[sentence, output], training=False)
- # select the last word from the seq_len dimension
- predictions = predictions[:, -1:, :]
- predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
- # return the result if the predicted_id is equal to the end token
- if tf.equal(predicted_id, END_TOKEN[0]):
- break
- # concatenated the predicted_id to the output which is given to the decoder
- # as its input.
- output = tf.concat([output, predicted_id], axis=-1)
- return tf.squeeze(output, axis=0)
- def predict(tokenizer, model, sentence):
- prediction = evaluate(tokenizer, model, sentence)
- predicted_sentence = tokenizer.decode(
- [i for i in prediction if i < tokenizer.vocab_size])
- #logger.debug('Input: {}'.format(sentence))
- #logger.debug('Output: {}'.format(predicted_sentence))
- return predicted_sentence
- def load_args():
- parser = argparse.ArgumentParser()
- parser.add_argument('source', help='Source File')
- parser.add_argument('output', help='File to save the model')
- parser.add_argument('--debug', help='Extra debugging messages', action='store_true')
- parser.add_argument('--epochs', help='Number of epochs to train', type=int, default=20)
- parser.add_argument('--checkpointdir', help='Filename for checkpoint state', default='checkpoints')
- parser.add_argument('--resume', help='Resume and keep training', action='store_true')
- parser.add_argument('--loadcheckpoint', help='Load checkpoint but skip training', action='store_true')
- args = parser.parse_args()
- global logger
- if args.debug:
- logger.setLevel(logging.DEBUG)
- else:
- logger.setLevel(logging.INFO)
- global EPOCHS
- EPOCHS = args.epochs
- return args
- def main():
- global logger
- handler = logging.StreamHandler()
- logger.addHandler(handler)
- args = load_args()
- logger.debug(f'Loading file {args.source}')
- questions, answers = load_file(args.source)
- logger.debug('Sample question: {}'.format(questions[25]))
- logger.debug('Sample answer: {}'.format(answers[25]))
- tokenizer = tokenizer_init(questions, answers)
- questions, answers = tokenize_and_filter(tokenizer, questions, answers)
- dataset = initialize_dataset(questions, answers)
- logger.debug(create_look_ahead_mask(tf.constant([[1, 2, 0, 4, 5]])))
- model = train_model()
- learning_rate = CustomSchedule(D_MODEL)
- optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
- model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])
- # Create a callback that saves the model's weights
-
- # TODO: Add a resume learning step. See https://www.tensorflow.org/tutorials/keras/save_and_load for
- # information on how tor esume from a checkpoint
- cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(args.checkpointdir, f'{args.output}.ckpt'),
- save_weights_only=True,
- verbose=1)
- if args.resume:
- logger.debug('Loading previously saved checkpoint')
- model.load_weights(os.path.join(args.checkpointdir, f'{args.output}.ckpt')).expect_partial()
- if args.loadcheckpoint:
- logger.debug('Loading previously saved checkpoint but expecting partial')
- model.load_weights(os.path.join(args.checkpointdir, f'{args.output}.ckpt')).expect_partial()
-
- if not args.loadcheckpoint:
- logger.debug(f'Untrained model:\n{model.summary()}')
- model.fit(dataset, epochs=EPOCHS, callbacks=[cp_callback])
- logger.debug(f'Trained model:\n{model.summary()}')
- # save the model - This doesn't work yet
- #model.save(args.output)
-
- for sentence in SAMPLE_QUESTIONS:
- print(f'Q: {sentence}')
- print(f'A: {predict(tokenizer, model, sentence)}')
-
- if __name__ == "__main__":
- main()
|