|
@@ -1,6 +1,8 @@
|
|
|
#! /bin/env python3
|
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
|
|
+import argparse
|
|
|
+import logging
|
|
|
import os
|
|
|
import re
|
|
|
|
|
@@ -11,18 +13,488 @@ import tensorflow_datasets as tfds
|
|
|
# Probably remove
|
|
|
#import matplotlib.pyplot as plt
|
|
|
|
|
|
+# Constants
|
|
|
+MAX_SENTENCE_LENGTH = 40
|
|
|
+BATCH_SIZE = 64
|
|
|
+BUFFER_SIZE = 20000
|
|
|
+
|
|
|
+# Hyper-parameters
|
|
|
+# Notebook poitns out that 'num_layers, d_model, and units have been reduced, and to see
|
|
|
+# https://arxiv.org/abs/1706.03762 for more information.
|
|
|
+NUM_LAYERS = 2
|
|
|
+D_MODEL = 256
|
|
|
+NUM_HEADS = 8
|
|
|
+UNITS = 512
|
|
|
+DROPOUT = 0.1
|
|
|
+
|
|
|
+# Globals, set elsewhere
|
|
|
+START_TOKEN = None
|
|
|
+END_TOKEN = None
|
|
|
+VOCAB_SIZE = None
|
|
|
+EPOCHS = None
|
|
|
+
|
|
|
# random but predictable results:
|
|
|
tf.random.set_seed(4242)
|
|
|
+logger = logging.getLogger()
|
|
|
+
|
|
|
+def initialize_dataset(questions, answers):
|
|
|
+ # decoder inputs use the previous target as input
|
|
|
+ # remove START_TOKEN from targets
|
|
|
+ dataset = tf.data.Dataset.from_tensor_slices((
|
|
|
+ {
|
|
|
+ 'inputs': questions,
|
|
|
+ 'dec_inputs': answers[:, :-1]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ 'outputs': answers[:, 1:]
|
|
|
+ },
|
|
|
+ ))
|
|
|
+
|
|
|
+ dataset = dataset.cache()
|
|
|
+ dataset = dataset.shuffle(BUFFER_SIZE)
|
|
|
+ dataset = dataset.batch(BATCH_SIZE)
|
|
|
+ dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
|
|
|
+
|
|
|
+ return dataset
|
|
|
+
|
|
|
+
|
|
|
+# Tokenize, filter and pad sentences
|
|
|
+def tokenize_and_filter(tokenizer, inputs, outputs):
|
|
|
+ tokenized_inputs, tokenized_outputs = [], []
|
|
|
+
|
|
|
+ for (sentence1, sentence2) in zip(inputs, outputs):
|
|
|
+ # tokenize sentence
|
|
|
+ sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
|
|
|
+ sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN
|
|
|
+ # check tokenized sentence max length
|
|
|
+ if len(sentence1) <= MAX_SENTENCE_LENGTH and len(sentence2) <= MAX_SENTENCE_LENGTH:
|
|
|
+ tokenized_inputs.append(sentence1)
|
|
|
+ tokenized_outputs.append(sentence2)
|
|
|
+
|
|
|
+ # pad tokenized sentences
|
|
|
+ tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
|
|
|
+ tokenized_inputs, maxlen=MAX_SENTENCE_LENGTH, padding='post')
|
|
|
+ tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
|
|
|
+ tokenized_outputs, maxlen=MAX_SENTENCE_LENGTH, padding='post')
|
|
|
+
|
|
|
+ return tokenized_inputs, tokenized_outputs
|
|
|
+
|
|
|
+def load_file(filename):
|
|
|
+ ''' Returns an array of questions and answers '''
|
|
|
+ questions, answers = [], []
|
|
|
+ count = 0
|
|
|
+ with open(filename, 'r') as file:
|
|
|
+ while True:
|
|
|
+ q, a = file.readline(), file.readline()
|
|
|
+ if not q or not a:
|
|
|
+ logger.info(f'{count} question and answer pairs read.')
|
|
|
+ return questions, answers
|
|
|
+ count += 1
|
|
|
+ questions.append(q.replace('\n', ''))
|
|
|
+ answers.append(a.replace('\n', ''))
|
|
|
+ # Unreachable
|
|
|
+
|
|
|
+def load_args():
|
|
|
+
|
|
|
+ parser = argparse.ArgumentParser()
|
|
|
+ parser.add_argument('source', help='Source File')
|
|
|
+ parser.add_argument('output', help='File to save the model')
|
|
|
+ parser.add_argument('--debug', help='Extra debugging messages', action='store_true')
|
|
|
+ parser.add_argument('--epochs', help='Number of epochs to train', default=20)
|
|
|
+ args = parser.parse_args()
|
|
|
+
|
|
|
+ global logger
|
|
|
+ if args.debug:
|
|
|
+ logger.setLevel(logging.DEBUG)
|
|
|
+ else:
|
|
|
+ logger.setLevel(logging.INFO)
|
|
|
+
|
|
|
+ global EPOCHS
|
|
|
+ EPOCHS = args.epochs
|
|
|
+
|
|
|
+ return args
|
|
|
+
|
|
|
+def tokenizer_init(questions, answers):
|
|
|
+ global START_TOKEN
|
|
|
+ global END_TOKEN
|
|
|
+ global VOCAB_SIZE
|
|
|
+ # Build tokenizer using tfds for both questions and answers
|
|
|
+ # not great taht this is depecated. TODO: Update to use tensorflow_text
|
|
|
+ tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(questions + answers, target_vocab_size=2**13)
|
|
|
+ # Define start and end token to indicate the start and end of a sentence
|
|
|
+ START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]
|
|
|
+ # Vocabulary size plus start and end token
|
|
|
+ VOCAB_SIZE = tokenizer.vocab_size + 2
|
|
|
+ logger.debug(f'Vocab size: {VOCAB_SIZE}')
|
|
|
+ logger.debug(f'Tokenized sample question: { tokenizer.encode(questions[20]) }')
|
|
|
+ return tokenizer
|
|
|
+
|
|
|
+def scaled_dot_product_attention(query, key, value, mask):
|
|
|
+ """Calculate the attention weights. """
|
|
|
+ matmul_qk = tf.matmul(query, key, transpose_b=True)
|
|
|
+
|
|
|
+ # scale matmul_qk
|
|
|
+ depth = tf.cast(tf.shape(key)[-1], tf.float32)
|
|
|
+ logits = matmul_qk / tf.math.sqrt(depth)
|
|
|
+
|
|
|
+ # add the mask to zero out padding tokens
|
|
|
+ if mask is not None:
|
|
|
+ logits += (mask * -1e9)
|
|
|
+
|
|
|
+ # softmax is normalized on the last axis (seq_len_k)
|
|
|
+ attention_weights = tf.nn.softmax(logits, axis=-1)
|
|
|
+
|
|
|
+ output = tf.matmul(attention_weights, value)
|
|
|
+
|
|
|
+ return output
|
|
|
+
|
|
|
+
|
|
|
+class MultiHeadAttention(tf.keras.layers.Layer):
|
|
|
+
|
|
|
+ def __init__(self, d_model, num_heads, name="multi_head_attention"):
|
|
|
+ super(MultiHeadAttention, self).__init__(name=name)
|
|
|
+ self.num_heads = num_heads
|
|
|
+ self.d_model = d_model
|
|
|
+
|
|
|
+ assert d_model % self.num_heads == 0
|
|
|
+
|
|
|
+ self.depth = d_model // self.num_heads
|
|
|
+
|
|
|
+ self.query_dense = tf.keras.layers.Dense(units=d_model)
|
|
|
+ self.key_dense = tf.keras.layers.Dense(units=d_model)
|
|
|
+ self.value_dense = tf.keras.layers.Dense(units=d_model)
|
|
|
+
|
|
|
+ self.dense = tf.keras.layers.Dense(units=d_model)
|
|
|
+
|
|
|
+ def split_heads(self, inputs, batch_size):
|
|
|
+ inputs = tf.reshape(
|
|
|
+ inputs, shape=(batch_size, -1, self.num_heads, self.depth))
|
|
|
+ return tf.transpose(inputs, perm=[0, 2, 1, 3])
|
|
|
+
|
|
|
+ def call(self, inputs):
|
|
|
+ query, key, value, mask = inputs['query'], inputs['key'], inputs[
|
|
|
+ 'value'], inputs['mask']
|
|
|
+ batch_size = tf.shape(query)[0]
|
|
|
+
|
|
|
+ # linear layers
|
|
|
+ query = self.query_dense(query)
|
|
|
+ key = self.key_dense(key)
|
|
|
+ value = self.value_dense(value)
|
|
|
+
|
|
|
+ # split heads
|
|
|
+ query = self.split_heads(query, batch_size)
|
|
|
+ key = self.split_heads(key, batch_size)
|
|
|
+ value = self.split_heads(value, batch_size)
|
|
|
+
|
|
|
+ # scaled dot-product attention
|
|
|
+ scaled_attention = scaled_dot_product_attention(query, key, value, mask)
|
|
|
+
|
|
|
+ scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
|
|
|
+
|
|
|
+ # concatenation of heads
|
|
|
+ concat_attention = tf.reshape(scaled_attention,
|
|
|
+ (batch_size, -1, self.d_model))
|
|
|
+
|
|
|
+ # final linear layer
|
|
|
+ outputs = self.dense(concat_attention)
|
|
|
+
|
|
|
+ return outputs
|
|
|
+
|
|
|
+
|
|
|
+def create_padding_mask(x):
|
|
|
+ mask = tf.cast(tf.math.equal(x, 0), tf.float32)
|
|
|
+ # (batch_size, 1, 1, sequence length)
|
|
|
+ return mask[:, tf.newaxis, tf.newaxis, :]
|
|
|
+
|
|
|
+
|
|
|
+def create_look_ahead_mask(x):
|
|
|
+ seq_len = tf.shape(x)[1]
|
|
|
+ look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
|
|
|
+ padding_mask = create_padding_mask(x)
|
|
|
+ return tf.maximum(look_ahead_mask, padding_mask)
|
|
|
+
|
|
|
+
|
|
|
+class PositionalEncoding(tf.keras.layers.Layer):
|
|
|
+
|
|
|
+ def __init__(self, position, d_model):
|
|
|
+ super(PositionalEncoding, self).__init__()
|
|
|
+ self.pos_encoding = self.positional_encoding(position, d_model)
|
|
|
+
|
|
|
+ def get_angles(self, position, i, d_model):
|
|
|
+ angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
|
|
|
+ return position * angles
|
|
|
+
|
|
|
+ def positional_encoding(self, position, d_model):
|
|
|
+ angle_rads = self.get_angles(
|
|
|
+ position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
|
|
|
+ i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
|
|
|
+ d_model=d_model)
|
|
|
+ # apply sin to even index in the array
|
|
|
+ sines = tf.math.sin(angle_rads[:, 0::2])
|
|
|
+ # apply cos to odd index in the array
|
|
|
+ cosines = tf.math.cos(angle_rads[:, 1::2])
|
|
|
+
|
|
|
+ pos_encoding = tf.concat([sines, cosines], axis=-1)
|
|
|
+ pos_encoding = pos_encoding[tf.newaxis, ...]
|
|
|
+ return tf.cast(pos_encoding, tf.float32)
|
|
|
+
|
|
|
+ def call(self, inputs):
|
|
|
+ return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]
|
|
|
+
|
|
|
|
|
|
+def encoder_layer(units, d_model, num_heads, dropout, name="encoder_layer"):
|
|
|
+ inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
|
|
|
+ padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")
|
|
|
|
|
|
+ attention = MultiHeadAttention(
|
|
|
+ d_model, num_heads, name="attention")({
|
|
|
+ 'query': inputs,
|
|
|
+ 'key': inputs,
|
|
|
+ 'value': inputs,
|
|
|
+ 'mask': padding_mask
|
|
|
+ })
|
|
|
+ attention = tf.keras.layers.Dropout(rate=dropout)(attention)
|
|
|
+ attention = tf.keras.layers.LayerNormalization(
|
|
|
+ epsilon=1e-6)(inputs + attention)
|
|
|
|
|
|
+ outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention)
|
|
|
+ outputs = tf.keras.layers.Dense(units=d_model)(outputs)
|
|
|
+ outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
|
|
|
+ outputs = tf.keras.layers.LayerNormalization(
|
|
|
+ epsilon=1e-6)(attention + outputs)
|
|
|
|
|
|
+ return tf.keras.Model(
|
|
|
+ inputs=[inputs, padding_mask], outputs=outputs, name=name)
|
|
|
|
|
|
|
|
|
+def encoder(vocab_size,
|
|
|
+ num_layers,
|
|
|
+ units,
|
|
|
+ d_model,
|
|
|
+ num_heads,
|
|
|
+ dropout,
|
|
|
+ name="encoder"):
|
|
|
+ inputs = tf.keras.Input(shape=(None,), name="inputs")
|
|
|
+ padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")
|
|
|
+
|
|
|
+ embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
|
|
|
+ embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
|
|
|
+ embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
|
|
|
+
|
|
|
+ outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)
|
|
|
+
|
|
|
+ for i in range(num_layers):
|
|
|
+ outputs = encoder_layer(
|
|
|
+ units=units,
|
|
|
+ d_model=d_model,
|
|
|
+ num_heads=num_heads,
|
|
|
+ dropout=dropout,
|
|
|
+ name="encoder_layer_{}".format(i),
|
|
|
+ )([outputs, padding_mask])
|
|
|
+
|
|
|
+ return tf.keras.Model(
|
|
|
+ inputs=[inputs, padding_mask], outputs=outputs, name=name)
|
|
|
+
|
|
|
+
|
|
|
+def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"):
|
|
|
+ inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
|
|
|
+ enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")
|
|
|
+ look_ahead_mask = tf.keras.Input(
|
|
|
+ shape=(1, None, None), name="look_ahead_mask")
|
|
|
+ padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
|
|
|
+
|
|
|
+ attention1 = MultiHeadAttention(
|
|
|
+ d_model, num_heads, name="attention_1")(inputs={
|
|
|
+ 'query': inputs,
|
|
|
+ 'key': inputs,
|
|
|
+ 'value': inputs,
|
|
|
+ 'mask': look_ahead_mask
|
|
|
+ })
|
|
|
+ attention1 = tf.keras.layers.LayerNormalization(
|
|
|
+ epsilon=1e-6)(attention1 + inputs)
|
|
|
+
|
|
|
+ attention2 = MultiHeadAttention(
|
|
|
+ d_model, num_heads, name="attention_2")(inputs={
|
|
|
+ 'query': attention1,
|
|
|
+ 'key': enc_outputs,
|
|
|
+ 'value': enc_outputs,
|
|
|
+ 'mask': padding_mask
|
|
|
+ })
|
|
|
+ attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
|
|
|
+ attention2 = tf.keras.layers.LayerNormalization(
|
|
|
+ epsilon=1e-6)(attention2 + attention1)
|
|
|
+
|
|
|
+ outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention2)
|
|
|
+ outputs = tf.keras.layers.Dense(units=d_model)(outputs)
|
|
|
+ outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
|
|
|
+ outputs = tf.keras.layers.LayerNormalization(
|
|
|
+ epsilon=1e-6)(outputs + attention2)
|
|
|
+
|
|
|
+ return tf.keras.Model(
|
|
|
+ inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
|
|
|
+ outputs=outputs,
|
|
|
+ name=name)
|
|
|
+
|
|
|
+
|
|
|
+def decoder(vocab_size,
|
|
|
+ num_layers,
|
|
|
+ units,
|
|
|
+ d_model,
|
|
|
+ num_heads,
|
|
|
+ dropout,
|
|
|
+ name='decoder'):
|
|
|
+ inputs = tf.keras.Input(shape=(None,), name='inputs')
|
|
|
+ enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs')
|
|
|
+ look_ahead_mask = tf.keras.Input(
|
|
|
+ shape=(1, None, None), name='look_ahead_mask')
|
|
|
+ padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
|
|
|
+
|
|
|
+ embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
|
|
|
+ embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
|
|
|
+ embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
|
|
|
+
|
|
|
+ outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)
|
|
|
+
|
|
|
+ for i in range(num_layers):
|
|
|
+ outputs = decoder_layer(
|
|
|
+ units=units,
|
|
|
+ d_model=d_model,
|
|
|
+ num_heads=num_heads,
|
|
|
+ dropout=dropout,
|
|
|
+ name='decoder_layer_{}'.format(i),
|
|
|
+ )(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask])
|
|
|
+
|
|
|
+ return tf.keras.Model(
|
|
|
+ inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
|
|
|
+ outputs=outputs,
|
|
|
+ name=name)
|
|
|
+
|
|
|
+
|
|
|
+def transformer(vocab_size,
|
|
|
+ num_layers,
|
|
|
+ units,
|
|
|
+ d_model,
|
|
|
+ num_heads,
|
|
|
+ dropout,
|
|
|
+ name="transformer"):
|
|
|
+ inputs = tf.keras.Input(shape=(None,), name="inputs")
|
|
|
+ dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")
|
|
|
+
|
|
|
+ enc_padding_mask = tf.keras.layers.Lambda(
|
|
|
+ create_padding_mask, output_shape=(1, 1, None),
|
|
|
+ name='enc_padding_mask')(inputs)
|
|
|
+ # mask the future tokens for decoder inputs at the 1st attention block
|
|
|
+ look_ahead_mask = tf.keras.layers.Lambda(
|
|
|
+ create_look_ahead_mask,
|
|
|
+ output_shape=(1, None, None),
|
|
|
+ name='look_ahead_mask')(dec_inputs)
|
|
|
+ # mask the encoder outputs for the 2nd attention block
|
|
|
+ dec_padding_mask = tf.keras.layers.Lambda(
|
|
|
+ create_padding_mask, output_shape=(1, 1, None),
|
|
|
+ name='dec_padding_mask')(inputs)
|
|
|
+
|
|
|
+ enc_outputs = encoder(
|
|
|
+ vocab_size=vocab_size,
|
|
|
+ num_layers=num_layers,
|
|
|
+ units=units,
|
|
|
+ d_model=d_model,
|
|
|
+ num_heads=num_heads,
|
|
|
+ dropout=dropout,
|
|
|
+ )(inputs=[inputs, enc_padding_mask])
|
|
|
+
|
|
|
+ dec_outputs = decoder(
|
|
|
+ vocab_size=vocab_size,
|
|
|
+ num_layers=num_layers,
|
|
|
+ units=units,
|
|
|
+ d_model=d_model,
|
|
|
+ num_heads=num_heads,
|
|
|
+ dropout=dropout,
|
|
|
+ )(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])
|
|
|
+
|
|
|
+ outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)
|
|
|
+
|
|
|
+ return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)
|
|
|
+
|
|
|
+
|
|
|
+def train_model():
|
|
|
+ tf.keras.backend.clear_session()
|
|
|
+
|
|
|
+ return transformer(
|
|
|
+ vocab_size=VOCAB_SIZE,
|
|
|
+ num_layers=NUM_LAYERS,
|
|
|
+ units=UNITS,
|
|
|
+ d_model=D_MODEL,
|
|
|
+ num_heads=NUM_HEADS,
|
|
|
+ dropout=DROPOUT)
|
|
|
+
|
|
|
+
|
|
|
+def loss_function(y_true, y_pred):
|
|
|
+ y_true = tf.reshape(y_true, shape=(-1, MAX_SENTENCE_LENGTH - 1))
|
|
|
+
|
|
|
+ loss = tf.keras.losses.SparseCategoricalCrossentropy(
|
|
|
+ from_logits=True, reduction='none')(y_true, y_pred)
|
|
|
+
|
|
|
+ mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
|
|
|
+ loss = tf.multiply(loss, mask)
|
|
|
+
|
|
|
+ return tf.reduce_mean(loss)
|
|
|
+
|
|
|
+# A custom learning rate
|
|
|
+class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
|
|
|
+
|
|
|
+ def __init__(self, d_model, warmup_steps=4000):
|
|
|
+ super(CustomSchedule, self).__init__()
|
|
|
+
|
|
|
+ self.d_model = d_model
|
|
|
+ self.d_model = tf.cast(self.d_model, tf.float32)
|
|
|
+
|
|
|
+ self.warmup_steps = warmup_steps
|
|
|
+
|
|
|
+ def __call__(self, step):
|
|
|
+ arg1 = tf.math.rsqrt(step)
|
|
|
+ arg2 = step * (self.warmup_steps**-1.5)
|
|
|
+
|
|
|
+ return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
|
|
|
+
|
|
|
+
|
|
|
+def accuracy(y_true, y_pred):
|
|
|
+ # ensure labels have shape (batch_size, MAX_LENGTH - 1)
|
|
|
+ y_true = tf.reshape(y_true, shape=(-1, MAX_SENTENCE_LENGTH - 1))
|
|
|
+ return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
|
|
|
|
|
|
|
|
|
def main():
|
|
|
- print(f'Hello World')
|
|
|
+ global logger
|
|
|
+
|
|
|
+ handler = logging.StreamHandler()
|
|
|
+ logger.addHandler(handler)
|
|
|
+
|
|
|
+ args = load_args()
|
|
|
+
|
|
|
+ logger.debug(f'Loading file {args.source}')
|
|
|
+ questions, answers = load_file(args.source)
|
|
|
+ logger.debug('Sample question: {}'.format(questions[25]))
|
|
|
+ logger.debug('Sample answer: {}'.format(answers[25]))
|
|
|
+
|
|
|
+ tokenizer = tokenizer_init(questions, answers)
|
|
|
+ questions, answers = tokenize_and_filter(tokenizer, questions, answers)
|
|
|
+
|
|
|
+ dataset = initialize_dataset(questions, answers)
|
|
|
+
|
|
|
+ logger.debug(create_look_ahead_mask(tf.constant([[1, 2, 0, 4, 5]])))
|
|
|
+
|
|
|
+ model = train_model()
|
|
|
+ learning_rate = CustomSchedule(D_MODEL)
|
|
|
+
|
|
|
+ optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
|
|
|
+
|
|
|
+ model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])
|
|
|
+
|
|
|
+ model.fit(dataset, epochs=EPOCHS)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|