尝试重新创建Tensorflow NMT示例

https://github.com/tensorflow/nmt#bidirectional-rnns

只是复制原始句子而不是翻译的基线,但是我的推理部分得到了这个奇怪的错误,其中唯一的输出是多次相同的单词 .

使用TF注意包装并使用Greedy Embedding Helper进行推理 .

如果有帮助,使用TF 1.3和python 3.6

Screenshot of the bugged prediction

奇怪的是,在训练期间,预测是正常的,损失减少到0.1左右

我已经检查了嵌入,并且它们确实从每个时间步骤改变,我怀疑这与解码阶段有关,因为它是从训练到推理真正变化的唯一部分 .

tf.reset_default_graph()
sess = tf.InteractiveSession()

PAD = 0
EOS = 1

max_gradient_norm = 1
learning_rate = 0.02
num_layers = 1
total_epoch = 2
sentence_length = 19
vocab_size = 26236
input_embedding_size = 128

if mode == "training":
    batch_size = 100
    isReused = None
else:
    batch_size = 1
    isReused = True

with tf.name_scope("encoder"):
    encoder_embeddings = tf.get_variable('encoder_embeddings', [vocab_size, input_embedding_size], tf.float32,
                                         tf.random_uniform_initializer(-1.0, 1.0))
    encoder_hidden_units = 128
    encoder_inputs = tf.placeholder(shape=(batch_size, None), dtype=tf.int32, name='encoder_inputs')
    encoder_lengths = tf.placeholder(shape=batch_size, dtype=tf.int32, name='encoder_lengths')

    encoder_cell = tf.contrib.rnn.BasicLSTMCell(encoder_hidden_units, state_is_tuple=True)
    encoder_inputs_embedded = tf.nn.embedding_lookup(encoder_embeddings, encoder_inputs)
    encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder_cell, encoder_inputs_embedded, dtype=tf.float32,
                                                       sequence_length=encoder_lengths, time_major=False)

with tf.variable_scope("decoder", isReused):
    decoder_hidden_units = encoder_hidden_units

    decoder_inputs = tf.placeholder(shape=(batch_size, None), dtype=tf.int32, name='decoder_inputs')
    decoder_targets = tf.placeholder(shape=(batch_size, None), dtype=tf.int32, name='decoder_targets')
    decoder_lengths = tf.placeholder(shape=batch_size, dtype=tf.int32, name="decoder_lengths")

    decoder_embeddings = tf.get_variable('decoder_embeddings', [vocab_size, input_embedding_size], tf.float32,
                                         tf.random_uniform_initializer(-1.0, 1.0))
    decoder_inputs_embedded = tf.nn.embedding_lookup(decoder_embeddings, decoder_inputs)
    decoder_cell = tf.contrib.rnn.BasicLSTMCell(decoder_hidden_units, state_is_tuple=True)
    projection_layer = layers_core.Dense(vocab_size, use_bias=False)  
    attention_mechanism = tf.contrib.seq2seq.LuongAttention(encoder_hidden_units, encoder_outputs,
                                                            memory_sequence_length=encoder_lengths)
    attn_decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                            attention_layer_size=encoder_hidden_units)

    if mode == "training":
        helper = tf.contrib.seq2seq.TrainingHelper(decoder_inputs_embedded, decoder_lengths, time_major=False)  
        maximum_iterations = None
    else:
        helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings, tf.fill([batch_size], EOS), EOS)
        maximum_iterations = tf.round(tf.reduce_max(encoder_lengths) * 2)
    # Decoder
    init_state = attn_decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state)
    decoder = tf.contrib.seq2seq.BasicDecoder(attn_decoder_cell, helper, init_state, output_layer=projection_layer)
    # Dynamic decoding

    decoder_outputs, decoder_final_state, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=False,
                                                                                swap_memory=True,
                                                                                maximum_iterations=maximum_iterations)
    decoder_logits = decoder_outputs.rnn_output
    decoder_prediction = decoder_outputs.sample_id
if mode == "training":
    with tf.name_scope("cross_entropy"):
        labels = tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32)
        decoder_crossent = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=decoder_logits)
    with tf.name_scope("loss"):
        target_weights = tf.sequence_mask(decoder_lengths, maxlen=20, dtype=decoder_logits.dtype)
        train_loss = tf.reduce_sum(decoder_crossent * target_weights) / (batch_size * 20)
    tf.summary.scalar('loss', train_loss)

    with tf.name_scope("clip_gradients"):
        params = tf.trainable_variables()
        gradients = tf.gradients(train_loss, params)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm)
    with tf.name_scope("Optimizer"):
        optimizer = tf.train.AdamOptimizer(learning_rate)
        update_step = optimizer.apply_gradients(zip(clipped_gradients, params))  

merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(os.getcwd() + '/train', sess.graph)
test_writer = tf.summary.FileWriter(os.getcwd() + '/test', )
tf.global_variables_initializer().run()

sess.run(tf.global_variables_initializer())