我正在尝试使用expstflow中的lstm,但我已经达到了一点,我无法使一个简单的imdb情绪模型收敛 .

我采用了一个keras模型并试图在张量流中复制完全相同的模型,在keras中它训练并收敛但是在张量流中它只是卡在某个点(0.69损失)

我试着让它们尽可能相同 . 我能说的唯一区别是在keras中,填充在序列之前,而在张量流中,由于tensorflow中的约定,我使用'post'填充 .

我的张量流模型有什么问题吗?

from __future__ import print_function

import random
import numpy as np

from tensorflow.contrib.keras.python.keras.preprocessing import sequence
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.layers import Dense, Dropout, Activation
from tensorflow.contrib.keras.python.keras.layers import Embedding
from tensorflow.contrib.keras.python.keras.layers import LSTM
from tensorflow.contrib.keras.python.keras.layers import Conv1D, MaxPooling1D
from tensorflow.contrib.keras.python.keras.datasets import imdb

import tensorflow as tf

# Embedding
max_features = 30000
maxlen = 2494
embedding_size = 128

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 30
epochs = 2


class TrainData:
    def __init__(self, batch_sz=batch_size):
        (x_train, y_train), (_, _) = imdb.load_data(num_words=max_features)


        y_train = [[int(x == 1), int(x != 1)] for x in y_train]
        self._batch_size = batch_sz

        self._train_data = sequence.pad_sequences(x_train, padding='pre')

        self._train_labels = y_train

    def next_batch(self):
        if len(self._train_data) < self._batch_size:
            self.__init__()

        batch_x, batch_y = self._train_data[:self._batch_size], self._train_labels[:self._batch_size]
        self._train_data = self._train_data[self._batch_size:]
        self._train_labels = self._train_labels[self._batch_size:]

        return batch_x, batch_y

    def batch_generator(self):
        while True:
            if len(self._train_data) < self._batch_size:
                self.__init__()

            batch_x, batch_y = self._train_data[:self._batch_size], self._train_labels[:self._batch_size]
            self._train_data = self._train_data[self._batch_size:]
            self._train_labels = self._train_labels[self._batch_size:]

            yield batch_x, batch_y

    def get_num_batches(self):
        return int(len(self._train_data) / self._batch_size)

def length(sequence):
    used = tf.sign(tf.abs(sequence))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length


def get_model(x, y):
    embedding = tf.get_variable("embedding", [max_features, embedding_size], dtype=tf.float32)
    embedded_x = tf.nn.embedding_lookup(embedding, x)
    print(x)
    print(embedded_x)
    print(length(x))

    cell_1 = tf.contrib.rnn.BasicLSTMCell(lstm_output_size)
    output_1, state_1 = tf.nn.dynamic_rnn(cell_1, embedded_x, dtype=tf.float32, scope="rnn_layer1",
                                          sequence_length=length(x))

    # Select last output.
    last_index = tf.shape(output_1)[1] - 1
    # reshaping to [seq_length, batch_size, num_units]
    output = tf.transpose(output_1, [1, 0, 2])

    last = tf.gather(output, last_index)

    # Softmax layer
    with tf.name_scope('fc_layer'):
        weight = tf.get_variable(name="weights", shape=[lstm_output_size, 2])
        bias = tf.get_variable(shape=[2], name="bias")

    logits = tf.matmul(last, weight) + bias

    loss = tf.losses.softmax_cross_entropy(y, logits=logits)

    optimizer = tf.train.AdamOptimizer()
    optimize_step = optimizer.minimize(loss=loss)

    return loss, optimize_step


def tf_model():
    x_holder = tf.placeholder(tf.int32, shape=[None, maxlen])
    y_holder = tf.placeholder(tf.int32, shape=[None, 2])
    loss, opt_step = get_model(x_holder, y_holder)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        step = 0
        for epoch in range(10):
            cost_epochs = []
            train_data = TrainData()
            cost_batch = 0
            for batch in range(train_data.get_num_batches()):
                x_train, y_train = train_data.next_batch()
                _, cost_batch = sess.run([opt_step, loss],
                                         feed_dict={x_holder: x_train,
                                                    y_holder: y_train})

                cost_epochs.append(cost_batch)


                step += 1
                # if step % 100 == 0:
                print("Epoch: " + str(epoch))
                print("\tcost: " + str(np.mean(cost_epochs)))



def keras_model():
    # print('Loading data...')
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

    y_test = [[int(x == 1), int(x != 1)] for x in y_test]

    x_test = sequence.pad_sequences(x_test, maxlen=maxlen, padding='pre')

    model = Sequential()
    model.add(Embedding(max_features, embedding_size, input_length=maxlen))

    model.add(LSTM(lstm_output_size))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    print('Train...')
    data = TrainData()
    model.fit_generator(data.batch_generator(), steps_per_epoch=data.get_num_batches(),
                        epochs=epochs,
                        validation_data=(x_test, y_test))


if __name__ == '__main__':
    # keras_model()
    tf_model()

EDIT

当我将序列长度限制为100时,两个模型都收敛,所以我假设在lstm层中存在不同的东西