我正在尝试培养名称生成LSTM网络 . 我没有使用预定义的tensorflow单元格(如tf.contrib.rnn.BasicLSTMCell等) . 我自己创建了LSTM细胞 . 但错误并未超出限制 . 它仅从最初的(当向前传播中使用随机权重时)减少30%,然后开始增加 . 此外,在几千个训练步骤之后,梯度和重量变得非常小 .

我认为不收敛的原因可能是两个中的一个:1 . 张量流图的设计我创建了OR 2.我使用的损失函数 .

我正在为网络的每个时间步骤提供单词的每个字符的一个热向量 . 我用于图生成和丢失功能的代码如下 . Tx是RNN中的时间步数,n_x,n_a,n_y分别是输入矢量的长度,LSTM单元矢量和输出矢量 . 如果有人可以帮助我确定我在这里做错了什么,那将会很棒 .

n_x = vocab_size
n_y = vocab_size
n_a = 100
Tx = 50
Ty = Tx 

with open("trainingnames_file.txt") as f:
    examples = f.readlines()
examples = [x.lower().strip() for x in examples]
X0 = [[char_to_ix[x1] for x1 in list(x)] for x in examples]
X1 = np.array([np.concatenate([np.array(x), np.zeros([Tx-len(x)])]) for x in X0], dtype=np.int32).T
Y0 = [(x[1:] + [char_to_ix["\n"]]) for x in X0]
Y1 = np.array([np.concatenate([np.array(y), np.zeros([Ty-len(y)])]) for y in Y0], dtype=np.int32).T

m = len(X0)



Wf = tf.get_variable(name="Wf", shape = [n_a,(n_a+n_x)])
Wu = tf.get_variable(name="Wu", shape = [n_a,(n_a+n_x)])
Wc = tf.get_variable(name="Wc", shape = [n_a,(n_a+n_x)])
Wo = tf.get_variable(name="Wo", shape = [n_a,(n_a+n_x)])
Wy = tf.get_variable(name="Wy", shape = [n_y,n_a])
bf = tf.get_variable(name="bf", shape = [n_a,1])
bu = tf.get_variable(name="bu", shape = [n_a,1])
bc = tf.get_variable(name="bc", shape = [n_a,1])
bo = tf.get_variable(name="bo", shape = [n_a,1])
by = tf.get_variable(name="by", shape = [n_y,1])

X_input = tf.placeholder(dtype = tf.int32, shape = [Tx,None])
Y_input = tf.placeholder(dtype = tf.int32, shape = [Ty,None])

X = tf.one_hot(X_input, axis = 0, depth = n_x)
Y = tf.one_hot(Y_input, axis = 0, depth = n_y)
X.shape

a_prev = tf.zeros(shape = [n_a,m])
c_prev = tf.zeros(shape = [n_a,m])

a_all = []
c_all = []


for i in range(Tx):
    ac = tf.concat([a_prev,tf.squeeze(tf.slice(input_=X,begin=[0,i,0],size=[n_x,1,m]))], axis=0)
    ct = tf.tanh(tf.matmul(Wc,ac) + bc)
    tug = tf.sigmoid(tf.matmul(Wu,ac) + bu)
    tfg = tf.sigmoid(tf.matmul(Wf,ac) + bf)
    tog = tf.sigmoid(tf.matmul(Wo,ac) + bo)
    c = tf.multiply(tug,ct) + tf.multiply(tfg,c_prev)
    a = tf.multiply(tog,tf.tanh(c))
    y = tf.nn.softmax(tf.matmul(Wy,a) + by, axis = 0)
    a_all.append(a)
    c_all.append(c)
    a_prev = a 
    c_prev = c
    y_ex = tf.expand_dims(y,axis=1)
    if i == 0:
        y_all = y_ex
    else:
        y_all = tf.concat([y_all,y_ex], axis=1)        



loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y,logits=y_all,dim=0))

opt = tf.train.AdamOptimizer()
train = opt.minimize(loss)
init = tf.global_variables_initializer()


with tf.Session() as sess:
    sess.run(init)
    o = sess.run(loss, feed_dict = {X_input:X1,Y_input:Y1})
    print(o.shape)
    print(o)
    sess.run(train, feed_dict = {X_input:X1,Y_input:Y1})
    o = sess.run(loss, feed_dict = {X_input:X1,Y_input:Y1})
    print(o)