我想在tensorflow中开发一个自定义的Seq2Seq,我已经创建了这部分网络

wflat0 = tf.get_variable("wflat0", shape=(1024, 1024), 
initializer=LinearInitializer)
bflat0 = tf.get_variable("bflat0", shape=1024, initializer=BiasInitializer)

l0flat = selu(tf.matmul(x, wflat0) + bflat0)
x=tf.reshape(l0flat,[shape[0],shape[1],1024])
x = tf.unstack(x, constant.Lstm_cell, 1)
lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(LSTMHiddenSize, forget_bias=1.0, dropout_keep_prob=0.9)
outputs, states = tf.contrib.rnn.static_rnn(lstm_cell, x, dtype=tf.float32)

decoder = tf.contrib.rnn.LayerNormBasicLSTMCell(LSTMHiddenSize, forget_bias=1.0, dropout_keep_prob=0.9)

y = tf.unstack(tf.reshape(self.y_,[tf.shape(self.y_)[0],tf.shape(self.y_)[1],1]), constant.Lstm_cell, 1)
decoutputs, decstates = tf.contrib.rnn.static_rnn(decoder, y, dtype=tf.float32)

wflat1 = tf.get_variable("wflat1", shape=(LSTMHiddenSize, 128),
                                     initializer=LinearInitializer)
bflat1 = tf.get_variable("bflat1", shape=128, initializer=BiasInitializer)
shapes = tf.shape(decoutputs)
decoutputs=tf.transpose(decoutputs, perm=[1, 0, 2])
decoutputs=tf.reshape(decoutputs,(shapes[0]*shapes[1],shapes[2]))
self.sss=decoutputs
l1flat = selu(tf.matmul(decoutputs, wflat1) + bflat1)
wflat2 = tf.get_variable("wflat2", shape=(128, 1), initializer=LinearInitializer)
bflat2 = tf.get_variable("bflat2", shape=1, initializer=BiasInitializer)
l2flat = tf.matmul(l1flat, wflat2) + bflat2
l2flat = tf.reshape(l2flat,(constant.batchsize , constant.Lstm_cell))
self.sigout=tf.nn.softmax(l2flat)
self.out=l2flat
cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=self.out, labels=self.y_))

这个网络在第一次更新迭代后为大序列创建NAN输出,它只适用于小序列,所以我发现大多数变量都没有渐变,你可以在这张图片中看到Image of computed Gradient怎么可能有些变量哪个参加失去功能有一个无梯度