使用tensorflow时训练损失不能下降-Java 学习之路

首先，我可以确认列车批次是否正确

enter image description here

批量大小为8，输出和张量形状在上面，这是预处理的，可以在 opencv 中看到 .

问题是我的训练损失不能下拉，见下图和代码：

enter image description here

这是 tensorboard 中的计算图

enter image description here

这是代码：

#import package
import tensorflow as tf
import os
#using for provide train batch, batch_size=32
import cifar10_input_pipeline

# images shape can be [batch_size, height, width, 3]
def inference(images):
    with tf.variable_scope('conv1'):
        kernel = tf.get_variable('weights', 
                                [3,3,3,32],
                                tf.float32,

        initializer=tf.random_normal_initializer(stddev=1e-3))
        bias = tf.get_variable('bias',
                              [32],
                              tf.float32,
                              initializer=tf.zeros_initializer())
        conv1 = tf.nn.conv2d(images, kernel, [1,1,1,1], padding='SAME', name='conv')
        conv1 = tf.nn.relu(tf.nn.bias_add(conv1, bias), name='relu')

        pool1 = tf.nn.max_pool(conv1, [1,2,2,1], [1,2,2,1], padding='VALID', name='pool1')

    with tf.variable_scope('conv2'):
        kernel = tf.get_variable('weights',
                                [3,3,32,64],
                                tf.float32,

        initializer=tf.random_normal_initializer(stddev=1e-3))
        bias = tf.get_variable('bias',
                              [64],
                              tf.float32,
                              initializer=tf.zeros_initializer())
        conv2 = tf.nn.conv2d(pool1, kernel, [1,1,1,1], padding='SAME', name='conv')
        conv2 = tf.nn.relu(tf.nn.bias_add(conv2, bias), name='relu')

        pool2 = tf.nn.max_pool(conv2, [1,2,2,1], [1,2,2,1], padding='VALID', name='pool2')

batch_size = images.get_shape()[0].value
flatten = tf.reshape(pool2, shape=[batch_size, -1])
dim = flatten.get_shape()[1].value
with tf.variable_scope('fc1'):
    weights = tf.get_variable('weights',
                                 [dim, 384],
                                 tf.float32,
                                 initializer=tf.random_normal_initializer(stddev=1e-3))
    bias = tf.get_variable('bias',
                              [384],
                              tf.float32,
                              initializer=tf.zeros_initializer())
    fc1 = tf.matmul(flatten, weights, name='fc')
    fc1 = tf.nn.sigmoid(tf.nn.bias_add(fc1, bias), name='sigmoid')

with tf.variable_scope('softmax_linear'):
    weights = tf.get_variable('weights',
                                 [384, 10],
                                 tf.float32,
                                 initializer=tf.random_normal_initializer(stddev=1e-3))
    bias = tf.get_variable('bias',
                              [10],
                              tf.float32,
                              initializer=tf.zeros_initializer())
    fc2 = tf.matmul(fc1, weights, name='fc')
    # final connected layer, return without softmax function
    logits = tf.nn.bias_add(fc2, bias, name='logits')

print('inference sucess')
return logits

#caculate loss
def loss(logits, labels):

    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels,
                                                                  name='cross_entropy')
    loss = tf.reduce_mean(cross_entropy, name='loss')

    print('loss sucess')
    return loss

#return training op
def train(loss):

    opt = tf.train.GradientDescentOptimizer(learning_rate=0.01)
    train_op = opt.minimize(loss)

    print('train sucess')
    return train_op

#calculate accuracy, use train data batch
def accuracy(logits, labels):

    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(logits, axis=1, output_type=tf.int32), labels),
                                          tf.int32),
                               name='accuracy')
    return accuracy

if __name__ == '__main__':

    data_dir = '/home/mao/Notebooks/cifar10/cifar-10-batches-bin/'
    filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in range(1, 6)]

    example_batch = cifar10_input_pipeline.input_pipeline(filenames, batch_size=32, num_epochs=None)

    images = example_batch[0]
    labels = example_batch[1]

    _logits = inference(images)
    _loss = loss(_logits, labels)
    _train_op = train(_loss)

    sess = tf.Session()
    #global_step = tf.train.get_or_create_global_step()
    init_op = tf.group(tf.global_variables_initializer(),
                          tf.local_variables_initializer())
    sess.run(init_op)
    tf.summary.FileWriterCache.clear()
    writer = tf.summary.FileWriter('./test_model', sess.graph)
    Loss = tf.summary.scalar('Loss', _loss)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord, sess=sess)

    try:
        for i in range(1000):
            if not coord.should_stop():
                _, sum1 = sess.run([_train_op, Loss])
                writer.add_summary(sum1, i)
                if i % 100 == 0:
                    print(sess.run(labels))
    except tf.errors.OutOfRangeError:
        print('catch OutOfRangeError')
    finally:
        coord.request_stop()

    coord.join(threads)
    writer.flush()
    writer.close()
    sess.close()

更多细节：

以下是sess.run（train_op）时的损失摘要和标签批次，再次，我确定标签和图像批次在训练时被洗牌 .

那么，函数调用中是否有任何错误？甚至在输出层（logits）中删除tf.nn.relu，损失仍然无法下拉 . 我很困惑

enter image description here

有人可以帮忙吗？
谢谢！

使用tensorflow时训练损失不能下降

相关问题