在Tensorflow中特定迭代次数之后的NaN值不是由于梯度不同引起的-Java 学习之路

尝试在TensorFlow中实现端到端的CNN模型 . 输入是包含11779个图像的文件夹，输出是0-200的int标签 . 我've already created a .tfrecords file for both image and labels and I'm使用 tf.train.string_input_producer() 和 tf.train.shuffle_batch() 来启动培训 .

我在训练集中有11779个例子 . 当我设置batch_size = 128时，使用（'tf.nn.sparse_softmax_cross_entropy_with_logits`计算的训练损失在训练了84个批次后准确地返回NaN（10496个例子） . 当我将batch_size设置为256时，损失在41个批次（10752个示例）被训练后返回NaN . 当我将learning_rate设置为0.0（并使用GradientDescent）时，我正在观察同样的问题 . 我非常肯定这不是由于梯度不同造成的 .

我的问题：

（1）我感觉训练不会无限期地循环遍历批次，导致在特定数量的批次循环后出现NaN值 . 我在这方面需要一些帮助 . 我究竟做错了什么？

感谢一些帮助 . 这是我的代码：

import numpy as np
import os
import os.path
import tensorflow as tf


filename = 'inputdata.tfrecords'

def read_and_decode(filename_queue):
  reader = tf.TFRecordReader()
  _, serialized_example = reader.read(filename_queue)
  features = tf.parse_single_example(
      serialized_example,
      # Defaults are not specified since both keys are required.
      features={
          'image_raw': tf.FixedLenFeature([], tf.string),
          'label': tf.FixedLenFeature([], tf.int64),
      })


  image = tf.decode_raw(features['image_raw'], tf.uint8)
  image.set_shape([128*128*3])
  image = tf.reshape(image,[128,128,3])

  # Convert from [0, 255] -> [-0.5, 0.5] floats.
  image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
  image = tf.cast(image, tf.float32)
  # Convert label from a scalar uint8 tensor to an int32 scalar.
  label = tf.cast(features['label'], tf.int32)
  return image,label


def weight_variable(shape):
    initial = tf.truncated_normal(shape,stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1,shape=shape)
    return tf.Variable(initial)

def conv2d(x,W):
    return tf.nn.conv2d(x,W,strides=[1,1,1,1],padding='SAME')

def max_pool_2X2(x):
    return tf.nn.max_pool(x,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')



with tf.Session() as sess:

    filename_queue = tf.train.string_input_producer(
        [filename], num_epochs=None,shuffle=True)

    image,label = read_and_decode(filename_queue)

    min_after_dequeue = 1000
    batch_size = 128
    capacity = min_after_dequeue + 3 * batch_size

    images_batch, label_batch = tf.train.shuffle_batch(
        [image, label], batch_size=batch_size, capacity=capacity,
        min_after_dequeue=min_after_dequeue)



    # 1st layer
    W_conv1 = weight_variable([5,5,3,32])
    b_conv1 = bias_variable([32])


    h_conv1 = tf.nn.relu(conv2d(images_batch,W_conv1) + b_conv1)
    h_pool1 = max_pool_2X2(h_conv1)


    ## 2nd layer
    W_conv2 = weight_variable([5,5,32,64])
    b_conv2 = bias_variable([64])

    h_conv2 = tf.nn.relu(conv2d(h_pool1,W_conv2) + b_conv2)
    h_pool2 = max_pool_2X2(h_conv2)

    ## 3rd layer
    W_conv3 = weight_variable([5,5,64,64])
    b_conv3 = bias_variable([64])

    h_conv3 = tf.nn.relu(conv2d(h_pool2,W_conv3) + b_conv3)
    h_pool3 = max_pool_2X2(h_conv3)

    # Dense layer

    W_fc1 = weight_variable([16*16*64, 1024])
    b_fc1 = bias_variable([1024])

    h_pool2_flat = tf.reshape(h_pool3, [-1,16*16*64])
    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,W_fc1)+ b_fc1)

    keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1,keep_prob)
    h_fc1_drop = tf.nn.dropout(h_fc1,0.3)

    # readout layer
    W_fc2 = weight_variable([1024,200])
    b_fc2 = bias_variable([200])
    y_conv=tf.matmul(h_fc1_drop,W_fc2) + b_fc2

    # Define loss and optimizer
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(y_conv, label_batch)

    # for monitoring
    loss_mean = tf.reduce_mean(loss)

    train_op = tf.train.AdamOptimizer(1e-4).minimize(loss)
    correct_prediction = tf.equal(tf.argmax(label_batch, 1), tf.argmax(y_conv, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


    init = tf.initialize_all_variables()
    sess.run(init)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)


    try:
        while not coord.should_stop():
            # Run training steps or whatever
            _, loss_val = sess.run([train_op, loss_mean])
            print loss_val
            assert not np.isnan(loss_val), 'NaN'
    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        # When done, ask the threads to stop.
        coord.request_stop()

        #Wait for threads to finish.
        coord.join(threads)
        sess.close()

在Tensorflow中特定迭代次数之后的NaN值不是由于梯度不同引起的

相关问题