尝试在TensorFlow中实现端到端的CNN模型 . 输入是包含11779个图像的文件夹,输出是0-200的int标签 . 我've already created a .tfrecords file for both image and labels and I'm使用 tf.train.string_input_producer()
和 tf.train.shuffle_batch()
来启动培训 .
我在训练集中有11779个例子 . 当我设置batch_size = 128时,使用('tf.nn.sparse_softmax_cross_entropy_with_logits`计算的训练损失在训练了84个批次后准确地返回NaN(10496个例子) . 当我将batch_size设置为256时,损失在41个批次(10752个示例)被训练后返回NaN . 当我将learning_rate设置为0.0(并使用GradientDescent)时,我正在观察同样的问题 . 我非常肯定这不是由于梯度不同造成的 .
我的问题:
(1)我感觉训练不会无限期地循环遍历批次,导致在特定数量的批次循环后出现NaN值 . 我在这方面需要一些帮助 . 我究竟做错了什么 ?
感谢一些帮助 . 这是我的代码:
import numpy as np
import os
import os.path
import tensorflow as tf
filename = 'inputdata.tfrecords'
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'image_raw': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64),
})
image = tf.decode_raw(features['image_raw'], tf.uint8)
image.set_shape([128*128*3])
image = tf.reshape(image,[128,128,3])
# Convert from [0, 255] -> [-0.5, 0.5] floats.
image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
image = tf.cast(image, tf.float32)
# Convert label from a scalar uint8 tensor to an int32 scalar.
label = tf.cast(features['label'], tf.int32)
return image,label
def weight_variable(shape):
initial = tf.truncated_normal(shape,stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1,shape=shape)
return tf.Variable(initial)
def conv2d(x,W):
return tf.nn.conv2d(x,W,strides=[1,1,1,1],padding='SAME')
def max_pool_2X2(x):
return tf.nn.max_pool(x,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')
with tf.Session() as sess:
filename_queue = tf.train.string_input_producer(
[filename], num_epochs=None,shuffle=True)
image,label = read_and_decode(filename_queue)
min_after_dequeue = 1000
batch_size = 128
capacity = min_after_dequeue + 3 * batch_size
images_batch, label_batch = tf.train.shuffle_batch(
[image, label], batch_size=batch_size, capacity=capacity,
min_after_dequeue=min_after_dequeue)
# 1st layer
W_conv1 = weight_variable([5,5,3,32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(images_batch,W_conv1) + b_conv1)
h_pool1 = max_pool_2X2(h_conv1)
## 2nd layer
W_conv2 = weight_variable([5,5,32,64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1,W_conv2) + b_conv2)
h_pool2 = max_pool_2X2(h_conv2)
## 3rd layer
W_conv3 = weight_variable([5,5,64,64])
b_conv3 = bias_variable([64])
h_conv3 = tf.nn.relu(conv2d(h_pool2,W_conv3) + b_conv3)
h_pool3 = max_pool_2X2(h_conv3)
# Dense layer
W_fc1 = weight_variable([16*16*64, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool3, [-1,16*16*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,W_fc1)+ b_fc1)
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1,keep_prob)
h_fc1_drop = tf.nn.dropout(h_fc1,0.3)
# readout layer
W_fc2 = weight_variable([1024,200])
b_fc2 = bias_variable([200])
y_conv=tf.matmul(h_fc1_drop,W_fc2) + b_fc2
# Define loss and optimizer
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(y_conv, label_batch)
# for monitoring
loss_mean = tf.reduce_mean(loss)
train_op = tf.train.AdamOptimizer(1e-4).minimize(loss)
correct_prediction = tf.equal(tf.argmax(label_batch, 1), tf.argmax(y_conv, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
init = tf.initialize_all_variables()
sess.run(init)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
while not coord.should_stop():
# Run training steps or whatever
_, loss_val = sess.run([train_op, loss_mean])
print loss_val
assert not np.isnan(loss_val), 'NaN'
except tf.errors.OutOfRangeError:
print('Done training -- epoch limit reached')
finally:
# When done, ask the threads to stop.
coord.request_stop()
#Wait for threads to finish.
coord.join(threads)
sess.close()