我试图使用Theano训练一个非常标准的MLP模型有点困难 . 我的模型代码看起来像这样
class Layer(object):
def __init__(self, inputs, n_in, n_out, activation=T.nnet.softmax):
def weights(shape):
return np.array(np.random.uniform(size=shape), dtype='float64')
def biases(size):
return np.zeros((size), dtype='float64')
self.W = theano.shared(value=weights((n_in, n_out)), name='weights', borrow=True)
self.b = theano.shared(value=biases(n_out), name='biases', borrow=True)
self.output = activation(T.dot(inputs, self.W) + self.b)
self.pred = T.argmax(self.output, axis=1)
self.params = [self.W, self.b]
class MLP(object):
def __init__(self, inputs, n_in, n_hidden, n_out):
""" for now lets go with one hidden layer"""
self._hidden = Layer(inputs, n_in, n_hidden, activation=T.tanh)
self._output = Layer(self._hidden.output, n_hidden, n_out) # softmax by default
def loss(self, one_hot):
return T.mean(T.sqr(one_hot - self._output.output)
def accuracy(self, y):
return T.mean(T.eq(self._output.pred, y))
def updates(self, loss, rate=0.01):
updates = []
updates.append((self._hidden.W, self._hidden.W - rate * T.grad(cost=loss, wrt=self._hidden.W)))
updates.append((self._hidden.b, self._hidden.b - rate * T.grad(cost=loss, wrt=self._hidden.b)))
updates.append((self._output.W, self._output.W - rate * T.grad(cost=loss, wrt=self._output.W)))
updates.append((self._output.b, self._output.b - rate * T.grad(cost=loss, wrt=self._output.b)))
return updates
然后我试着像这样训练它
x = T.matrix('x', dtype='float64')
y = T.vector('y', dtype='int32')
# basic logistic model
# model = Layer(x, 784, 10, activation=T.nnet.softmax)
# basic multi-layer perceptron
model = MLP(x, 784, 128, 10)
labels = T.extra_ops.to_one_hot(y, 10)
# loss function
#loss = T.mean(T.sqr(labels - model.output))
loss = model.loss(labels)
# average number of correct predictions over a batch
#accuracy = T.mean(T.eq(model.pred, y))
accuracy = model.accuracy(y)
# updates
#rate = 0.05
#g_W = T.grad(cost=loss, wrt=model.W)
#g_b = T.grad(cost=loss, wrt=model.b)
#updates = [(model.W, model.W - rate * g_W),
# (model.b, model.b - rate * g_b)]
updates = model.updates(loss, rate=0.3)
# batch index
index = T.scalar('batch index', dtype='int32')
size = T.scalar('batch size', dtype='int32')
train = theano.function([index, size],
[loss, accuracy],
updates=updates,
givens={x: train_set[0][index * size: (index + 1) * size],
y: train_set[1][index * size: (index + 1) * size]})
valid = theano.function([index, size],
[loss, accuracy],
givens={x: valid_set[0][index * size: (index + 1) * size],
y: valid_set[1][index * size: (index + 1) * size]})
test = theano.function([index, size],
[accuracy],
givens={x: test_set[0][index * size: (index + 1) * size],
y: test_set[1][index * size: (index + 1) * size]})
n_epochs = 10
batch_size = 500
# number of items in training dataset / batch size
batches_in_epoch = datasets[0][0].shape[0] // batch_size
losses = np.empty(0)
errors = np.empty(0)
for epoch in range(1, n_epochs + 1):
epoch_losses = np.empty(0)
epoch_errors = np.empty(0)
for batch_n in range(batches_in_epoch):
l, e = train(batch_n, batch_size)
epoch_losses = np.append(epoch_losses, l)
epoch_errors = np.append(epoch_errors, e)
print('[%s]' % time.ctime(),
'epoch: ', epoch,
'batch: ', batch_n,
'loss: ', np.round(l, 4),
'accuracy: ', np.round(e, 4))
# shuffle train set every epoch
shuffle = np.arange(datasets[0][1].shape[0])
np.random.shuffle(shuffle)
train_set[0] = train_set[0][shuffle]
train_set[1] = train_set[1][shuffle]
losses = np.concatenate([losses, epoch_losses])
errors = np.concatenate([errors, epoch_errors])
valid_l, valid_e = valid(0, datasets[1][0].shape[0])
print('[%s]' % time.ctime(), 'epoch: ', epoch, 'validation loss: ', valid_l, 'validation accuracy: ', valid_e)
acc = test(0, datasets[2][0].shape[0])
print()
print('Final accuracy: ', np.round(acc, 4)[0])
现在,如果你看一下评论,我用一个基本的逻辑回归模型尝试了它并且它有效,我得到了80%的准确率 . 但是当我用我的MLP模型替换它时,它不起作用 . 它不会收敛到任何东西,我得到10%的准确性随机猜测 . 我究竟做错了什么?我使用的数据是按照Theano教程的方式加载到共享变量中的MNIST数据集 .
1 回答
问题似乎在于权重初始化 . 你是如何在tensorflow实现中做到这一点的?
我现在对基础数学不太了解,所以如果我错了就纠正我,但我喜欢解释它,就像所有权重都是正数一样,模型无法学习负面特征 .
您可以尝试将
low=-1, high=1
添加到初始化(默认值np.random.uniform
介于0和1之间) . 在我的测试中,这需要很长时间才能收敛(约100个时代),但至少它确实如此 .使用更智能的glorot initialization像这样:
使培训更快 . 在5个时代之后,我将大约90%的验证准确性添加到您的代码中 .
这也是theano MLP example中权重初始化的方式 .