首页 文章

在Theano训练MLP

提问于
浏览
0

我试图使用Theano训练一个非常标准的MLP模型有点困难 . 我的模型代码看起来像这样

class Layer(object):
    def __init__(self, inputs, n_in, n_out, activation=T.nnet.softmax):
        def weights(shape):
            return np.array(np.random.uniform(size=shape), dtype='float64')
        def biases(size):
            return np.zeros((size), dtype='float64')

        self.W = theano.shared(value=weights((n_in, n_out)), name='weights', borrow=True)
        self.b = theano.shared(value=biases(n_out), name='biases', borrow=True)
        self.output = activation(T.dot(inputs, self.W) + self.b)
        self.pred = T.argmax(self.output, axis=1)
        self.params = [self.W, self.b]

class MLP(object):
    def __init__(self, inputs, n_in, n_hidden, n_out):
        """ for now lets go with one hidden layer"""
        self._hidden = Layer(inputs, n_in, n_hidden, activation=T.tanh)
        self._output = Layer(self._hidden.output, n_hidden, n_out) # softmax by default        
    def loss(self, one_hot):
        return T.mean(T.sqr(one_hot - self._output.output)    
    def accuracy(self, y):
        return T.mean(T.eq(self._output.pred, y))    
    def updates(self, loss, rate=0.01):
        updates = []
        updates.append((self._hidden.W, self._hidden.W - rate * T.grad(cost=loss, wrt=self._hidden.W)))
        updates.append((self._hidden.b, self._hidden.b - rate * T.grad(cost=loss, wrt=self._hidden.b)))
        updates.append((self._output.W, self._output.W - rate * T.grad(cost=loss, wrt=self._output.W)))
        updates.append((self._output.b, self._output.b - rate * T.grad(cost=loss, wrt=self._output.b)))
        return updates

然后我试着像这样训练它

x = T.matrix('x', dtype='float64')
y = T.vector('y', dtype='int32')

# basic logistic model
# model = Layer(x, 784, 10, activation=T.nnet.softmax)
# basic multi-layer perceptron
model = MLP(x, 784, 128, 10)

labels = T.extra_ops.to_one_hot(y, 10)
# loss function
#loss = T.mean(T.sqr(labels - model.output))
loss = model.loss(labels)
# average number of correct predictions over a batch
#accuracy = T.mean(T.eq(model.pred, y))
accuracy = model.accuracy(y)

# updates
#rate = 0.05
#g_W = T.grad(cost=loss, wrt=model.W)
#g_b = T.grad(cost=loss, wrt=model.b)
#updates = [(model.W, model.W - rate * g_W),
#           (model.b, model.b - rate * g_b)]
updates = model.updates(loss, rate=0.3)

# batch index
index = T.scalar('batch index', dtype='int32')
size = T.scalar('batch size', dtype='int32')

train = theano.function([index, size], 
                        [loss, accuracy],
                        updates=updates,
                        givens={x: train_set[0][index * size: (index + 1) * size],
                                y: train_set[1][index * size: (index + 1) * size]})

valid = theano.function([index, size], 
                        [loss, accuracy],
                        givens={x: valid_set[0][index * size: (index + 1) * size],
                                y: valid_set[1][index * size: (index + 1) * size]})

test = theano.function([index, size], 
                       [accuracy],
                       givens={x: test_set[0][index * size: (index + 1) * size],
                               y: test_set[1][index * size: (index + 1) * size]})

n_epochs = 10
batch_size = 500
# number of items in training dataset / batch size
batches_in_epoch = datasets[0][0].shape[0] // batch_size

losses = np.empty(0)
errors = np.empty(0)

for epoch in range(1, n_epochs + 1):
    epoch_losses = np.empty(0)
    epoch_errors = np.empty(0)
    for batch_n in range(batches_in_epoch):
        l, e = train(batch_n, batch_size)
        epoch_losses = np.append(epoch_losses, l)
        epoch_errors = np.append(epoch_errors, e)
        print('[%s]' % time.ctime(), 
              'epoch: ', epoch, 
              'batch: ', batch_n, 
              'loss: ', np.round(l, 4), 
              'accuracy: ', np.round(e, 4))
    # shuffle train set every epoch
    shuffle = np.arange(datasets[0][1].shape[0])
    np.random.shuffle(shuffle)
    train_set[0] = train_set[0][shuffle]
    train_set[1] = train_set[1][shuffle]

    losses = np.concatenate([losses, epoch_losses])
    errors = np.concatenate([errors, epoch_errors])
    valid_l, valid_e = valid(0, datasets[1][0].shape[0])
    print('[%s]' % time.ctime(), 'epoch: ', epoch, 'validation loss: ', valid_l, 'validation accuracy: ', valid_e)

acc = test(0, datasets[2][0].shape[0])
print()
print('Final accuracy: ', np.round(acc, 4)[0])

现在,如果你看一下评论,我用一个基本的逻辑回归模型尝试了它并且它有效,我得到了80%的准确率 . 但是当我用我的MLP模型替换它时,它不起作用 . 它不会收敛到任何东西,我得到10%的准确性随机猜测 . 我究竟做错了什么?我使用的数据是按照Theano教程的方式加载到共享变量中的MNIST数据集 .

1 回答

  • 0

    问题似乎在于权重初始化 . 你是如何在tensorflow实现中做到这一点的?

    我现在对基础数学不太了解,所以如果我错了就纠正我,但我喜欢解释它,就像所有权重都是正数一样,模型无法学习负面特征 .

    您可以尝试将 low=-1, high=1 添加到初始化(默认值 np.random.uniform 介于0和1之间) . 在我的测试中,这需要很长时间才能收敛(约100个时代),但至少它确实如此 .

    使用更智能的glorot initialization像这样:

    def weights(shape):
        return np.random.uniform(low=-np.sqrt(6. / sum(shape)),
                                 high=np.sqrt(6. / sum(shape)),
                                 size=shape)
    

    使培训更快 . 在5个时代之后,我将大约90%的验证准确性添加到您的代码中 .

    这也是theano MLP example中权重初始化的方式 .

相关问题