Home Articles

如何用pytorch中的SGD成功训练一个简单的线性回归模型?

Asked
Viewed 33 times
1

我试图用SGD在pytorch中训练一个简单的多项式线性回归模型 . 我写了一些自包含(我认为是非常简单的代码),然而,由于某种原因,我的模型没有按照我的想法进行训练 .

我从正弦曲线中采样了5个点,并尝试用4度多项式拟合它 . 这是一个凸问题,所以只要我们有足够的迭代和足够小的步长,GD或SGD应该找到零列车误差的解决方案 . 尺寸 . 出于某种原因,我的模型不能很好地训练(即使它似乎正在改变模型的参数 . 任何人都知道为什么?这是代码(我尝试使其自包含和最小):

import numpy as np
from sklearn.preprocessing import PolynomialFeatures

import torch
from torch.autograd import Variable

from maps import NamedDict

from plotting_utils import *

def index_batch(X,batch_indices,dtype):
    '''
    returns the batch indexed/sliced batch
    '''
    if len(X.shape) == 1: # i.e. dimension (M,) just a vector
        batch_xs = torch.FloatTensor(X[batch_indices]).type(dtype)
    else:
        batch_xs = torch.FloatTensor(X[batch_indices,:]).type(dtype)
    return batch_xs

def get_batch2(X,Y,M,dtype):
    '''
    get batch for pytorch model
    '''
    # TODO fix and make it nicer, there is pytorch forum question
    X,Y = X.data.numpy(), Y.data.numpy()
    N = len(Y)
    valid_indices = np.array( range(N) )
    batch_indices = np.random.choice(valid_indices,size=M,replace=False)
    batch_xs = index_batch(X,batch_indices,dtype)
    batch_ys = index_batch(Y,batch_indices,dtype)
    return Variable(batch_xs, requires_grad=False), Variable(batch_ys, requires_grad=False)

def get_sequential_lifted_mdl(nb_monomials,D_out, bias=False):
    return torch.nn.Sequential(torch.nn.Linear(nb_monomials,D_out,bias=bias))

def train_SGD(mdl, M,eta,nb_iter,logging_freq ,dtype, X_train,Y_train):
    ##
    N_train,_ = tuple( X_train.size() )
    #print(N_train)
    for i in range(nb_iter):
        # Forward pass: compute predicted Y using operations on Variables
        batch_xs, batch_ys = get_batch2(X_train,Y_train,M,dtype) # [M, D], [M, 1]
        ## FORWARD PASS
        y_pred = mdl.forward(batch_xs)
        ## LOSS + Regularization
        batch_loss = (1/M)*(y_pred - batch_ys).pow(2).sum()
        ## BACKARD PASS
        batch_loss.backward() # Use autograd to compute the backward pass. Now w will have gradients
        ## SGD update
        for W in mdl.parameters():
            delta = eta*W.grad.data
            W.data.copy_(W.data - delta)
        ## train stats
        if i % (nb_iter/10) == 0 or i == 0:
            current_train_loss = (1/N_train)*(mdl.forward(X_train) - Y_train).pow(2).sum().data.numpy()
            print('i = {}, current_loss = {}'.format(i, current_train_loss ) )
        ## Manually zero the gradients after updating weights
        mdl.zero_grad()
##
logging_freq = 100
dtype = torch.FloatTensor
## SGD params
M = 3
eta = 0.0002
nb_iter = 20*1000
##
lb,ub = 0,1
f_target = lambda x: np.sin(2*np.pi*x)
N_train = 5
X_train = np.linspace(lb,ub,N_train)
Y_train = f_target(X_train)
## degree of mdl
Degree_mdl = 4
## pseudo-inverse solution
c_pinv = np.polyfit( X_train, Y_train , Degree_mdl )[::-1]
## linear mdl to train with SGD
nb_terms = c_pinv.shape[0]
mdl_sgd = get_sequential_lifted_mdl(nb_monomials=nb_terms,D_out=1, bias=False)
## Make polynomial Kernel
poly_feat = PolynomialFeatures(degree=Degree_mdl)
Kern_train = poly_feat.fit_transform(X_train.reshape(N_train,1))
Kern_train_pt, Y_train_pt = Variable(torch.FloatTensor(Kern_train).type(dtype), requires_grad=False), Variable(torch.FloatTensor(Y_train).type(dtype), requires_grad=False)
train_SGD(mdl_sgd, M,eta,nb_iter,logging_freq ,dtype, Kern_train_pt,Y_train_pt)

错误似乎徘徊在2ish:

i = 0, current_loss = [ 2.08996224]
i = 2000, current_loss = [ 2.03536892]
i = 4000, current_loss = [ 2.02014995]
i = 6000, current_loss = [ 2.01307297]
i = 8000, current_loss = [ 2.01300406]
i = 10000, current_loss = [ 2.01125693]
i = 12000, current_loss = [ 2.01162267]
i = 14000, current_loss = [ 2.01296973]
i = 16000, current_loss = [ 2.00951076]
i = 18000, current_loss = [ 2.00967121]

这很奇怪,因为它应该能够达到零 .

我还绘制了学习函数:

enter image description here

绘图的代码:

##
x_horizontal = np.linspace(lb,ub,1000).reshape(1000,1)
X_plot = poly_feat.fit_transform(x_horizontal)
X_plot_pytorch = Variable( torch.FloatTensor(X_plot), requires_grad=False)
##
fig1 = plt.figure()
#plots objs
p_sgd, = plt.plot(x_horizontal, [ float(f_val) for f_val in mdl_sgd.forward(X_plot_pytorch).data.numpy() ])
p_pinv, = plt.plot(x_horizontal, np.dot(X_plot,c_pinv))
p_data, = plt.plot(X_train,Y_train,'ro')
## legend
nb_terms = c_pinv.shape[0]
legend_mdl = f'SGD solution standard parametrization, number of monomials={nb_terms}, batch-size={M}, iterations={nb_iter}, step size={eta}'
plt.legend(
        [p_sgd,p_pinv,p_data],
        [legend_mdl,f'linear algebra soln, number of monomials={nb_terms}',f'data points = {N_train}']
    )
##
plt.xlabel('x'), plt.ylabel('f(x)')
plt.show()

我实际上继续实施了TensorFlow版本 . 那似乎确实训练了模型 . 我尝试通过给它们相同的初始化来使它们都匹配:

mdl_sgd[0].weight.data.fill_(0)

但那仍然行不通 . Tensorflow代码:

graph = tf.Graph()
with graph.as_default():
    X = tf.placeholder(tf.float32, [None, nb_terms])
    Y = tf.placeholder(tf.float32, [None,1])
    w = tf.Variable( tf.zeros([nb_terms,1]) )
    #w = tf.Variable( tf.truncated_normal([Degree_mdl,1],mean=0.0,stddev=1.0) )
    #w = tf.Variable( 1000*tf.ones([Degree_mdl,1]) )
    ##
    f = tf.matmul(X,w) # [N,1] = [N,D] x [D,1]
    #loss = tf.reduce_sum(tf.square(Y - f))
    loss = tf.reduce_sum( tf.reduce_mean(tf.square(Y-f), 0))
    l2loss_tf = (1/N_train)*2*tf.nn.l2_loss(Y-f)
    ##
    learning_rate = eta
    #global_step = tf.Variable(0, trainable=False)
    #learning_rate = tf.train.exponential_decay(learning_rate=eta, global_step=global_step,decay_steps=nb_iter/2, decay_rate=1, staircase=True)
    train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)
    with tf.Session(graph=graph) as sess:
        Y_train = Y_train.reshape(N_train,1)
        tf.global_variables_initializer().run()
        # Train
        for i in range(nb_iter):
            #if i % (nb_iter/10) == 0:
            if i % (nb_iter/10) == 0 or i == 0:
                current_loss = sess.run(fetches=loss, feed_dict={X: Kern_train, Y: Y_train})
                print(f'i = {i}, current_loss = {current_loss}')
            ## train
            batch_xs, batch_ys = get_batch(Kern_train,Y_train,M)
            sess.run(train_step, feed_dict={X: batch_xs, Y: batch_ys})

我也试过改变初始化,但它没有改变任何东西,这是有道理的,因为它不应该有很大的不同:

mdl_sgd[0].weight.data.normal_(mean=0,std=0.001)

原帖:

https://discuss.pytorch.org/t/how-to-train-a-simple-linear-regression-model-with-sgd-in-pytorch-successfully/9620


它应该是这样的:

enter image description here


解:

似乎结果作为向量而不是导致问题的数字返回存在问题 . 即以下代码固定的东西:

y_pred = model(batch_xs).view(-1) # change this to "y_pred = model(batch_xs)" to get the incorrect results
    loss = (y_pred - batch_ys).pow(2).mean()

这对我来说似乎完全神秘 . 有人知道为什么这解决了这个问题吗?它看起来就像魔术一样 .

1 Answer

  • 1

    这个bug真的很微妙,但实质上是因为pytorch正在使用numpy广播规则 . 所以当一个列向量 (3,1) 和一个数组(即dim是 (3,) )时,会发生的是广播产生一个 (3,3) 矩阵(注意当你用 (3,) 数组减去一个行向量 (1,3) 向量时不会发生这种情况,我猜数组会被处理掉作为行向量) . 这非常糟糕,因为这意味着我们计算每个标签和每个预测之间所有成对差异的矩阵 . 当然,这是荒谬的,并产生一个错误,因为我们不会产生任何明智的东西 .

    因此,答案似乎只是为了避免错误的numpy广播,无论是在培训期间还是在数据输入之前重塑事物 . 任何一个都应该工作 .


    为了避免错误,可以附加使用此代码:

    def check_vectors_have_same_dimensions(Y,Y_):
        '''
        Checks that vector Y and Y_ have the same dimensions. If they don't
        then there might be an error that could be caused due to wrong broadcasting.
        '''
        DY = tuple( Y.size() )
        DY_ = tuple( Y_.size() )
        if len(DY) != len(DY_):
            return True
        for i in range(len(DY)):
            if DY[i] != DY_[i]:
                return True
        return False
    

Related