EDIT:

这篇文章的原始 Headers 是

“当学习率下降时,损失值会跳跃,然后慢慢回落”

但是我现在认为这是在恢复保存的模型时发生的,无论学习速率是否改变 .

在10个学期后,学习率为0.001,损失达到~10 . 然后我保存并恢复了模型,重新开始训练,第11纪元的损失是〜15 . 在另外5个时代,它再次没有达到10 .


我已经构建了一个Autoencoder模型,如下所示 . 它使用 tf.train.Saver() 将训练过的模型保存到内存中或从内存中恢复 . 我的学习率为 tf.placeholder ,因此允许在 feed_dict 中指定 .

在训练模型时,我最初以较高的学习速度开始,让它训练直到损失开始稳定,然后保存模型,手动降低学习速度,恢复它并重新开始训练 .

然而,当这种变化发生时,损失总是比变化前的变化高得多,然后缓慢但持续地再次减少 . 我无法理解为什么会发生这种跳跃,除非我的实现出现问题 .

import os
import pickle as pk

import tensorflow as tf


class Autoencoder:

    def __init__(self, encoderDims, sparseInput=False, tiedWeights=False, denoise=False):
        self.encoderDims = encoderDims
        self.decoderDims = list(reversed(encoderDims))
        self.sparseInput = sparseInput
        self.tiedWeights = tiedWeights
        self.denoise = denoise          # Only works for greyscale image data

        self.input = tf.placeholder(tf.float32, [None, encoderDims[0]])
        self.learningRate = tf.placeholder(tf.float32, [])

        self.activationFunction = tf.nn.sigmoid             # TO DO: Allow to be specified by user
        # self.activationFunction = tf.tanh
        # self.activationFunction = tf.nn.selu
        self.SGD = tf.train.AdamOptimizer(self.learningRate)

        if self.denoise:
            self.__addNoise()
        self.__buildNetwork()           # Constructs Encoder & Decoder
        self.__buildTensorFlowGraph()   # Creates sequential TensorFlow operations

        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())  # Initialise weights & biases
        self.saver = tf.train.Saver()
        self.session.graph.finalize()   # Avoids memory leaks through duplicating graph nodes

    def __addNoise(self):
        # Create a tensor of random numbers with unit variance
        # Then sets pixels to black where values of random tensor > 1
        # (i.e. all values outside the std dev -> ~32% of pixels)
        random = tf.random_normal(tf.shape(self.input))
        mask = tf.greater(random, 1.0)
        self.noisyInput = tf.where(
            mask, tf.ones_like(self.input) * 255, self.input)

    def __buildNetwork(self):
        # Lists of weights and biases per layer of encoder and decoder
        self.encoderWeights, self.encoderBiases = [], []
        self.decoderWeights, self.decoderBiases = [], []
        for layer in range(len(self.encoderDims) - 1):
            self.encoderWeights.append(
                tf.Variable(tf.random_normal(
                    [self.encoderDims[layer], self.encoderDims[layer + 1]]))
            )
            self.encoderBiases.append(
                tf.Variable(tf.zeros([self.encoderDims[layer + 1]]))
            )
            # if layer != len(self.decoderDims) - 2:  # BIAS IN OUTPUT LAYER????
            self.decoderBiases.append(
                tf.Variable(tf.zeros([self.decoderDims[layer + 1]]))
            )
            if not self.tiedWeights:
                self.decoderWeights.append(
                    tf.Variable(tf.random_normal(
                        [self.decoderDims[layer], self.decoderDims[layer + 1]]))
                )
        if self.tiedWeights:
            self.decoderWeights = [tf.transpose(
                i) for i in reversed(self.encoderWeights)]

    def __buildTensorFlowGraph(self):
        self.encoded = self.encode()        # Encoded/compressed data
        self.decoded = self.decode()        # Decoded/reconstructed data
        self.loss = self.__calculateLoss()
        self.train = self.SGD.minimize(self.loss)

    def encode(self):
        if self.denoise:
            encoded = self.noisyInput
        else:
            encoded = self.input
        for layer in range(len(self.encoderDims) - 1):
            encoded = tf.matmul(encoded, self.encoderWeights[layer])
            encoded = tf.add(encoded, self.encoderBiases[layer])
            # if layer != len(self.encoderDims) - 2:    # KEEP LAST LINEAR?
            encoded = self.activationFunction(encoded)
        return encoded

    def decode(self):
        decoded = self.encoded
        for layer in range(len(self.decoderDims) - 1):
            decoded = tf.matmul(decoded, self.decoderWeights[layer])
            # if layer != len(self.decoderDims) - 2:  # BIAS IN OUTPUT LAYER????
            decoded = tf.add(decoded, self.decoderBiases[layer])
            if layer != len(self.decoderDims) - 2:  # Keep output layer linear
                decoded = self.activationFunction(decoded)
        return decoded

    def __calculateLoss(self):
        # TO DO: ADD REGULARISATION
        if self.sparseInput:
            nonZeros = tf.where(tf.greater(self.input, 0))
            input = tf.gather(self.input, nonZeros)
            output = tf.gather(self.decoded, nonZeros)
        else:
            input = self.input
            output = self.decoded

        return tf.sqrt(
            tf.losses.mean_squared_error(
                labels=input,
                predictions=output
            )
        )

    def setBatch(self, input, learningRate=0.0):
        self.batchDict = {
            self.input: input,
            self.learningRate: learningRate
        }

    def run(self, operations=None, train=False):
        # Returns values of specified list of operations
        # Trains network's parameters if specified
        if not type(operations) is list:
            operations = [operations]

        if train:
            ops = [self.train]
        else:
            ops = []

        if operations is not None:
            for op in operations:
                if op == 'input':
                    ops.append(self.input)
                if op == 'noisyInput':
                    ops.append(self.noisyInput)
                if op == 'encoded':
                    ops.append(self.encoded)
                if op == 'decoded':
                    ops.append(self.decoded)
                if op == 'loss':
                    ops.append(self.loss)

        if (train and len(ops) == 2) or (not train and len(ops) == 1):
            return self.session.run(ops, self.batchDict)[-1]
        elif train:
            return self.session.run(ops, self.batchDict)[1:]
        else:
            return self.session.run(ops, self.batchDict)

    def save(self, epoch, modelName="Autoencoder"):
        modelName += '.ckpt'
        dir = os.path.dirname(os.path.realpath(__file__)) + '/SavedModels/'
        self.saver.save(self.session, dir + modelName)
        loss = self.session.run(self.loss, self.batchDict)
        with open(dir + modelName + '_epoch.pk', 'wb') as epochFile:
            pk.dump(epoch, epochFile)
        with open(dir + modelName + '_loss.pk', 'wb') as lossFile:
            pk.dump(loss, lossFile)

    def restore(self, modelName="Autoencoder"):
        modelName += '.ckpt'
        dir = os.path.dirname(os.path.realpath(__file__)) + '/SavedModels/'
        self.saver.restore(self.session, dir + modelName)
        with open(dir + modelName + '_epoch.pk', 'rb') as epochFile:
            epoch = pk.load(epochFile)
        with open(dir + modelName + '_loss.pk', 'rb') as lossFile:
            loss = pk.load(lossFile)
        return epoch, loss

    def kill(self):
        self.session.close()

以下是模型的训练方法:

import os

import numpy as np

import pandas as pd
from Autoencoder import Autoencoder

loadModel = True
# loadModel = False
learningRate = 0.001
numEpochs = 10000
batchSize = 1  # Divide into 670
printStep = 1

projectDir = os.path.dirname(os.path.realpath(__file__))

# (671, 9066)
original = pd.read_csv(projectDir + '/Data/ratings_small_pivoted.csv').drop('userId', axis=1)
numSamples = original.shape[0]
numFeatures = original.shape[1]
numBatches = numSamples // batchSize

encoderDims = [
    numFeatures,
    numFeatures // 2
]

ae = Autoencoder(encoderDims, sparseInput=True)

if loadModel:
    bestEpoch, bestLoss = ae.restore()
else:
    bestEpoch, bestLoss = 0, 9999

for epoch in range(1, numEpochs - bestEpoch + 1):
    epochLoss = 0
    for batch in range(numBatches):
        batchInput = original[batch * batchSize: (batch + 1) * batchSize]
        ae.setBatch(batchInput, learningRate)
        batchLoss = ae.run(['loss'], train=True)
        epochLoss += batchLoss
    epochLoss /= numBatches
    if epochLoss < bestLoss:
        bestLoss = epochLoss
        ae.save(epoch + bestEpoch)
    if epoch == 1 or epoch % printStep == 0:
        print("EPOCH: {} / {}".format(epoch + bestEpoch, numEpochs))
        print("LOSS:  {} ({})\n".format("%.4f" % epochLoss, "%.4f" % bestLoss))

Here is the data I am using.

学习率为0.001,在100个时期后损失达到~1.0 . 然后我把它减少到0.0001并且损失立即在第一个时期跳到~3.9,然后在这个时期之后持续减少,但是非常缓慢 .