种子对象，用于在插入符号中并行操作的可重现结果-Java 学习之路

我正在尝试使用代码在插入符号中完全可重现的并行模型，但不了解如何设置种子对象中的向量的大小 . 对于gbm，我有4个调整参数，共有11个不同的级别，我的调优网格中有54行 . 如果我指定任何值<18作为下面"for(i in 1:10)"行中的最后一个值，我会收到一个错误："Bad seeds: the seed object should be a list of length 11 with 10 integer vectors of size 18 and the last list element having a single integer."为什么是18？对于> 18（例如54）的值，它也会运行没有错误 - 为什么？非常感谢您的帮助 . 以下是基于http://topepo.github.io/caret/training.html，补充了一些内容 .

library(mlbench)
data(Sonar)
str(Sonar[, 1:10])
library(caret)
library(doParallel)

set.seed(998)
inTraining <- createDataPartition(Sonar$Class, p = .75, list = FALSE)
training <- Sonar[ inTraining,]
testing  <- Sonar[-inTraining,]

grid <- expand.grid(n.trees = seq(50,150,by=50), interaction.depth = seq(1,3,by=1),
  shrinkage = seq(.09,.11,by=.01),n.minobsinnode=seq(8,10,by=2)) 

# set seed to run fully reproducible model in parallel mode using caret          
set.seed(825)
seeds <- vector(mode = "list", length = 11) # length is = (n_repeats*nresampling)+1
for(i in 1:10) seeds[[i]]<- sample.int(n=1000, 11) # ...the number of tuning parameter...
seeds[[11]]<-sample.int(1000, 1) # for the last model

fitControl <- trainControl(method = "cv",number = 10,seeds=seeds)               

# run model in parallel
cl <- makeCluster(detectCores())
registerDoParallel(cl)

gbmFit1 <- train(Class ~ ., data = training,method = "gbm",
  trControl = fitControl,tuneGrid=grid,verbose = FALSE)
gbmFit1

1 回答

我将分两部分解答你的问题：

1 - Setting the seeds:

你所说的代码：

set.seed(825)
seeds <- vector(mode = "list", length = 11)
for(i in 1:10) seeds[[i]]<- sample.int(n=1000, 54)
#for the last model
seeds[[11]]<-sample.int(1000, 1)

seeds <- vector(mode = "list", length = 11) 中的 11 是 (n_repeats*nresampling)+1 ，所以在你的情况下，你正在使用 10-fold CV ，所以 10+1 = 11 . 如果您使用 repeatedcv 和 number=10 and repeats = 5 ，则会将 11 替换为 (5*10)+1 = 51 .

for(i in 1:10) 中的 10 是 (n_repeats*nresampling) . 在你的情况下它是 10 因为你正在使用 10-fold CV . 同样，如果你使用 repeatedcv 与 number=10 and repeats = 5 ，它将是 for(i in 1:50) .

sample.int(n=1000, 54) 中的 54 是 number of tuning parameter combinations . 在您的情况下， 4 parameters 与 3,3,3 and 2 values . 所以，它是 3*3*3*2 = 54 . 但是，我记得我在某个地方用红色表示，对于gbm，模型适合网格中的 max(n.trees) ，而树木较少的模型是从它派生的，这解释了为什么 caret 根据你的情况 3 * 3 * 2 = 18 计算 seeds 而不是 3*3*3*2 = 54 ，我们稍后会看到 .

但如果您使用带有网格 svmGrid <- expand.grid(sigma= 2^c(-25, -20, -15,-10, -5, 0), C= 2^c(0:5)) 的 SVM 模型，则您的值为 6 * 6 = 36

请记住，使用 seeds 的目的是通过设置适合每次重采样迭代的模型的种子来允许 reproducible research .

seeds[[11]]<-sample.int(1000, 1) 用于设置适合完整数据集的最后（最佳）模型的种子 .

2 - Why you get an error if you specify a value < 18, but no error with a value >= 18

我能够在我的机器上重现相同的错误：

Error in train.default(x, y, weights = w, ...) : 
  Bad seeds: the seed object should be a list of length 11 with 10 integer vectors of size 18 and the last list element having a single integer

因此，通过检查 train.default 我能够找到它的来源 . 根据 4 和 5 行中的测试 badSeed ，行 7 to 10 中的 stop 触发错误消息 .

else {
        if (!(length(trControl$seeds) == 1 && is.na(trControl$seeds))) {
            numSeeds <- unlist(lapply(trControl$seeds, length))
4            badSeed <- (length(trControl$seeds) < length(trControl$index) + 
5              1) || (any(numSeeds[-length(numSeeds)] < nrow(trainInfo$loop)))
            if (badSeed) 
7             stop(paste("Bad seeds: the seed object should be a list of length", 
8               length(trControl$index) + 1, "with", length(trControl$index), 
9                   "integer vectors of size", nrow(trainInfo$loop), 
10               "and the last list element having a", "single integer"))
        }
    }

数字 18 来自 nrow(trainInfo$loop) ，所以我们需要找到 trainInfo$loop 的值 . 对象 trainInfo 在第3行中分配了值 trainInfo <- models$loop(tuneGrid) ：

if (trControl$method != "none") {
        if (is.function(models$loop) && nrow(tuneGrid) > 1) {
 3          trainInfo <- models$loop(tuneGrid)
            if (!all(c("loop", "submodels") %in% names(trainInfo))) 
                stop("The 'loop' function should produce a list with elements 'loop' and 'submodels'")
    }

现在，我们需要找到对象 models . 它在第2行中被赋值为 models <- getModelInfo(method, regex = FALSE)[[1]] ：

else {
2       models <- getModelInfo(method, regex = FALSE)[[1]]
        if (length(models) == 0) 
            stop(paste("Model", method, "is not in caret's built-in library"))
    }

由于我们使用 method = "gbm" ，我们可以看到 getModelInfo("gbm", regex = FALSE)[[1]]$loop 的值并检查下面的结果：

> getModelInfo("gbm", regex = FALSE)[[1]]$loop
function(grid) {     
3               loop <- ddply(grid, c("shrinkage", "interaction.depth", "n.minobsinnode"),
                              function(x) c(n.trees = max(x$n.trees)))
                submodels <- vector(mode = "list", length = nrow(loop))
                for(i in seq(along = loop$n.trees)) {
                  index <- which(grid$interaction.depth == loop$interaction.depth[i] & 
                                   grid$shrinkage == loop$shrinkage[i] &
                                   grid$n.minobsinnode == loop$n.minobsinnode[i])
                  trees <- grid[index, "n.trees"] 
                  submodels[[i]] <- data.frame(n.trees = trees[trees != loop$n.trees[i]])
                }    
                list(loop = loop, submodels = submodels)
}
>

loop （上面第3行）被赋值：

loop <- ddply(grid, c("shrinkage", "interaction.depth", "n.minobsinnode"),
                              function(x) c(n.trees = max(x$n.trees)))`

现在，让我们将 grid 与 54 rows 一起传递到上面的行并检查结果：

> nrow(grid)
[1] 54
> 
> loop <- ddply(grid, c("shrinkage", "interaction.depth", "n.minobsinnode"),
+               function(x) c(n.trees = max(x$n.trees)))
> loop
   shrinkage interaction.depth n.minobsinnode n.trees
1       0.09                 1              8     150
2       0.09                 1             10     150
3       0.09                 2              8     150
4       0.09                 2             10     150
5       0.09                 3              8     150
6       0.09                 3             10     150
7       0.10                 1              8     150
8       0.10                 1             10     150
9       0.10                 2              8     150
10      0.10                 2             10     150
11      0.10                 3              8     150
12      0.10                 3             10     150
13      0.11                 1              8     150
14      0.11                 1             10     150
15      0.11                 2              8     150
16      0.11                 2             10     150
17      0.11                 3              8     150
18      0.11                 3             10     150
>

ahh!, we found it . 值 18 来自 nrow(trainInfo$loop) ，它来自上面显示的 getModelInfo("gbm", regex = FALSE)[[1]]$loop ，只有 18 rows .

现在，回到触发错误的测试：

badSeed <- (length(trControl$seeds) < length(trControl$index) + 
              1) || (any(numSeeds[-length(numSeeds)] < nrow(trainInfo$loop)))

测试 (length(trControl$seeds) < length(trControl$index) + 1) 的第一部分是 FALSE ，但是第二部分 (any(numSeeds[-length(numSeeds)] < nrow(trainInfo$loop))) 对于所有值来说是 TRUE ，而 18 [来自 nrow(trainInfo$loop) ]，而 FALSE 对于所有值大于 18 的 FALSE . 这就是为 <18 而不是 >=18 触发错误的原因 . 正如我上面所说，插入符号基于 interaction.depth * shrinkage * n.minobsinnode 计算 seeds 3 * 3 * 2 = 18 （一个模型适合 max(n.trees) 而其他模型是从它派生的，因此不需要 54 整数） .

回复于 2024-04-28T08:59:14+08:00

种子对象，用于在插入符号中并行操作的可重现结果

1 回答

相关问题