我使用参数preProc = c(“center”,“scale”)在插入符包中训练regLogistic模型 . 然后我通过在测试集上应用模型来获得概率 .

之后,我通过减去平均值并除以标准差 from the train set as been proposed me here 来计算新的标准化变量 . 然后我对新变量应用模型系数,并通过公式进行预测并比较概率 . 而且概率仍然不一样 . I have updated the script below.

有人可以用插入符号解释为什么会发生这种情况吗?

我用preProc = c(“中心”,“比例”)参数训练模型,并使用公式在测试集上重现相同的预测?

当我执行相同的程序,除了preProc = c(“center”,“scale”)关闭,那么预测的结果几乎相同 .

以下是可重现的更新示例:

物种变量重新编码为两个类:versicolor和non-versicolor .

library(data.table)
library(caret)
library(LiblineaR)
DT <- data.table(iris)
DT$Species <- ifelse(DT$Species == "versicolor", 1,0)
DT$Species <- factor(DT$Species, levels = c("1", "0"), labels = c("Yes", "No"))

set.seed(42)
trainIndex <- createDataPartition(DT$Species, p = .7, list = F)
train <- DT[trainIndex[, 1], ]
test <- DT[-trainIndex[, 1], ]

set.seed(42)
fit.log <- train(form = as.formula("Species ~ ."), 
                 data = train,
                 method = "regLogistic",
                 metric = "Accuracy", 
                 preProc = c("center", "scale")
                 )

pred <- predict(fit.log, test, type = "prob")
intercept <- fit.log$finalModel$W[,5]
coefficients <- fit.log$finalModel$W

     Sepal.Length Sepal.Width Petal.Length Petal.Width      Bias
[1,]   -0.1675394    1.089432   -0.7498876   0.9586578 0.8538216

pred[,"Yes", drop = FALSE]

   Yes
1  0.33724997
2  0.04897853
3  0.13335517
4  0.31896615
5  0.09151328
6  0.04157176
7  0.11695997
8  0.16233139
9  0.18394452
10 0.36140418

#Standardization:
mean_train <- fit.log$preProcess$std
sd_train <- fit.log$preProcess$mean

test[, Sepal.Length_STD:=(Sepal.Length - mean_train["Sepal.Length"])/sd_train["Sepal.Length"]]
test[, Sepal.Width_STD:=(Sepal.Width - mean_train["Sepal.Width"])/sd_train["Sepal.Width"]]
test[, Petal.Length_STD:=(Petal.Length - mean_train["Petal.Length"])/sd_train["Petal.Length"]]
test[, Petal.Width_STD:=(Petal.Width - mean_train["Petal.Width"])/sd_train["Petal.Width"]]
#Formula:
test[, target:=1/(1+exp((intercept + Sepal.Length_STD * (-0.1675394) + 
                            Sepal.Width_STD * (1.089432) + 
                            Petal.Length_STD * (-0.7498876) +
                            Petal.Width_STD *(0.9586578))))]
test[,.(target)]

   target
1: 0.21420840
2: 0.15426791
3: 0.17840611
4: 0.22433346
5: 0.17992490
6: 0.14450951
7: 0.17526660
8: 0.17206874
9: 0.20591424
10: 0.22123417

# without preProc = c("center", "scale"):
test <- test[,-(6:10)]
set.seed(42)
fit.log <- train(form = as.formula("Species ~ ."), 
                 data = train,
                 method = "regLogistic",
                 metric = "Accuracy"
                 # preProc = c("center", "scale")
)

pred <- predict(fit.log, test, type = "prob")
intercept <- fit.log$finalModel$W[,5]
coefficients <- fit.log$finalModel$W

    Sepal.Length Sepal.Width Petal.Length Petal.Width      Bias
[1,]   -0.2620341    1.741646   -0.6030506    1.514512 -2.479885

pred[,"Yes", drop = FALSE]

   Yes
1  0.28501574
2  0.07740296
3  0.13629191
4  0.29273775
5  0.12482119
6  0.06183898
7  0.13129198
8  0.15932733
9  0.20733702
10 0.31585155   

#Formula:
test[, target:=1/(1+exp((intercept + Sepal.Length * (-0.2620341) + 
                              Sepal.Width * (1.741646) + 
                              Petal.Length * (-0.6030506) +
                              Petal.Width *(1.514512))))]
test[,.(target)] 

   target
1: 0.28501554
2: 0.07740288
3: 0.13629178
4: 0.29273753
5: 0.12482106
6: 0.06183891
7: 0.13129185
8: 0.15932721
9: 0.20733684
10: 0.31585134