glmnet ridge逻辑回归中的崩溃-Java 学习之路

我在软件包glmnet（版本2.0.10和2.0.13，至少）中获得随机崩溃，尝试使用ridge逻辑回归运行cv.glmnet . 下面提供了可再现的例子 . 正如您将看到的，行为取决于所选的随机种子 .

该错误发生在 cv.lognet() 中，因为有时 nlami==0 . 这是因为全局（非交叉验证）λ序列的范围（即下例中的[14.3; 20.7]）完全小于其中一个折叠上的λ范围（即折叠4， [32.5; 22.4]）

可能的解决方法是通过更改 which_lam 的定义强制 nlami>=1 ，如下所示：

which_lam = lambda >= min(mlami, max(lambda))

这样可以避免崩溃，但不能确定结果是否正确 . 有人可以确认或提出另一个修复方案吗？

NB：似乎与未解决的问题有关cv.glmnet fails for ridge, not lasso, for simulated data with coder error

可重复的例子

library(glmnet)

x=structure(c(0.294819653005975, -0.755878041644385, -0.460947383309942, 
  -1.25359210780316, -0.643969512320233, -0.146301489038128, -0.190235360501265, 
  -0.778418128295596, -0.659228201713315, -0.589987067456389, 1.33064976036166, 
  -0.232480434360983, -0.374383490492533, -0.504817187501063, -0.558531620483801, 
  2.16732105550181, 0.238948891919474, -0.857229316573454, -0.673919980092841, 
  1.17924306872964, 0.831719897152008, -1.15770770325374, 2.54984789196214, 
  -0.970167597835476, -0.557900637238063, -0.432268012373971, 1.15479761345536, 
  1.72197312745038, -0.460658453148444, -1.17746101934592, 0.411060691690596, 
  0.172735774511478, 0.328416881299735, 2.13514661730084, -0.498720272451663, 
  0.290967756655844, -0.87284566376257, -0.652533179632676, -0.89323787137697, 
  -0.566883371886824, -1.1794485033936, 0.821276174960557, -0.396480750015741, 
  -0.121609740429242, -0.464060359619162, 0.0396628676584573, -0.942871230138644, 
  0.160331360905244, -0.369955203694528, -0.192318421900764, -1.39309898491775, 
  -0.264395753844046, 2.25142560078458, -0.897873918532094, -0.159680604037913, 
  -0.918027468751383, 0.43181753901048, 1.56060286954228, -0.617456504201816, 
  1.73106033616784, -0.97099289786049, -1.09325650121771, -0.0407358272757967, 
  0.553103582991963, 1.15479545417553, 0.36144086171342, -1.35507249278068, 
  1.37684903500442, 0.755599287825675, 0.820363089698391, 1.65541232241803, 
  -0.692008406375665, 1.65484854848556, -1.14659093945895), .Dim = c(37L, 2L))

# NB: x is already standardized
print(apply(x,2,mean))
print(apply(x,2,sd))

y=c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, 
  FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, 
  TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, 
  TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE)

# NB: y is moderately unbalanced
print(table(y))

# This works OK (with a warning):
set.seed(3)
m = cv.glmnet(x, y, family = "binomial", alpha = 0, standardize = FALSE, type.measure = "class", nfolds = 5)

# This crashes:
set.seed(1)
m = cv.glmnet(x, y, family = "binomial", alpha = 0, standardize = FALSE, type.measure = "class", nfolds = 5)
# Error in predmat[which, seq(nlami)] <- preds : 
#     replacement has length zero

编辑：数据的可视化显示没有特定的模式 . 预计线性分离器的性能较低：
Data plot: points positioned by 2 predictors, colored by label

1 回答

我认为问题在于，在交叉验证期间，有一个数据样本只有一个响应变量（y全部为TRUE，或全部为FALSE），因为您的观察结果很少 . 使用一些随机种子你很幸运，但这不会发生，但种子等于1 . 我对这么少观察的建议是跳过交叉验证并适合模型，然后观察变化的lambda如何改变系数：

lbs_fun <- function(fit, ...) {
  L <- length(fit$lambda)
  x <- log(fit$lambda[L])
  y <- fit$beta[, L]
  labs <- names(y)
  text(x, y, labels = labs, cex = 0.8, pos = 4)
}

m <- glmnet(x = x, y = y, alpha = 0, family = "binomial")
plot(m, xvar="lambda")
lbs_fun(m)

enter image description here

请注意，这适用于任何种子（我测试过）而没有错误 .

关于你对预测进行评估的愿望，我就是这样做的，请注意，对于glmnet软件包，请忽略一个交叉验证，因此必须在此处手动完成 .

y <- y * 1 # I prefer 1 and 0, rather than true and false:
set.seed(1111) # set aside a holdout
holdout <- sample.int(37, 10)

x_train <- x[-holdout,]
y_train <- y[-holdout]
x_holdout <- x[holdout,]
y_holdout <- y[holdout]

# leave one out cross validation
out_df <- c()
run_num = 1
for(lambda_val in seq(0.001, 5, 0.1)) {
  for(one in 1:nrow(x_train)) {
    new_x = x_train[-one,] # train data minus one
    new_y = y_train[-one] # train data minus one
    one_x = x_train[one,,drop=FALSE] # leave one out
    one_y = y_train[one] # leave one out
    fit <- glmnet(x = new_x, y = new_y, alpha = 0, family = "binomial", standardize = F, lambda = lambda_val)
    y_hat <- predict(fit, one_x, type = "response")
    row <- c(run_num, lambda_val, y_hat, one_y)
    out_df <- rbind(out_df, row)
  }
  run_num <- run_num + 1
}
row.names(out_df) <- NULL
out_df <- data.frame(out_df)
names(out_df) <- c("run_number", "lambda", "y_hat", "y_actual")

# choose an evaluation metric: Accuracy (TN + TP)/(N + P), you will need to tune this threshold to best align with your metric
out_df$y_hat2 <- ifelse(out_df$y_hat >= 0.3, 1, 0)

get_best_run <- c()
for (run in unique(out_df$run_number)) {
  sub <- out_df[out_df$run_number == run, c("y_hat2", "y_actual")]
  accuracy <- nrow(sub[sub$y_hat2 == sub$y_actual,])/nrow(sub)
  row <- c(run, accuracy)
  get_best_run <- rbind(get_best_run, row)
}
row.names(get_best_run) <- NULL
get_best_run <- data.frame(get_best_run)
names(get_best_run) <- c("run_num", "accuracy")

# find the run number with the best accuracy
keep <- get_best_run[get_best_run$accuracy == max(get_best_run$accuracy), "run_num"]
keep_lambda <- unique(out_df[out_df$run_number == keep, "lambda"])

# fit a model with all of the train data (no cv here), and use the keep_lambda
fit <- glmnet(x = x_train, y = y_train, alpha = 0, family = "binomial", standardize = F, lambda = keep_lambda)

# make a prediction for the holdout + apply the same threshold used earlier
preds <- predict(fit, x_holdout, type = "response")
preds2 <- ifelse(preds >= 0.3, 1, 0)

# how can we expect this model to perform?
conf_mat <- table(preds2, y_holdout)
(conf_mat[1,1] + conf_mat[2,2])/sum(conf_mat) # accuracy 0.3

conf_mat
#        y_holdout
# preds2 0 1
#      0 3 2
#      1 5 0

回复于 2024-05-13T00:43:43+08:00

glmnet ridge逻辑回归中的崩溃

可重复的例子

1 回答

相关问题