首页 文章

根据群体随机抽取样本

提问于
浏览
12

我有一个由15个不同ID分散的近50,000行的df(每个ID有数千个观察值) . df看起来像:

ID  Year    Temp    ph
1       P1  1996    11.3    6.80
2       P1  1996    9.7     6.90
3       P1  1997    9.8     7.10
...
2000    P2  1997    10.5    6.90
2001    P2  1997    9.9     7.00
2002    P2  1997    10.0    6.93

我想为每个ID获取500个随机行(对于P1为500,对于P2为500,....)并创建一个新的df . 我尝试:

new_df<-df[df$ID %in% sample(unique(dfID),500),]

但它随机需要一个ID,而每个ID需要500个随机行 .

6 回答

  • 2

    试试这个:

    library(plyr)
    ddply(df,.(ID),function(x) x[sample(nrow(x),500),])
    
  • 6

    这可以作为 dplyr 中的 sample_n 函数使用:

    library(dplyr)
    new_df <- df %>% group_by(ID) %>% sample_n(500)
    
  • 17

    这是基地R的一种方法 .

    首先,要使用的先决条件样本数据:

    set.seed(1)
    mydf <- data.frame(ID = rep(1:3, each = 5), matrix(rnorm(45), ncol = 3))
    mydf
    #    ID         X1          X2          X3
    # 1   1 -0.6264538 -0.04493361  1.35867955
    # 2   1  0.1836433 -0.01619026 -0.10278773
    # 3   1 -0.8356286  0.94383621  0.38767161
    # 4   1  1.5952808  0.82122120 -0.05380504
    # 5   1  0.3295078  0.59390132 -1.37705956
    # 6   2 -0.8204684  0.91897737 -0.41499456
    # 7   2  0.4874291  0.78213630 -0.39428995
    # 8   2  0.7383247  0.07456498 -0.05931340
    # 9   2  0.5757814 -1.98935170  1.10002537
    # 10  2 -0.3053884  0.61982575  0.76317575
    # 11  3  1.5117812 -0.05612874 -0.16452360
    # 12  3  0.3898432 -0.15579551 -0.25336168
    # 13  3 -0.6212406 -1.47075238  0.69696338
    # 14  3 -2.2146999 -0.47815006  0.55666320
    # 15  3  1.1249309  0.41794156 -0.68875569
    

    二,抽样:

    do.call(rbind, 
            lapply(split(mydf, mydf$ID), 
                   function(x) x[sample(nrow(x), 3), ]))
    #      ID         X1          X2         X3
    # 1.2   1  0.1836433 -0.01619026 -0.1027877
    # 1.1   1 -0.6264538 -0.04493361  1.3586796
    # 1.5   1  0.3295078  0.59390132 -1.3770596
    # 2.10  2 -0.3053884  0.61982575  0.7631757
    # 2.9   2  0.5757814 -1.98935170  1.1000254
    # 2.8   2  0.7383247  0.07456498 -0.0593134
    # 3.13  3 -0.6212406 -1.47075238  0.6969634
    # 3.12  3  0.3898432 -0.15579551 -0.2533617
    # 3.15  3  1.1249309  0.41794156 -0.6887557
    

    sampling 包中还有 strata ,当您想从每个组中采样不同的尺寸时,这很方便:

    # install.packages("sampling")
    library(sampling)
    set.seed(1)
    x <- strata(mydf, "ID", size = c(2, 3, 2), method = "srswor")
    getdata(mydf, x)
    #            X1          X2         X3 ID ID_unit Prob Stratum
    # 2   0.1836433 -0.01619026 -0.1027877  1       2  0.4       1
    # 5   0.3295078  0.59390132 -1.3770596  1       5  0.4       1
    # 6  -0.8204684  0.91897737 -0.4149946  2       6  0.6       2
    # 8   0.7383247  0.07456498 -0.0593134  2       8  0.6       2
    # 9   0.5757814 -1.98935170  1.1000254  2       9  0.6       2
    # 14 -2.2146999 -0.47815006  0.5566632  3      14  0.4       3
    # 15  1.1249309  0.41794156 -0.6887557  3      15  0.4       3
    
  • 0

    一个方法,如果ID为<500 . 这里我使用了mtcars集:

    n <- 8
    df <- mtcars
    df$ID <- df$cyl
    
    FUN <- function(x, n) {
        if (length(x) <= n) return(x)
        x[x %in% sample(x, n)]
    }
    
    df[unlist(lapply(split(1:nrow(df), df$ID), FUN, n = 8)), ]
    
  • 10
    mydata1 is your original data(not tested)
    
    mydata2<- split(mydata1,mydata1$ID)
    names(mydata2)<-paste0("mydata2",1:length(levels(ID))) 
    mysample<-Map(function(x) x[sample((1:nrow(x)),size=500,replace=FALSE),], mydata2)
    
    library(plyr)# for rbinding the mysample
    ldply(mysample)
    
  • 0

    虽然这不是很优雅的解决方案,但它可能会起作用 .

    library(data.table)
    df <- data.table(df)
    f <- list()
    for(i in unique(df1$ID)){
     f[[i]] <- df1[id == i][sample(.N,(500))]
      }
     dfnew <- rbindlist(f)
    

相关问题