首页 文章

R中的自定义合并功能

提问于
浏览
1

我有一个大型数据集,我想编写一个自定义合并函数与apply一起使用,但我无法解决某个问题 . 我不能使用循环因为它需要太长时间 . 数据大致如下;

#     [ Name, Strand, Start, End ]

R1 = c( 'GeneA', '+', 1000, 1500 )
R2 = c( 'GeneA', '+', 1510, 2000 )
R3 = c( 'GeneA', '+', 2001, 2500 )
R4 = c( 'GeneB', '-', 3100, 4000 )

数据是具有行R1:R4的data.frame

到目前为止,我可以得到一个比较Ri和Rj(j = i 1)的函数,如果Name相同则合并它们,Strand是相同的,它们之间的差距小于100 .

GAP = Ri[End] - Rj[Start]

如果我将函数应用于每一行并构建输出 . 然后应该创建函数输出

R1 = c( 'GeneA', '+', 1000, 2500 )
R4 = c( 'GeneB', '-', 3100, 4000 )

我可以得到一个工作函数,可以使用apply将两个连续元素合并为一个,但我无法弄清楚如何合并三个连续元素 . 一个丑陋的解决方案是运行两个连续的元素合并函数,直到没有进行额外的合并,但这不是有效的 . 我对想法有点困惑,任何见解都会受到赞赏 .

编辑:澄清一下,数据按连续染色体上的起始位置排序(未显示),每个基因名称在不同位置的数据集中多次出现(即GeneA可以是1000-2500,然后是4000-5000,而且我不想合并这两个基因,只有连续的基因 .

编辑2:我在下面使用了Tim P解决方案 . 合并的效率出现了问题 . 还有一种方法可以将文本文件发布到堆栈溢出,这样我就可以显示数据的真实情况,然后发布我的脚本到目前为止?

# Let ValidMerge be a logical vector of MergeDown corresponding to the rows in the data
 # as defined by TimP below
 # The data.frame is called RMDB 
 # TeMerge function is previously defined to merge two rows Ri and Rj into one entry
 # which spans the start of Ri to the end of Rj (with same name and strand)

 COUNT = 0
 RMDB.OUT = 0

 while (COUNT < nrow(RMDB)) { # Cycle through all rows of RMDB
  COUNT = COUNT + 1

  # Is this position a merger start?
    # If yes, then returns position in startpt
    # which will be the end position in endpt
    # If no, returns 0

  Merge = match(COUNT,startpt,nomatch=0) 

  if ( Merge == 0 ){
    # No merge starts at this position
    RMDB.OUT = rbind(RMDB.OUT,RMDB[COUNT,]) # Append COUNT Row to output

  }else{
    # Merge COUNT row in RMDB to its endpoint 

      RMDB.OUT = rbind(RMDB.OUT,
                     TeMerge(RMDB[COUNT,],RMDB[endpt[Merge],]))

    #print(paste('Merging Row',COUNT,' and Row',endpt[Merge]))

    COUNT = endpt[Merge] # Move Count to the endpoint      
  }
}

这个脚本给出了我感兴趣的结果,唯一的问题是我的data.frame有5 000 000个条目,我想使用GAP大小的几个参数来运行分析来比较结果 . 有没有办法可以重写这部分代码以提高效率?到目前为止,所有事情都在合理的时间内运行(约几分钟) . 这一部分已经在70万个数据子集上进行了3个小时 .

编辑3:加标数据,所有案例都在顶部(MIR3) . 忽略第1:4,8,11:15栏 .

dput(RMDB)结构(列表(V1 = c(3612L,318L,318L,318L,318L,318L,318L,318L,318L,318L,318L,318L,318L,318L,741L,444L,741L,407L,2059L) ,407L,656L,2058L,257L,4051L,456L,351L,850L,335L,1000L,1566L,236L,588L,3877L,750L,2292L,783L,747L,666L,260L,1118L,341L,7010L,320L,7010L ,249L,458L,24L,631L,631L,875L),V2 = c(11.4,23,23,23,23,23,23,23,23,23,23,23,23,23,18.7,11.6, 24.9,28.8,14.1,28.8,23.6,18.3,25,7,8.2,23.6,24.9,29.5,13.5,19.4,34.8,17.4,22.9,27.6,12,26.6,30.4,12.9,38.5,35.4,27.8, 19.2,0,17.3,21.2,19.3,0,3.9,26.6,22.6),V3 = c(27,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8 ,4.5,0,0,7.3,0.3,7.3,12.2,4.9,0,0.4,0,1.8,15.1,12.7,0,3.6,3,7.5,14,9.4,0,14.1,4.1,4,1.4,1.4 ,9.4,5.9,2.7,0,3.3,1,0,0,0,1.8,9.5),V4 = c(1.3,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,3.1,1.1,3,3.3,3,3,0,0,0,3.6,0.8,2.7,0,2.8,0.4,0.8,1.6,6.4,0,3.8, 3.7,0,0,2 .6,1.5,2.5,2.4,1.4,2,5,0,0,0.6,0.5),V5 =结构(c(1L,1L,1L,1L,1L,1L,1L,1L,1L,1L, 1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L, 1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L), . 标签=“chr1”,等级=“因子”),V6 = c(10469L ,20001L,20210L,21000L,21201L,22000L,22201L,23000L,23201L,20000L,20001L,24001L,24205L,24405L,0L,30855L,30953L,31293L,31436L,31734L,32841L,33048L,33466L,33529L,34048L,34451L ,34565L,35217L,35367L,37045L,37733L,38059L,38256L,39465L,39624L,39925L,40333L,40629L,40736L,41380L,42370L,43243L,44836L,44877L,45887L,46079L,46217L,46416L,46553L,46893L), V7 = c(11447L,20200L,20400L,21200L,21400L,22200L,22400L,23200L,23400L,20001L,20200L,24200L,24400L,24600L,0L,30952L,31131L,31435L,31733L,31754L,33037L,33456L,33509L, 34041L,34108L,34560L,34921L,35366L,35499L,37431L,37861L,38191L,39464L,39623L,39924L,40294L,4062 6L,40729L,40878L,42285L,42504L,44835L,44876L,45753L,45987L,46198L,46240L,46493L,46722L,47092L),V8 =结构(c(38L,37L,37L,37L,37L,37L,37L,37L) ,37L,37L,37L,37L,37L,37L,36L,35L,34L,33L,32L,31L,30L,29L,28L,27L,26L,25L,24L,23L,22L,21L,20L,19L,18L ,17L,16L,15L,14L,13L,12L,11L,10L,9L,8L,7L,6L,5L,4L,3L,2L,1L),. Label = c(“(249203529)”,“(249203899) )“,”(249204128)“,”(249204381)“,”(249204423)“,”(249204634)“,“(249204868)”,“(249205745)”,“(249205786)”,“(249208117)”,“(249208336)”,“(249209743)”,“”(249209892)“,”(249209995)“,”( 249210327)“,”(249210697)“,”(249210998)“,”(249211157)“,”(249212430)“,”(249212760)“,”(249213190)“,”(249215122)“,”(249215255) “,”(249215700)“,”(249216061)“,”(249216513)“,”(249216580)“,”(249217112)“,”(249217165)“,”(249217584)“,”(249218867)“, “(249218888)”,“(249219186)”,“(249219490)”,“(249219669)”,“(249219773)”,“(249235266)”,“(249239174)”),class =“factor”), V9 =结构(c(2L,1L,1L,2L,2L,2L,2L,2L,1L,2L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L, 1L,2L,1L,2L,2L,2L,1L,1L,1L,1L,1L,1L,1L,1L,1L,2L,2L,2L,1L,1L,1L,1L,1L,1L,1L, 2L,1L,2L),. Label = c(“”,“C”),class =“factor”),V10 =结构(c(32L,23L,23L,23L,23L,23L,24L,23L,23L) ,23L,23L,23L,23L,23L,34L,33L,27L,26L,1L,26L,22L,12L,5L,14L,13L,16L,30L,25L,2L,7L,16L,15L,29L,28L ,3L,28L,15L,4L,18L,8L,19L,10L,31L,10L,9L,11L,6 L,17L,20L,21L), . Label = c(“AluJo”,“AluJr”,“AluSx”,“AluSz6”,“AluYc”,“AT_rich”,“Charlie5”,“ERVL-E-int”, “L1M5”,“L1MA8”,“L1MA9”,“L1MB5”,“L1P1”,“L1PA6”,“L2a”,“L2c”,“LTR12F”,“LTR16C”,“MamRep1527”,“MER45A”,“MER58A “,”MIR“,”MIR3“,”MIR3a“,”MIRb“,”MIRc“,”MLT1A“,”MLT1E1A“,”MLT1E1A-int“,”MLT1J2“,”(TAAA)n“,”TAR1“ ,“(TC)n”,“XXXXX”),class =“factor”),V11 =结构(c(10L,13L,13L,13L,13L,13L,13L,13L,13L,13L,13L,13L, 13L,13L,14L,11L,9L,13L,12L,13L,13L,3L,12L,3L,3L,4L,9L,13L,12L,1L,4L,4L,9L,9L,12L,9L,4L, 12L,8L,8L,6L,3L,11L,3L,3L,3L,5L,7L,2L,1L), . 标签= c(“DNA / hAT-Charlie”,“DNA / hAT-Tip100”,“LINE / L1“,”LINE / L2“,”Low_complexity“,”LTR“,”LTR / ERV1“,”LTR / ERVL“,”LTR / ERVL-MaLR“,”卫星/电视“,”Simple_repeat“,”SINE / Alu“,”SINE / MIR“,”XXXXXXXX“),class =”factor“),V12 =结构(c(19L,5L,5L,5L,5L,5L,5L,5L,5L,5L,5L, 5L,5L,5L,2L,9L,8L,25L,2L,10L,9L,23L,2L,1L,15L,12L,1L,6L,2L,11L,16L,13 L,7L,2L,2L,8L,14L,1L,3L,26L,17L,18L,9L,21L,22L,24L,2L,4L,27L,20L), . Label = c(“(0)”, “1”,“(113)”,“(115)”,“(119)”,“(13)”,“131”,“173”,“2”,“218”,“2234”,“( 231)“,”2705“,”2923“,”2970“,”3242“,”359“,”3715“,”(399)“,”(4)“,”5306“,”5334“,”5746 “,”6167“,”67“,”(685)“,”7“),class =”factor“),V13 = c(1712L,143L,143L,143L,143L,143L,143L,143L,143L, 143L,143L,143L,143L,143L,162L,96L,349L,217L,298L,238L,216L,6174L,44L,6154L,3030L,3156L,448L,255L,133L,2623L,3375L,2846L,1489L,172L, 301L,666L,3217L,312L,376L,4982L,499L,5305L,41L,6290L,5433L,6280L,24L,404L,178L,220L),V14 =结构(c(23L,24L,24L,24L,24L,24L) ,24L,24L,24L,24L,24L,24L,24L,24L,8L,1L,10L,25L,4L,13L,21L,1L,11L,26L,15L,14L,20L,30L,5L,2L,1L ,27L,1L,18L,3L,4L,7L,6L,9L,19L,22L,29L,1L,2L,28L,16L,1L,17L,1L,12L),.标签= c(“(0)” ,“(1)”,“(11)”,“(14)”,“(179)”,“208”,“(209)”,“(212)”,“232”,“(25)” ,“(255)”,“3”,“(3 0)“,”3049“,”(3116)“,”(32)“,”327“,”(388)“,”4016“,”41“,”(46)“,”(470)“, “483”,“49”,“(51)”,“5640”,“(573)”,“(691)”,“(838)”,“91”),class =“factor”),V15 = c(2L,5L,5L,5L,5L,5L,5L,5L,5L,5L,5L,5L,5L,5L,22L,23L,22L,24L,25L,24L,26L,27L,28L,29L, 30L,31L,32L,33L,34L,35L,36L,37L,38L,38L,39L,38L,37L,40L,41L,42L,43L,44L,45L,44L,46L,47L,48L,49L,50L, 51L)),. Name = c(“V1”,“V2”,“V3”,“V4”,“V5”,“V6”,“V7”,“V8”,“V9”,“V10”,“ V11“,”V12“,”V13“,”V14“,”V15“),class =”data.frame“,row.names = c(NA,-50L))

这是应用ValidMerge后的输出 .

dput(RMDB.OUT)结构(列表(V1 = c(3612L,NA,NA,318L,318L,318L,318L,318L,318L,NA,741L,444L,741L,407L,2059L,407L,656L,2058L) ,257L,4051L,456L,351L,850L,335L,1000L,1566L,236L,588L,3877L,750L,2292L,783L,747L,666L,260L,1118L,341L,7010L,320L,7010L,249L,458L,24L ,631L,631L,875L),V2 = c(11.4,NA,NA,23,23,23,23,23,23,NA,18.7,11.6,24.9,28.8,14.1,28.8,23.6,18.3,25, 7,8.2,23.6,24.9,29.5,13.5,19.4,34.8,17.4,22.9,27.6,12,26.6,30.4,12.9,38.5,35.4,27.8,19.2,0,17.3,21.2,19.3,0,3.9, 26.6,22.6),V3 = c(27,NA,NA,3.8,3.8,3.8,3.8,3.8,3.8,NA,4.5,0,0,7.3,0.3,7.3,12.2,4.9,0,0.4,0 ,1.8,15.1,12.7,0,3.6,3,7.5,14,9.4,0,14.1,4.1,4,1.4,9.4,5.9,2.7,0,3.3,1,0,0,0,1.8,9.5 ),V4 = c(1.3,NA,NA,0,0,0,0,0,0,NA,0,3.1,1.1,3,3.3,3,3,0,0,0,3,2,3, 0.8,2.7,0,2.8,0.4,0.8,1.6,6.4,0,3.8,3.7,0,0,2.6,1.5,2.5,2.4,1.4,2,5,0,0,0.6,0.5),V5 =结构(c(1L) ,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L ,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L), . 标签=“chr1”,类=“因子”),V6 = c(10469L,20001L,21000L,22000L,22201L,23000L,23201L,20000L,20001L,24001L,0L,30855L,30953L,31293L,31436L,31734L,32841L,33048L,33466L,33529L, 34048L,34451L,34565L,35217L,35367L,37045L,37733L,38059L,38256L,39465L,39624L,39925L,40333L,40629L,40736L,41380L,42370L,43243L,44836L,44877L,45887L,46079L,46217L,46416L,46553L, 46893L),V7 = c(11447L,20400L,21400L,22200L,22400L,23200L,23400L,20001L,20200L,24600L,0L,30952L,31131L,31435L,31733L,31754L,33037L,33456L,33509L,34041L,34108L,34560L ,34921L,35366L,35499L,37431L,37861L,38191L,39464L,39623L,39924L,40294L,40626L,40729L,40878L,42285L,42504L, 44835L,44876L,45753L,45987L,46198L,46240L,46493L,46722L,47092L),V8 =结构(c(38L,37L,37L,37L,37L,37L,37L,37L,37L,37L,36L,35L,34L) ,33L,32L,31L,30L,29L,28L,27L,26L,25L,24L,23L,22L,21L,20L,19L,18L,17L,16L,15L,14L,13L,12L,11L,10L,9L ,8L,7L,6L,5L,4L,3L,2L,1L),. Label = c(“(249203529)”,“(249203899)”,“(249204128)”,“(249204381)”,“(249204423) )“,”(249204634)“,”(249204868)“,”(249205745)“,”(249205786)“,”(249208117)“,”(249208336)“,”(249209743)“,”(249209892)“ ,“(249209995)”,“(249210327)”,“(249210697)”,“(249210998)”,“(249211157)”,“(249212430)”,“(249212760)”,“(249213190)”,“ (249215122)“,”(249215255)“,”(249215700)“,”(249216061)“,”(249216513)“,”(249216580)“,”(249217112)“,”(249217165)“,”(249217584) )“,”(249218867)“,”(249218888)“,”(249219186)“,”(249219490)“,”(249219669)“,”(249219773)“,”(249235266)“,”(249239174)“ ),class =“factor”),V9 =结构(c(2L,1L,2L,2L,2L,2L,1L,2L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1 L,1L,2L,1L,2L,2L,2L,1L,1L,1L,1L,1L,1L,1L,1L,1L,2L,2L,2L,1L,1L,1L,1L,1L,1L, 1L,2L,1L,2L), . 标签= c(“”,“C”),类=“因子”),V10 =结构(c(32L,23L,23L,23L,24L,23L,23L,23L) ,23L,23L,34L,33L,27L,26L,1L,26L,22L,12L,5L,14L,13L,16L,30L,25L,2L,7L,16L,15L,29L,28L,3L,28L,15L ,4L,18L,8L,19L,10L,31L,10L,9L,11L,6L,17L,20L,21L), . 标签= c(“AluJo”,“AluJr”,“AluSx”,“AluSz6”,“ AluYc“,”AT_rich“,”Charlie5“,”ERVL-E-int“,”L1M5“,”L1MA8“,”L1MA9“,”L1MB5“,”L1P1“,”L1PA6“,”L2a“,”L2c“ ,“LTR12F”,“LTR16C”,“MamRep1527”,“MER45A”,“MER58A”,“MIR”,“MIR3”,“MIR3a”,“MIRb”,“MIRc”,“MLT1A”,“MLT1E1A”,“ MLT1E1A-int“,”MLT1J2“,”(TAAA)n“,”TAR1“,”(TC)n“,”XXXXX“),类=”因子“),V11 =结构(c(10L,13L,13L) ,13L,13L,13L,13L,13L,13L,13L,14L,11L,9L,13L,12L,13L,13L,3L,12L,3L,3L,4L,9L,13L,12L,1L,4L,4L ,9L,9L,12L,9L,4L,12L,8L,8L,6L,3L,11L,3L,3L,3L,5L,7L,2L,1L), . 标签= c(“DNA / hAT-Ch arlie“,”DNA / hAT-Tip100“,”LINE / L1“,”LINE / L2“,”Low_complexity“,”LTR“,”LTR / ERV1“,”LTR / ERVL“,”LTR / ERVL-MaLR“ ,“Satellite / telo”,“Simple_repeat”,“SINE / Alu”,“SINE / MIR”,“XXXXXXXX”),class =“factor”),V12 =结构(c(19L,NA,NA,5L,5L) ,5L,5L,5L,5L,NA,2L,9L,8L,25L,2L,10L,9L,23L,2L,1L,15L,12L,1L,6L,2L,11L,16L,13L,7L,2L ,2L,8L,14L,1L,3L,26L,17L,18L,9L,21L,22L,24L,2L,4L,27L,20L), . 标签= c(“(0)”,“1”,“ (113)“,”(115)“,”(119)“,”(13)“,”131“,”173“,”2“,”218“,”2234“,”(231)“,” 2705“,”2923“,”2970“,”3242“,”359“,”3715“,”(399)“,”(4)“,”“5306”,“5334”,“5746”,“6167” ,“67”,“(685)”,“7”),类=“因子”),V13 = c(1712L,NA,NA,143L,143L,143L,143L,143L,143L,NA,162L,96L ,349L,217L,298L,238L,216L,6174L,44L,6154L,3030L,3156L,448L,255L,133L,2623L,3375L,2846L,1489L,172L,301L,666L,3217L,312L,376L,4982L,499L ,5305L,41L,6290L,5433L,6280L,24L,404L,178L,220L),V14 =结构(c(23L,NA,NA,24L,24L,24L,24 L,24L,24L,NA,8L,1L,10L,25L,4L,13L,21L,1L,11L,26L,15L,14L,20L,30L,5L,2L,1L,27L,1L,18L,3L, 4L,7L,6L,9L,19L,22L,29L,1L,2L,28L,16L,1L,17L,1L,12L),.标签= c(“(0)”,“(1)”,“( 11)“,”(14)“,”(179)“,”208“,”(209)“,”(212)“,”232“,”(25)“,”(255)“,”3 “,”(30)“,”3049“,”(3116)“,”(32)“,”327“,”(388)“,”4016“,”41“,”(46)“,”( 470)“,”483“,”49“,”(51)“,”5640“,”(573)“,”(691)“,”(838)“,”91“),class =”factor“ ),V15 = c(2L,5L,5L,5L,5L,5L,5L,5L,5L,5L,22L,23L,22L,24L,25L,24L,26L,27L,28L,29L,30L,31L,32L,33L,34L,35L,36L,37L,38L,38L,39L,38L,37L,40L,41L, 42L,43L,44L,45L,44L,46L,47L,48L,49L,50L,51L)) . . 名称= c(“V1”,“V2”,“V3”,“V4”,“V5”,“ V6“,”V7“,”V8“,”V9“,”V10“,”V11“,”V12“,”V13“,”V14“,”V15“),class =”data.frame“,行 . names = c(NA,-46L))

编辑4:对不起,这是简化版:初始Data.frame

dput(RMDB.cut)结构(列表(染色体=结构(c(1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L) ,1L,1L,1L),. Label =“chr1”,class =“factor”),Start = c(10469L,20001L,20210L,21000L,21201L,22000L,22201L,23000L,23201L,20000L,20001L,24001L, 24205L,24405L,0L,30855L,30953L,31293L,31436L,31734L),End = c(11447L,20200L,20400L,21200L,21400L,22200L,22400L,23200L,23400L,20001L,20200L,24200L,24400L,24600L,0L ,30952L,31131L,31435L,31733L,31754L),(左)=结构(c(38L,37L,37L,37L,37L,37L,37L,37L,37L,37L,37L,37L,37L,37L,36L, 35L,34L,33L,32L,31L),. Label = c(“(249203529)”,“(249203899)”,“(249204128)”,“(249204381)”,“(249204423)”,“(249204634) “,”(249204868)“,”(249205745)“,”(249205786)“,”(249208117)“,”(249208336)“,”(249209743)“,”(249209892)“,”(249209995)“, “(249210327)”,“(249210697)”,“(249210998)”,“(249211157)”,“(249212430)”,“(249212760)”,“”(249213190)“,”(249215122)“,”( 249215255)“,”(249215700)“,” (249216061)“,”(249216513)“,”(249216580)“,”(249217112)“,”(249217165)“,”(249217584)“,”(249218867)“,”(249218888)“,”(249219186 )“,”(249219490)“,”(249219669)“,”(249219773)“,”(249235266)“,”(249239174)“),class =”factor“),钢绞线=结构(c(2L,1L) ,1L,2L,2L,2L,2L,2L,1L,2L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L), . 标签= c(“”,“C”) ,class =“factor”),repName =结构(c(32L,23L,23L,23L,23L,23L,24L,23L,23L,23L,23L,23L,23L,23L,34L,33L,27L,26L, 1L,26L),. Label = c(“AluJo”,“AluJr”,“AluSx”,“AluSz6”,“AluYc”,“AT_rich”,“Charlie5”,“ERVL-E-int”,“L1M5”, “L1MA8”,“L1MA9”,“L1MB5”,“L1P1”,“L1PA6”,“L2a”,“L2c”,“LTR12F”,“LTR16C”,“MamRep1527”,“MER45A”,“MER58A”,“MIR “,”MIR3“,”MIR3a“,”MIRb“,”MIRc“,”MLT1A“,”MLT1E1A“,”MLT1E1A-int“,”MLT1J2“,”(TAAA)n“,”TAR1“,”(TC )n“,”XXXXX“),class =”factor“),ValidMerge = c(FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,错误,错误,错误,错误,错误) ), . Name = c(“Chromosome”,“Start”,“End”,“(Left)”,“Strand”,“repName”,“ValidMerge”),row.names = c(NA,20L),类=“data.frame”)

合并后的输出

dput(RMDB.out.cut)结构(列表(染色体=结构(c(1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L) ), . Label =“chr1”,class =“factor”),Start = c(“10469”,“20001”,“21000”,“22000”,“22201”,“23000”,“23201”,“20000 “,”20001“,”24001“,”0“,”30855“,”30953“,”31293“,”31436“,”31734“),结束= c(”11447“,”20400“,”21400“ ,“22200”,“22400”,“23200”,“23400”,“20001”,“20200”,“24600”,“0”,“30952”,“31131”,“31435”,“31733”,“ 31754“),(左)=结构(c(38L,37L,37L,37L,37L,37L,37L,37L,37L,37L,36L,35L,34L,33L,32L,31L), . 标签= c( “(249203529)”,“(249203899)”,(249204128)“,”(249204381)“,”(249204423)“,”(249204634)“,”(249204868)“,”(249205745)“,”( 249205786)“,”(249208117)“,”(249208336)“,”(249209743)“,”(249209892)“,”(249209995)“,”(249210327)“,”(249210697)“,”(249210998) “,”“(249211157)”,“(249212430)”,“(249212760)”,“(249213190)”,“(249215122)”,“(249215255)”,“(249215700)”,“(249216061)”, “(249216513)”,“(249216580)”,“(249217112)”,“(249 217165)“,”(249217584)“,”(249218867)“,”(249218888)“,”(249219186)“,”(249219490)“,”(249219669)“,”(249219773)“,”(249235266) “,”(249239174)“),class =”factor“),Strand =结构(c(2L,1L,2L,2L,2L,2L,1L,2L,1L,1L,1L,1L,1L,1L, 1L,1L),. Label = c(“”,“C”),class =“factor”),repName =结构(c(32L,23L,23L,23L,24L,23L,23L,23L,23L,23L) ,34L,33L,27L,26L,1L,26L), . 标签= c(“AluJo”,“AluJr”,“AluSx”,“AluSz6”,“AluYc”,“AT_rich”,“Charlie5”,“ERVL- E-int“,”L1M5“,”L1MA8“,”L1MA9“,”L1MB5“,”L1P1“,”L1PA6“,”L2a“,”L2c“,”LTR12F“,”LTR16C“,”MamRep1527“,” MER45A“,”MER58A“,”MIR“,”MIR3“,”MIR3a“,”MIRb“,”MIRc“,”MLT1A“,”MLT1E1A“,”MLT1E1A-int“,”MLT1J2“,”(TAAA)n “,”TAR1“,”(TC)n“,”XXXXX“),class =”factor“),ValidMerge = c(FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE ,TRUE,TRUE,FALSE,FALSE,FALSE)),. Name = c(“染色体”,“开始”,“结束”,“(左)”,“Strand”,“repName”,“ValidMerge”),row.names = c(“2”,“3”,“4”,“6”,“7”,“8”,“ 9“,”10“,”11“,”111“,”15“,”16“,”17“,”18“,”19“,”20“),class =”data.frame“)

3 回答

  • 0

    我认为策略应该是生成另一个名为DoMerge的列 - 并且对于每一行R_i(其中i的范围是1到n-1),如果Name和Strand在R_i和R_ {i 1}之间匹配,则DoMerge为TRUE,并且对于R_i为End足够接近于R_ {i 1}的开始(在100之内,如果这是正确的值) . 按惯例,行n的DoMerge为FALSE . 直观地,DoMerge为TRUE意味着将该行与下一行合并是有效的 .

    然后,我们将所有存在连续字符串TRUE的行合并在一起 . 如果我们同意这是最好的策略,我可以为此编写一些快速代码! :)

    UPDATE:

    这是任务的代码,假设mydf是包含Name,Strand,Start和End列的信息的数据框......下面的输出是你需要合并的起点和终点 - 尽管你知道了什么需要合并它应该是一个cinch :)

    distanceThresh = 100
    isSameName=(mydf$Name==c(mydf$Name[-1],"void"))
    isSameStrand=(mydf$Strand==c(mydf$Strand[-1],"void"))
    isWithinDistance=(c(mydf$Start[-1],max(mydf$End)+2*distanceThresh)
                      -mydf$End) <= distanceThresh
    validMerge = isSameName & isSameStrand & isWithinDistance
    
    fthent=which(!validMerge & c(validMerge[-1],FALSE))
    tthenf=which(validMerge & !c(validMerge[-1],TRUE))
    startpt = fthent+1; if (validMerge[1]) {startpt=c(1,startpt)}
    endpt = tthenf+1
    
    instructions=NULL
    for (kk in seq_along(startpt)) {
    instructions = c(instructions,
                   paste("Merge",startpt[kk],"to",endpt[kk],"inclusive",sep=" "))
    }
    

    如果这一切都有意义,请告诉我! :)

    MERGING METHOD (8 June):

    这样的事情怎么样(有一些测试,但没有真实的数据)...

    doMerge = unlist(mapply(function(x,y) {seq(x,y,1)},startpt,endpt))
    doNotMerge = setdiff(seq_along(validMerge),doMerge)
    dataMerged=data.frame(Name=RMDB$Name[startpt], Strand=RMDB$Strand[startpt],
                          Start=RMDB$Start[startpt], End=RMDB$End[endpt],
                          stringsAsFactors=FALSE)
    dataUnmerged=RMDB[doNotMerge,]
    

    基本上 doMerge 告诉你需要合并的行(只需设置一个从每个 startpt 运行到相应的 endpt 的序列) . 并且 doNotMerge 是所有其他行(假设 validMerge 的长度是数据的长度) .

    然后 dataMerged 只是直接构造合并数据应该是什么样子 - 显然 NameStrandStart 从行 startpt 继承而 End 从行 endpt 继承 . (如果还有其他感兴趣的列,你必须决定它们来自哪里,显然......)当然, dataMerged 中的行数与 startptendpt 的长度相匹配 . 最后, dataUnmerged 是所有不符合合并条件的行 .

    希望以上都是有道理的,而且很明显,如果你将 dataMergeddataUnmerged 组合在一起并重新排序以使原始序列中的所有内容都恢复原状(可能会有一个可用于此的索引列),那么您将得到所需的结果 .

    而且我希望上面的确能非常非常快地运行:)

  • 1

    这是GenomicRanges解决方案 . 第一步,仅一次性,是安装包及其依赖项

    source("http://bioconductor.org/biocLite.R")
    biocLite("GenomicRanges")
    

    然后加载包并从您的数据创建 GRanges 实例,我将其放入名为 df 的数据框中

    library(GenomicRanges)
    gr = with(df, GRanges(Chromosome, IRanges(Start, End), Strand, repName=repName))
    

    你的数据实际上有点复杂,一个'GRanges列表',其中每个列表元素都由一个基因定义,所以

    grl = split(gr, values(gr)$repName)
    

    你希望逐个元素 reduce 这个,当相邻元素之间的最小间隙宽度为100时允许减少 .

    merged = reduce(grl, min.gapwidth=100L)
    

    你可以用 as(merged, "data.frame") 强制回到 data.frame . 强制之前的结果看起来像

    > merged
    GRangesList of length 8:
    $AluJo
    GRanges with 1 range and 0 elementMetadata cols:
          seqnames         ranges strand
             <Rle>      <IRanges>  <Rle>
      [1]     chr1 [31436, 31733]      +
    
    $MIR3
    GRanges with 7 ranges and 0 elementMetadata cols:
          seqnames         ranges strand
      [1]     chr1 [20001, 20400]      +
      [2]     chr1 [23201, 23400]      +
      [3]     chr1 [24001, 24600]      +
      [4]     chr1 [20000, 20001]      -
      [5]     chr1 [21000, 21400]      -
      [6]     chr1 [22000, 22200]      -
      [7]     chr1 [23000, 23200]      -
    

    并作为 data.frame

    > as(merged, "data.frame")
       element seqnames start   end width strand
    1    AluJo     chr1 31436 31733   298      +
    2     MIR3     chr1 20001 20400   400      +
    3     MIR3     chr1 23201 23400   200      +
    4     MIR3     chr1 24001 24600   600      +
    5     MIR3     chr1 20000 20001     2      -
    6     MIR3     chr1 21000 21400   401      -
    7     MIR3     chr1 22000 22200   201      -
    8     MIR3     chr1 23000 23200   201      -
    

    对于一百万行,排列成100000个基因,我们有

    > length(grl)
    [1] 100000
    > table(elementLengths(grl))
        10
    100000
    > system.time(reduce(grl, min.gapwidth=100))
       user  system elapsed
      9.468   0.064   9.553
    
  • 1

    我首先会从数据框( df )重新渲染不同的列 . 执行如下所需的操作,然后将它们放入 data-frame

    name_strand=paste(df$Name,df$Strand,sep=",")  #for grouping
    st=df$Start
    en=df$End
    unq=unique(name_strand) #get the unique Name+Strand tags
    ls1=list()
    for(i in 1:length(unq)){
      index=which(name_strand %in% unq[i]) #get the index ids or group by unique Name+Strand 
      un_ls=unlist(strsplit(unq[i],split=","))
      ls1[[i]]=list(Name=un_ls[1],Strand=un_ls[2],Start=min(st[index]),End=max(en[index]))
    }
    df=as.data.frame(do.call(rbind, ls1))
    

    所以这也可以解决 NameStranddata-frame 其他地方发生的情况 .

    EDIT

    由于您的问题存在疑问,如果您想要 gap_distance <100并且还考虑到可能不会订购 data-frame 行,我会有一个经过修改的解决方案 . 所以我的解决方案考虑了这两个(如果行已按 StartEnd 的降序排序,则可以修改它:

    for(i in 1:length(unq)){
      index=which(name_strand %in% unq[i]) #get the index ids or group by unique Name+Strand 
      un_ls=unlist(strsplit(unq[i],split=","))
      #previous sol: ls1[[i]]=list(Name=un_ls[1],Strand=un_ls[2],Start=min(st[index]),End=max(en[index]))
    
      new_st=st[index] #new Start vector from the "st" vector for this group by name_strand
      new_en=en[index]
      new_st=new_st[order(new_st)] #when data-frame rows are not ordered:
      new_en=new_en[order(new_en)]  #"order" method gives the index in ascending order of START
    
      #using embed method to lag and taking the diff for checking with gap distance considerations
      gap_diff=embed(new_st,2)[,1]-new_en[-length(new_en)]  #subtract from End vector except the last element of End  
      a_ind_st=1 #index of new_st (Start) vector
      b_ind_en=1 #index of new_en (End) vector     
      ls_ind=1  #index for list
      for (j in 1:(length(new_en)-1)){  #also acts as the index for gap_diff
          if (gap_diff[j] < gap_distance){ #where gap_distance = 100 (consideration #2) < or <=
              b_ind_en=j+1
               if (j + 1 > length(new_en)-1){  #special case for last element in vector
                  ls1[[ls_ind]]=list(Name=un_ls[1],Strand=un_ls[2],Start=new_st[a_ind_st],End=new_en[b_ind_en]))
                  ls_ind=ls_ind+1 #increment list index
               }
           }else{
              ls1[[ls_ind]]=list(Name=un_ls[1],Strand=un_ls[2],Start=new_st[a_ind_st],End=new_en[b_ind_en]))
              ls_ind=ls_ind+1 #increment list index
              a_ind_st=b_ind_en+1
              b_ind_en=b_ind_en+1
          }      
       }
    }
    df=as.data.frame(do.call(rbind, ls1))
    

    此修改将考虑所有因素 . 如果您不需要任何限制,则可以修改解决方案 .

相关问题