首页 文章

如何删除两个CSV之间的不常见行?

提问于
浏览
0

我有2个CSV(从Android设备(加速度计,陀螺仪)导入),问题是加速度计的CSV有一些额外的时间戳(这本身很奇怪),这些时间戳不包含在陀螺仪的CSV中,所以有什么方法可以找到并删除不常见的行(时间戳)

Reason for this question: 我正在使用来自Android设备的传感器数据进行活动识别,因此我对陀螺仪和加速度计数据的时间戳进行一对一的对应非常重要

1 回答

  • 2

    创建演示数据:

    #  bad data, the weird ones are bad
    
    data = """
    ts1,d001,d002,d003
    ts2,d001,d002,d003
    ts3,d001,d002,d003
    weird1,d001,d002,d003
    weird2,d001,d002,d003
    ts4,d001,d002,d003 
    """
    
    # the good data
    
    other = """
    ts1,f001,f002,f003
    ts2,f001,f002,f003
    ts3,f001,f002,f003
    ts4,f001,f002,f003 
    """
    
    # create demo files
    fn1 = "d1.csv"
    fn2 = "d2.csv"
    with open(fn1,"w") as f:
        f.write(data)
    with open(fn2,"w") as f:
        f.write(other)
    

    现在解析:

    import csv
    
    def readFile(name):
        """returns a dict for data with 4 columns"""
        result = []
        with open(name,"r") as f:
            k = csv.DictReader(f,fieldnames=["ts","dp1","dp2","dp3"])
            for l in k:
                result.append(l)
        return result
    
    badData = readFile(fn1)
    goodData = readFile(fn2)
    
    print(badData)
    print(goodData)
    

    输出:

    # weired data
     [{'dp3': 'd003', 'ts': 'ts1', 'dp1': 'd001', 'dp2': 'd002'}, 
      {'dp3': 'd003', 'ts': 'ts2', 'dp1': 'd001', 'dp2': 'd002'}, 
      {'dp3': 'd003', 'ts': 'ts3', 'dp1': 'd001', 'dp2': 'd002'}, 
      {'dp3': 'd003', 'ts': 'weird1', 'dp1': 'd001', 'dp2': 'd002'}, 
      {'dp3': 'd003', 'ts': 'weird2', 'dp1': 'd001', 'dp2': 'd002'}, 
      {'dp3': 'd003 ', 'ts': 'ts4', 'dp1': 'd001', 'dp2': 'd002'}]
    
    # good data
    [{'dp3': 'f003', 'ts': 'ts1', 'dp1': 'f001', 'dp2': 'f002'}, 
     {'dp3': 'f003', 'ts': 'ts2', 'dp1': 'f001', 'dp2': 'f002'}, 
     {'dp3': 'f003', 'ts': 'ts3', 'dp1': 'f001', 'dp2': 'f002'}, 
     {'dp3': 'f003 ', 'ts': 'ts4', 'dp1': 'f001', 'dp2': 'f002'}]
    

    现在要消除不良数据点:

    # get all the "good" ts 
    goodTs = set( oneDict["ts"] for oneDict in goodData)
    
    # clean the bad data, only keep those "ts" that are in goodTs
    cleanedData = [x for x in badData if x["ts"] in goodTs]
    
    print(cleanedData)
    

    输出:

    # filtered weired data
    [{'dp3': 'd003', 'ts': 'ts1', 'dp1': 'd001', 'dp2': 'd002'}, 
     {'dp3': 'd003', 'ts': 'ts2', 'dp1': 'd001', 'dp2': 'd002'}, 
     {'dp3': 'd003', 'ts': 'ts3', 'dp1': 'd001', 'dp2': 'd002'}, 
     {'dp3': 'd003 ', 'ts': 'ts4', 'dp1': 'd001', 'dp2': 'd002'}]
    

    完成 .

相关问题