首页 文章

绘制地理坐标的方法,用于标记另一个拉特和多头的数据?

提问于
浏览
0

我有一个纬度和经度列表与位置,另一个列表只有拉特和长 . 我需要将这个另一组映射到第一个列表中的近似位置 . 我在R中尝试过geosphere,但是数据太大了,我最终收到一条错误消息“无法分配大小为718.5 GB的向量”!有任何想法吗?我们正在寻找要映射的数据非常庞大(接近100M行分为48个段,需要映射到lats和long的列表,大约是80k记录长...)

1 回答

  • 1

    脱离RomanLuštrik的想法,将其分成尽可能小的块,这将是您最理想的解决方案 . 让我们首先找到每行的最近点,而不是尝试将它们全部加载到内存中 . 此示例将是基于SAS的解决方案 .

    通过遍历哈希表也可以更有效地完成此示例,但这里解释起来会更复杂 . 这也可以并行化 . 这种方式效率中等,但更容易遵循 . 让我们使用两个示例数据集:

    1. Mobile_Activity_3months_scrambled.csv - http://js.cit.datalens.api.here.com/datasets/starter_pack/Mobile_activity_3months_scrambled.csv

    500k行 . 让我们将此视为您的大数据集 .

    2. sashelp.zipcode

    41k行 . 我们将此视为您的小数据集 .

    Goal: Map each data point to the closest city.

    为了保持尽可能简单,让我们只读一行并将其与最近的城市相匹配 . 首先,读入您的数据:

    proc import 
        file='CHANGE DIRECTORY HERE\Mobile_activity_3months_scrambled.csv'
        out=bigdata
        dbms=csv
        replace;
    run;
    

    接下来,我们将读取一行并计算其与所有其他纬度/长度对的地理距离 . 我们将使用SQL对这些数据进行笛卡尔积 .

    proc sql noprint;
        create table nearest_point as
            select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
                 , t2.city as Nearest_City
            from bigdata(obs=1 firstobs=1) as t1
            CROSS JOIN
                 sashelp.zipcode as t2
            where NOT missing(t2.x)
            order by Distance
        ;
    quit;
    

    输出数据集中的第一个观察点是您的最近距离 .

    让's generalize this for multiple observations. Let'为它们中的10个做,但稍微提高效率 . 我们不需要输出所有41k观测值 . 我们只需输出最小距离的观测值并将其附加到主表格中 . 将 outobs=1 选项添加到SQL .

    %macro nearest_distance;
    
        %do i = 1 %to 10;
            proc sql outobs=1 noprint;
                create table nearest_point as
                    select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
                         , t2.city as Nearest_City
                    from bigdata(obs=&i. firstobs=&i.) as t1
                    CROSS JOIN
                         sashelp.zipcode as t2
                    where NOT missing(t2.x)
                    order by Distance
                ;
            quit;
    
            proc append base=all_nearest_points
                        data=nearest_point
                        force;
            run;
    
        %end;
    
    %mend;
    %nearest_distance;
    

    让's generalize it even more, and remove writing to the log to make it faster. Let'甚至将我们的邮政编码数据预先加载到内存中,并对所有观察执行此操作 . 为了测试一个例子,我们首先强制 bigdata 最多为100个obs .

    data bigdata;
        set bigdata(obs=100);
    run;
    
    %macro nearest_distance;
    
        %let dsid   = %sysfunc(open(bigdata) );
        %let n      = %sysfunc(attrn(&dsid., nlobs) );
        %let rc     = %sysfunc(close(&dsid.) );
    
        proc printto log="%sysfunc(getoption(work) )\_tmp_.txt";
        run;
    
        %do i = 1 %to &n.;
            proc sql outobs=1 noprint;
                create table nearest_point as
                    select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
                         , t2.city as Nearest_City
                    from bigdata(obs=&i. firstobs=&i.) as t1
                    CROSS JOIN
                         sashelp.zipcode as t2
                    where NOT missing(t2.x)
                    order by Distance
                ;
            quit;
    
            proc append base=all_nearest_points
                        data=nearest_point
                        force;
            run;
    
        %end;
    
        proc printto log=log;
        run;
    
    %mend;
    %nearest_distance;
    

    接下来,让它并行化,然后完成所有操作 . 您可以使用 threads 选项更改要使用的并行会话数 .

    %macro nearest_distance(threads=5);
    
        /* Parallel submit options */
        options 
            autosignon=yes
            sascmd='!sascmd'
        ;
    
        /* Current session work directory */
        %let workdir = %sysfunc(getoption(work) );
    
        /* Total obs in big data */
        %let dsid   = %sysfunc(open(bigdata) );
        %let n      = %sysfunc(attrn(&dsid., nlobs) );
        %let rc     = %sysfunc(close(&dsid.) );
    
        /* Load lookup table to memory */
        sasfile sashelp.zipcode load;
    
        /* Prevent writing to session log */
        proc printto log="%sysfunc(getoption(work) )\_tmp_.txt";
        run;
    
        /* Run in &threads parallel sessions */
        %do t = 1 %to &threads.;
    
            /* Divide up observations for each thread */
            %let firstobs = %sysevalf(&n-(&n/&threads.)*(&threads.-&t+1)+1, floor);
            %let obs      = %sysevalf(&n-(&n/&threads.)*(&threads.-&t.),    floor);
    
            /* Transfer primary session macro variables to each worker session */
            %syslput _USER_ / remote=worker&t.;
    
            /* Parallel calculations for data in memory */
            rsubmit wait=no remote=worker&t.;
    
                /* We are in a specific session, and must define this as a macro within the session */
                %macro thread_loop;
                    %do i = &firstobs. %to &obs.;
    
                        /* Primary session library */
                        libname workdir "&workdir.";
    
                        proc sql outobs=1 noprint;
                            create table nearest_point as
                                select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
                                     , t2.city as Nearest_City
                                from workdir.bigdata(obs=&i. firstobs=&i.) as t1
                                CROSS JOIN
                                     sashelp.zipcode as t2
                                where NOT missing(t2.x)
                                order by Distance
                            ;
                        quit;
    
                        /* Save to primary session library */
                        proc append base=workdir._all_nearest_points_&t.
                                    data=nearest_point
                                    force;
                        run;
                    %end;
                %mend;
                %thread_loop;
    
            endrsubmit;
        %end;
    
        /* Wait for all workers to end */
        waitfor _ALL_;
    
        /* Unload zipcode data from memory */
        sasfile sashelp.zipcode close;
    
        /* Append all data to the master file */
        proc datasets nolist;
    
            /* Delete final appended output data if it already exists */
            delete work.all_nearest_points;
    
            %do t = 1 %to &threads.;
                append base =  all_nearest_points
                       data = _all_nearest_points_&t.
                       force
                ;
            %end;
    
            /* Remove tmp files */
            delete _all_nearest_points_:;
        quit;
    
        /* Restore log */
        proc printto log=log;
        run;
    
    %mend;
    %nearest_distance;
    

相关问题