绘制地理坐标的方法，用于标记另一个拉特和多头的数据？-Java 学习之路

我有一个纬度和经度列表与位置，另一个列表只有拉特和长 . 我需要将这个另一组映射到第一个列表中的近似位置 . 我在R中尝试过geosphere，但是数据太大了，我最终收到一条错误消息“无法分配大小为718.5 GB的向量”！有任何想法吗？我们正在寻找要映射的数据非常庞大（接近100M行分为48个段，需要映射到lats和long的列表，大约是80k记录长...）

1 回答

脱离RomanLuštrik的想法，将其分成尽可能小的块，这将是您最理想的解决方案 . 让我们首先找到每行的最近点，而不是尝试将它们全部加载到内存中 . 此示例将是基于SAS的解决方案 .

通过遍历哈希表也可以更有效地完成此示例，但这里解释起来会更复杂 . 这也可以并行化 . 这种方式效率中等，但更容易遵循 . 让我们使用两个示例数据集：

1. Mobile_Activity_3months_scrambled.csv - http://js.cit.datalens.api.here.com/datasets/starter_pack/Mobile_activity_3months_scrambled.csv

500k行 . 让我们将此视为您的大数据集 .

2. sashelp.zipcode

41k行 . 我们将此视为您的小数据集 .

Goal: Map each data point to the closest city.

为了保持尽可能简单，让我们只读一行并将其与最近的城市相匹配 . 首先，读入您的数据：

proc import 
    file='CHANGE DIRECTORY HERE\Mobile_activity_3months_scrambled.csv'
    out=bigdata
    dbms=csv
    replace;
run;

接下来，我们将读取一行并计算其与所有其他纬度/长度对的地理距离 . 我们将使用SQL对这些数据进行笛卡尔积 .

proc sql noprint;
    create table nearest_point as
        select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
             , t2.city as Nearest_City
        from bigdata(obs=1 firstobs=1) as t1
        CROSS JOIN
             sashelp.zipcode as t2
        where NOT missing(t2.x)
        order by Distance
    ;
quit;

输出数据集中的第一个观察点是您的最近距离 .

让's generalize this for multiple observations. Let'为它们中的10个做，但稍微提高效率 . 我们不需要输出所有41k观测值 . 我们只需输出最小距离的观测值并将其附加到主表格中 . 将 outobs=1 选项添加到SQL .

%macro nearest_distance;

    %do i = 1 %to 10;
        proc sql outobs=1 noprint;
            create table nearest_point as
                select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
                     , t2.city as Nearest_City
                from bigdata(obs=&i. firstobs=&i.) as t1
                CROSS JOIN
                     sashelp.zipcode as t2
                where NOT missing(t2.x)
                order by Distance
            ;
        quit;

        proc append base=all_nearest_points
                    data=nearest_point
                    force;
        run;

    %end;

%mend;
%nearest_distance;

让's generalize it even more, and remove writing to the log to make it faster. Let'甚至将我们的邮政编码数据预先加载到内存中，并对所有观察执行此操作 . 为了测试一个例子，我们首先强制 bigdata 最多为100个obs .

data bigdata;
    set bigdata(obs=100);
run;

%macro nearest_distance;

    %let dsid   = %sysfunc(open(bigdata) );
    %let n      = %sysfunc(attrn(&dsid., nlobs) );
    %let rc     = %sysfunc(close(&dsid.) );

    proc printto log="%sysfunc(getoption(work) )\_tmp_.txt";
    run;

    %do i = 1 %to &n.;
        proc sql outobs=1 noprint;
            create table nearest_point as
                select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
                     , t2.city as Nearest_City
                from bigdata(obs=&i. firstobs=&i.) as t1
                CROSS JOIN
                     sashelp.zipcode as t2
                where NOT missing(t2.x)
                order by Distance
            ;
        quit;

        proc append base=all_nearest_points
                    data=nearest_point
                    force;
        run;

    %end;

    proc printto log=log;
    run;

%mend;
%nearest_distance;

接下来，让它并行化，然后完成所有操作 . 您可以使用 threads 选项更改要使用的并行会话数 .

%macro nearest_distance(threads=5);

    /* Parallel submit options */
    options 
        autosignon=yes
        sascmd='!sascmd'
    ;

    /* Current session work directory */
    %let workdir = %sysfunc(getoption(work) );

    /* Total obs in big data */
    %let dsid   = %sysfunc(open(bigdata) );
    %let n      = %sysfunc(attrn(&dsid., nlobs) );
    %let rc     = %sysfunc(close(&dsid.) );

    /* Load lookup table to memory */
    sasfile sashelp.zipcode load;

    /* Prevent writing to session log */
    proc printto log="%sysfunc(getoption(work) )\_tmp_.txt";
    run;

    /* Run in &threads parallel sessions */
    %do t = 1 %to &threads.;

        /* Divide up observations for each thread */
        %let firstobs = %sysevalf(&n-(&n/&threads.)*(&threads.-&t+1)+1, floor);
        %let obs      = %sysevalf(&n-(&n/&threads.)*(&threads.-&t.),    floor);

        /* Transfer primary session macro variables to each worker session */
        %syslput _USER_ / remote=worker&t.;

        /* Parallel calculations for data in memory */
        rsubmit wait=no remote=worker&t.;

            /* We are in a specific session, and must define this as a macro within the session */
            %macro thread_loop;
                %do i = &firstobs. %to &obs.;

                    /* Primary session library */
                    libname workdir "&workdir.";

                    proc sql outobs=1 noprint;
                        create table nearest_point as
                            select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
                                 , t2.city as Nearest_City
                            from workdir.bigdata(obs=&i. firstobs=&i.) as t1
                            CROSS JOIN
                                 sashelp.zipcode as t2
                            where NOT missing(t2.x)
                            order by Distance
                        ;
                    quit;

                    /* Save to primary session library */
                    proc append base=workdir._all_nearest_points_&t.
                                data=nearest_point
                                force;
                    run;
                %end;
            %mend;
            %thread_loop;

        endrsubmit;
    %end;

    /* Wait for all workers to end */
    waitfor _ALL_;

    /* Unload zipcode data from memory */
    sasfile sashelp.zipcode close;

    /* Append all data to the master file */
    proc datasets nolist;

        /* Delete final appended output data if it already exists */
        delete work.all_nearest_points;

        %do t = 1 %to &threads.;
            append base =  all_nearest_points
                   data = _all_nearest_points_&t.
                   force
            ;
        %end;

        /* Remove tmp files */
        delete _all_nearest_points_:;
    quit;

    /* Restore log */
    proc printto log=log;
    run;

%mend;
%nearest_distance;

回复于 2024-05-14T10:42:34+08:00

绘制地理坐标的方法，用于标记另一个拉特和多头的数据？

1 回答

相关问题