proc sql noprint;
create table nearest_point as
select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
, t2.city as Nearest_City
from bigdata(obs=1 firstobs=1) as t1
CROSS JOIN
sashelp.zipcode as t2
where NOT missing(t2.x)
order by Distance
;
quit;
输出数据集中的第一个观察点是您的最近距离 .
让's generalize this for multiple observations. Let'为它们中的10个做,但稍微提高效率 . 我们不需要输出所有41k观测值 . 我们只需输出最小距离的观测值并将其附加到主表格中 . 将 outobs=1 选项添加到SQL .
%macro nearest_distance;
%do i = 1 %to 10;
proc sql outobs=1 noprint;
create table nearest_point as
select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
, t2.city as Nearest_City
from bigdata(obs=&i. firstobs=&i.) as t1
CROSS JOIN
sashelp.zipcode as t2
where NOT missing(t2.x)
order by Distance
;
quit;
proc append base=all_nearest_points
data=nearest_point
force;
run;
%end;
%mend;
%nearest_distance;
让's generalize it even more, and remove writing to the log to make it faster. Let'甚至将我们的邮政编码数据预先加载到内存中,并对所有观察执行此操作 . 为了测试一个例子,我们首先强制 bigdata 最多为100个obs .
data bigdata;
set bigdata(obs=100);
run;
%macro nearest_distance;
%let dsid = %sysfunc(open(bigdata) );
%let n = %sysfunc(attrn(&dsid., nlobs) );
%let rc = %sysfunc(close(&dsid.) );
proc printto log="%sysfunc(getoption(work) )\_tmp_.txt";
run;
%do i = 1 %to &n.;
proc sql outobs=1 noprint;
create table nearest_point as
select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
, t2.city as Nearest_City
from bigdata(obs=&i. firstobs=&i.) as t1
CROSS JOIN
sashelp.zipcode as t2
where NOT missing(t2.x)
order by Distance
;
quit;
proc append base=all_nearest_points
data=nearest_point
force;
run;
%end;
proc printto log=log;
run;
%mend;
%nearest_distance;
%macro nearest_distance(threads=5);
/* Parallel submit options */
options
autosignon=yes
sascmd='!sascmd'
;
/* Current session work directory */
%let workdir = %sysfunc(getoption(work) );
/* Total obs in big data */
%let dsid = %sysfunc(open(bigdata) );
%let n = %sysfunc(attrn(&dsid., nlobs) );
%let rc = %sysfunc(close(&dsid.) );
/* Load lookup table to memory */
sasfile sashelp.zipcode load;
/* Prevent writing to session log */
proc printto log="%sysfunc(getoption(work) )\_tmp_.txt";
run;
/* Run in &threads parallel sessions */
%do t = 1 %to &threads.;
/* Divide up observations for each thread */
%let firstobs = %sysevalf(&n-(&n/&threads.)*(&threads.-&t+1)+1, floor);
%let obs = %sysevalf(&n-(&n/&threads.)*(&threads.-&t.), floor);
/* Transfer primary session macro variables to each worker session */
%syslput _USER_ / remote=worker&t.;
/* Parallel calculations for data in memory */
rsubmit wait=no remote=worker&t.;
/* We are in a specific session, and must define this as a macro within the session */
%macro thread_loop;
%do i = &firstobs. %to &obs.;
/* Primary session library */
libname workdir "&workdir.";
proc sql outobs=1 noprint;
create table nearest_point as
select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
, t2.city as Nearest_City
from workdir.bigdata(obs=&i. firstobs=&i.) as t1
CROSS JOIN
sashelp.zipcode as t2
where NOT missing(t2.x)
order by Distance
;
quit;
/* Save to primary session library */
proc append base=workdir._all_nearest_points_&t.
data=nearest_point
force;
run;
%end;
%mend;
%thread_loop;
endrsubmit;
%end;
/* Wait for all workers to end */
waitfor _ALL_;
/* Unload zipcode data from memory */
sasfile sashelp.zipcode close;
/* Append all data to the master file */
proc datasets nolist;
/* Delete final appended output data if it already exists */
delete work.all_nearest_points;
%do t = 1 %to &threads.;
append base = all_nearest_points
data = _all_nearest_points_&t.
force
;
%end;
/* Remove tmp files */
delete _all_nearest_points_:;
quit;
/* Restore log */
proc printto log=log;
run;
%mend;
%nearest_distance;
1 回答
脱离RomanLuštrik的想法,将其分成尽可能小的块,这将是您最理想的解决方案 . 让我们首先找到每行的最近点,而不是尝试将它们全部加载到内存中 . 此示例将是基于SAS的解决方案 .
通过遍历哈希表也可以更有效地完成此示例,但这里解释起来会更复杂 . 这也可以并行化 . 这种方式效率中等,但更容易遵循 . 让我们使用两个示例数据集:
1. Mobile_Activity_3months_scrambled.csv - http://js.cit.datalens.api.here.com/datasets/starter_pack/Mobile_activity_3months_scrambled.csv
500k行 . 让我们将此视为您的大数据集 .
2. sashelp.zipcode
41k行 . 我们将此视为您的小数据集 .
Goal: Map each data point to the closest city.
为了保持尽可能简单,让我们只读一行并将其与最近的城市相匹配 . 首先,读入您的数据:
接下来,我们将读取一行并计算其与所有其他纬度/长度对的地理距离 . 我们将使用SQL对这些数据进行笛卡尔积 .
输出数据集中的第一个观察点是您的最近距离 .
让's generalize this for multiple observations. Let'为它们中的10个做,但稍微提高效率 . 我们不需要输出所有41k观测值 . 我们只需输出最小距离的观测值并将其附加到主表格中 . 将
outobs=1
选项添加到SQL .让's generalize it even more, and remove writing to the log to make it faster. Let'甚至将我们的邮政编码数据预先加载到内存中,并对所有观察执行此操作 . 为了测试一个例子,我们首先强制
bigdata
最多为100个obs .接下来,让它并行化,然后完成所有操作 . 您可以使用
threads
选项更改要使用的并行会话数 .