MQL5中的OpenCL代码不会将分布式作业提供给每个GPU核心-Java 学习之路

我使用OpenCL和MQL5为MetaTrader终端平台创建了一个基于GPU的指标 .

我努力工作，我的[MetaTrader终端：策略测试程序]优化工作必须在GPU上转移到最大值 . 大多数计算都是由指标完成的 . 因此，我在指标中进行了更改，并在GPU上完全转移 .

但是当我尝试在策略测试器部分进行优化过程时，真正的问题出现了 .
我看到的过程同时使用了GPU和CPU，但对整个过程没有影响 .

我怀疑这个过程并没有分配到每个GPU核心进行处理，而是所有GPU核心都在处理相同的进程或功能以便执行 .

那么，让我知道我需要做些什么来让单GPU工作在单一功能执行上以提供更快的输出 .

我的代码的核心是：

__kernel void calSMA(
                     int limit, 
                     int rates_total, 
                     __global double *price, 
                     __global double *ExtLineBuffer,
                     int InpMAPeriod

                   )
         { 

                int count = 0;
                int len = get_global_id(2);
                for(int i=limit;i<rates_total;i++) 

                     ExtLineBuffer[len+i] = ExtLineBuffer[len+ i-1]+(price[len+i]-price[len+i-InpMAPeriod])/InpMAPeriod;

         }


 __kernel void calcSMALoop(int begin, int limit, __global double *price, __global double *firstValue, int InpMAPeriod)
          { 
                int i, len = get_global_id(2);
                for(i=begin;i<limit;i++) 
                  firstValue[len]+=price[i]; 
                firstValue[len]/=InpMAPeriod;

          }

__kernel void calcEMA(int begin, int limit, __global double *price, __global double *ExtLineBuffer, double SmoothFactor)
          {
            int len = get_global_id(2);
            for(int i=begin;i<limit;i++)
               ExtLineBuffer[len + i]=price[len + i]*SmoothFactor+ExtLineBuffer[len + i-1]*(1.0-SmoothFactor);
          }

__kernel void calcSSMA(int limit, int rates_total, __global double *price, __global double *ExtLineBuffer, int InpMAPeriod)
          {
            int len = get_global_id(2);
            for(int i=limit;i<rates_total;i++)
               ExtLineBuffer[len + i]=(ExtLineBuffer[len + i-1]*(InpMAPeriod-1)+price[len + i])/InpMAPeriod;         
          }

__kernel void calcLWMALoop(int begin, int limit, __global double *price, __global double *firstValue, int weightsum, __global int *weightreturn)
          {

            weightsum = 0;
            int len = get_global_id(2);
            for(int i=begin;i<limit;i++)
            {                 
               weightsum+=(i-begin+1);
               firstValue[len]+=(i-begin+1)*price[i];
            }
      firstValue[len]/=(double)weightsum;
          weightreturn[0] = weightsum;
          }
//__global int counter = 0;
double returnCalculation(int InpMAPeriod, double price, int j)
{

   return ((InpMAPeriod-j)*price);
}
__kernel void calcLWMA(int limit, int rates_total, __global double *price, __global double *ExtLineBuffer, int InpMAPeriod, int weightsum)
          {
          int len = get_global_id(2);
            for(int i=limit;i<rates_total;i++)
            {
               double     sum = 0;
               for(int j=0;j<InpMAPeriod;j++) sum+=returnCalculation(InpMAPeriod,price[len + i-j],j);
                  ExtLineBuffer[len + i]=sum/weightsum;
            }
          }

请建议我在OpenCL上使用GPU在MQL5中使用不同的值或框架分发函数的方法 .

EDITED

这对挑战者来说是一个巨大的挑战......即使我很想知道OpenCL和MQL5是否可以用于优化任务 . 我希望我会得到我正在寻求的答案 .

EDITED AGAIN MAGPU.mqh 文件

#include "CHECKMA.mq5"
#define CUDA_CORE 2

int Execute_SMA(
                 const double &price[],                 
                 int rates_total,
                 int limit
                 )
 {

   int cl_mem = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE), 
       cl_price = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE);

          Check_Memory_Initialization(cl_mem, cl_price, cl_CommonKernel1, "Execute_SMA function error");
           if(!CLSetKernelArgMem(cl_CommonKernel1,2,cl_price))
            Print("Input Bufer Not Set");
          //else Print("Input Buffer Set");
           if(!CLSetKernelArgMem(cl_CommonKernel1,3,cl_mem))
            Print("Output Bufer Not Set");
           //else Print("Output Buffer Set");

           if(!CLBufferWrite(cl_price, price))
            Print("Could not copy Input buffer"); 
           //else Print("Copied: ",cl_price);    
           if(!CLBufferWrite(cl_mem, ExtLineBuffer))
            Print("Could not copy Input buffer"); 
           //else Print("Copied: ",cl_mem);  

           //else Print("Input Buffer Copied");
           if(!CLSetKernelArg(cl_CommonKernel1,0,limit))
           Print("Could Not Set Arg 0");
           //else Print("Set Arg 0");
           if(!CLSetKernelArg(cl_CommonKernel1,1,rates_total))
           Print("Could Not Set Arg 1");
           //else Print("Set Arg 1");
           //if(!CLSetKernelArg(cl_CommonKernel1,4,previous_value))
           //Print("Could Not Set Arg2");
           //else Print("Set Arg 2");
           if(!CLSetKernelArg(cl_CommonKernel1,4,InpMAPeriod))
           Print("Could Not Set Arg3: ",GetLastError());

           //Print(CLGetInfoInteger(cl_ctx,CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS));


           if(!CLExecute(cl_CommonKernel1,CUDA_CORE,offset,work))
               Print("Kernel not executed",GetLastError());
           //else Print("Executing Now!");
           //if(CLExecutionStatus(cl_krn) == 0) Print("Completed");
           //if(CLExecutionStatus(cl_krn) == 1) Print("CL_RUNNING");
           //if(CLExecutionStatus(cl_krn) == 2) Print("CL_SUBMITTED");
           //if(CLExecutionStatus(cl_krn) == 3) Print("CL_QUEUED");
           //if(CLExecutionStatus(cl_krn) == -1)Print("Error Occurred:", GetLastError());
           //if(!CLExecutionStatus(cl_krn))
            //Print(CLExecutionStatus(cl_krn));

           if(!CLBufferRead(cl_mem,ExtLineBuffer))
           Print("Buffer Copy Nothing: ", GetLastError());

      CLBufferFree(cl_price);
      CLBufferFree(cl_mem);
  return(1);
 } 

 double ExecuteLoop(
                 int begin,
                 int limit,
                 const double &price[]
                 )
 {

   int cl_mem = CLBufferCreate(cl_ctx,sizeof(double),CL_MEM_READ_WRITE), 
       cl_price = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE);
   double temp[];
   ArrayResize(temp,1);
   temp[0] = 0;

           Check_Memory_Initialization(cl_mem, cl_price, cl_CommonKernel2, "ExecuteLoop function error");

           if(!CLSetKernelArgMem(cl_CommonKernel2,2,cl_price))
            Print("Input Bufer Not Set 2");
           if(!CLSetKernelArgMem(cl_CommonKernel2,3,cl_mem))
            Print("Output Bufer Not Set 2");

           if(!CLBufferWrite(cl_price, price))
            Print("Could not copy Input buffer 2"); 
           if(!CLSetKernelArg(cl_CommonKernel2,0,begin))
            Print("Could Not Set Arg 0");
           if(!CLSetKernelArg(cl_CommonKernel2,1,limit))
            Print("Could Not Set Arg 1");
           if(!CLSetKernelArg(cl_CommonKernel2,4,InpMAPeriod))
            Print("Could Not Set Arg3: ",GetLastError());

           if(!CLExecute(cl_CommonKernel2,CUDA_CORE,offset,work))
               Print("Kernel not executed",GetLastError());

           if(!CLBufferRead(cl_mem,temp))
           Print("Buffer Copy Nothing: ", GetLastError());
      CLBufferFree(cl_price);
      CLBufferFree(cl_mem);
  return(temp[0]);
 } 
int ExecuteEMA(int begin, int limit, const double &price[], double SmoothFactor)
{

   int cl_mem = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE), 
       cl_price = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE);

         Check_Memory_Initialization(cl_mem, cl_price, cl_CommonKernel1, "ExecuteEMA function error");

           if(!CLSetKernelArgMem(cl_CommonKernel1,2,cl_price))
            Print("Input Bufer Not Set");
           if(!CLSetKernelArgMem(cl_CommonKernel1,3,cl_mem))
            Print("Output Bufer Not Set");

           if(!CLBufferWrite(cl_price, price))
            Print("Could not copy Input buffer"); 
           if(!CLBufferWrite(cl_mem, ExtLineBuffer))
            Print("Could not copy Input buffer"); 

           if(!CLSetKernelArg(cl_CommonKernel1,0,begin))
            Print("Could Not Set Arg 0");
           if(!CLSetKernelArg(cl_CommonKernel1,1,limit))
            Print("Could Not Set Arg 1");
           if(!CLSetKernelArg(cl_CommonKernel1,4,SmoothFactor))
            Print("Could Not Set Arg3: ",GetLastError());

           if(!CLExecute(cl_CommonKernel1,CUDA_CORE,offset,work))
               Print("Kernel not executed",GetLastError());

           if(!CLBufferRead(cl_mem,ExtLineBuffer))
            Print("Buffer Copy Nothing: ", GetLastError());

      CLBufferFree(cl_price);
      CLBufferFree(cl_mem);

  return(1);
 }   
int Execute_SSMA(
                 const double &price[],                 
                 int rates_total,
                 int limit
                 )
 {

   int cl_mem = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE), 
       cl_price = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE);

       Check_Memory_Initialization(cl_mem, cl_price, cl_CommonKernel1, "Execute_SSMA function error");

           if(!CLSetKernelArgMem(cl_CommonKernel1,2,cl_price))
            Print("Input Bufer Not Set");
           if(!CLSetKernelArgMem(cl_CommonKernel1,3,cl_mem))
            Print("Output Bufer Not Set");

           if(!CLBufferWrite(cl_price, price))
            Print("Could not copy Input buffer"); 
           if(!CLBufferWrite(cl_mem, ExtLineBuffer))
            Print("Could not copy Input buffer"); 
//             
           //else Print("Input Buffer Copied");
           if(!CLSetKernelArg(cl_CommonKernel1,0,limit))
            Print("Could Not Set Arg 0");
           if(!CLSetKernelArg(cl_CommonKernel1,1,rates_total))
            Print("Could Not Set Arg 1");
           if(!CLSetKernelArg(cl_CommonKernel1,4,InpMAPeriod))
            Print("Could Not Set Arg3: ",GetLastError());

           if(!CLExecute(cl_CommonKernel1,CUDA_CORE,offset,work))
               Print("Kernel not executed",GetLastError());
           if(!CLBufferRead(cl_mem,ExtLineBuffer))
            Print("Buffer Copy Nothing: ", GetLastError());

      CLBufferFree(cl_price);
      CLBufferFree(cl_mem);
  return(1);
 } 

 double ExecuteLWMALoop(
                 int begin,
                 int limit,
                 const double &price[],
                 int weightsumlocal
                 )
 {

   int cl_mem = CLBufferCreate(cl_ctx,sizeof(double),CL_MEM_READ_WRITE), 
       cl_price = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE),
       cl_weightsumlocal = CLBufferCreate(cl_ctx,sizeof(int),CL_MEM_READ_WRITE);
   double temp[];
   int weight[];
   ArrayResize(temp,1);
   ArrayResize(weight,1);
   weight[0] = 0;
   temp[0] = 0;

           Check_Memory_Initialization(cl_mem, cl_price, cl_CommonKernel2, "ExecuteLWMALoop function error");

           if(!CLSetKernelArgMem(cl_CommonKernel2,2,cl_price))
            Print("Input Bufer Not Set 2");
           if(!CLSetKernelArgMem(cl_CommonKernel2,3,cl_mem))
            Print("Output Bufer Not Set 2");
           if(!CLSetKernelArgMem(cl_CommonKernel2,5,cl_weightsumlocal))
            Print("Output Bufer Not Set 2");

           if(!CLBufferWrite(cl_price, price))
            Print("Could not copy Input buffer 2"); 
           if(!CLSetKernelArg(cl_CommonKernel2,0,begin))
            Print("Could Not Set Arg 0");
           if(!CLSetKernelArg(cl_CommonKernel2,1,limit))
            Print("Could Not Set Arg 1");
           if(!CLSetKernelArg(cl_CommonKernel2,4,weightsumlocal))
            Print("Could Not Set Arg3: ",GetLastError());

           if(!CLExecute(cl_CommonKernel2,CUDA_CORE,offset,work))
               Print("Kernel not executed",GetLastError());

           if(!CLBufferRead(cl_mem,temp))
            Print("Buffer Copy Nothing: ", GetLastError());
           if(!CLBufferRead(cl_weightsumlocal,weight))
            Print("Buffer Copy Nothing: ", GetLastError());
       weightsum = weight[0];

       CLBufferFree(cl_weightsumlocal);
      CLBufferFree(cl_price);
      CLBufferFree(cl_mem);
  return(temp[0]);
 } 
int Execute_LWMA(const double &price[], int rates_total, int limit, int weightsum1)
       {

         int cl_mem = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE), 
       cl_price = CLBufferCreate(cl_ctx,ArraySize(price)*sizeof(double),CL_MEM_READ_WRITE);

       Check_Memory_Initialization(cl_mem, cl_price, cl_CommonKernel1, "Execute_SSMA function error");

           if(!CLSetKernelArgMem(cl_CommonKernel1,2,cl_price))
            Print("Input Bufer Not Set");
           if(!CLSetKernelArgMem(cl_CommonKernel1,3,cl_mem))
            Print("Output Bufer Not Set");

           if(!CLBufferWrite(cl_price, price))
            Print("Could not copy Input buffer"); 
           if(!CLBufferWrite(cl_mem, ExtLineBuffer))
            Print("Could not copy Input buffer"); 

           //else Print("Input Buffer Copied");
           if(!CLSetKernelArg(cl_CommonKernel1,0,limit))
            Print("Could Not Set Arg 0");
           if(!CLSetKernelArg(cl_CommonKernel1,1,rates_total))
            Print("Could Not Set Arg 1");
           if(!CLSetKernelArg(cl_CommonKernel1,4,InpMAPeriod))
            Print("Could Not Set Arg4: ",GetLastError());
           if(!CLSetKernelArg(cl_CommonKernel1,5,weightsum1))
            Print("Could Not Set Arg5: ",GetLastError());

           if(!CLExecute(cl_CommonKernel1,CUDA_CORE,offset,work))
               Print("Kernel not executed",GetLastError());
           if(!CLBufferRead(cl_mem,ExtLineBuffer))
            Print("Buffer Copy Nothing: ", GetLastError());

      CLBufferFree(cl_price);
      CLBufferFree(cl_mem);
  return(1);

       }
void checkKernel(int cl_kernel, string var_name)
{

           if(cl_kernel==INVALID_HANDLE )
           {
               CLProgramFree(cl_prg);
               CLContextFree(cl_ctx);
               Print("OpenCL kernel create failed: ERR_OPENCL_INVALID_HANDLE ", var_name);
               return;
           }
           if(cl_kernel==ERR_INVALID_PARAMETER )
           {
               CLProgramFree(cl_prg);
               CLContextFree(cl_ctx);
               Print("OpenCL kernel create failed: ERR_INVALID_PARAMETER ", var_name);
               return;
           }
           if(cl_kernel==ERR_OPENCL_TOO_LONG_KERNEL_NAME  )
           {
               CLProgramFree(cl_prg);
               CLContextFree(cl_ctx);
               Print("OpenCL kernel create failed: ERR_OPENCL_TOO_LONG_KERNEL_NAME ", var_name);
               return;
           }
           if(cl_kernel==ERR_OPENCL_KERNEL_CREATE )
           {
               CLProgramFree(cl_prg);
               CLContextFree(cl_ctx);
               Print("OpenCL kernel create failed 1: ERR_OPENCL_KERNEL_CREATE ", var_name);
               return;
           }

}

 int Check_Memory_Initialization(int cl_mem, int cl_price, int cl_ker, string name_process_call)
      {

         if(cl_mem==INVALID_HANDLE)
           {
               CLKernelFree(cl_ker);
               CLProgramFree(cl_prg);
               CLContextFree(cl_ctx);
               Print("OpenCL buffer create failed: cl_mem INVALID_HANDLE: ", name_process_call);
               return(0);
           }
           if(cl_mem==ERR_NOT_ENOUGH_MEMORY )
           {
               CLKernelFree(cl_ker);
               CLProgramFree(cl_prg);
               CLContextFree(cl_ctx);
               Print("OpenCL buffer create failed: cl_mem ERR_NOT_ENOUGH_MEMORY: ", name_process_call);
               return(0);
           }
           if(cl_mem==ERR_OPENCL_BUFFER_CREATE )
           {
               CLKernelFree(cl_ker);
               CLProgramFree(cl_prg);
               CLContextFree(cl_ctx);
               Print("OpenCL buffer create failed: cl_mem ERR_OPENCL_BUFFER_CREATE: ", name_process_call);
               return(0);
           }

           if(cl_price==INVALID_HANDLE)
           {
               CLKernelFree(cl_ker);
               CLProgramFree(cl_prg);
               CLContextFree(cl_ctx);
               CLBufferFree(cl_mem);
               Print("OpenCL buffer create failed: cl_price: ", name_process_call);
               return(0);
           }
           if(cl_price==ERR_NOT_ENOUGH_MEMORY)
           {
               CLKernelFree(cl_ker);
               CLProgramFree(cl_prg);
               CLContextFree(cl_ctx);
               CLBufferFree(cl_mem);
               Print("OpenCL buffer create failed: cl_price ERR_NOT_ENOUGH_MEMORY: ", name_process_call);
               return(0);
           }
           if(cl_price==ERR_OPENCL_BUFFER_CREATE)
           {
               CLKernelFree(cl_ker);
               CLProgramFree(cl_prg);
               CLContextFree(cl_ctx);
               CLBufferFree(cl_mem);
               Print("OpenCL buffer create failed: cl_price ERR_OPENCL_BUFFER_CREATE: ", name_process_call);
               return(0);
           }
        return(1);       
      }

MAIN INDICATOR FILE CHECKMA.mq5 文件

#resource "program_MA_GPU.cl" as string cl_program
    #include "MAGPU.mqh"
    #property indicator_chart_window
    #property indicator_buffers 1
    #property indicator_plots   1
    #property indicator_type1   DRAW_LINE
    #property indicator_color1  Red
    input int            InpMAPeriod=13;         // Period
    input int            InpMAShift=0;           // Shift
    input ENUM_MA_METHOD InpMAMethod=MODE_SMA;  // Method
    //--- indicator buffers
    double               ExtLineBuffer[];
    int  offset[CUDA_CORE], work[CUDA_CORE];//={0,19,38,57,76,95,114,123};
    string str;   
       int cl_ctx, cl_prg, cl_CommonKernel1, cl_CommonKernel2;
    static int weightsum;    
    void CalculateSimpleMA(int rates_total,int prev_calculated,int begin,const double &price[])
      {
       int limit;
if(prev_calculated==0)
         {
          limit=InpMAPeriod+begin;
          ArrayFill(ExtLineBuffer,0,limit-1,0.0);
          ExtLineBuffer[limit-1]=ExecuteLoop(begin,limit,price);
         }
       else limit=prev_calculated-ArraySize(price)+InpMAPeriod+17;
         Execute_SMA(price,rates_total,limit);
      }
    void CalculateEMA(int rates_total,int prev_calculated,int begin,const double &price[])
      {
       int    limit;
       double SmoothFactor=2.0/(1.0+InpMAPeriod);

       if(prev_calculated==0)
         {

          limit=InpMAPeriod+begin;
          ExtLineBuffer[begin]=price[begin];
          ExecuteEMA(begin+1,limit,price,SmoothFactor);      
         }
       else limit=prev_calculated;
       ExecuteEMA(begin+99900,limit,price,SmoothFactor);
      }
    void CalculateLWMA(int rates_total,int prev_calculated,int begin,const double &price[])
      {
       int  limit; 
       if(prev_calculated==0)
         {
          weightsum=0;
          limit=InpMAPeriod+begin;
          //--- set empty value for first limit bars
          ArrayFill(ExtLineBuffer,0,limit,0.0);
          //--- calculate first visible value
          ExtLineBuffer[limit-1]=ExecuteLWMALoop(begin,limit,price,weightsum);
         }
       else limit=prev_calculated-ArraySize(price)+InpMAPeriod+17;
    //--- main loop
       Execute_LWMA(price,rates_total,limit,weightsum);
    }

    void CalculateSmoothedMA(int rates_total,int prev_calculated,int begin,const double &price[])
      {
       int limit;
    //--- first calculation or number of bars was changed
       if(prev_calculated==0)
         {
          limit=InpMAPeriod+begin;
          //--- set empty value for first limit bars
          ArrayFill(ExtLineBuffer,0,limit-1,0.0);
          ExtLineBuffer[limit-1]=ExecuteLoop(begin,limit,price);
         }
       else limit=prev_calculated-ArraySize(price)+InpMAPeriod+17;

          Execute_SSMA(price,rates_total,limit);
    //---
      }

    void OnInit()
      {
    //--- indicator buffers mapping
       SetIndexBuffer(0,ExtLineBuffer,INDICATOR_DATA);
    //--- set accuracy
       IndicatorSetInteger(INDICATOR_DIGITS,_Digits+1);
    //--- sets first bar from what index will be drawn
       PlotIndexSetInteger(0,PLOT_DRAW_BEGIN,InpMAPeriod);
    //---- line shifts when drawing
       PlotIndexSetInteger(0,PLOT_SHIFT,InpMAShift);
    //--- name for DataWindow

    //---- sets drawing line empty value--
       PlotIndexSetDouble(0,PLOT_EMPTY_VALUE,0.0);
    //---- initialization done
       cl_ctx = CLContextCreate(CL_USE_GPU_ONLY);
       cl_prg=CLProgramCreate(cl_ctx,cl_program,str);

    if(cl_ctx==INVALID_HANDLE)
         {
          Print("OpenCL not found: ", GetLastError() );
          return;

         }  
       if(cl_prg==INVALID_HANDLE)
         {
          CLContextFree(cl_ctx);

          Print("OpenCL program create failed: ", str);
          return;
         }
         if(cl_prg==ERR_INVALID_PARAMETER )
         {
          CLContextFree(cl_ctx);

          Print("OpenCL program create failed: ", str);
          return;
         }
         if(cl_prg==ERR_NOT_ENOUGH_MEMORY )
         {
          CLContextFree(cl_ctx);

          Print("OpenCL program create failed: ", str);
          return;
         }
         if(cl_prg==ERR_OPENCL_PROGRAM_CREATE )
         {
          CLContextFree(cl_ctx);

          Print("OpenCL program create failed: ", str);
          return;
         }
         int c = 1;
         ArrayFill(work,0,CUDA_CORE,c);
         //ArrayInitialize(offset,0);
         int enter = -c;
         for (int i =0; i <  CUDA_CORE; i++)
         {
          offset[i] = enter +  c;
          enter = offset[i];
         }
       switch(InpMAMethod)
         {
          case MODE_SMA : cl_CommonKernel1 = CLKernelCreate(cl_prg,"calSMA");
                          checkKernel(cl_CommonKernel1,"cl_CommonKernel1 SMA");
                          cl_CommonKernel2 = CLKernelCreate(cl_prg,"calcSMALoop");  
                          checkKernel(cl_CommonKernel2,"cl_CommonKernel2 SMA");     
                          break;
          case MODE_EMA : cl_CommonKernel1 = CLKernelCreate(cl_prg,"calcEMA");
                          checkKernel(cl_CommonKernel1,"cl_CommonKernel1 EMA");                          
                          break;       
          case MODE_LWMA : cl_CommonKernel1 = CLKernelCreate(cl_prg,"calcLWMA");
                           checkKernel(cl_CommonKernel1,"cl_CommonKernel1 LWMA");
                           cl_CommonKernel2 = CLKernelCreate(cl_prg,"calcLWMALoop"); 
                           checkKernel(cl_CommonKernel2,"cl_CommonKernel2 LWMA");      
                          break;       
          case MODE_SMMA : cl_CommonKernel1 = CLKernelCreate(cl_prg,"calcSSMA");
                           checkKernel(cl_CommonKernel1,"cl_CommonKernel1 SSMA");
                           cl_CommonKernel2 = CLKernelCreate(cl_prg,"calcSMALoop"); 
                           checkKernel(cl_CommonKernel2,"cl_CommonKernel2 SSMA");    
                           break;  

         }


      }
int OnCalculate(const int rates_total,
                    const int prev_calculated,
                    const int begin,
                    const double &price[])
      {
       if(rates_total<InpMAPeriod-1+begin)
          return(0);
       if(prev_calculated==0)
          ArrayInitialize(ExtLineBuffer,0);
       PlotIndexSetInteger(0,PLOT_DRAW_BEGIN,InpMAPeriod-1+begin);
       switch(InpMAMethod)
         {
          case MODE_EMA:  CalculateEMA(rates_total,prev_calculated,begin,price);        break;
          case MODE_LWMA: CalculateLWMA(rates_total,prev_calculated,begin,price);       break;
          case MODE_SMMA: CalculateSmoothedMA(rates_total,prev_calculated,begin,price); break;
          case MODE_SMA:  CalculateSimpleMA(rates_total,prev_calculated,begin,price);   break;
         }
    //--- return value of prev_calculated for next call
       return(rates_total);
      }
 void OnDeinit(const int reason)
      {
       CLKernelFree(cl_CommonKernel1);
       CLKernelFree(cl_CommonKernel2);
       CLProgramFree(cl_prg);
       CLContextFree(cl_ctx);
      }

1 回答

6
帮助我以正确的方式编写此代码，以便我的进程变得更快，并且可以为我的GPU提供正确的结果 .

事实上，自4月2日你已经知道，MQL5 CustomIndicator将无法以这种方式工作......

Facts matter - 如果MQL4 / 5代码执行架构 has explicitly documented 没有任何地方可以将任何扩展延迟/异步/阻塞操作放在任何MQL4 / 5 CustomIndicator代码执行单元中，那么可能是合理的时间停止所有这些S / O社区成员的攻击和 start simply respect the documented & published fact .

MQL5 文档非常明确，并警告其结构对性能的不利影响和/或警告整个系统完全无意中死锁的风险：

在一个符号上计算的所有指标，即使它们附加到不同的图表，也在同一个线程中工作 . 因此，一个符号上的所有指示符共享一个线程的资源 .

一个指标中的无限循环（阻塞/增加的延迟/意外延迟）将停止此符号上的所有其他指示符 .

当然，如果他们决定进一步花费宝贵的时间来赞助和扩展这些努力，可能会要求S / O社区中的许多OpenCL专业人士提供帮助 .

所有这些仍然必须满足现实并且最好地尊重已知的事实，在任何合理的努力（无论表达的任何力量）之前至少可以 start to work in the proper direction.

CPU：GPU拓扑如何在内部工作？

使用异步，无序交付操作，根据定义异步， zero-to-many zero-to-many 的GPU设备目标 computing job(s) ，所有都具有从开始到结束的主要不确定性RTT持续时间 .

可以指示GPU设备访问的CPU端端 Queue 将作业发送到GPU（任务〜要执行的程序）：

GPU任务管理工作流的 F inite S tto A utomaton具有以下拓扑图：
```
< START____________> s = GetMicrosecondCount();
( MQL5 RQSTs )
          |
          |
          |
       T0:+---+--> CL_QUEUED            : 3 == "queued", i.e. waiting for its turn ( submission )
          ?   |    |  |
          ?   +----+  |
          ?           v
       T0+?      +--> CL_SUBMITTED      : 2 == "submitted" for an OpenCL-device execution
          ?      |    |  |        
          ?      +----+  |        
          ?              v        
       T0+?         +--> CL_RUNNING     : 1 == "running" the kernel-code on a mapped OpenCL resource pool
          ?         |    |  |     
          ?         +----+  |     
          ?                 v     
       T0+?                 CL_COMPLETE : 0 == "program complete", processing has finished its remote outputs
          ?                 |  
       T0+?-----------------+
          |
( MQL5 FREEs )
< END_____________>  e = GetMicrosecondCount();
                     PrintFormat( "RTT-COST WAS ~ %9d [us] ( CLES==0 ? %d )",
                                   ( e - s ),
                                   CLExecutionStatus( _gpuKernelHANDLE )
                                   );
```
接下来，让我们尊重GPU架构的领域：

GPU计算设备与任何通用CPU CISC / RISC计算设备不同，具有其他硅硬连线架构 .

WHY 的原因在这里非常重要 .

GPU设备使用某些硬件检查工具中引用的 S treaming M ultiprocessor e X 执行单元（ SMX units） .

虽然S M X缩写中的字母 M 强调，可以在SMX单元上加载多个执行，但是，所有这些情况实际上都会执行（确定，只有在以这种方式指示时才会执行，这超出了此范围主题，覆盖/跨越每个SMX-present SM-cores）完全相同的计算指令 - 这是它们可以运行的唯一方式 - 它被称为 SIMT/SIMD 类型的有限并行范围可实现（共同本地））仅在SMX的周边， s ingle- i nstruction- m ultiple- { t hreads | d ata}可以在一个内执行提供SIMT / SIMD-（WARP-wide | half-WARP-wide | WARP-ignoring-GreedyMode）-scheduler功能 . 需要注意的是，计划的 SIMT/SIMD -execution的宽度越窄，SMX / SM内核实际上执行全局作业执行的任何有用部分的次数就越少，浪费的时间就会因为 N-(CPUs) 的数量下降而导致性能上的冲突 . 实际上，如下所述 .

列出上面列出的384个核心意味着硬件限制，超过这个限制，这种共同本地协调的SIMT / SIMD类型的有限范围并行性不能增长，并且所有朝这个方向的尝试将导致纯粹的内部调度GPU作业（是的，即一个接一个） .

理解这些基础是基本的，因为没有这些架构特征，人们可能会期望一种行为，实际上根本不可能在任何类型的GPGPU系统中进行编排，具有自主，异步distributed-system星形的正式形状 . -nodes .

从CPU主机加载到GPU上的任何GPU内核都将映射到一组非空的SMX单元，其中指定数量的核心（再次应用另一个更精细的粒子几何计算资源）超出本篇文章的范围）加载SIMT / SIMD指令流，而不是违反GPU设备限制：
```
...
+----------------------------------------------------------------------------------------
 Max work items dimensions:          3       // 3D-geometry grids possible
    Max work items[0]:               1024    // 1st dimension max.
    Max work items[1]:               1024
    Max work items[2]:               64      // theoretical max. 1024 x 1024 x 64 BUT...
+----------------------------------------------------------------------------------------
 Max work group size:                1024    // actual      max. "geometry"-size
+----------------------------------------------------------------------------------------
 ...
```
所以，
- 如果 1-SM-core 在内部被指示执行一些GPU任务单元（GPU作业），那么这一个SM-core将一个接一个地获取一个GPU-RISC指令（为了简单起见忽略任何可能的ILP）并执行一个一次，踩过所述GPU作业的SIMD指令流 . 存在于同一SMX单元上的所有其他SM内核通常在此期间不执行任何操作，直到此GPU作业完成并且内部GPU进程管理系统决定为此SMX映射其他一些工作 .
- 如果 2-SM-cores 被指示执行一些GPU作业，那么这对SM内核将一个接一个地（和非常 same ）GPU-RISC指令一个接一个（为了简单起见忽略任何可能的ILP）并且 both 执行它一次，逐步完成所述GPU作业的SIMT / SIMD指令流 . 在这种情况下，如果一个SM核进入一个条件，其中 if -ed或类似分支的执行流使一个SM核进入另一个代码执行流路径， SIMT/SIMD -parallelism进入不同的场景，其中一个SM核获取下一个SIMT / SIMD指令，属于它的代码执行路径，而另一个没有做任何事情（得到 GPU_NOP （s）），直到第一个完成整个工作（或被强制停止在某个同步障碍中陷入无法屏蔽的延迟等待状态，当等待一段数据从"far"（慢）非本地内存位置取出时，细节再次超出了范围这篇文章） - 只有在其中任何一个发生之后，发散路径，到目前为止只是 GPU_NOP -ed SM-core可以接收任何下一个SIMT / SIMD指令，属于其（发散）代码执行路径以移动任何向前 . 存在于同一SMX单元上的所有其他SM内核通常在此期间不执行任何操作，直到此GPU作业完成并且内部GPU进程管理系统决定为此SMX映射其他一些工作 .
- 如果 16-SM-cores 被指示通过任务特定的"geometry"执行某些GPU作业，那么这个"herd"的SM内核将接一个（和非常 same ）GPU-RISC SIMT / SIMD指令接下来（忽略任何可能的ILP）这里的简单性和 all 一次执行一个，逐步执行所述GPU作业的SIMT / SIMD指令流 . "herd"内部的任何分歧都会降低SIMT / SIMD效应，并且 GPU_NOP -blocked核心仍然在等待"herd"的主要部分完成作业（与此点之上的草图相同） .
- 如果 more-SIMT/SIMD-threads-than-SM-cores-available 被指示通过任务特定的"geometry"执行某些GPU作业，则GPU设备芯片将对此进行操作以作为 [SERIAL] -sequence的多个 { WARP-wide | half-WARP-wide } -SIMT / SIMD-线程包，直到这样的序列完成所有指示映射到SMX的SIMT / SIMD线程数 . 因此，这种包统一定型的时间一致性通常是不可能的，因为它们在特定的WARP调度程序中到达它们各自的末端 . 时尚，但从不同步（是的， your CPU-side code here will have to wait till the very last ( the laziest （由于任何原因，无论是容量受限的调度原因，还是代码分歧调度原因或不良的相互（重新）同步原因） code-execution flow ) will eventually, in some unknown time in the future, finish the __kernel-code processing and the OpenCL-operated device will allow for "remote"-detection of CL_COMPLETE state, before being able to fetch any meaningful results （如你所惊讶地问的那样）在你的其他一个问题） .
无论如何，所有其他SM核心，没有被相应GPU设备的SMX单元上的任务特定“几何”映射，通常根本没有任何用处 - 因此了解正确任务的硬件细节的重要性特定的“几何”确实很重要，分析可能有助于确定任何此类GPU任务星座的峰值性能（差异可能在几个数量级范围内 - 从最佳到普通到更差 - 在所有可能的任务特定“几何”设置中） .

其次，当我有很多核心时，openCL如何分配任务，它是在每个核心相同的进程和相同的数据上，还是不同的核心与不同的数据？

正如上面简要解释的那样 - SIMT/SIMD 型器件硅架构不允许任何SMX SM内核执行除了整个"herd"-of-SM内核上完全相同的SIMT / SIMD指令之外的任何内容，由任务"geometry"映射到SMX单元（不计算 GPU_NOP （s）做“别的东西”，因为它只是浪费CPU：GPU系统时间） .

所以， yes, “..在每个核心 same 进程..”（最好，如果在 if 或 while 或任何其他类型的代码执行路径分支之后，其内部代码执行路径从不发散），所以如果算法，基于数据 - 驱动值导致不同的内部状态，每个核心可能具有不同的线程局部状态，基于此处理 may 不同（如上面的 if -驱动的发散代码执行路径所示） . 有关SM本地寄存器，SM本地缓存，受限共享内存使用（和延迟成本），GPU设备全局内存使用（以及延迟成本和缓存行长度以及关联性的最佳合并访问模式的关联性）的更多详细信息屏蔽选项 - 许多与硬件相关的编程生态系统细节都涉及数千页的硬件软件特定文档，并且超出了本文的范围 simplified for clarity ）

相同的数据或不同的核心与不同的数据？

这是最后但并非最不重要的两难 - 任何参数化良好的GPU内核激活也可能会在GPU内核之外传递一些外部世界数据，这可能使SMX线程本地数据不同于SM-core到SM-核心 . 这样做的映射实践和最佳性能主要是设备特定的（{SMX | SM-registers | GPU_GDDR gloMEM：shaMEM：constMEM | GPU SMX-本地缓存 - 层次结构} - 详细信息和容量
```
...
 +---------------------------------------------------------
  ...                                               901 MHz
  Cache type:                            Read/Write
  Cache line size:                     128
  Cache size:                        32768
  Global memory size:           4294967296
  Constant buffer size:              65536
  Max number of constant args:           9
  Local memory size:                 49152
 +---------------------------------------------------------
  ...                                              4000 MHz
  Cache type:                            Read/Write
  Cache line size:                      64
  Cache size:                       262144
  Global memory size:            536838144
  Constant buffer size:             131072
  Max number of constant args:         480
  Local memory size:                 32768
 +---------------------------------------------------------
  ...                                              1300 MHz
  Cache type:                            Read/Write
  Cache line size:                      64
  Cache size:                       262144
  Global memory size:           1561123226
  Constant buffer size:              65536
  Max number of constant args:           8
  Local memory size:                 65536
 +---------------------------------------------------------
  ...                                              4000 MHz
  Cache type:                            Read/Write
  Cache line size:                      64
  Cache size:                       262144
  Global memory size:           2147352576
  Constant buffer size:             131072
  Max number of constant args:         480
  Local memory size:                 32768
```
主要是设备与设备的不同，每个高性能代码项目主要可以分析其各自的GPU设备任务 - “实际部署设备的几何和资源使用图组合 . 在一个GPU设备/ GPU上可以更快地工作 - 驱动器堆栈，不需要在另一个上工作（或者在GPU驱动程序外部编程生态系统更新/升级之后），只需要现实生活中的基准测试就能证明（因为理论可以轻松打印，但很难轻松实现）执行时，因为许多特定于设备和工作负载的注入限制将适用于实际部署） .

建议我在OpenCL上使用GPU在MQL5中使用不同的值或框架分发函数的方法 .

诚实和最好的建议与4月2日已经呈现给你的一样 .

不要尝试使用任何广泛的延迟/异步/阻塞代码来阻止/延迟任何MQL5 CustomIndicator类型的代码执行单元的执行流程 . Never, 直到MetaTrader终端平台文档将明确删除此类警告（仍然存在于2018 / Q2那里），并将明确建议使用延迟避免的非阻塞分布式代理通信工具来协调（几乎）同步交换处理数据/结果在MQL5端和GPU设备端之间（由于现有GPU设备类中GPU作业的无序调度的性质，因此不会很快推出 . ）

这被记录为 natural flow-of-time ，由外部外汇市场（经纪人广播传播）事件的流动引发，有大约几百个 [us] 事件到事件韵律 .

如果进入 synthetic flow-of-time ，就像在终端的 [ Strategy Tester ] 模拟器生态系统中编排的那样， the problem 记录在 goes many orders of magnitude worse, 之上，因为模拟器实际上加速了时间流/趋势，任何不能跟上节奏的事情都会阻止任何加速（这是在上面的时间流动的自然速度已经很糟糕） . 所以，不，这是一个非常糟糕的方向，投资于下一个单一的努力（再次， at least until both platforms will have changed their architectural limits ） .

...这样我的过程变得更快......

这部分问题定义has been decided already ~ 60 years back, by Dr. Gene AMDAHL.

他的（然后简化的） Law of Diminishing Returns 解释了 WHY 任何过程加速的主要上限与静止 [SERIAL] 部分相关联，因为明确区分了纯粹的部分和潜在的N-（CPU）时间 true-[PARALLEL] 部分 .

这有助于预估流程重新设计的成本/收益效果 .

所以，在这里，你的GPU内核代码是（几乎） - [PARALLEL] 处理部分 . 所有其余的仍然是一个纯粹的处理部分 .

这足以猜测尝试进入OpenCL包装过程重新设计的效果的极限 .

但是，魔鬼隐藏在细节中......

实际成本更高 .
- [SERIAL] -part will never get faster 本身 .
- [SERIAL] -part will actually get "slower" and "extended" ，因为将有 many more steps to execute ，在有效载荷的第一个SIMT / SIMD指令之前......被"remotely"传递到OpenCL-Queue OpenCL-Data-Transfer（s）OpenCL-Queue任务管理等待... OpenCL-Queue TaskManagement提交到设备上......甚至会开始执行任务==预定的OpenCL-Device WARP-scheduled / SIMT / SIMD-从远程马戏团一直执行 - - OpenCL-Device任务完成开销MQL5端异步完成检测异步附加延迟OpenCL-Data-Transfer（s）
- [PARALLEL] -part将被执行 only "after" or "at" all the add-on costs were accrued （上图中没有描述，因为需要避免使其过于复杂且难以理解理论上的限制，忽略开销，加速（不）缩放），但 even worse, 因为只执行了大约4倍 GPU_CLOCK -rate（没有提到 ~ 10x ~ 1000x slower 访问延迟时间到内存和缓存），并且因为传送的算法仍然只是一个线性卷积的TimeSeries数据处理，因此不能有但 << 1.0 改善因素对理论处理加速的不利净效应（实现的结果性能得到 worse 比没有这样的尝试"improve"） .
有关这些净效应的完整参考，请阅读Criticism, where both Overhead-strict re-formulation of the Amdahl's Law speedup and Overhead-strict and resources-aware re-formulation部分更详细：
```
1
S =  __________________________; where s, ( 1 - s ), N were defined above
                ( 1 - s )            pSO:= [PAR]-Setup-Overhead     add-on
     s  + pSO + _________ + pTO      pTO:= [PAR]-Terminate-Overhead add-on
                    N
```
```
1                         where s, ( 1 - s ), N
S =  ______________________________________________ ;      pSO, pTO
                    / ( 1 - s )           \                were defined above
     s  + pSO + max|  _________ , atomicP  |  + pTO        atomicP:= further indivisible duration of atomic-process-block
                    \     N               /
```
在这篇文章的顶部引用的 Headers 图提供了一个链接到带有交互式输入和动画输出的实时GUI，其中一个may test impacts of values for p == ( 1 - s ) going anywhere under 1.00（这是一个理论上的，绝对100％ [PARALLEL] 时间表（这在任何实际上都是技术上不可能的）世界场景））并且还调整 o 中的所有附加开销的影响（出于简单原因仅表示为标量分数）在可编辑的 ~ < 0.0 ~ 0.0001 > 值范围内，以便 better sense the principal limits of real-world behaviour of many-core devices 并且甚至在思考之前能够做出更好的工程决策关于任何编码步骤 .

并且考虑到已知的（在代码执行的MQL5端，使用对 GetMicrosecondCount() 的调用的单个 [us] 分辨率）附加开销和处理原子性的值 - 容易测量的下降 - 网络 - 尝试继续向OpenCL包装 Simple Moving Average 的效果，如GPU内核代码所示：
```
kernel void SMA_executeSMA(          float  ExtLineBufferi_1,
                                     float  price1,
                                     float  price2,
                                     int    InpMAPeriod,
                            __global float *output
                            )
{                                                  // 1: .STO 0x0001, REG
   int len = get_global_id( 1 );                   // 2: .JMP intrinsic_OpenCL_fun(), ... may get masked by reading a hardwired-const-ID#
                                                   // 3: .GET len, REG
   output[len] =                                   // 4: .STO MEM[*],
                 ExtLineBufferi_1                  // 5:     .ADD const,
               + ( price1 - price2 )               //             ( .SUB const, const
                 / InpMAPeriod;                    //               .FDIV REG, const )
}                                                  // 6: .RET
```
它只有几条900 MHz时钟指令 - 即动画图形可视化中的 p = ( 1 - s ) -factor将接近 p == 0 end，使游戏最终由CPU的纯粹部分支配：GPU组成distributed-computing系统 - （〜几个，最小的几十个 [ns] 裸（不可屏蔽，因为这里没有重复使用）在GPU设备内存访问延迟 ~ 350 - 700+ [ns] ） .

如此低的 p 是一个性能调整坏标志（如果不是 ANTI-PATTERN ），任何尝试这样做 .

因为即使进入 N-(CPUs) ~ +INF ，它仍然会 never make the wished-for speedup （参考：可能会尝试在上面提供的交互式图形中修改这些因素并直观地看到效果 - 数字会有多低） - while 同样可以计算得差不多于 ~ 0.5 [ns] ，还有可矢量化的CPU指令，这里也有零附加成本 .

这些是“经济成本”的原因（除了主要的MQL5之外）为什么最好不要这样做

that will never pay back the sum of all the [SERIAL] add-on costs ，在整个OpenCL重新包装期间引入 - 在那里发送 - 在那里发送 - 并且在检测到后发送马戏团的CPU代码/ MQL5端（所有这些都是以制作不多的名义）而不仅仅是这些确实很少发生的GPU_INSTR-s），这只是上面简要提到的， even if an infinite number of GPU-cores were used .

你只是试着 pay way more than one will ever receive back.
回复于 2024-05-12T17:14:01+08:00

MQL5中的OpenCL代码不会将分布式作业提供给每个GPU核心

1 回答

事实上，自4月2日你已经知道，MQL5 CustomIndicator将无法以这种方式工作......

CPU：GPU拓扑如何在内部工作？

接下来，让我们尊重GPU架构的领域：

但是，魔鬼隐藏在细节中......

这些是“经济成本”的原因（除了主要的MQL5之外）为什么最好不要这样做

相关问题