我使用pthreads编写了一个简单的平流方程求解器,它在处理器上正常工作 . 然而,当我使用-mmic标志编译它并在协处理器上运行(使用micnativeloadex)时,它只使用 one 线程(我很难编码它使用200) . 据我所知,代码应该按原样运行 . 我在这里错过了什么吗?

代码非常脏,但仍然是完整的 .

#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <sys/time.h>
#include <unistd.h>

int64_t TimeInMicros() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec*1000000 + tv.tv_usec;
}


struct Grid{
  int nx;
  double *u, *u_new, *f, *res;
  double a, cfl, dx;
  double tf;
};


struct ThreadData{
  int tid;
  struct Grid *grid;
  int maxthreads;
  pthread_barrier_t *barr;
};

void *solver(void *args){
  struct ThreadData *td = (struct ThreadData *)args;
  int tid = td->tid;

  struct Grid *grid = td->grid;
  pthread_barrier_t *barr = td->barr;

  double *u = grid->u;
  double *u_new = grid->u_new;

  double cfl = grid->cfl;
  double a = grid->a;
  double dx = grid->dx;
  double tf = grid->tf;
  int nx = grid->nx;
  double *f = grid->f;
  double *res = grid->res;
  double dt = cfl*dx/a;
  double t = 0.0;
  int chunk = nx/(td->maxthreads);
  int start = tid*chunk;

  int rc;
  while(t < tf){
    // sync here


    if(start == 0){
    f[start+1:chunk] = a*u[start:chunk];
    }
    else{
      f[start:chunk+1] = a*u[start-1:chunk+1];
    }

    // sync here

    rc = pthread_barrier_wait(barr);
    if(rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD)
    {
        printf("Could not wait on barrier\n");
        exit(-1);
    }


    if(start == 0){
      f[start] = f[nx-1];
    }

    res[start:chunk] = -(f[start+1:chunk] - f[start:chunk])/dx;
    // need to use update u_new for multiple threads
    u[start:chunk] += res[start:chunk]*dt;

    rc = pthread_barrier_wait(barr);
    if(rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD)
    {
        printf("Could not wait on barrier\n");
        exit(-1);
    }



    t+=dt;
  }
  return NULL;
}


int main(int argc, char*argv[]){
  int nx=100000;//atoi(argv[1]);
  int nthreads=200;//atoi(argv[2]);

  if(nx%nthreads != 0){
    printf("ERROR: Number of cells should be integral multiple of number of threads \n");
    exit(1);
  }

  pthread_t *threads = new pthread_t[nthreads]();
  struct ThreadData td[nthreads];
  pthread_barrier_t barr;
  pthread_barrier_init(&barr, NULL, nthreads);

  double *u = new double[nx]();
  double *res = new double[nx]();
  double *f = new double[nx+1]();

  double dx = 1.0/nx;
  double cfl = 0.9;
  double a = 1.0;
  double tf = 1.0;
  int i;
  // initialize
  u[0:nx] = 0.0;
  u[nx/4:nx/2] = 1.0;
  f[0:nx+1] = 0.0;
  res[0:nx] = 0.0;
  struct Grid grid;
  grid.nx = nx;
  grid.a = a;
  grid.cfl = cfl;
  grid.dx = dx;
  grid.u = u;
  grid.u_new = u;
  grid.res = res;
  grid.f = f;
  grid.tf = tf;

  for(i=0;i<nthreads;i++){
    td[i].tid = i;
    td[i].grid = &grid;
    td[i].maxthreads = nthreads;
    td[i].barr = &barr;
  }




  int64_t t1 = TimeInMicros();

  for(i=0;i<nthreads;i++){
    pthread_create(&threads[i],NULL,solver,&(td[i]));
  }

  for(i=0;i<nthreads;i++){
    pthread_join(threads[i],NULL);
  }

  int64_t t2 = TimeInMicros();
  printf("Execution time: %.10f\n", (t2-t1)*1e-6);

  FILE * outfile;
  outfile = fopen("results.txt", "w+");
  for(i = 0; i < nx; i++){
    fprintf(outfile, "%.10f %.10f\n", i*dx, grid.u[i]);
  }
  fclose(outfile);



  delete[] threads;
  delete[] u;
  delete[] res;
  delete[] f;



}