首页 文章

为什么选择我的自定义操作系统的CPU实现?

提问于
浏览
8

为了学习如何编写自定义TensorFlow操作,我按照Adding a New Op教程制作了一个"add_b"操作,为每个输入值添加一个标量 b .

add_b_op.cc

#define EIGEN_USE_THREADS

#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"

#include "tensorflow/core/framework/common_shape_fns.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/shape_inference.h"

using namespace tensorflow;

REGISTER_OP("AddB")
    .Attr("T: {float, double}")
    .Input("input: T")
    .Input("b: T")
    .Output("output: T")
    .SetShapeFn([] (shape_inference::InferenceContext* c) -> Status {
      shape_inference::ShapeHandle out;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &out));
      return shape_inference::UnchangedShape(c);
    })
//----------------------------------------------------------------------
    .Doc(R"doc(
Adds `b` to each input.

input: The input values.
b: A number to add to each input value.
)doc");


template <typename T>
class AddBCpuOp : public OpKernel {
 public:
  explicit AddBCpuOp(OpKernelConstruction* context) : OpKernel(context) {}

  void Compute(OpKernelContext* context) override {
    const Tensor& input_tensor = context->input(0);
    const auto input = input_tensor.flat<T>();

    Tensor* output_tensor = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                     &output_tensor));
    auto output = output_tensor->flat<T>();

    const Eigen::ThreadPoolDevice& d = context->eigen_device<Eigen::ThreadPoolDevice>();

    // Note: The mistake of adding 1 instead of `b` is intentional to be able to distinguish
    // the CPU and GPU implementations.
    output.device(d) = input + static_cast<T>(1);
  }
};

REGISTER_KERNEL_BUILDER(
    Name("AddB")
    .Device(DEVICE_CPU)
    .TypeConstraint<float>("T"),
    AddBCpuOp<float>);
REGISTER_KERNEL_BUILDER(
    Name("AddB")
    .Device(DEVICE_CPU)
    .TypeConstraint<double>("T"),
    AddBCpuOp<double>);


#if GOOGLE_CUDA

template <typename T>
bool LaunchAddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output);

template <typename T>
class AddBGpuOp : public OpKernel {
 public:
  explicit AddBGpuOp(OpKernelConstruction* context) : OpKernel(context) {}

  void Compute(OpKernelContext* context) override {
    const Tensor& input_tensor = context->input(0);
    const auto input = input_tensor.flat<T>();

    const Tensor& b_tensor = context->input(1);
    OP_REQUIRES(context, TensorShapeUtils::IsScalar(b_tensor.shape()),
                errors::InvalidArgument("add_b expects a scalar for `b`."));
    const auto b = b_tensor.scalar<T>();

    Tensor* output_tensor = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                     &output_tensor));
    auto output = output_tensor->flat<T>();

    OP_REQUIRES(context, LaunchAddBKernel(input.data(), input.dimension(0), b.data(), output.data()),
                errors::Internal("add_b: LaunchAddBKernel() failed."));
  }
};

REGISTER_KERNEL_BUILDER(
    Name("AddB")
    .Device(DEVICE_GPU)
    .TypeConstraint<float>("T"),
    AddBGpuOp<float>);
REGISTER_KERNEL_BUILDER(
    Name("AddB")
    .Device(DEVICE_GPU)
    .TypeConstraint<double>("T"),
    AddBGpuOp<double>);

#endif // if GOOGLE_CUDA

add_b_op.cu.cc

template <typename T, int BLOCK_DIM_X>
__global__ void AddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output) {
  const int i = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
  if (i < n) {
    d_output[i] = d_input[i] + *d_b;
  }
}

template <typename T>
bool LaunchAddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output) {
  if (n <= 0) return true;

  constexpr int BLOCK_DIM_X = 256;
  AddBKernel<T, BLOCK_DIM_X><<<n / BLOCK_DIM_X + (n % BLOCK_DIM_X != 0), BLOCK_DIM_X>>>(d_input, n, d_b, d_output);
  return true;
}

// Explicit instantiations.
template bool LaunchAddBKernel<float>(const float *__restrict__, int, const float *__restrict__, float *__restrict__);
template bool LaunchAddBKernel<double>(const double *__restrict__, int, const double *__restrict__, double *__restrict__);

我故意在CPU实现中引入了一个错误,以便能够区分是使用CPU还是GPU实现 .

当我测试我的自定义操作时:

from __future__ import print_function
import tensorflow as tf

module = tf.load_op_library('custom_ops.so')
with tf.Session(config = tf.ConfigProto(log_device_placement = True)):
  print(module.add_b([5., 4., 3., 2., 1.], 8.).eval())

我得到以下输出:

I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] OS X does not support NUMA - returning NUMA node zero
I tensorflow/core/common_runtime/gpu/gpu_device.cc:951] Found device 0 with properties: 
name: GeForce GT 750M
major: 3 minor: 0 memoryClockRate (GHz) 0.9255
pciBusID 0000:01:00.0
Total memory: 2.00GiB
Free memory: 1.80GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:972] DMA: 0 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] 0:   Y 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:1041] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0)
Device mapping:
/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0
I tensorflow/core/common_runtime/direct_session.cc:252] Device mapping:
/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0

AddB: /job:localhost/replica:0/task:0/gpu:0
I tensorflow/core/common_runtime/simple_placer.cc:819] AddB: /job:localhost/replica:0/task:0/gpu:0
AddB/b: /job:localhost/replica:0/task:0/gpu:0
I tensorflow/core/common_runtime/simple_placer.cc:819] AddB/b: /job:localhost/replica:0/task:0/gpu:0
AddB/input: /job:localhost/replica:0/task:0/gpu:0
I tensorflow/core/common_runtime/simple_placer.cc:819] AddB/input: /job:localhost/replica:0/task:0/gpu:0
[ 6.  5.  4.  3.  2.]

“设备放置日志”似乎表示正在GPU上执行操作,但输出表明正在使用CPU实现 .

当我注释掉 DEVICE_CPU 实现的两个REGISTER_KERNEL_BUILDER()注册,重新编译和重新测试时,我得到 [ 13. 12. 11. 10. 9.] 的预期输出,但是有一个错误:

E tensorflow/core/common_runtime/executor.cc:334] Executor failed to create kernel. Not found: No registered 'AddB' OpKernel for CPU devices compatible with node AddB = AddB[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](AddB/input, AddB/b)
    .  Registered:  device='GPU'; T in [DT_FLOAT]
  device='GPU'; T in [DT_DOUBLE]

     [[Node: AddB = AddB[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](AddB/input, AddB/b)]]

该错误消息对我来说似乎是一个错误,因为虽然错误显示“Executor无法创建内核”,但显然创建了一个内核来运行GPU上的op .

为什么要使用CPU实现而不是GPU实现?

如果这很重要,这里有关于我的开发设置的详细信息:

  • 我正在使用内置NVIDIA GeForce GT 750M(CUDA Compute Capability 3.0)的MacBook Pro .

  • macOS Sierra版本10.12.1(16B2555)

  • cuda_8.0.47_mac,cudnn-8.0-osx-x64-v5.1

  • TensorFlow 0.11.0rc2通过以下方式安装: export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc2-py2-none-any.whl

UPDATE 我发现是否选择了CPU或GPU实现取决于输入的大小 . 使用此测试脚本:

from __future__ import print_function
import numpy as np
import tensorflow as tf
from time import time

NUM_VALUES = 1310720

input = np.arange(0, NUM_VALUES, dtype = float)

module = tf.load_op_library('custom_ops.so')
with tf.Session(config = tf.ConfigProto(log_device_placement = True)):
  start = time(); print(module.add_b(input, 8.).eval()); end = time(); print(end - start)

..当 NUM_VALUES 为1310720或更小时,则使用CPU实现 . 当 NUM_VALUES 为1310721或更高时,则使用GPU实现 .

是否有(1310720 * 8字节/双=)10 MiB截止?如果是这样,我该如何覆盖呢? AddB()操作很简单,但对于更复杂的自定义操作,10 MiB可能太大,无法选择GPU实现 .

3 回答

  • 1

    我认为模板实例化可能不正确:

    template <typename Device, typename T>
    class AddBOp : public OpKernel {
    ...
    }
    
    REGISTER_KERNEL_BUILDER(
        Name("AddB")
        .Device(DEVICE_CPU)
        .TypeConstraint<float>("T"),
        AddBOp<CPUDevice, float>);
    

    然后:

    template <typename T>
    class AddBOp<GPUDevice, T> : public OpKernel {
    ...
    }
    
    REGISTER_KERNEL_BUILDER(
        Name("AddB")
        .Device(DEVICE_GPU)
        .TypeConstraint<float>("T"),
        AddBOp<GPUDevice, float>);
    

    我认为用于GPU的AddB的注册实例化与第一个实现匹配的对象,而不是第二个实现(第一个实现有两个模板参数,第二个实现有一个) .

    您可以通过在第二次注册中调用AddBOp <float>来解决此问题,但我会建议更好的名称以避免混淆 .

  • 1

    我刚刚阅读TensorFlow issue #2054 - Manual placement on GPU of a custom operator with both CPU and GPU implementation will always run the CPU version并且运行CPU实现的行为似乎是TensorFlow的一个名为"constant folding"的功能 . 当TensorFlow在第一次运行之前优化图形时,通常在CPU上评估涉及常量的操作,因为思考是CPU和GPU实现应该产生相同的结果 . 说得通 .

    禁用此行为的两种方法是:

    • 禁用图表优化:
    from __future__ import print_function
    import numpy as np
    import tensorflow as tf
    from time import time
    
    NUM_VALUES = 10
    
    input = np.arange(0, NUM_VALUES, dtype = float)
    
    custom_ops_module = tf.load_op_library('custom_ops.so')
    
    config = tf.ConfigProto(log_device_placement = True)
    config.graph_options.optimizer_options.opt_level = -1
    
    with tf.Session(config = config):
      start = time(); print(custom_ops_module.add_b(input, 8.).eval()); end = time(); print(end - start)
    
    • 不使用常量,例如,将值提供给占位符:
    from __future__ import print_function
    import numpy as np
    import tensorflow as tf
    from time import time
    
    NUM_VALUES = 10
    
    custom_ops_module = tf.load_op_library('custom_ops.so')
    
    graph = tf.Graph()
    with graph.as_default():
      input = tf.placeholder(tf.float64, shape = (NUM_VALUES,))
      b = tf.placeholder(tf.float64, shape = ())
      result = custom_ops_module.add_b(input, b)
    
    with tf.Session(graph = graph, config = tf.ConfigProto(log_device_placement = True)) as session:
      feed_dict = {
        input: np.arange(0, NUM_VALUES, dtype = float),
        b: 8.,
      }
      start = time(); print(session.run([result], feed_dict = feed_dict)); end = time(); print(end - start)
    
  • 0

    根据this可能是由于内存碎片管理,请尝试:

    with tf.device('/gpu:0'):
    

    或者链接页面中的片段用于内存碎片选项调整 .

    EDIT: 要查看是否是这种情况,请尝试:

    from __future__ import print_function
    import numpy as np
    import tensorflow as tf
    from time import time
    
    NUM_VALUES = 10
    
    input = np.arange(0, NUM_VALUES, dtype = float)
    
    custom_ops_module = tf.load_op_library('custom_ops.so')
    
    config = tf.ConfigProto(log_device_placement = True)
    config.gpu_options.allow_growth = True
    
    with tf.Session(config = config):
        start = time(); print(custom_ops_module.add_b(input, 8.).eval()); end = time(); print(end - start)
    

相关问题