OpenCL支持API 21以上的某些Android设备,具体支持得看芯片厂商爸爸给不给力了,不仅仅如此,OpenCL在X86体系基本都有集成,不仅仅是Mac、Ubuntu、Windows、Android等os。详细的网站页可以参考:

对应的官方文档中,都有详细的介绍,如果是Android,想要查看是否支持OpenCL GPU运算。可以用一个GPU测试工具OpenCL-Z工具去查看,可以显示当前手机GPU的型号和厂商信息。包括有几个GPU核心都可以看到。






  • 自己编写Android.mk或者新的bp,然后去引用系统的so、头文件。官方源历程也是这样编写的。(只是用了mac,而且inter介绍的英文太多,下载了studio和docker,感觉那是一堆LOT的东西,比较烦)。

  • 在cmake中包含库、头文件。打包成so,和普通集成的AS环境类似。(这里可以用省力的方式,直接提取手机的


  • 例子(我们现在要实现的测试功能是A数组和B数组进行累加计算,并且把结果保存到result数组中)

         第一种方式觉得比较麻烦,这里重点介绍一下第二种方式的实际运行过程。首先新弄一个工程叫imageDemo,然后link C++,新建一个JAVA类比如叫,然后开始编写我们的Demo接口:

package com.genesis.imageNative;import;/*** 项目名称:ImageDemo* 类描述:* 创建人:genesis* 创建时间:2019/4/10 5:45 PM* 修改人:genesis* 修改时间:2019/4/10 5:45 PM* 修改备注:*/public class ImageNativeInterface {    private static volatile ImageNativeInterface mInterface;    public static ImageNativeInterface getInstance () {        if (mInterface == null) {            mInterface = new ImageNativeInterface();        }        return mInterface;    }    private ImageNativeInterface () {    }    public int openclDemo (int[] arrayA, int[] arrayB, int[] result, String kernelCode,                           float[] runningTime) {        return nOpenCLDemo(arrayA, arrayB, result, kernelCode, runningTime);    }    public int nativeAdd (int[] arrayA, int[] arrayB, int[] result) {        return nNativeAdd(arrayA, arrayB, result);    }    //fixme 接口native    private static native int nOpenCLDemo (int[] arrayA, int[] arrayB, int[] result,                                           String kernelCode, float[] runningTime);    //fixme 本地C++ CPU计算数组    private static native int nNativeAdd(int[] arrayA, int[] arrayB, int[] result);    static {        try {            System.loadLibrary("gimage");        } catch (Exception e) {            System.out.println("LoadLib error");        }    }}


#include #include #include #include #include #include #include #include #include #include #include #include "CL/cl.h"#define    EXIT_FAILURE    1#define    EXIT_SUCCESS    0#define    RAND_MAX    0x7fffffff#define  LOG_TAG    "libGenesisBitmap"#define  LOGI(...)  __android_log_print(ANDROID_LOG_INFO,LOG_TAG,__VA_ARGS__)#define  LOGE(...)  __android_log_print(ANDROID_LOG_ERROR,LOG_TAG,__VA_ARGS__)#define MAX_PLATFORMS_COUNT     16#define CL_SUCCEEDED(clErr) CL_SUCCESS==clErr#define CL_FAILED(clErr) CL_SUCCESS!=clErrvoid addArrays(const int* arrayA, const int* arrayB, const int* Result, int length,               const char* kernelCode, float* runTime){    cl_platform_id     platform     = 0;    cl_device_type     clDEviceType = CL_DEVICE_TYPE_CPU; // default    cl_kernel          kernel       = 0;    cl_command_queue   cmd_queue    = 0;    cl_context         context      = 0;    cl_mem             memobjs[3];    cl_program         program      = 0;    cl_int             clErr;    unsigned long long startTime    = 0, endTime = 0;    // get current platform id, assuming there are no more than 16 platforms in the system    cl_platform_id pPlatforms[MAX_PLATFORMS_COUNT] = {0};    cl_uint        uiPlatformsCount                = 0;    clErr = clGetPlatformIDs(MAX_PLATFORMS_COUNT, pPlatforms, &uiPlatformsCount);    __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "clErr: %d uiPlatformsCount: %d", clErr,                        uiPlatformsCount);    if (CL_FAILED(clErr) || 0 == uiPlatformsCount)    {        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "ERROR: Failed to find any platform.");        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "clErr: %d uiPlatformsCount: %d", clErr,                            uiPlatformsCount);        return;    }    // go through the available platform and select our (vendor = "Intel Corporation")    cl_uint num_device;    cl_uint num_platform;    cl_platform_id* platformtag;    cl_device_id  * devices;    clErr       = clGetPlatformIDs(0, 0, &num_platform);    platformtag = (cl_platform_id*) malloc(sizeof(cl_platform_id) * num_platform);    clErr       = clGetPlatformIDs(num_platform, platformtag, NULL);    clErr   = clGetDeviceIDs(platformtag[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_device);    devices = (cl_device_id*) malloc(sizeof(cl_device_id) * num_device);    clErr   = clGetDeviceIDs(platformtag[0], CL_DEVICE_TYPE_GPU, num_device, devices, NULL);    //create context    __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "create context");    cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platform,                                          (cl_context_properties) NULL};    context = clCreateContext(NULL, num_device, devices, NULL, NULL, &clErr);;    if (0 == context)    {        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "clErr: %d - Failed to create context",                            clErr);        return;    }    // get context's devices    __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "get context's devices");    cl_device_id device = 0;    clErr = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device, NULL);    if (CL_FAILED(clErr) || 0 == device)    {        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "clErr: %d - Failed to get context info",                            clErr);        clReleaseContext(context);        return;    }    // create a command-queue    __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "create a command-queue");    cmd_queue = clCreateCommandQueue(context, device, 0, NULL);    if (cmd_queue == (cl_command_queue) 0)    {        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG,                            "clErr: %d - Failed to create command queue", clErr);        goto release_context;    }    size_t global_work_size[1];    size_t local_work_size[1];    // allocate the buffer memory objects    __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "allocate the buffer memory objects");    memobjs[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,                                sizeof(int) * length, (void*) arrayA, NULL);    if (memobjs[0] == (cl_mem) 0)    {        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "Failed to create memobjs[0]");        goto release_queue;    }    memobjs[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,                                sizeof(int) * length, (void*) arrayB, NULL);    if (memobjs[1] == (cl_mem) 0)    {        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "Failed to create memobjs[1]");        goto release_mem0;    }    memobjs[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * length, NULL, NULL);    if (memobjs[1] == (cl_mem) 0)    {        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "Failed to create memobjs[2]");        goto release_mem1;    }    // create program    __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "create program");    program = clCreateProgramWithSource(context, 1, (const char**) &kernelCode, NULL, &clErr);    if (CL_FAILED(clErr) || 0 == program)    {        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "clErr: %d - Failed to create program",                            clErr);        goto release_mem2;    }    // build program    __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "build program");    clErr = clBuildProgram(program, 1, &device, NULL, NULL, NULL);    if (CL_FAILED(clErr))    {        size_t len;        char   buffer[2048];        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG,                            "clErr: %d - Failed to build program\n Log: %s", clErr, buffer);        goto release_program;    }    // create the kernel    __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "create kernel");    kernel = clCreateKernel(program, "vadd", NULL);    if (kernel == (cl_kernel) 0)    {        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "clErr: %d - Failed to create kernel",                            clErr);        goto release_program;    }    // set the args values    __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "set the args values");    clErr = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*) &memobjs[0]);    clErr |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &memobjs[1]);    clErr |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &memobjs[2]);    if (CL_FAILED(clErr))    {        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG,                            "clErr: %d - Failed to set kernel arguments", clErr);        goto release_all;    }    // set work-item dimensions    global_work_size[0] = length;    local_work_size[0]  = 512;    // execute kernel    __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "execute kernel");    struct timespec tp;    clock_gettime(CLOCK_MONOTONIC, &tp);    startTime = (unsigned long long) (tp.tv_sec * 1000000000 + tp.tv_nsec);    clErr     = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global_work_size,                                       local_work_size, 0, NULL, NULL);    if (CL_FAILED(clErr))    {        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "clErr: %d - Failed to execute kernel",                            clErr);        goto release_all;    }    clErr = clFinish(cmd_queue);    if (CL_FAILED(clErr))    {        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "clErr: %d - Failed to finish queue",                            clErr);        goto release_all;    }    clock_gettime(CLOCK_MONOTONIC, &tp);    endTime = (unsigned long long) (tp.tv_sec * 1000000000 + tp.tv_nsec);    *runTime = (endTime - startTime) / 1000000.0f;    // read output Buffer    __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "read output Buffer");    clErr = clEnqueueReadBuffer(cmd_queue, memobjs[2], CL_TRUE, 0, length * sizeof(int),                                (void*) Result, 0, NULL, NULL);    if (CL_FAILED(clErr))    {        __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, "clErr: %d - Failed to read output Buffer",                            clErr);        goto release_all;    }    __android_log_print(ANDROID_LOG_INFO, LOG_TAG, "Done!");    //release kernel, program, and memory objects    release_all:    clReleaseKernel(kernel);    release_program:    clReleaseProgram(program);    release_mem2:    clReleaseMemObject(memobjs[2]);    release_mem1:    clReleaseMemObject(memobjs[1]);    release_mem0:    clReleaseMemObject(memobjs[0]);    release_queue:    clReleaseCommandQueue(cmd_queue);    release_context:    clReleaseContext(context);    return;}extern "C"JNIEXPORT jintJNICALLJava_com_genesis_imageNative_ImageNativeInterface_nOpenCLDemo(JNIEnv* env, jclass type,                                                              jintArray arrayA_, jintArray arrayB_,                                                              jintArray result_,                                                              jstring kernelCode_,                                                              jfloatArray runningTime){    int  * c_arrayA  = env->GetIntArrayElements(arrayA_, NULL);    int  * c_arrayB  = env->GetIntArrayElements(arrayB_, NULL);    int  * c_Result  = env->GetIntArrayElements(result_, NULL);    float* c_runTime = env->GetFloatArrayElements(runningTime, NULL);    int length = env->GetArrayLength(arrayA_);    const char* nativeKernelCode = env->GetStringUTFChars(kernelCode_, 0);    addArrays(c_arrayA, c_arrayB, c_Result, length, nativeKernelCode, c_runTime);    env->ReleaseIntArrayElements(arrayA_, c_arrayA, 0);    env->ReleaseIntArrayElements(arrayB_, c_arrayB, 0);    env->ReleaseIntArrayElements(result_, c_Result, 0);    env->ReleaseFloatArrayElements(runningTime, c_runTime, 0);    return 1;}extern "C"JNIEXPORT jint JNICALLJava_com_genesis_imageNative_ImageNativeInterface_nNativeAdd(JNIEnv* env, jclass type,                                                             jintArray arrayA_, jintArray arrayB_,                                                             jintArray result_){    jint* arrayA = env->GetIntArrayElements(arrayA_, NULL);    jint* arrayB = env->GetIntArrayElements(arrayB_, NULL);    jint* result = env->GetIntArrayElements(result_, NULL);    int length = env->GetArrayLength(arrayA_);    for (int i = 0; i < length; ++i)    {        result[i] = arrayA[i] + arrayB[i];    }    env->ReleaseIntArrayElements(arrayA_, arrayA, 0);    env->ReleaseIntArrayElements(arrayB_, arrayB, 0);    env->ReleaseIntArrayElements(result_, result, 0);    return 1;}


__kernel void vadd(__global const int *a, __global const int *b, __global int *c){    int gid = get_global_id(0);    c[gid] = a[gid] + b[gid];}


# For more information about using CMake with Android Studio, read the# documentation: Sets the minimum version of CMake required to build the native librarycmake_minimum_required(VERSION 3.4.1)# Creates and names a library, sets it as either STATIC# or SHARED, and provides the relative paths to its source code.# You can define multiple libraries, and CMake builds them for you.# Gradle automatically packages shared libraries with your APK.include_directories(${CMAKE_SOURCE_DIR}/src/main/cpp/include)#add_library(libOpenCL SHARED IMPORTED)#set_target_properties(libOpenCL PROPERTIES IMPORTED_LOCATION#        ${CMAKE_SOURCE_DIR}/src/main/jniLibs/opencl/${ANDROID_ABI}/ # Sets the name of the library.        gimage        # Sets the library as a shared library.        SHARED        # Provides a relative path to your source file(s).        src/main/cpp/native-lib.cpp)# Specifies libraries CMake should link to your target library. You# can link multiple libraries, such as libraries you define in this# build script, prebuilt third-party libraries, or system libraries.target_link_libraries( # Specifies the target library.        gimage        -ljnigraphics        -lOpenCL        log        # Links the target library to the log library        # included in the NDK.        ${log-lib})


private static final int ARRAY_SIZE = 262144;public void calcVectors (View view) {    int[] arrayA = new int[ARRAY_SIZE];    int[] arrayB = new int[ARRAY_SIZE];    int[] arrayC = new int[ARRAY_SIZE];    float[] execTime = new float[1];    execTime[0] = 0;    AssetManager am = getAssets();    try {        initArrays(arrayA, arrayB, arrayC, ARRAY_SIZE);        InputStream is ="");        String kernelCode = convertInputStreamToString(is);        ImageNativeInterface.getInstance().openclDemo(arrayA, arrayB, arrayC, kernelCode,                execTime);        //fixme 调用其他函数得到计算数据    } catch (IOException e) {        Log.d("oclDebug", e.toString());    }    String print = String.valueOf(execTime[0]);    print += " (ms)";    TextView myTextField = (TextView) findViewById(;    myTextField.setText(print);}





  • 总结

        以上仅对Android 单纯单核心运算做了介绍,事实上,在Android 8.0之后的版本中,存在对AIT的优化,特别是循环运算、浮点数运算等等做了优化,实际上跑大容量固定值算法的时间周期在第一次可能会比较慢,但是在第二次启动同一段JAVA算法段,AIT会对相应的代码做优化处理。结果就会直接反应在性能上。8.0指令优化的点包括:

  • 消除边界检查

    • 静态:在编译时证明范围位于边界内

    • 动态:运行时测试确保循环始终位于边界内(否则不进行优化)

  • 消除归纳变量

    • 移除无用归纳

    • 用封闭式表达式替换仅在循环后使用的归纳

  • 消除循环主体内的无用代码,移除整个死循环

  • 强度降低

  • 循环转换:逆转、交换、拆分、展开、单模等

  • SIMDization(也称为矢量化)


