文件目录:

cudaTest

|--utils.cu

|--utils.h

|--squaresum.cu

|--squaresum.h

|--test.cpp

|--CMakeLists.txt

编译命令:

$cd /root/cudaTest

$mkdir build

$cd build

$cmake ..

$make

调佣关系:

utils:提供常用工具,这里提供查询设备信息功能;

squaresum:计算平方和功能,为cuda运行的核心函数实现

test:调用平方和函数

CMakeLists.txt:组织所有文件编译生成可执行文件

注意:调用cu文件中的函数时要在头文件声明成extern “C”

文件内容:

CMakeLists.txt

# CMakeLists.txt to build hellocuda.cu
cmake_minimum_required(VERSION 2.8)
find_package(CUDA QUIET REQUIRED)

# Specify binary name and source file to build it from
#
add_library(utils utils.cpp)
cuda_add_executable(
squaresum
test
.cpp squaresum.cu utils.cu)
#target_link_libraries(squaresum utils)

test.cpp

#include <iostream>
#include
"squaresum.h"

//extern "C" int squaresum();

int main(){
squaresum();
return 0;
}

squaresum.h

#include "utils.h"
#include
<cuda_runtime.h>

extern "C" {
int squaresum();
}

squaresum.cu

#include <stdio.h>
#include
<stdlib.h>
//#include "utils.h"
#include <iostream>
#include
"squaresum.h"
// ======== define area ========
#define DATA_SIZE 1048576 // 1M

// ======== global area ========
int data[DATA_SIZE];

__global__
static void squaresSum(int *data, int *sum, clock_t *time)
{
int sum_t = 0;
clock_t start
= clock();
for (int i = 0; i < DATA_SIZE; ++i) {
sum_t
+= data[i] * data[i];
}
*sum = sum_t;
*time = clock() - start;
}

// ======== used to generate rand datas ========
void generateData(int *data, int size)
{
for (int i = 0; i < size; ++i) {
data[i]
= rand() % 10;
}
}

int squaresum()
{
// init CUDA device
if (!InitCUDA()) {
return 0;
}
printf(
"CUDA initialized.\n");

// generate rand datas
generateData(data, DATA_SIZE);

// malloc space for datas in GPU
int *gpuData, *sum;
clock_t
*time;
cudaMalloc((
void**) &gpuData, sizeof(int) * DATA_SIZE);
cudaMalloc((
void**) &sum, sizeof(int));
cudaMalloc((
void**) &time, sizeof(clock_t));
cudaMemcpy(gpuData, data,
sizeof(int) * DATA_SIZE, cudaMemcpyHostToDevice);

// calculate the squares's sum
squaresSum<<<1, 1, 0>>>(gpuData, sum, time);

// copy the result from GPU to HOST
int result;
clock_t time_used;
cudaMemcpy(
&result, sum, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(
&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

// free GPU spaces
cudaFree(gpuData);
cudaFree(sum);
cudaFree(time);

// print result
printf("(GPU) sum:%d time:%ld\n", result, time_used);

// CPU calculate
result = 0;
clock_t start
= clock();
for (int i = 0; i < DATA_SIZE; ++i) {
result
+= data[i] * data[i];
}
time_used
= clock() - start;
printf(
"(CPU) sum:%d time:%ld\n", result, time_used);

return 0;
}

utils.h

#include <stdio.h>
#include
<cuda_runtime.h>

extern "C" {
bool InitCUDA();
}

utils.cu

#include "utils.h"
#include
<cuda_runtime.h>
#include
<iostream>

void printDeviceProp(const cudaDeviceProp &prop)
{
printf(
"Device Name : %s.\n", prop.name);
printf(
"totalGlobalMem : %d.\n", prop.totalGlobalMem);
printf(
"sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock);
printf(
"regsPerBlock : %d.\n", prop.regsPerBlock);
printf(
"warpSize : %d.\n", prop.warpSize);
printf(
"memPitch : %d.\n", prop.memPitch);
printf(
"maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
printf(
"maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf(
"maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf(
"totalConstMem : %d.\n", prop.totalConstMem);
printf(
"major.minor : %d.%d.\n", prop.major, prop.minor);
printf(
"clockRate : %d.\n", prop.clockRate);
printf(
"textureAlignment : %d.\n", prop.textureAlignment);
printf(
"deviceOverlap : %d.\n", prop.deviceOverlap);
printf(
"multiProcessorCount : %d.\n", prop.multiProcessorCount);
}

bool InitCUDA()
{
//used to count the device numbers
int count;

// get the cuda device count
cudaGetDeviceCount(&count);
// print("%d\n", count);
std::cout << count << std::endl;
if (count == 0) {
fprintf(stderr,
"There is no device.\n");
return false;
}

// find the device >= 1.X
int i;
for (i = 0; i < count; ++i) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
printDeviceProp(prop);
break;
}
}
}

// if can't find the device
if (i == count) {
fprintf(stderr,
"There is no device supporting CUDA 1.x.\n");
return false;
}

// set cuda device
cudaSetDevice(i);

return true;
}

//int main(){
// InitCUDA();
//}

更多相关文章

  1. Linux学习总结(十五)文件查找 which whereis locate find
  2. 嵌入式Linux文件系统及其存储机制分析
  3. Oracle表按字段和|分格符导出文件
  4. linux使用rz和sz命令,实现小文件上传下载
  5. linux 服务器间文件传输
  6. 拆分gzip压缩日志文件而不将未压缩的拆分存储在磁盘上
  7. Linux 环境变量与文件查找
  8. 为什么Linux不需要磁盘碎片整理?——借以复习文件系统方面的知识
  9. Linux中的文件权限和用户组

随机推荐

  1. JQuery实现的 checkbox 全选、反选。
  2. 执行Django数据库值的计算:视图。py或Jav
  3. 通过AJAX加载内容和预加载图像?
  4. 使用JQuery从外部文件中通过id选择器获取
  5. 使用jQuery更新textarea值更改的文本
  6. 是否每次都可能分离内容、表示和行为?
  7. jquery向Django后台发送数组
  8. 将JavaScript对象作为Dictionary 传递给C
  9. 如何防止缓存.NET JSON源
  10. 如何在用户选择操作后获取当前日期和时间