CUDA中的cusparse库

VS2013+CUDA7.5

项目设置

1.新建cuda项目

Paste_Image.png

选择X64位,根据自己的需要定义Debug或Release模式


Paste_Image.png

2.项目属性-配置属性-VC++目录

Paste_Image.png

  设置包含目录


Paste_Image.png

  设置库目录


Paste_Image.png

3.项目属性-配置属性-链接器

常规-附加库目录:$(CUDA_PATH_V7.5)\lib$(Platform)


Paste_Image.png

  输入-附加依赖项:
  cublas.lib
  cublas_device.lib
  cuda.lib
  cudadevrt.lib
  cudart.lib
  cudart_static.lib
  cufft.lib
  cufftw.lib
  curand.lib
  cusolver.lib
  cusparse.lib
  nppc.lib
  nppi.lib
  npps.lib
  nvblas.lib
  nvcuvid.lib
  nvrtc.lib
  OpenCL.lib


Paste_Image.png

4.设置windows调制器为X64位

由于cusparse.lib是X64位下的库,所以要设置成X64运行模式,如果不设置就会出现如下的错误。


Paste_Image.png

修改:


Paste_Image.png

实验结果:
Paste_Image.png

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include"cusparse.h"

#define CLEANUP(s)\
do{\
    printf("%s\n", s); \
if (yHostPtr)           free(yHostPtr);                 \
if (zHostPtr)           free(zHostPtr);          \
if (xIndHostPtr)        free(xIndHostPtr);       \
if (xValHostPtr)        free(xValHostPtr);       \
if (cooRowIndexHostPtr) free(cooRowIndexHostPtr); \
if (cooColIndexHostPtr) free(cooColIndexHostPtr); \
if (cooValHostPtr)      free(cooValHostPtr);     \
if (y)                  cudaFree(y);             \
if (z)                  cudaFree(z);             \
if (xInd)               cudaFree(xInd);          \
if (xVal)               cudaFree(xVal);          \
if (csrRowPtr)          cudaFree(csrRowPtr);     \
if (cooRowIndex)        cudaFree(cooRowIndex);   \
if (cooColIndex)        cudaFree(cooColIndex);   \
if (cooVal)             cudaFree(cooVal);        \
if (descr)              cusparseDestroyMatDescr(descr); \
if (handle)             cusparseDestroy(handle); \
    cudaDeviceReset(); \
    fflush(stdout); \
} while (0)

    int main(){
        cudaError_t cudaStat1, cudaStat2, cudaStat3, cudaStat4, cudaStat5, cudaStat6;
        cusparseStatus_t status;
        cusparseHandle_t handle = 0;
        cusparseMatDescr_t descr = 0;

        int *    cooRowIndexHostPtr = 0;
        int *    cooColIndexHostPtr = 0;
        double * cooValHostPtr = 0;

        int *    cooRowIndex = 0;
        int *    cooColIndex = 0;
        double * cooVal = 0;

        int *    xIndHostPtr = 0;
        double * xValHostPtr = 0;
        double * yHostPtr = 0;
        int *    xInd = 0;
        double * xVal = 0;
        double * y = 0;
        int *    csrRowPtr = 0;
        double * zHostPtr = 0;
        double * z = 0;
        int      n, nnz, nnz_vector;
        double dzero = 0.0;
        double dtwo = 2.0;
        double dthree = 3.0;
        double dfive = 5.0;
        printf("testing example\n");
        /* create the following sparse test matrix in COO format */
        /*
        |1.0     2.0 3.0|
        |    4.0        |
        |5.0     6.0 7.0|
        |    8.0     9.0| */
        n = 4; nnz = 9;
        cooRowIndexHostPtr = (int *)malloc(nnz*sizeof(cooRowIndexHostPtr[0]));
        cooColIndexHostPtr = (int *)malloc(nnz*sizeof(cooColIndexHostPtr[0]));
        cooValHostPtr = (double *)malloc(nnz*sizeof(cooValHostPtr[0]));
        if ((!cooRowIndexHostPtr) || (!cooColIndexHostPtr) || (!cooValHostPtr)){
            CLEANUP("Host malloc failed (matrix)");
            return 1;
        }
        cooRowIndexHostPtr[0] = 0; cooColIndexHostPtr[0] = 0; cooValHostPtr[0] = 1.0;
        cooRowIndexHostPtr[1] = 0; cooColIndexHostPtr[1] = 2; cooValHostPtr[1] = 2.0;
        cooRowIndexHostPtr[2] = 0; cooColIndexHostPtr[2] = 3; cooValHostPtr[2] = 3.0;
        cooRowIndexHostPtr[3] = 1; cooColIndexHostPtr[3] = 1; cooValHostPtr[3] = 4.0;
        cooRowIndexHostPtr[4] = 2; cooColIndexHostPtr[4] = 0; cooValHostPtr[4] = 5.0;
        cooRowIndexHostPtr[5] = 2; cooColIndexHostPtr[5] = 2; cooValHostPtr[5] = 6.0;
        cooRowIndexHostPtr[6] = 2; cooColIndexHostPtr[6] = 3; cooValHostPtr[6] = 7.0;
        cooRowIndexHostPtr[7] = 3; cooColIndexHostPtr[7] = 1; cooValHostPtr[7] = 8.0;
        cooRowIndexHostPtr[8] = 3; cooColIndexHostPtr[8] = 3; cooValHostPtr[8] = 9.0;
        //*
        printf("Input data:\n");
        for (int i = 0; i < nnz; i++){
            printf("cooRowIndexHostPtr[%d]=%d  ", i, cooRowIndexHostPtr[i]);
            printf("cooColIndexHostPtr[%d]=%d  ", i, cooColIndexHostPtr[i]);
            printf("cooValHostPtr[%d]=%f     \n", i, cooValHostPtr[i]);
        }
        //*/


        /* create a sparse and dense vector */
        /*
        xVal= [100.0 200.0 400.0]   (sparse)
        xInd= [0     1     3    ]
        y = [10.0 20.0 30.0 40.0 | 50.0 60.0 70.0 80.0] (dense)
        */
        nnz_vector = 3;
        xIndHostPtr = (int *)malloc(nnz_vector*sizeof(xIndHostPtr[0]));
        xValHostPtr = (double *)malloc(nnz_vector*sizeof(xValHostPtr[0]));
        yHostPtr = (double *)malloc(2 * n*sizeof(yHostPtr[0]));
        zHostPtr = (double *)malloc(2 * (n + 1)*sizeof(zHostPtr[0]));
        if ((!xIndHostPtr) || (!xValHostPtr) || (!yHostPtr) || (!zHostPtr)){
            CLEANUP("Host malloc failed (vectors)");
            return 1;
        }
        yHostPtr[0] = 10.0; xIndHostPtr[0] = 0; xValHostPtr[0] = 100.0;
        yHostPtr[1] = 20.0; xIndHostPtr[1] = 1; xValHostPtr[1] = 200.0;
        yHostPtr[2] = 30.0;
        yHostPtr[3] = 40.0; xIndHostPtr[2] = 3; xValHostPtr[2] = 400.0;
        yHostPtr[4] = 50.0;
        yHostPtr[5] = 60.0;
        yHostPtr[6] = 70.0;
        yHostPtr[7] = 80.0;
        //printf the vectors
        for (int j = 0; j < 2; j++){
            for (int i = 0; i < n; i++){
                printf("yHostPtr[%d,%d]=%f\n", i, j, yHostPtr[i + n*j]);
            }
        }
        for (int i = 0; i < nnz_vector; i++){
            printf("xIndHostPtr[%d]=%d ", i, xIndHostPtr[i]);
            printf("xValHostPtr[%d]=%f\n", i, xValHostPtr[i]);
        }

        /* allocate GPU memory and copy the matrix and vectors into it */
        cudaStat1 = cudaMalloc((void**)&cooRowIndex, nnz*sizeof(cooRowIndex[0]));
        cudaStat2 = cudaMalloc((void**)&cooColIndex, nnz*sizeof(cooColIndex[0]));
        cudaStat3 = cudaMalloc((void**)&cooVal, nnz*sizeof(cooVal[0]));
        cudaStat4 = cudaMalloc((void**)&y, 2 * n*sizeof(y[0]));
        cudaStat5 = cudaMalloc((void**)&xInd, nnz_vector*sizeof(xInd[0]));
        cudaStat6 = cudaMalloc((void**)&xVal, nnz_vector*sizeof(xVal[0]));
        if ((cudaStat1 != cudaSuccess) || (cudaStat2 != cudaSuccess)
            || (cudaStat3 != cudaSuccess) || (cudaStat4 != cudaSuccess)
            || (cudaStat5 != cudaSuccess) || (cudaStat6 != cudaSuccess)) {
            CLEANUP("Device malloc failed");
            return 1;
        }
        cudaStat1 = cudaMemcpy(cooRowIndex, cooRowIndexHostPtr,
            (size_t)(nnz*sizeof(cooRowIndex[0])),
            cudaMemcpyHostToDevice);
        cudaStat2 = cudaMemcpy(cooColIndex, cooColIndexHostPtr,
            (size_t)(nnz*sizeof(cooColIndexHostPtr[0])),
            cudaMemcpyHostToDevice);
        cudaStat3 = cudaMemcpy(cooVal, cooValHostPtr,
            (size_t)(nnz*sizeof(cooValHostPtr[0])),
            cudaMemcpyHostToDevice);
        cudaStat4 = cudaMemcpy(y, yHostPtr,
            (size_t)(2 * n*sizeof(y[0])),
            cudaMemcpyHostToDevice);
        cudaStat5 = cudaMemcpy(xInd, xIndHostPtr,
            (size_t)(nnz_vector*sizeof(xInd[0])),
            cudaMemcpyHostToDevice);
        cudaStat6 = cudaMemcpy(xVal, xValHostPtr,
            (size_t)(nnz_vector*sizeof(xVal[0])),
            cudaMemcpyHostToDevice);

        if ((cudaStat1 != cudaSuccess) ||
            (cudaStat2 != cudaSuccess) ||
            (cudaStat3 != cudaSuccess) ||
            (cudaStat4 != cudaSuccess) ||
            (cudaStat5 != cudaSuccess) ||
            (cudaStat6 != cudaSuccess)) {
            CLEANUP("Memcpy from Host to Device failed");
            return 1;
        }

        /* initialize cusparse library */
        status = cusparseCreate(&handle);
        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("CUSPARSE Library initialization failed");
            return 1;
        }
        /* create and setup matrix descriptor */
        status = cusparseCreateMatDescr(&descr);
        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("Matrix descriptor initialization failed");
            return 1;
        }
        cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
        cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
        /* exercise conversion routines (convert matrix from COO 2 CSR format) */
        cudaStat1 = cudaMalloc((void**)&csrRowPtr, (n + 1)*sizeof(csrRowPtr[0]));
        if (cudaStat1 != cudaSuccess) {
            CLEANUP("Device malloc failed (csrRowPtr)");
            return 1;
        }
        status = cusparseXcoo2csr(handle, cooRowIndex, nnz, n,
            csrRowPtr, CUSPARSE_INDEX_BASE_ZERO);
        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("Conversion from COO to CSR format failed");
            return 1;
        }
        //csrRowPtr = [0 3 4 7 9] 
        // The following test only works for compute capability 1.3 and above 
        // because it needs double precision.
        int devId;
        cudaDeviceProp prop; //cudaDeviceProp</strong>数据类型针对函式 <strong>cudaGetDeviceProperties</strong>定义的,<strong>cudaGetDeviceProperties</strong>函数的功能是取得支持GPU计算的装置的相关属性,如支持CUDA版本号装置的名称、内存的大小、最大的 thread 数目、执行单元的频率  
        cudaError_t cudaStat;
        cudaStat = cudaGetDevice(&devId);
        if (cudaStat != cudaSuccess){
            CLEANUP("cudaGetDevice failed");
            printf("Error: cudaStat %d, %s\n", cudaStat, cudaGetErrorString(cudaStat));
            return 1;
        }
        cudaStat = cudaGetDeviceProperties(&prop, devId);
        if (cudaStat != cudaSuccess){
            CLEANUP("cudaGetDeviceProperties failed");
            printf("Error: cudaStat %d, %s\n", cudaStat, cudaGetErrorString(cudaStat));
            return 1;
        }
        int cc = 100 * prop.major + 10 * prop.minor; //major计算能力的主代号,minor计算能力的次要代号
        if (cc < 130){
            CLEANUP("waive the test because only sm13 and above are supported\n");
            printf("the device has compute capability %d\n", cc);
            printf("example test WAIVED");
            return 2;
        }

        /* exercise Level 1 routines (scatter vector elements) */
        status = cusparseDsctr(handle, nnz_vector, xVal, xInd,
            &y[n], CUSPARSE_INDEX_BASE_ZERO);

        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("Scatter from sparse to dense vector failed");
            return 1;
        }
        //y = [10 20 30 40 | 100 200 70 400]


        /* exercise Level 2 routines (csrmv) */
        status = cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, n, nnz,
            &dtwo, descr, cooVal, csrRowPtr, cooColIndex,
            &y[0], &dthree, &y[n]);

        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("Matrix-vector multiplication failed");
            return 1;
        }
        //y = [10 20 30 40 | 680 760 1230 2240]
        cudaMemcpy(yHostPtr, y, (size_t)(2 * n*sizeof(y[0])), cudaMemcpyDeviceToHost);
        printf("Intermediate results:\n");
        for (int j = 0; j < 2; j++){
            for (int i = 0; i < n; i++){
                printf("yHostPtr[%d,%d]=%f\n", i, j, yHostPtr[i + n*j]);
            }
        }


        /* exercise Level 3 routines (csrmm) */
        cudaStat1 = cudaMalloc((void**)&z, 2 * (n + 1)*sizeof(z[0]));
        if (cudaStat1 != cudaSuccess) {
            CLEANUP("Device malloc failed(z)");
            return 1;
        }
        cudaStat1 = cudaMemset((void *)z, 0, 2 * (n + 1)*sizeof(z[0]));
        if (cudaStat1 != cudaSuccess) {
            CLEANUP("Memset on Device failed");
            return 1;
        }
        status = cusparseDcsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, 2, n, nnz,
            &dfive, descr, cooVal, csrRowPtr, cooColIndex,
            y, n, &dzero, z, n + 1);
        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("Matrix-matrix multiplication failed");
            return 1;
        }
        //z = [950 400 2550 2600 0 | 49300 15200 132300 131200 0]
        /* print final results (z) */
        cudaStat1 = cudaMemcpy(zHostPtr, z,
            (size_t)(2 * (n + 1)*sizeof(z[0])),
            cudaMemcpyDeviceToHost);
        if (cudaStat1 != cudaSuccess) {
            CLEANUP("Memcpy from Device to Host failed");
            return 1;
        }
        printf("Final results:\n");
        for (int j = 0; j < 2; j++){
            for (int i = 0; i < n + 1; i++){
                printf("z[%d,%d]=%f\n", i, j, zHostPtr[i + (n + 1)*j]);
            }
        }
        /* destroy matrix descriptor */
        status = cusparseDestroyMatDescr(descr);
        descr = 0;
        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("Matrix descriptor destruction failed");
            return 1;
        }
        /* destroy handle */
        status = cusparseDestroy(handle);
        handle = 0;
        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("CUSPARSE Library release of resources failed");
            return 1;
        }


        /* check the results */
        /* Notice that CLEANUP() contains a call to cusparseDestroy(handle) */
        if ((zHostPtr[0] != 950.0) ||
            (zHostPtr[1] != 400.0) ||
            (zHostPtr[2] != 2550.0) ||
            (zHostPtr[3] != 2600.0) ||
            (zHostPtr[4] != 0.0) ||
            (zHostPtr[5] != 49300.0) ||
            (zHostPtr[6] != 15200.0) ||
            (zHostPtr[7] != 132300.0) ||
            (zHostPtr[8] != 131200.0) ||
            (zHostPtr[9] != 0.0) ||
            (yHostPtr[0] != 10.0) ||
            (yHostPtr[1] != 20.0) ||
            (yHostPtr[2] != 30.0) ||
            (yHostPtr[3] != 40.0) ||
            (yHostPtr[4] != 680.0) ||
            (yHostPtr[5] != 760.0) ||
            (yHostPtr[6] != 1230.0) ||
            (yHostPtr[7] != 2240.0)){
            CLEANUP("example test FAILED");
            return 1;
        }
        else{
            CLEANUP("example test PASSED");
            //return 0; 
        }
        getchar();
}
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 204,732评论 6 478
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 87,496评论 2 381
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 151,264评论 0 338
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 54,807评论 1 277
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 63,806评论 5 368
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 48,675评论 1 281
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 38,029评论 3 399
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 36,683评论 0 258
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 41,704评论 1 299
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,666评论 2 321
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 37,773评论 1 332
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,413评论 4 321
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 39,016评论 3 307
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 29,978评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 31,204评论 1 260
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 45,083评论 2 350
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 42,503评论 2 343

推荐阅读更多精彩内容

  • Spring Cloud为开发人员提供了快速构建分布式系统中一些常见模式的工具(例如配置管理,服务发现,断路器,智...
    卡卡罗2017阅读 134,596评论 18 139
  • Android 自定义View的各种姿势1 Activity的显示之ViewRootImpl详解 Activity...
    passiontim阅读 171,451评论 25 707
  • Spring Boot 参考指南 介绍 转载自:https://www.gitbook.com/book/qbgb...
    毛宇鹏阅读 46,733评论 6 342
  • 朋友在于精不在于多。 我一直都在坚持这一点。 曾经我也曾试过跟身边所有的人打好关系,后来发现我做不到,你聊动...
    人圭雨路阅读 940评论 0 4
  • 今天记录一下浏览器无法访问个别网站的原因,重装系统后,将需要的软件都安装了最新版本的,今天,打开浏览器在茫茫...
    泰泰博主阅读 18,415评论 1 3