1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
| #include <cuda_runtime.h> #include <cublas.h> #include <cublas_api.h> #include <cublas_v2.h>
bool CompareFeatureMtoN_gpu(float * featureM, float * featureN, float * result, int count_m, int count_n, int size, int gpu_id) { float *dev_featureM = 0; float *dev_featureN = 0; float *dev_result = 0; const float alpha = 1, beta = 0; cublasHandle_t handle; cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(gpu_id); if (cudaStatus != cudaSuccess) { printf("cudaSetDevice failed! Do you have a CUDA-capable GPU installed?\n"); goto out; } cublasCreate(&handle);
cudaStatus = cudaMalloc((void**)&dev_featureM, count_m * size * sizeof(float)); if (cudaStatus != cudaSuccess) { printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__); goto out; } cudaStatus = cudaMalloc((void**)&dev_featureN, count_n * size * sizeof(float)); if (cudaStatus != cudaSuccess) { printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__); goto out; } cudaStatus = cudaMalloc((void**)&dev_result, count_m * count_n * sizeof(float)); if (cudaStatus != cudaSuccess) { printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__); goto out; }
cudaStatus = cudaMemcpy(dev_featureM, featureM, count_m * size * sizeof(float), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__); goto out; } cudaStatus = cudaMemcpy(dev_featureN, featureN, count_n * size * sizeof(float), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__); goto out; }
cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, count_n, count_m, size, &alpha, dev_featureN, size, dev_featureM, size, &beta, dev_result, count_n); cudaStatus = cudaThreadSynchronize();
cudaStatus = cudaMemcpy(result, dev_result, count_m * count_n * sizeof(float), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { printf("%s, line %d, cudaMemcpy failed!\n", __func__, __LINE__); goto out; }
out: if(dev_featureM) cudaFree(dev_featureM); if(dev_featureN) cudaFree(dev_featureN); if(dev_result) cudaFree(dev_result); cublasDestroy(handle); return cudaStatus == cudaSuccess; }
|