0%

cublasSgemm for large matrix multiplication on gpu in C++

Guide

code

demo.cu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#include <cuda_runtime.h>
#include <cublas.h>
#include <cublas_api.h>
#include <cublas_v2.h>

bool CompareFeatureMtoN_gpu(float * featureM, float * featureN, float * result,
int count_m, int count_n, int size, int gpu_id) {
float *dev_featureM = 0;
float *dev_featureN = 0;
float *dev_result = 0;
const float alpha = 1, beta = 0;
cublasHandle_t handle;
cudaError_t cudaStatus;

cudaStatus = cudaSetDevice(gpu_id);
if (cudaStatus != cudaSuccess) {
printf("cudaSetDevice failed! Do you have a CUDA-capable GPU installed?\n");
goto out;
}
cublasCreate(&handle);

cudaStatus = cudaMalloc((void**)&dev_featureM, count_m * size * sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}
cudaStatus = cudaMalloc((void**)&dev_featureN, count_n * size * sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}
cudaStatus = cudaMalloc((void**)&dev_result, count_m * count_n * sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}

cudaStatus = cudaMemcpy(dev_featureM, featureM, count_m * size * sizeof(float),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}
cudaStatus = cudaMemcpy(dev_featureN, featureN, count_n * size * sizeof(float),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}

/*

CUBLAS assumes that the matrix in the device is stored in column major:

" where α and β are scalars, and A , B and C are matrices stored in column-major
format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n , respectively.

Also, for matrix A


// Multiply the arrays A and B on GPU and save the result in C (coloum-major)
// C(m,n) = A(m,k) * B(k,n)

cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
*/

cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, count_n, count_m, size,
&alpha, dev_featureN, size, dev_featureM, size, &beta, dev_result, count_n);
cudaStatus = cudaThreadSynchronize();

cudaStatus = cudaMemcpy(result, dev_result, count_m * count_n * sizeof(float),
cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMemcpy failed!\n", __func__, __LINE__);
goto out;
}

out:
if(dev_featureM) cudaFree(dev_featureM);
if(dev_featureN) cudaFree(dev_featureN);
if(dev_result) cudaFree(dev_result);
cublasDestroy(handle);
return cudaStatus == cudaSuccess;
}

usage

demo.cu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
void test_feature_compare()
{
/*

[a1]
[a2]
[a3]
* [b1]
[b2]

[10,35]
[10,35]
[10,35]
*/
std::vector<float> f1{0,1,2,3,4,5,6,7,8,9};
std::vector<float> f2{1,1,1,1,1,0,0,0,0,0},f22{0,0,0,0,0,1,1,1,1,1};

std::vector<std::vector<float>> A,B;
// 3*10
A.push_back(f1);
A.push_back(f1);
A.push_back(f1);

// 10 * 2
B.push_back(f2);
B.push_back(f22);


int m = 3;
int n = 2;
int dim = 10;
int gpu_id = 0;

float* feature_m = new float[ m*dim ];
float* feature_n = new float[ n*dim ];
auto tmp = feature_m;
for (int i = 0; i < m; i++) {
for (int j = 0; j < dim; j++)
*tmp++ = A[i][j];
}

tmp = feature_n;
for (int i = 0; i < n; i++) {
for (int j = 0; j < dim; j++)
*tmp++ = B[i][j];
}

printf("m = %d, n= %d, size= %d \n", m, n, dim); // 3, 2, 10

//float* result = CompareFeatureMtoN(feature_m, m*dim, feature_n, n*dim, dim, gpu_id);

float* result = new float[m*n];
CompareFeatureMtoN_gpu(feature_m, feature_n, result, m, n, dim, gpu_id);

tmp = result;
for(int i=0;i<6;i++)
printf("%f ", *(tmp++));

delete []feature_m;
delete []feature_n;
delete []result;
}

output

m = 3, n= 2, size= 10 
10.000000 35.000000 10.000000 35.000000 10.000000 35.000000

Reference

History

  • 20191015: created.