0%

caffe failed to use gpu in multiple threads, how to fix it ?

Guide

set_mode

Caffe fails to use GPU in a new thread ???
see here

the `Caffe::mode_` variable that controls this is thread-local,
so ensure you’re calling `caffe.set_mode_gpu()` in each thread
before running any Caffe functions. That should solve your issue.

Caffe set_mode GPU 在多线程下失效
在main thread中设置GPU模式,在worker thread中调用网络进行检测,
GPU模式不起效,默认仍然使用CPU模式,所以速度很慢,和GPU相比慢了
10倍左右。

解决方案:在子线程中set_mode,然后调用网络进行检测。
(1)创建网络在main thread。static 网络存储在全局静态数据区。
worker thread可以直接使用。
(2) 在worker thread中检测,需要在子线程中set_mode,然后调用网络进行检测。

结论:
(1)caffe的set_mode所在的线程必须和使用nets进行forward的线程相同。否则默认使用CPU模式,速度会很慢。
(2)caffe的nets初始化可以在main thread也可以在worker thread。

code example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#include <iostream>
#include <string>
#include <thread>

#include <gtest/gtest.h>
#include <glog/logging.h>

#include <boost/date_time/posix_time/posix_time.hpp>

// opencv
#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>

using namespace std;

#include "algorithm/algorithm.h"
using namespace kezunlin::algorithm;

#pragma region net-demo

void topwire_demo(bool run_in_worker_thread)
{
if (run_in_worker_thread) {
CaffeApi::set_mode(true, 0, 1234);// set in worker thread-1, use GPU-0
}

// do net detect
// ...
}

void railway_demo(bool run_in_worker_thread)
{
if (run_in_worker_thread) {
CaffeApi::set_mode(true, 0, 1234);// set in worker thread-1, use GPU-0
}

// do net detect
// ...
}

void sidewall_demo(bool run_in_worker_thread)
{
if (run_in_worker_thread) {
CaffeApi::set_mode(true, 0, 1234);// set in worker thread-1, use GPU-0
}

// do net detect
// ...
}

void lockcatch_demo(bool run_in_worker_thread)
{
if (run_in_worker_thread) {
CaffeApi::set_mode(true, 0, 1234);// set in worker thread-1, use GPU-0
}

// do net detect
// ...
}

#pragma endregion


#pragma region worker-thread-demo

void worker_thread_topwire_demo(bool run_in_worker_thread)
{
std::thread thr(topwire_demo, run_in_worker_thread);
thr.join();
}

void worker_thread_railway_demo(bool run_in_worker_thread)
{
std::thread thr(railway_demo, run_in_worker_thread);
thr.join();
}

void worker_thread_sidewall_demo(bool run_in_worker_thread)
{
std::thread thr(sidewall_demo, run_in_worker_thread);
thr.join();
}

void worker_thread_lockcatch_demo(bool run_in_worker_thread)
{
std::thread thr(lockcatch_demo, run_in_worker_thread);
thr.join();
}

#pragma endregion

enum DETECT_TYPE {
SET_IN_MAIN_DETECT_IN_MAIN, // 主线程set_mode,主线程检测,40ms左右,使用GPU
SET_IN_WORKER_DETECT_IN_WORKER, // 子线程set_mode,子线程检测,40ms左右,使用GPU
SET_IN_MAIN_DETECT_IN_WORKER // 主线程set_mode,子线程检测,400ms左右,慢了10倍左右,没有使用GPU
};

void thread_demo()
{
DETECT_TYPE detect_type = SET_IN_MAIN_DETECT_IN_MAIN;
detect_type = SET_IN_WORKER_DETECT_IN_WORKER;
detect_type = SET_IN_MAIN_DETECT_IN_WORKER;

init_algorithm_api();

switch (detect_type)
{
case SET_IN_MAIN_DETECT_IN_MAIN:
topwire_demo(false);
railway_demo(false);
sidewall_demo(false);
lockcatch_demo(false);
break;
case SET_IN_WORKER_DETECT_IN_WORKER:
worker_thread_topwire_demo(true);
worker_thread_railway_demo(true);
worker_thread_sidewall_demo(true);
worker_thread_lockcatch_demo(true);
break;
case SET_IN_MAIN_DETECT_IN_WORKER:
worker_thread_topwire_demo(false);
worker_thread_railway_demo(false);
worker_thread_sidewall_demo(false);
worker_thread_lockcatch_demo(false);
break;
default:
break;
}

free_algorithm_api();
}

void test_algorithm_api()
{
thread_demo();
}

TEST(algorithn_test, test_algorithm_api) {
test_algorithm_api();
}

conclusions

  • SET_IN_MAIN_DETECT_IN_MAIN, // 主线程set_mode,主线程检测,40ms左右,使用GPU
  • SET_IN_WORKER_DETECT_IN_WORKER, // 子线程set_mode,子线程检测,40ms左右,使用GPU
  • SET_IN_MAIN_DETECT_IN_WORKER // 主线程set_mode,子线程检测,400ms左右,慢了10倍左右,没有使用GPU

Reference

History

  • 20180712: created.