0%

Series

code example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#!/usr/bin/python
# -*- coding: utf-8 -*-

# https://blogs.msdn.microsoft.com/lukassteindl/2015/12/13/linear-regression-example-with-python-and-theano/
# http://blog.csdn.net/vins_napoleon/article/details/38057927

from numpy import *
import numpy as np
import theano
import theano.tensor as T
import time

class Linear_Reg(object):
def __init__(self,x):
#x,y是scalar,vector(n,),matrix(m,n)
self.w = theano.shared(value = 0.0,name = 'w')
self.b = theano.shared(value = 0.0,name = 'b')
# w,b is scalar,so x can be vector
self.y_pred = x * self.w + self.b
self.params = [self.w,self.b]

def msl(self,y):
return T.sum((y - self.y_pred)**2)

class Linear_Reg2(object):
def __init__(self,x):
#x,y是scalar,vector(1,),matrix(m,1)
self.w = theano.shared(value = np.zeros((1,),dtype=theano.config.floatX),name = 'w')
self.b = theano.shared(value = np.zeros((1,),dtype=theano.config.floatX),name = 'b')
# w,b is vector,so x cab be scalar
self.y_pred = x * self.w + self.b
self.params = [self.w,self.b]

def msl(self,y):
return T.sum((y - self.y_pred)**2)

def test_type():
# x,y be scalar
points_x = [1.1,2.2]
points_y = [3.3,4.4]
X = theano.shared(np.asarray(points_x,dtype=theano.config.floatX),borrow = True)
Y = theano.shared(np.asarray(points_y,dtype=theano.config.floatX),borrow = True)
x = T.dscalar('x')
print x
print x.type
print X[0]
print X[0].type

#x
#TensorType(float64, scalar)
#Subtensor{int64}.0
#TensorType(float64, scalar)

def test_type2():
# x,y be vector
points_x = [1.1,2.2]
points_y = [3.3,4.4]
X = theano.shared(np.asarray(points_x,dtype=theano.config.floatX),borrow = True)
Y = theano.shared(np.asarray(points_y,dtype=theano.config.floatX),borrow = True)
x = T.dvector('x')
print x
print x.type
print X[0:2]
print X[0:2].type

#x
#TensorType(float64, vector)
#Subtensor{int64:int64:}.0
#TensorType(float64, vector)


def run_model1(mode):
"""
[w,b be scalar]

mode = scalar(), set mini_batch_size = 1
mode = vector(m,), reshape X,Y to vector
mode = matrix(m,n), reshape X,Y to matrix
"""
eta = 0.000001
epochs = 1000
points = genfromtxt("data.csv", delimiter=",")# (100,2)
points_x = points[:,0] # (100,) numpy.float64
points_y = points[:,1] # (100,) numpy.float64
N = points_x.shape[0]

if mode == "scalar":
mini_batch_size = 1 # must be 1 so that all X[i] are used
X = theano.shared(np.asarray(points_x,dtype=theano.config.floatX),borrow = True)
Y = theano.shared(np.asarray(points_y,dtype=theano.config.floatX),borrow = True)

# so than we get X[0],X[1],...
x = T.dscalar('tx')
y = T.dscalar('ty')
elif mode == "vector":
mini_batch_size = 5
X = theano.shared(np.asarray(points_x,dtype=theano.config.floatX),borrow = True)
Y = theano.shared(np.asarray(points_y,dtype=theano.config.floatX),borrow = True)

# so than we get X[0:5],X[5:10],...
x = T.dvector('tx')
y = T.dvector('ty')
elif mode == "matrix":
mini_batch_size = 5
X = theano.shared(np.asarray(points_x,dtype=theano.config.floatX).reshape(N,1),borrow = True)
Y = theano.shared(np.asarray(points_y,dtype=theano.config.floatX).reshape(N,1),borrow = True)

# so than we get X[0:5],X[5:10],...
x = T.dmatrix('tx')
y = T.dmatrix('ty')

num_batches = N/mini_batch_size

#index = T.lscalar() # l int64

reg = Linear_Reg(x = x)
cost = reg.msl(y)

w_g = T.grad(cost = cost, wrt = reg.w)
b_g = T.grad(cost = cost, wrt = reg.b)

updates=[(reg.w, reg.w - eta * w_g),
(reg.b, reg.b - eta * b_g)]

# use x,y as input (调用函数的时候,x,y的参数化必须是python数值,不能是theano variable)
train_model = theano.function(inputs=[x,y],
outputs = cost,
updates = updates,
)

cost_t = 0.0
costs = []
start_time = time.clock()

for epoch in xrange(epochs):
# 1个epoch,所有N个样本参与训练,mini = m,学习N/m次
cost_l = []
for index in range(num_batches):
if mode == "scalar":
x = X.get_value()[index]
y = Y.get_value()[index]
else:
x = X.get_value()[index*mini_batch_size:(index+1)*mini_batch_size]
y = Y.get_value()[index*mini_batch_size:(index+1)*mini_batch_size]
cost_l.append( train_model(x,y) )

cost_t = np.mean(cost_l)
costs.append(cost_t)

end_time = time.clock()
print '\nTotal time is :',end_time -start_time,' s'
print 'last cost :',cost_t
print 'w value : ',reg.w.get_value()
print 'b value : ',reg.b.get_value()


def run_model2(mode):
"""
[w,b be vector(1,)]

mode = scalar(), set mini_batch_size = 1
mode = vector(1,), set mini_batch_size = 1, reshape X,Y to vector
mode = matrix(m,1), reshape X,Y to matrix
"""
eta = 0.000001
epochs = 1000
points = genfromtxt("data.csv", delimiter=",")# (100,2)
points_x = points[:,0] # (100,) numpy.float64
points_y = points[:,1] # (100,) numpy.float64
N = points_x.shape[0]

if mode == "scalar":
mini_batch_size = 1
X = theano.shared(np.asarray(points_x,dtype=theano.config.floatX),borrow = True)
Y = theano.shared(np.asarray(points_y,dtype=theano.config.floatX),borrow = True)

# so than we get X[0],X[1],...
x = T.dscalar('tx')
y = T.dscalar('ty')
elif mode == "vector":
mini_batch_size = 1
X = theano.shared(np.asarray(points_x,dtype=theano.config.floatX),borrow = True)
Y = theano.shared(np.asarray(points_y,dtype=theano.config.floatX),borrow = True)

# so than we get X[0:5],X[5:10],...
x = T.dvector('tx')
y = T.dvector('ty')
elif mode == "matrix":
mini_batch_size = 5
X = theano.shared(np.asarray(points_x,dtype=theano.config.floatX).reshape(N,1),borrow = True)
Y = theano.shared(np.asarray(points_y,dtype=theano.config.floatX).reshape(N,1),borrow = True)

# so than we get X[0:5],X[5:10],...
x = T.dmatrix('tx')
y = T.dmatrix('ty')

num_batches = N/mini_batch_size

index = T.lscalar() # l int64
reg = Linear_Reg2(x = x)
cost = reg.msl(y)

#w_g,b_g = T.grad(cost,[reg.w,reg.b])
#updates=[(reg.w, reg.w - eta * w_g), (reg.b, reg.b - eta * b_g)]

#==========================================================================
# use params and updates
#==========================================================================
params = [reg.w,reg.b] # list of [w,b]
grads = T.grad(cost,params) # list of [w_g,b_g]
updates = [(param,param-eta*grad)
for param,grad in zip(params,grads)]
# list of [ (w,w-eta*w_g), (b, b-eta*b_g) ]

# updates必须是shared变量;
# use x,y as input (调用函数的时候,x,y的参数化必须是python数值,不能是theano variable)
# use index as input (调用函数的时候,index的参数化必须是python数值),
# 通过givens替换掉x,y,X[index],Y[index]和x,y的类型必须一致,都是theano variable
# 此处x,y,X[index],Y[index]都是 TensorType(float64, scalar) 
# 或者是float64, vector,matrix类型
if mode == "scalar":
train_model = theano.function(inputs=[index],
outputs = cost,
updates = updates,
givens = {x:X[index],
y:Y[index]})
else:
train_model = theano.function(inputs=[index],
outputs = cost,
updates = updates,
givens = {x:X[index*mini_batch_size:(index+1)*mini_batch_size],
y:Y[index*mini_batch_size:(index+1)*mini_batch_size]})

cost_t = 0.0
costs = []
start_time = time.clock()

for epoch in xrange(epochs):
# 1个epoch,所有N个样本参与训练,mini = m,学习N/m次
cost_l = []
for index in range(num_batches):
cost_l.append( train_model(index) )

cost_t = np.mean(cost_l)
costs.append(cost_t)

end_time = time.clock()
print '\nTotal time is :',end_time -start_time,' s'
print 'last cost :',cost_t
print 'w value : ',reg.w.get_value()
print 'b value : ',reg.b.get_value()


"""
run:
Total time is : 2.796644 s
last cost : 113.178407322
w value : 1.48497718432
b value : 0.0890071567283

run2:
Total time is : 2.127487 s
last cost : 113.178407322
w value : [ 1.48497718]
b value : [ 0.08900716]

run3:
Total time is : 0.480144 s
avg cost : 565.419900755
w value : [ 1.48492605]
b value : [ 0.08896923]
"""

def test():
test_type()
test_type2()

if __name__ == '__main__':
#test()

#run_model1("scalar")
#run_model1("vector")
#run_model1("matrix")

run_model2("scalar")
run_model2("vector")
run_model2("matrix")
Total time is : 1.929041  s
last cost : 113.178407322
w value :  [ 1.48497718]
b value :  [ 0.08900716]

Total time is : 2.040197  s
last cost : 113.178407322
w value :  [ 1.48497718]
b value :  [ 0.08900716]

Total time is : 0.553913  s
last cost : 565.419900755
w value :  [ 1.48492605]
b value :  [ 0.08896923]

Reference

History

  • 20180807: created.

Series

code example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/python
# -*- coding: utf-8 -*-

# https://blogs.msdn.microsoft.com/lukassteindl/2015/12/13/linear-regression-example-with-python-and-theano/
# http://jakevdp.github.io/blog/2013/05/12/embedding-matplotlib-animations/

"""
在Python Notebook中无法展现动画,只能展示静态的图像。
The problem is that so far the integration of IPython with matplotlib is entirely static,
while animations are by their nature dynamic.
"""

import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import style
import theano
from theano import tensor as T
import numpy as np

style.use('fivethirtyeight')
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)

trX = np.linspace(-1,1,101) # (101,)
trY = 2 * trX + np.random.randn(*trX.shape) * 0.33 # (101,)

X = T.scalar()
Y = T.scalar()

def model (X,w):
return X * w

w = theano.shared(np.asarray(0., dtype=theano.config.floatX)) #scalar
y = model(X,w)

cost = T.mean(T.sqr(y-Y))
gradient = T.grad(cost=cost, wrt = w)
updates = [[w,w-gradient * 0.001]]

#train = theano.function(inputs=[X,Y], outputs=cost, updates = updates, allow_input_downcast= True)
train = theano.function(inputs=[X,Y], outputs=cost, updates = updates)

def run():
for i in range(100):
for x,y in zip (trX,trY):
train(x,y)
print (w.eval())

def animate(i):
# i: iteration of animate
#print i
ax1.clear()
plt.scatter(trX, trY, label='Gradient Descent on GPU',
alpha=0.3, edgecolors='none')
plt.legend()
plt.grid(True)
for x,y in zip (trX,trY):
train(x,y)
#print (w.eval())

xs = [-1,1]
ys = [-1*w.eval(),w.eval()]
ax1.plot(xs,ys)

def show_animate():
ani = animation.FuncAnimation(fig, animate, interval = 250)
plt.show()

def main():
#run()
show_animate()

main()

Reference

History

  • 20180807: created.

Random and seed

所有标准库提供的Random函数其实都是假Random,真正的Random函数式不需要Seed的。

所谓假Random,是指所返回的随机数字其实是一个稳定算法所得出的稳定结果序列,而不是真正意义上的随机序列。 Seed就是这个算法开始计算的第一个值。所以就会出现只要seed是一样的,那么后续所有“随机”结果和顺序也都是完全一致的。

通常情况下,你可以用 DateTime.Now.Millisecend() 也就是当前时钟的毫秒来做Seed,因为毫秒对你来说是一个1000以内的随机数字。 这样可以大大改善标准库的Random结果的随机性。 不过这仍然算不上是完全随机,因为重复的概率还是千分之一。

1
2
import numpy as np
import random
1
2
3
4
5
6
# random.seed(1)
# np.random.seed(1)
#只要seed一样,不管运行多少次,每次产生的随机数都一样。
np.random.seed(1)
print np.random.rand(2,3)
print np.random.rand(3,1)
[[  4.17022005e-01   7.20324493e-01   1.14374817e-04]
 [  3.02332573e-01   1.46755891e-01   9.23385948e-02]]
[[ 0.18626021]
 [ 0.34556073]
 [ 0.39676747]]
1
2
3
4
#只要seed一样,不管运行多少次,每次产生的随机数都一样。
np.random.seed(1)
print np.random.rand(2,3)
print np.random.rand(3,1)
[[  4.17022005e-01   7.20324493e-01   1.14374817e-04]
 [  3.02332573e-01   1.46755891e-01   9.23385948e-02]]
[[ 0.18626021]
 [ 0.34556073]
 [ 0.39676747]]
1
2
3
import random
random.seed(1)
random.random()
0.13436424411240122
1
2
random.seed(1)
random.random()
0.13436424411240122
1
2
np.random.seed(1)
np.random.randint(0,6, size=(4,5))
array([[5, 3, 4, 0, 1],
       [3, 5, 0, 0, 1],
       [4, 5, 4, 1, 2],
       [4, 5, 2, 4, 3]])
1
2
np.random.seed(1)
np.random.randint(0,6, size=(4,5))
array([[5, 3, 4, 0, 1],
       [3, 5, 0, 0, 1],
       [4, 5, 4, 1, 2],
       [4, 5, 2, 4, 3]])

Reference

History

  • 20180806: created.

Tutorial

  • 目标
    给定N组二位坐标点对(x1,x2),…,(xn,yn),拟合直线y = mx + b,找到最佳的m和b,使得总的误差最小。

  • 数据展示
    points

  • 误差公式
    errors

  • 可视化误差
    误差error是m和b的函数
    visual errors of m and b

  • 偏导数计算公式
    partial derivatives

  • 学习过程
    learning

  • 误差曲线
    errors curve

  • 动画演示
    演示链接

Gradient Descent

随机梯度示例

横轴X代表参数,纵轴Y代表误差。
gradient descent should force us to move in the direction of the minimum
hilly landscape

沿着曲线斜率slope移动

The best guess the algorithm can do is to move in the direction that of the slope, also called the gradient of the function.
move in the slope

单变量(single variable)

  • derivative: 导数是一个极限,准确值。
    derivative of single variable

  • estimated gradient: 梯度是一个估计值,和采样距离dx有关。
    estimated gradient

  • how to update parameter
    Once we have found our estimated derivative, we need to move in its opposite direction to climb down the function. This means that we have to update our parameter p like this:
    update parameter
    The constant L is often referred as learning rate, and it dictates how fast we move against the gradient.
    找到梯度之后,沿着梯度的相反方向移动。

多变量(multiple variable)

  • partial derivatives: 多变量偏导数
    partial derivatives
    partial derivatives
    partial derivatives

  • estimated gradient: 梯度和采样距离dx,dy,dz有关。
    estimated gradient
    estimated gradient
    estimated gradient

  • gradient vector: 梯度向量
    gradient vector

Code Example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/python
# -*- coding: utf-8 -*-
from numpy import *
import numpy as np

# y = mx + b
# m is slope, b is y-intercept
def compute_error_for_line_given_points(b, m, points):
totalError = 0
for i in range(0, len(points)):
x = points[i, 0]
y = points[i, 1]
totalError += (y - (m * x + b)) ** 2
return totalError / float(len(points))

def update_mini_batch(b, m, mini_batch, learningRate):
#一次epoch梯度迭代,该函数需要调用epoches次
#每一次计算m,b的梯度,每一次学习都需要所有的N个样本数据参与运算。一个epoch内,只能学习1次。
#后续可以采用mini-batch思路,每一次学习只需要m个样本数据参与运算。一个epoch内,可以学习n/m次。
N = float(len(points))
b_gradient,m_gradient = compute_cost_gradient_of_all(mini_batch,b,m)
new_b = b - (learningRate/N) * b_gradient
new_m = m - (learningRate/N) * m_gradient
return [new_b, new_m]

def compute_cost_gradient_of_all(points,b,m):
#计算N个样本的总梯度
b_gradient = 0
m_gradient = 0
for i in range(0, len(points)):
delta_b,delta_m = compute_cost_gradient_of_one(points[i],b,m)
b_gradient += delta_b
m_gradient += delta_m
return [b_gradient,m_gradient]

def compute_cost_gradient_of_one(point,b,m):
#计算一个样本的梯度
x = point[0]
y = point[1]
b_gradient = -2 * (y - ((m * x) + b))
m_gradient = -2 * x * (y - ((m * x) + b))
return [b_gradient,m_gradient]

def gradient_descent_runner(points, starting_b, starting_m, learning_rate, epoches):
b = starting_b
m = starting_m
for i in range(epoches):
#每一次学习都需要所有的N个样本数据参与运算。一个epoch内,只能学习1次。
#此时,mini_batch包含了所有N个样本。
b, m = update_mini_batch(b, m, array(points), learning_rate)
#print "{0} iterations b = {1}, m = {2}, error = {3}".format(i, b, m, compute_error_for_line_given_points(b, m, points))
return [b, m]

def gradient_descent_runner_with_mini_batch(points, starting_b, starting_m, learning_rate, epoches,mini_batch_size):
b = starting_b
m = starting_m
n = len(points)
num_batches = n/mini_batch_size
for i in range(epoches):
#采用mini-batch思路,每一次学习只需要m个样本数据参与运算。一个epoch内,可以学习n/m次。

# shuffle data and get N/mini_batch_size mini batches
np.random.shuffle(points)

# iterate over all mini batches to learn (updating m and b)
# now in a epoch,we can learn N/mini times instead of just once.
for k in xrange(0,num_batches):
mini_batch = points[k*mini_batch_size : (k+1)*mini_batch_size]
b, m = update_mini_batch(b, m, mini_batch, learning_rate)
#print "{0} iterations b = {1}, m = {2}, error = {3}".format(i, b, m, compute_error_for_line_given_points(b, m, points))
return [b, m]

def run():
points = genfromtxt("data.csv", delimiter=",")
learning_rate = 0.0001
initial_b = 0 # initial y-intercept guess
initial_m = 0 # initial slope guess
epoches = 1000
print "Starting gradient descent at b = {0}, m = {1}, error = {2}".format(initial_b, initial_m, compute_error_for_line_given_points(initial_b, initial_m, points))
print "Running..."
[b, m] = gradient_descent_runner(points, initial_b, initial_m, learning_rate, epoches)
print "After {0} iterations b = {1}, m = {2}, error = {3}".format(epoches, b, m, compute_error_for_line_given_points(b, m, points))

def run2():
points = genfromtxt("data.csv", delimiter=",")
learning_rate = 0.0001
initial_b = 0 # initial y-intercept guess
initial_m = 0 # initial slope guess
epoches = 1000
mini_batch_size = 10 # mini batch size
print "Starting gradient descent at b = {0}, m = {1}, error = {2}".format(initial_b, initial_m, compute_error_for_line_given_points(initial_b, initial_m, points))
print "Running..."
[b, m] = gradient_descent_runner_with_mini_batch(points, initial_b, initial_m, learning_rate, epoches,mini_batch_size)
print "After {0} iterations b = {1}, m = {2}, error = {3}".format(epoches, b, m, compute_error_for_line_given_points(b, m, points))

if __name__ == '__main__':
run()
#run2()
Starting gradient descent at b = 0, m = 0, error = 5565.10783448
Running...
After 1000 iterations b = 0.0889365199374, m = 1.47774408519, error = 112.614810116

Reference

History

  • 20180806: created.

Series

Guide

requirements:

  • windows: 10
  • opencv: 3.1.0
  • nvidia driver: gtx 1060 382.05 (gtx 970m)
  • GPU arch(s): sm_61 (sm_52)
  • cuda: 8.0
  • cudnn: 5.0.5
  • cmake: 3.10.0
  • vs: vs2015 64

nvidia cuda CC

see cuda compute capacity

cuda-enabled nvidia GeForce cc

cpu vs gpu

for opencv functions

speed for cpu and gpu

get source

Get opencv 3.1.0 for git and fix some bugs

1
2
3
4
5
6
7
8
9
10
11
12
13
git clone https://github.com/opencv/opencv.git
cd opencv
git checkout -b v3.1.0 3.1.0

# fix bugs for 3.1.0
git cherry-pick 10896
git cherry-pick cdb9c
git cherry-pick 24dbb

git branch

master
* v3.1.0

compile

1
mkdir build && cd build && cmake-gui ..

config

configure with VS 2015 win64 with options

BUILD_SHARED_LIBS  ON
CMAKE_CONFIGURATION_TYPES Release # Release
CMAKE_CXX_FLAGS_RELEASE /MD /O2 /Ob2 /DNDEBUG /MP # for multiple processor

WITH_VTK OFF
BUILD_PERF_TESTS OFF # if ON, build errors occur

WITH_CUDA ON
CUDA_TOOLKIT_ROOT_DIR  C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0
#CUDA_ARCH_BIN  3.0 3.5 5.0 5.2 6.0 6.1 # very time-consuming 
CUDA_ARCH_PTX 3.0

for opencv
opencv cuda arch

CUDA_ARCH_BIN 3.0 3.5 5.0 5.2 6.0 6.1 relate with

-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_52,code=sm_52;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;

CUDA_ARCH_PTX 3.0 relate with

    -gencode;arch=compute_30,code=compute_30;

for caffe
caffe cuda arch

the CUDA_ARCH_BIN parameter specifies multiple architectures so as to support a variety of GPU boards. otherwise, cuda programs will not run with other type of GPU boards.
为了支持在多个不同计算能力的GPU上运行可执行程序,opencv/caffe编译过程中需要支持多个不同架构,eg. CUDA_ARCH_BIN 3.0 3.5 5.0 5.2 6.0 6.1, 因此编译过程非常耗时。在编译的而过程中尽可能选择需要发布release版本的GPU架构进行配置编译。

configure and output:

Selecting Windows SDK version 10.0.14393.0 to target Windows 10.0.17134.
found IPP (ICV version): 9.0.1 [9.0.1]
at: C:/compile/opencv/3rdparty/ippicv/unpack/ippicv_win
CUDA detected: 8.0
CUDA NVCC target flags: -gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_30,code=compute_30

General configuration for OpenCV 3.1.0 =====================================
  Version control:               3.1.0-3-g5e9beb8

  Platform:
    Host:                        Windows 10.0.17134 AMD64
    CMake:                       3.10.0
    CMake generator:             Visual Studio 14 2015 Win64
    CMake build tool:            C:/Program Files (x86)/MSBuild/14.0/bin/MSBuild.exe
    MSVC:                        1900

  C/C++:
    Built as dynamic libs?:      YES
    C++ Compiler:                C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/x86_amd64/cl.exe  (ver 19.0.24215.1)
    C++ flags (Release):         /DWIN32 /D_WINDOWS /W4 /GR /EHa  /D _CRT_SECURE_NO_DEPRECATE /D _CRT_NONSTDC_NO_DEPRECATE /D _SCL_SECURE_NO_WARNINGS /Gy /bigobj /Oi  /wd4251 /wd4324 /wd4275 /wd4589 /MP8  /MD /O2 /Ob2 /DNDEBUG /MP  /Zi
    C++ flags (Debug):           /DWIN32 /D_WINDOWS /W4 /GR /EHa  /D _CRT_SECURE_NO_DEPRECATE /D _CRT_NONSTDC_NO_DEPRECATE /D _SCL_SECURE_NO_WARNINGS /Gy /bigobj /Oi  /wd4251 /wd4324 /wd4275 /wd4589 /MP8  /MDd /Zi /Ob0 /Od /RTC1 
    C Compiler:                  C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/x86_amd64/cl.exe
    C flags (Release):           /DWIN32 /D_WINDOWS /W3  /D _CRT_SECURE_NO_DEPRECATE /D _CRT_NONSTDC_NO_DEPRECATE /D _SCL_SECURE_NO_WARNINGS /Gy /bigobj /Oi    /MP8  /MD /O2 /Ob2 /DNDEBUG  /Zi
    C flags (Debug):             /DWIN32 /D_WINDOWS /W3  /D _CRT_SECURE_NO_DEPRECATE /D _CRT_NONSTDC_NO_DEPRECATE /D _SCL_SECURE_NO_WARNINGS /Gy /bigobj /Oi    /MP8  /MDd /Zi /Ob0 /Od /RTC1 
    Linker flags (Release):      /machine:x64  /INCREMENTAL:NO  /debug
    Linker flags (Debug):        /machine:x64  /debug /INCREMENTAL 
    Precompiled headers:         YES
    Extra dependencies:          comctl32 gdi32 ole32 setupapi ws2_32 vfw32 cudart nppc nppi npps cufft -LC:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/lib/x64
    3rdparty dependencies:       zlib libjpeg libwebp libpng libtiff libjasper IlmImf

  OpenCV modules:
    To be built:                 cudev core cudaarithm flann imgproc ml video cudabgsegm cudafilters cudaimgproc cudawarping imgcodecs photo shape videoio cudacodec highgui objdetect ts features2d calib3d cudafeatures2d cudalegacy cudaobjdetect cudaoptflow cudastereo stitching superres videostab python2
    Disabled:                    world
    Disabled by dependency:      -
    Unavailable:                 java python3 viz

  Windows RT support:            NO

  GUI: 
    QT:                          NO
    Win32 UI:                    YES
    OpenGL support:              NO
    VTK support:                 NO

  Media I/O: 
    ZLib:                        build (ver 1.2.8)
    JPEG:                        build (ver 90)
    WEBP:                        build (ver 0.3.1)
    PNG:                         build (ver 1.6.19)
    TIFF:                        build (ver 42 - 4.0.2)
    JPEG 2000:                   build (ver 1.900.1)
    OpenEXR:                     build (ver 1.7.1)
    GDAL:                        NO

  Video I/O:
    Video for Windows:           YES
    DC1394 1.x:                  NO
    DC1394 2.x:                  NO
    FFMPEG:                      YES (prebuilt binaries)
      codec:                     YES (ver 56.41.100)
      format:                    YES (ver 56.36.101)
      util:                      YES (ver 54.27.100)
      swscale:                   YES (ver 3.1.101)
      resample:                  NO
      gentoo-style:              YES
    GStreamer:                   NO
    OpenNI:                      NO
    OpenNI PrimeSensor Modules:  NO
    OpenNI2:                     NO
    PvAPI:                       NO
    GigEVisionSDK:               NO
    DirectShow:                  YES
    Media Foundation:            NO
    XIMEA:                       NO
    Intel PerC:                  NO

  Parallel framework:            Concurrency

  Other third-party libraries:
    Use IPP:                     9.0.1 [9.0.1]
         at:                     C:/compile/opencv/3rdparty/ippicv/unpack/ippicv_win
    Use IPP Async:               NO
    Use Eigen:                   NO
    Use Cuda:                    YES (ver 8.0)
    Use OpenCL:                  YES
    Use custom HAL:              NO

  NVIDIA CUDA
    Use CUFFT:                   YES
    Use CUBLAS:                  NO
    USE NVCUVID:                 NO
    NVIDIA GPU arch:             30 35 50 52 60 61
    NVIDIA PTX archs:            30
    Use fast math:               NO

  OpenCL:
    Version:                     dynamic
    Include path:                C:/compile/opencv/3rdparty/include/opencl/1.2
    Use AMDFFT:                  NO
    Use AMDBLAS:                 NO

  Python 2:
    Interpreter:                 C:/Python27/python.exe (ver 2.7.13)
    Libraries:                   C:/Python27/libs/python27.lib (ver 2.7.13)
    numpy:                       C:/Python27/lib/site-packages/numpy/core/include (ver 1.11.3)
    packages path:               C:/Python27/Lib/site-packages

  Python 3:
    Interpreter:                 NO

  Python (for build):            C:/Python27/python.exe

  Java:
    ant:                         NO
    JNI:                         C:/Program Files/Java/jdk1.8.0_161/include C:/Program Files/Java/jdk1.8.0_161/include/win32 C:/Program Files/Java/jdk1.8.0_161/include
    Java wrappers:               NO
    Java tests:                  NO

  Matlab:                        Matlab not found or implicitly disabled

  Documentation:
    Doxygen:                     NO
    PlantUML:                    NO

  Tests and samples:
    Tests:                       YES
    Performance tests:           NO
    C/C++ Examples:              NO

  Install path:                  C:/compile/opencv/build/install

  cvconfig.h is in:              C:/compile/opencv/build
-----------------------------------------------------------------

Configuring done
Generating done

Notice for gencode

CUDA NVCC target flags: -gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_52,code=sm_52;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_30,code=compute_30

build

Open OpenCV.sln with VS 2015 and build release version.

this may take hours to finish.

build success

errors

opencv build errors

possible solutions

With BUILD_PERF_TESTS and BUILD_TESTS disabled, I managed to build OpenCV 3.1 with CUDA 8.0 on Windows 10 with VS2015 x64 arch target. Without building test/performance modules, the build process costs less time as well : )

I actually got it to work both on my laptop and my desktop (GTX960M and GTX970 respectively) running with OpenCV 3.2 and the latest version of CUDA 8.0 for Win10 in Visual Studio 15 Community! What I did was to enable WITH_CUBLAS aswell as WITH_CUDA. I also turned off BUILD_PERF_TESTS and BUILD_TESTS. The configuration was built using the Visual Studio 14 2015 C++ compiler.

my solution:

disable `BUILD_PERF_TESTS`

opencv disable build perf tests

configure and build again. this time cost only about 1 minutes.

after error fixed,build results
after errors fixed

demo

cuda-module

OpenCV GPU module is written using CUDA, therefore it benefits from the CUDA ecosystem.

GPU modules includes class cv::cuda::GpuMat which is a primary container for data kept in GPU memory. It’s interface is very similar with cv::Mat, its CPU counterpart. All GPU functions receive GpuMat as input and output arguments. This allows to invoke several GPU algorithms without downloading data. GPU module API interface is also kept similar with CPU interface where possible. So developers who are familiar with Opencv on CPU could start using GPU straightaway.

The GPU module is designed as a host-level API. This means that if you have pre-compiled OpenCV GPU binaries, you are not required to have the CUDA Toolkit installed or write any extra code to make use of the GPU.

CMakeLists.txt

1
2
3
4
5
find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc features2d calib3d 
cudaarithm cudabgsegm cudafilters cudaimgproc cudawarping cudafeatures2d # for cuda-enabled
) #
MESSAGE( [Main] " OpenCV_INCLUDE_DIRS = ${OpenCV_INCLUDE_DIRS}")
MESSAGE( [Main] " OpenCV_LIBS = ${OpenCV_LIBS}")

demo.cpp

In the sample below an image is loaded from local file, next it is uploaded to GPU, thresholded, downloaded and displayed.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#include <opencv2/cudaarithm.hpp>
#include <opencv2/cudabgsegm.hpp>
#include <opencv2/cudafilters.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/cudafeatures2d.hpp>

int test_opencv_gpu()
{
try
{
cv::Mat src_host = cv::imread("file.png", CV_LOAD_IMAGE_GRAYSCALE);
cv::cuda::GpuMat dst, src;
src.upload(src_host);

cv::cuda::threshold(src, dst, 128.0, 255.0, CV_THRESH_BINARY);

cv::Mat result_host;
dst.download(result_host);

cv::imshow("Result", result_host);
cv::waitKey();
}
catch (const cv::Exception& ex)
{
std::cout << "Error: " << ex.what() << std::endl;
}
return 0;
}

cpu vs gpu time cost

  • (1)对于分辨率不特别大的图片间的ORB特征匹配,CPU运算得比GPU版的快(由于图像上传到GPU消耗了时间)
  • (2)但对于分辨率较大的图片,或者GPU比CPU好的机器(比如Nvidia Jetson系列),GPU版的ORB算法比CPU版的程序更高效。

problems

(1) 使用cuda版本的opencv caffe网络的第一次创建非常耗时,后面的网络创建则非常快。
(2) opencv的gpu代码比cpu代码慢,初次启动多耗费20s左右。(事实是由于编译的caffe和GPU计算力不匹配导致的)

reasons

Your problem is that CUDA needs to initialize! And it will generally takes between serveral seconds

Why first function call is slow?
That is because of initialization overheads. On first GPU function call Cuda Runtime API is initialized implicitly.

The first gpu function call is always takes more time, because CUDA initialize context for device.
The following calls will be faster.

Not Reasons:
(1) CPU clockspeed is 10x faster than GPU clockspeed.
(2) memory transfer times between host (CPU) and device (GPU) (upload,downloa data)

deploy

runtime errors

gtx 1060 编译的opencv caffe在gtx 970m上运行出现错误

im2col.cu Check failed: error == cudaSuccess (8 vs. 0) invalid device function

    gtx 1060   sm_61
    gtx 970m   sm_52

im2col 是caffe的源文件,表明gtx 970m的计算能力不支持可执行文件的运行。

reasons

see what-is-the-purpose-of-using-multiple-arch-flags-in-nvidias-nvcc-compiler

Roughly speaking, the code compilation flow goes like this: CUDA C/C++ device code source --> PTX --> SASS
The virtual architecture (e.g. compute_20, whatever is specified by -arch compute…) determines what type of PTX code will be generated. The additional switches (e.g. -code sm_21) determine what type of SASS code will be generated. SASS is actually executable object code for a GPU (machine language). An executable can contain multiple versions of SASS and/or PTX, and there is a runtime loader mechanism that will pick appropriate versions based on the GPU actually being used.

win7/win10 deploy

  • compile opencv caffe on windows 10 for GTX 1060
  • deoply on windows 7 for GTX 1080 Ti successfully

for win7, if we install 398.82-desktop-win8-win7-64bit-international-whql.exe,errors may occur:

> nvidia-smi.exe 
Failed to initialize NVML: Unknown error

Solutions: use older drivers 385.69

linux/window performance

(1) api在linux平均耗时3ms;同样的代码在windows平均耗时14ms
(2) vs编译开启代码优化前后性能相差接近5倍,125ms vs 25ms
(3) cmake编译RELEASE选项默认已经开启了代码优化 -O3

Reference

History

  • 20180713: created.

Guide

set_mode

Caffe fails to use GPU in a new thread ???
see here

the `Caffe::mode_` variable that controls this is thread-local,
so ensure you’re calling `caffe.set_mode_gpu()` in each thread
before running any Caffe functions. That should solve your issue.

Caffe set_mode GPU 在多线程下失效
在main thread中设置GPU模式,在worker thread中调用网络进行检测,
GPU模式不起效,默认仍然使用CPU模式,所以速度很慢,和GPU相比慢了
10倍左右。

解决方案:在子线程中set_mode,然后调用网络进行检测。
(1)创建网络在main thread。static 网络存储在全局静态数据区。
worker thread可以直接使用。
(2) 在worker thread中检测,需要在子线程中set_mode,然后调用网络进行检测。

结论:
(1)caffe的set_mode所在的线程必须和使用nets进行forward的线程相同。否则默认使用CPU模式,速度会很慢。
(2)caffe的nets初始化可以在main thread也可以在worker thread。

code example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#include <iostream>
#include <string>
#include <thread>

#include <gtest/gtest.h>
#include <glog/logging.h>

#include <boost/date_time/posix_time/posix_time.hpp>

// opencv
#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>

using namespace std;

#include "algorithm/algorithm.h"
using namespace kezunlin::algorithm;

#pragma region net-demo

void topwire_demo(bool run_in_worker_thread)
{
if (run_in_worker_thread) {
CaffeApi::set_mode(true, 0, 1234);// set in worker thread-1, use GPU-0
}

// do net detect
// ...
}

void railway_demo(bool run_in_worker_thread)
{
if (run_in_worker_thread) {
CaffeApi::set_mode(true, 0, 1234);// set in worker thread-1, use GPU-0
}

// do net detect
// ...
}

void sidewall_demo(bool run_in_worker_thread)
{
if (run_in_worker_thread) {
CaffeApi::set_mode(true, 0, 1234);// set in worker thread-1, use GPU-0
}

// do net detect
// ...
}

void lockcatch_demo(bool run_in_worker_thread)
{
if (run_in_worker_thread) {
CaffeApi::set_mode(true, 0, 1234);// set in worker thread-1, use GPU-0
}

// do net detect
// ...
}

#pragma endregion


#pragma region worker-thread-demo

void worker_thread_topwire_demo(bool run_in_worker_thread)
{
std::thread thr(topwire_demo, run_in_worker_thread);
thr.join();
}

void worker_thread_railway_demo(bool run_in_worker_thread)
{
std::thread thr(railway_demo, run_in_worker_thread);
thr.join();
}

void worker_thread_sidewall_demo(bool run_in_worker_thread)
{
std::thread thr(sidewall_demo, run_in_worker_thread);
thr.join();
}

void worker_thread_lockcatch_demo(bool run_in_worker_thread)
{
std::thread thr(lockcatch_demo, run_in_worker_thread);
thr.join();
}

#pragma endregion

enum DETECT_TYPE {
SET_IN_MAIN_DETECT_IN_MAIN, // 主线程set_mode,主线程检测,40ms左右,使用GPU
SET_IN_WORKER_DETECT_IN_WORKER, // 子线程set_mode,子线程检测,40ms左右,使用GPU
SET_IN_MAIN_DETECT_IN_WORKER // 主线程set_mode,子线程检测,400ms左右,慢了10倍左右,没有使用GPU
};

void thread_demo()
{
DETECT_TYPE detect_type = SET_IN_MAIN_DETECT_IN_MAIN;
detect_type = SET_IN_WORKER_DETECT_IN_WORKER;
detect_type = SET_IN_MAIN_DETECT_IN_WORKER;

init_algorithm_api();

switch (detect_type)
{
case SET_IN_MAIN_DETECT_IN_MAIN:
topwire_demo(false);
railway_demo(false);
sidewall_demo(false);
lockcatch_demo(false);
break;
case SET_IN_WORKER_DETECT_IN_WORKER:
worker_thread_topwire_demo(true);
worker_thread_railway_demo(true);
worker_thread_sidewall_demo(true);
worker_thread_lockcatch_demo(true);
break;
case SET_IN_MAIN_DETECT_IN_WORKER:
worker_thread_topwire_demo(false);
worker_thread_railway_demo(false);
worker_thread_sidewall_demo(false);
worker_thread_lockcatch_demo(false);
break;
default:
break;
}

free_algorithm_api();
}

void test_algorithm_api()
{
thread_demo();
}

TEST(algorithn_test, test_algorithm_api) {
test_algorithm_api();
}

conclusions

  • SET_IN_MAIN_DETECT_IN_MAIN, // 主线程set_mode,主线程检测,40ms左右,使用GPU
  • SET_IN_WORKER_DETECT_IN_WORKER, // 子线程set_mode,子线程检测,40ms左右,使用GPU
  • SET_IN_MAIN_DETECT_IN_WORKER // 主线程set_mode,子线程检测,400ms左右,慢了10倍左右,没有使用GPU

Reference

History

  • 20180712: created.

Code Example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#include <iostream>
#include <string>
#include <boost/shared_ptr.hpp>
#include <boost/enable_shared_from_this.hpp>

using namespace std;
using namespace boost;

/*
enable_shared_from_this has become part of C++ 11 standard.

https://blog.csdn.net/csfreebird/article/details/8282518
https://stackoverflow.com/questions/712279/what-is-the-usefulness-of-enable-shared-from-this
*/

class Y : public boost::enable_shared_from_this<Y>
{
public:

Y()
{
cout << "Y::Y()" << endl;
}

~Y()
{
cout << "Y::~Y()" << endl;
}

boost::shared_ptr<Y> f()
{
return shared_from_this();
}

boost::shared_ptr<Y> f_dangerous()
{// lead to multiple release on same resouce
return boost::shared_ptr<Y>(this); // don't do this!
}
};

void test_safe()
{
boost::shared_ptr<Y> p(new Y);
boost::shared_ptr<Y> q = p->f();
assert(p == q);
assert(!(p < q || q < p)); // p and q must share ownership

/*
Y::Y()
Y::~Y()
*/
}

void test_dangerous()
{
boost::shared_ptr<Y> p(new Y);
boost::shared_ptr<Y> q = p->f_dangerous();
assert(p == q);
assert(!(p < q || q < p)); // p and q must share ownership

/*
Y::Y()
Y::~Y()
Y::~Y()
*/
}

int main()
{
//test_safe();
test_dangerous();
return 0;
}

Reference

History

  • 20180523: created.

Code Example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#include <iostream>
#include <boost/thread.hpp>
#include <boost/bind.hpp>
#include <boost/asio.hpp>
#include <boost/date_time/posix_time/posix_time.hpp>

/*
https://mmoaay.gitbooks.io/boost-asio-cpp-network-programming-chinese/content/Chapter3.html
https://www.boost.org/doc/libs/1_64_0/doc/html/boost_asio/tutorial/tuttimer2/src.html
sync: blocking ,return until job done
async: non-blocking, return immediately
*/

// Timer.1. - Using a timer synchronously
int snyc_timer()
{
boost::asio::io_service io;

boost::asio::deadline_timer t(io, boost::posix_time::seconds(2));
t.wait(); // blocking wait on the timer

std::cout << "Hello, world!" << std::endl;

return 0;
}

// Timer.2 - Using a timer asynchronously
void print2(const boost::system::error_code& /*e*/)
{
std::cout << "thread #" << boost::this_thread::get_id() << std::endl;
std::cout << "Hello, world!" << std::endl;
}

int asnyc_timer()
{
boost::asio::io_service io;

boost::asio::deadline_timer t(io, boost::posix_time::seconds(2));
t.async_wait(&print2); // asnyc,non-blocking,return immediately
/*
The asio library provides a guarantee that callback handlers will only be called from threads that are currently calling io_service::run().
asio库会确保handler会在io_service::run()所在的thread中运行。
*/

std::cout << "[main thread] #" << boost::this_thread::get_id() << std::endl;
std::cout << "here" << std::endl;

io.run();// sync: blocking ,return until job done

return 0;
}

// Timer.3 - Binding arguments to a handler
void print(const boost::system::error_code& /*e*/,
boost::asio::deadline_timer* t, int* count)
{
if (*count < 5)
{
std::cout << *count << std::endl;
++(*count);

t->expires_at(t->expires_at() + boost::posix_time::seconds(1));
t->async_wait(boost::bind(print,
boost::asio::placeholders::error, t, count));
}
}

int asnyc_timer_with_params()
{
boost::asio::io_service io;

int count = 0;
boost::asio::deadline_timer t(io, boost::posix_time::seconds(1));
t.async_wait(boost::bind(print,
boost::asio::placeholders::error, &t, &count));

io.run();

std::cout << "Final count is " << count << std::endl;

return 0;
}

// Timer.4 - Using a member function as a handler
class printer
{
public:
printer(boost::asio::io_service& io)
: timer_(io, boost::posix_time::seconds(1)),
count_(0)
{
timer_.async_wait(boost::bind(&printer::print, this));
}

~printer()
{
std::cout << "Final count is " << count_ << std::endl;
}

void print()
{
if (count_ < 5)
{
std::cout << count_ << std::endl;
++count_;

timer_.expires_at(timer_.expires_at() + boost::posix_time::seconds(1));
timer_.async_wait(boost::bind(&printer::print, this));
}
else {
std::cout << "print do nothing..." << std::endl;
}
}

private:
boost::asio::deadline_timer timer_;
int count_;
};

int asnyc_timer_with_class_method()
{
boost::asio::io_service io;
printer p(io);
io.run();

return 0;
}

// Timer.5 - Synchronising handlers in multithreaded programs

/*
The previous four tutorials avoided the issue of handler synchronisation
by calling the io_service::run() function from one thread only. As you
already know, the asio library provides a guarantee that callback handlers
will only be called from threads that are currently calling io_service::run().

Consequently, calling io_service::run() from only one thread ensures that
callback handlers cannot run concurrently.

By wrapping the handlers using the same boost::asio::strand, we are ensuring
that they cannot execute concurrently.

在一个thread中调用io_service::run(),能够确保其对应的回调handlers不会并行执行。
但是在A和B两个thread中调用io_service::run(),A对应的handler和B对应的handler会并行执行。
【handler不是线程安全的】

如何解决多线程调用io_service::run() 其对应的handler会并行执行的问题?
使用boost::asio::io_service::strand对象。

An boost::asio::strand guarantees that, for those handlers that are dispatched
through it, an executing handler will be allowed to complete before the next one
is started.

By wrapping the handlers using the same boost::asio::strand, we are ensuring that
they cannot execute concurrently.

同一个srand对象,能够确保其wrap的handler能够顺序执行。
即print1和print2在2个线程中不会并行执行。
*/
class printer_sync
{
public:
printer_sync(boost::asio::io_service& io)
: strand_(io),
timer1_(io, boost::posix_time::seconds(1)),
timer2_(io, boost::posix_time::seconds(1)),
count_(0)
{
timer1_.async_wait(strand_.wrap(boost::bind(&printer_sync::print1, this)));
timer2_.async_wait(strand_.wrap(boost::bind(&printer_sync::print2, this)));
}

~printer_sync()
{
std::cout << "Final count is " << count_ << std::endl;
}

void print1()
{
if (count_ < 10)
{
std::cout << "Timer 1: " << count_ <<",thread #" << boost::this_thread::get_id() << std::endl;
++count_;

timer1_.expires_at(timer1_.expires_at() + boost::posix_time::seconds(1));
timer1_.async_wait(strand_.wrap(boost::bind(&printer_sync::print1, this)));
}
else {
std::cout << "Timer 1: else " << count_ << std::endl;
}
}

void print2()
{
if (count_ < 10)
{

std::cout << "Timer 2: " << count_ << ",thread #" << boost::this_thread::get_id() << std::endl;
++count_;

timer2_.expires_at(timer2_.expires_at() + boost::posix_time::seconds(1));
timer2_.async_wait(strand_.wrap(boost::bind(&printer_sync::print2, this)));
}
else {
std::cout << "Timer 2: else " << count_ << std::endl;
}
}

private:
boost::asio::io_service::strand strand_; // wrap handlers
boost::asio::deadline_timer timer1_;
boost::asio::deadline_timer timer2_;
int count_;
};

int sync_handlers()
{
boost::asio::io_service io;
printer_sync p(io);
boost::thread t(boost::bind(&boost::asio::io_service::run, &io));
std::cout << "[main thread] #" << boost::this_thread::get_id() << std::endl;

std::cout << "[1] begin to run" << std::endl;
io.run(); // blocking until job done
std::cout << "[2] after run" << std::endl;
t.join();
std::cout << "[3] after join" << std::endl;
return 0;
}

int main(int argc, char* argv[])
{
//snyc_timer();
//asnyc_timer();
//asnyc_timer_with_params();
//asnyc_timer_with_class_method();
sync_handlers();
return 0;
}

Reference

History

  • 20180523: created.

Thread Pool with Boost

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#include <iostream>    //std::cout std::endl
#include <thread> //std::thread
#include <future> //std::future std::promise
#include <utility> //std::ref
#include <chrono> //std::chrono::seconds

#include <boost/thread.hpp>
#include <boost/bind.hpp>
#include <boost/asio.hpp>

class ThreadPool {
public:
explicit ThreadPool(size_t size) : work_(io_service_) {
for (size_t i = 0; i < size; ++i) {
workers_.create_thread(
boost::bind(&boost::asio::io_service::run, &io_service_));
}
}

~ThreadPool() {
std::cout << "~ThreadPool" << std::endl;
io_service_.stop(); // stop before join_all
workers_.join_all();
}

// Add new work item to the pool.
template<class F>
void Enqueue(F f) {
io_service_.post(f);
//sync, return immediately
}

private:
boost::thread_group workers_;
boost::asio::io_service io_service_;
boost::asio::io_service::work work_;
};

boost::mutex io_mutex;

void count(int id)
{
for (int i = 0; i < 10; i++)
{
boost::mutex::scoped_lock lock(io_mutex);
std::cout << id << ":" << i << std::endl;
}
}

void test_thread()
{
boost::thread thrd1(boost::bind(&count, 1));
boost::thread thrd2(boost::bind(&count, 2));
thrd1.join();
thrd2.join();
}

void print(int i)
{
boost::mutex::scoped_lock lock(io_mutex);
std::cout << "print() #" << boost::this_thread::get_id() << std::endl;
std::cout << "hello " << i << std::endl;
boost::this_thread::sleep(boost::posix_time::seconds(1));
std::cout << "world " << i << std::endl;
}

void test_thread_pool()
{
// Create a thread pool of 4 worker threads.
ThreadPool pool(4);

// Queue a bunch of work items.
for (int i = 0; i < 8; ++i) {
pool.Enqueue(boost::bind(&print, i));
}
}

void do_task(std::promise<int> &promiseObj) {
boost::mutex::scoped_lock lock(io_mutex);
std::cout << "Inside thread: " << std::this_thread::get_id() << std::endl;
std::cout << "Inside thread: sleep 2 seconds... " << std::endl;
std::this_thread::sleep_for(std::chrono::seconds(2));
promiseObj.set_value(35);
}

void test_future()
{
std::promise<int> promiseObj;
std::future<int> futureObj = promiseObj.get_future();
std::cout << "create thread..." << std::endl;
std::thread th(do_task, std::ref(promiseObj));

std::cout << "futureObj.get() block main thread." << std::endl;
std::cout << futureObj.get() << std::endl;

th.join();
std::cout << "after join" << std::endl;
}

/*
std::bind
bind预先绑定的参数需要传具体的变量或值进去,对于预先绑定的参数,是pass-by-value的;[使用std::ref()可以pass by reference]
对于不事先绑定的参数,需要传std::placeholders进去,从_1开始,依次递增。placeholder是pass-by-reference的;
*/

int main(int argc, char* argv[])
{
//test_thread();
//test_thread_pool();
test_future();
return 0;
}

Reference

History

  • 20180523: created.

Guide

basic

ownership

  • shared ownership
  • exclusive ownership
  • upgrade ownership—>exclusive ownership

mutex

  • boost::mutex enable exclusive access to shared data.
  • boost::shared_mutex enable shared access to shared data.

lock

  • boost::shared_lock
  • boost::unique_lock
  • boost::upgrade_lock
  • boost::upgrade_to_unique_lock

Tips from difference-between-boostunique-lock-and-boostupgrade-lock

The difference between upgrade_lock and unique_lock is simple. An instance of unique_lock is acquiring a full exclusive ownership of a shared_mutex. This means that no one else can get any type of ownership while the unique_lock is alive.

Unlike the unique_lock an instance of upgrade_lock is acquiring an upgrade ownership that exclusive only amongst threads trying to get the same upgrade ownership. All other threads that try to get a shared ownership could acquire it with no conflict until the upgrade_lock is upgraded to unique (with an instance of upgrade_to_unique_lock).

The upgrade_lock is useful when some of threads can be readers only and will not try to promote itself to writers. Otherwise (all readers may try to become writers at some point) upgrade_lock will operate as unique_lock.

conclusions

  • thread-A get shared_lock,other threads cannot get unique_lock,upgrade_lock,but can get shared_lock.(multiple-reader)

  • thread-A get upgrade_lock,other threads cannot get unique_lock,upgrade_lock,but can get shared_lock.

  • upgrade_lock can upgrade to upgrade_to_unique_lock,it’s same as thread-A get unique_lock.

  • thread-A get unique_lock,other threads cannot get unique_lock,upgrade_lock,shared_lock. (single-writer)

code example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#include <iostream>
#include <boost/thread.hpp>
#include <boost/bind.hpp>

typedef boost::shared_lock<boost::shared_mutex> read_lock_t;
typedef boost::unique_lock<boost::shared_mutex> write_lock_t;

typedef boost::upgrade_lock<boost::shared_mutex> upgrade_lock_t;
typedef boost::upgrade_to_unique_lock<boost::shared_mutex> upgrade_to_unique_lock_t;

boost::shared_mutex _access; // read-write access

typedef boost::unique_lock<boost::mutex> exclusive_lock_t;

int data = 0;

void readOnly()
{
read_lock_t rdlock(_access);
std::cout << data << std::endl;
}

void writeOnly()
{
write_lock_t wtlock(_access);
data = 100;
}

//=============================================
// reader and writer
//=============================================
void reader()
{
read_lock_t lock(_access);
// do work here, without anyone having exclusive access

std::cout << data << std::endl;
}

void conditional_writer()
{
upgrade_lock_t lock(_access);
// do work here, without anyone having exclusive access

bool conditional = true; // true/false
if (conditional) {
upgrade_to_unique_lock_t uniqueLock(lock);
// do work here, but now you have exclusive access
data = 100;
}

// do more work here, without anyone having exclusive access
}

void unconditional_writer()
{
write_lock_t lock(_access);
// do work here, with exclusive access
data = 100;
}

int main(int argc, char* argv[])
{
return 0;
}

Reference

History

  • 20180523 created.