network3.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""network3.py
~~~~~~~~~~~~~~

A Theano-based program for training and running simple neural
networks.

Supports several layer types (fully connected, convolutional, max
pooling, softmax), and activation functions (sigmoid, tanh, and
rectified linear units, with more easily added).

When run on a CPU, this program is much faster than network.py and
network2.py.  However, unlike network.py and network2.py it can also
be run on a GPU, which makes it faster still.

Because the code is based on Theano, the code is different in many
ways from network.py and network2.py.  However, where possible I have
tried to maintain consistency with the earlier programs.  In
particular, the API is similar to network2.py.  Note that I have
focused on making the code simple, easily readable, and easily
modifiable.  It is not optimized, and omits many desirable features.

This program incorporates ideas from the Theano documentation on
convolutional neural nets (notably,
http://deeplearning.net/tutorial/lenet.html ), from Misha Denil's
implementation of dropout (https://github.com/mdenil/dropout ), and
from Chris Olah (http://colah.github.io ).

Written for Theano 0.6 and 0.7, needs some changes for more recent
versions of Theano.

对于N=50000数据全部参与训练，time(python) = 7分钟; time(theano) = 1分钟。


But the big win is the ability to do fast symbolic differentiation, 
using a very general form of the backpropagation algorithm. 
This is extremely useful for applying stochastic gradient
descent to a wide variety of network architectures.
"""

#### Libraries
# Standard library
import cPickle
import gzip
import time
import copy

# Third-party libraries
import numpy as np
import theano
import theano.tensor as T
from theano.tensor.nnet import conv
from theano.tensor.nnet import softmax
from theano.tensor import shared_randomstreams
#from theano.tensor.signal.downsample import max_pool_2d  # for version theano-0.7
from theano.tensor.signal.pool import pool_2d # for version theano-0.9

# Activation functions for neurons
def linear(z): return z
def ReLU(z): return T.maximum(0.0, z)
from theano.tensor.nnet import sigmoid
from theano.tensor import tanh

#### Load the MNIST data
def load_data_shared(filename="../data/mnist.pkl.gz",training_set_size=1000):
    print 'loading data from {0} of #{1}'.format(filename,training_set_size)
    f = gzip.open(filename, 'rb')
    training_data, validation_data, test_data = cPickle.load(f) # float32(N,784); int64(N,)
    f.close()
    def shared(data):
        """Place the data into shared variables.  This allows Theano to copy
        the data to the GPU, if one is available.

        shared_x.get_value().shape   float32(50000, 784)
        shared_y.get_value().shape   float32(50000,)   
        
        y_cast = T.cast(shared_y, "int8") # float32--->int8
        
        shared_x.type  TensorType(float32, matrix)  theano.tensor.sharedvar.TensorSharedVariable
        shared_y.type  TensorType(float32, vector)  theano.tensor.sharedvar.TensorSharedVariable
        y_cast.type    TensorType(int32, vector)    theano.tensor.var.TensorVariable  (y_cast不是shared变量)
        """
        
        # 默认floatX = float64,在运行的时候需要设置floatX = float32
        # 取x[N,784],y[N]的前training_set_size个样本参与训练
        shared_x = theano.shared(np.asarray(data[0][:training_set_size,],dtype=theano.config.floatX), borrow=True)  
        shared_y = theano.shared(np.asarray(data[1][:training_set_size], dtype=theano.config.floatX), borrow=True)
        
        # shared变量中的数据在GPU上必须是float32类型，但是计算阶段可能需要int类型(y)，所以需要将float32--->int.
        # 并且int8类型需要和　self.y = T.bvector("y")的b类型一样。
        # When storing data on the GPU it has to be stored as floats
        # therefore we will store the labels as ``floatX`` as well
        # (``shared_y`` does exactly that). But during our computations
        # we need them as ints (we use labels as index, and if they are
        # floats it doesn't make sense) therefore instead of returning
        # ``shared_y`` we will have to cast it to int. This little hack
        # lets us get around this issue
        return shared_x, T.cast(shared_y, 'int8')
    return [shared(training_data), shared(validation_data), shared(test_data)]

def load_data_expanded(filename="../data/mnist_expanded.pkl.gz",training_set_size=1000):
    return load_data_shared(filename=filename,training_set_size=training_set_size)

#### Main class used to construct and train networks
class Network(object):

    def __init__(self, layers, mini_batch_size):
        """Takes a list of `layers`, describing the network architecture, and
        a value for the `mini_batch_size` to be used during training
        by stochastic gradient descent.

        """
        self.layers = layers
        assert len(self.layers)>=2
        self.mini_batch_size = mini_batch_size
        self.params = [param for layer in self.layers for param in layer.params]
        self.x = T.matrix("x")  # batch x  float32,(m,784) 不需要指定fmatrix
        self.y = T.bvector("y")  # batch y   int8,(m,)
        
        # first layer init with inpt=x,inpt_dropout=x
        init_layer = self.layers[0]
        init_layer.set_inpt(self.x, self.x, self.mini_batch_size)
        for j in xrange(1, len(self.layers)):
            prev_layer, layer  = self.layers[j-1], self.layers[j]
            layer.set_inpt(prev_layer.output, prev_layer.output_dropout, self.mini_batch_size)
            
        self.output = self.layers[-1].output
        self.output_dropout = self.layers[-1].output_dropout

    def SGD(self, training_data, epochs, mini_batch_size, eta,
            validation_data, test_data, lmbda=0.0,
            no_improvement_in_n=20,use_constant_eta=True, # default not vary eta because accuracy not imporved too much
            eta_shrink_times=10,eta_descrease_factor = 0.0001):
        
        """Train the network using mini-batch stochastic gradient descent."""
        training_x, training_y = training_data       # (N,784) (N,)
        validation_x, validation_y = validation_data
        test_x, test_y = test_data

        # compute number of minibatches for training, validation and testing
        num_training_batches = size(training_data)/mini_batch_size
        num_validation_batches = size(validation_data)/mini_batch_size
        num_test_batches = size(test_data)/mini_batch_size

        # define the (regularized) cost function, symbolic gradients, and updates
        l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers])
        cost0 = self.layers[-1].cost(self) # 计算最后一层的输出代价，传递Network作为net参数
        
        cost = cost0 + 0.5*lmbda*l2_norm_squared/size(training_data)  # ??? N instead of num_training_batches
        grads = T.grad(cost, self.params)
        
        shared_eta = theano.shared(eta,borrow=True) #(same as shared_b) use SharedVariable instead of value
        
        updates = [(param, param-T.cast(shared_eta*grad,dtype=theano.config.floatX)) for param, grad in zip(self.params, grads)] 
        
        """
        grad(float32),没有指定floatX=float32,则eta*grad(float64),指定之后eta*grad(float32)，无需cast
        
        #for param, grad in zip(self.params, grads):
        #    print param.type,grad.type,(eta*grad).type
        
        # updates = [(param, T.cast(param-eta*grad,'float32') ) for param, grad in zip(self.params, grads)]
        """
        
        # define functions to train a mini-batch, and to compute the
        # accuracy in validation and test mini-batches.
        i = T.lscalar() # mini-batch index
        train_mb = theano.function(
            [i], cost, updates=updates, # 给定i,===>x,y===>cost中的x,y被替换掉，从而计算mini-batch的代价，最后updates
            givens={
                self.x:
                training_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
                self.y:
                training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
            })
        
        # cost and accuracy for train,val,test
        # (1) train
        train_mb_cost = theano.function(
            [i], cost,
            givens={
                self.x:
                training_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
                self.y:
                training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
            }) 
        train_mb_accuracy = theano.function(
            [i], self.layers[-1].accuracy(self.y), # y(m,)
            givens={
                self.x:
                training_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
                self.y:
                training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
            }) 
        # (2) val
        validate_mb_cost = theano.function(
            [i], cost,
            givens={
                self.x:
                validation_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
                self.y:
                validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
            })
        validate_mb_accuracy = theano.function(
            [i], self.layers[-1].accuracy(self.y), # y(m,)
            givens={
                self.x:
                validation_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
                self.y:
                validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
            })
        # (3) test
        test_mb_cost = theano.function(
            [i], cost,
            givens={
                self.x:
                test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
                self.y:
                test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
            })
        test_mb_accuracy = theano.function(
            [i], self.layers[-1].accuracy(self.y), # y(m,)
            givens={
                self.x:
                test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
                self.y:
                test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
            })
        self.test_mb_predictions = theano.function(
            [i], self.layers[-1].y_out,   # y(m,)　m个样本的预测结果
            givens={
                self.x:
                test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
            })
        
        """
         def shuffle_data(x,y):
            seed = int(time.time()) 
            np.random.seed(seed)
            np.random.shuffle(x)

            np.random.seed(seed)
            np.random.shuffle(y)
        
        def shuffle_training_data(training_x,training_y):
            # CPU, OK; GPU, FAILED (在GPU中borrow失效)
            originX = training_x.get_value(borrow=True) # shared---> nparray
            originY = training_y.get_value(borrow=True) # shared---> nparray
            shuffle_data(originX,originY)
        """
        
        evaluation_costs, evaluation_accuracys = [], []
        training_costs, training_accuracys = [], []
        
        # use no-improvement-in-n early stopping
        # 记录best_validation_accuracy,best_epoch，如果epoch-best_epoch>=no_improvement_in_n,stop
        best_epoch = 0
        cur_eta_shrink_times = 0 # if cur_eta_shrink_times>=eta_shrink_times,stop
        best_validation_accuracy = 0.0 # with gpu, numpy.float64
        
        for epoch in xrange(epochs):
            #random.shuffle(training_data) # for list[(x1,y1),(x2,y2),...] 此处training_data是(X,Y)
            # shuffle_training_data(training_x,training_y) # FAILED on GPU
            
            for minibatch_index in xrange(num_training_batches):
                # iteration记录训练次数，每训练1000次输出一次
                iteration = num_training_batches*epoch+minibatch_index
                if iteration % 1000 == 0:
                    print("Training mini-batch number {0}".format(iteration))
                cost_ij = train_mb(minibatch_index)
           
            # 一个epoch训练结束，训练了num_training_batches次，iterration=4999。利用w,b计算一次验证accuracy
            #if (iteration+1) % num_training_batches == 0:
            validation_cost = np.mean( [validate_mb_cost(j) for j in xrange(num_validation_batches)] )
            validation_accuracy = np.mean( [validate_mb_accuracy(j) for j in xrange(num_validation_batches)] )
            print("\nEpoch {0}: validation accuracy {1:.2%}".format(epoch, validation_accuracy))

            train_cost = np.mean( [train_mb_cost(j) for j in xrange(num_training_batches)] )
            train_accuracy = np.mean( [train_mb_accuracy(j) for j in xrange(num_training_batches)] )
            
            # save 4 return lists
            evaluation_costs.append(validation_cost)
            evaluation_accuracys.append(validation_accuracy)
            training_costs.append(train_cost)
            training_accuracys.append(train_accuracy)
            
            #记录best_validation_accuracy
            # 关键在于<,满足足够多的NIIN,才能满足eta_shrink_times>=10
            if best_validation_accuracy - validation_accuracy < 0.0:  # <=
                print("This is the best validation accuracy to date.")
                best_validation_accuracy = validation_accuracy
                best_epoch = epoch
                best_iteration = iteration
                
                # save best network
                best_net = copy.deepcopy(self)
                
                #计算在val取得最佳accuracy情况下，test数据集的accuracy
                if test_data:
                    test_accuracy = np.mean( [test_mb_accuracy(j) for j in xrange(num_test_batches)] )
                    print('The corresponding test accuracy is {0:.2%}'.format(test_accuracy))
                    
            #============================================================================================
            # early stopping with variable learning rate
            # (1) (epoch - best_epoch) >= no_improvement_in_n: stop   NIIN = 20
            # (2) new_eta = 1/2*eta until new_eta<=1/1024*eta         ETA_SHRINK_TIME = 10 
            #============================================================================================
            
            # check in last epoch of NIIN stage
            if (epoch+1) % no_improvement_in_n == 0:
                # (1) check NIIN
                if (epoch - best_epoch) >= no_improvement_in_n:
                    # stop learning
                    print '!'*100
                    print '[HIT] Early stopping at epoch #{0},best_epoch #{1},iteration #{2},validation accuracy {3:.2%}'.format(epoch,best_epoch,best_iteration,best_validation_accuracy)
                    print '!'*100
                    break;
                    
                    #******************************************************************************
                    if use_constant_eta:
                        break # goto (2) instead of break
                    else:
                        # (2) shrink eta to 1/2*eta　　(accuracy not improved too much)
                        print 'cur_eta_shrink_times = {0}'.format(cur_eta_shrink_times)
                        if cur_eta_shrink_times >= eta_shrink_times:
                            print '+'*100
                            print '[HIT] Eta shrink OK. at epoch #{0},best_epoch #{1},iteration #{2},validation accuracy {3:.2%}'.format(epoch,best_epoch,best_iteration,best_validation_accuracy)
                            print '+'*100
                            break;

                        cur_eta_shrink_times +=1 

                        # update eta every epoch
                        eta_descrease_factor = 0.0001
                        new_eta = eta/(1.0+eta_descrease_factor*(epoch+1))
                        shared_eta.set_value(np.asarray(new_eta,dtype=theano.config.floatX),borrow=True) # update eta

                        #eta = eta/2.0 
                        #shared_eta.set_value(np.asarray(eta,dtype=theano.config.floatX),borrow=True) # update eta
                    #******************************************************************************
            #============================================================================================
            
            
        # once early stopping, we save the best model to file
        with open('best_model.pkl', 'wb') as fp:
            print 'Saving best mode to best_model.pkl...'
            cPickle.dump(best_net, fp)
                    
        print("\nFinished training network.")
        print("Best validation accuracy of {0:.2%} obtained at best_epoch {1}".format(best_validation_accuracy, best_epoch))
        print("Corresponding test accuracy of {0:.2%}".format(test_accuracy))

        return evaluation_costs, evaluation_accuracys, training_costs, training_accuracys,best_epoch # for plot


#********************************************************
# load model and predict on test data
#********************************************************
def load_network_and_predict():
    """
    An example of how to load a trained model and use it
    to predict labels.
    """
    # load the saved model
    net = cPickle.load(open('best_model.pkl'))
    
    # predict
    training_set_size = 50000
    train_data,val_data,test_data = load_data_shared(training_set_size=training_set_size)
    test_x,test_y = test_data
    
    mini_batch_size = 10
    num_test_batches = size(test_data)/mini_batch_size 
    
    i = T.lscalar()
    # test predict
    test_mb_predictions = theano.function(
        [i], net.layers[-1].y_out,   # y(m,)　m个样本的预测结果
        givens={
            net.x:
            test_x[i*mini_batch_size: (i+1)*mini_batch_size]
        })
    # test accuracy
    test_mb_accuracy = theano.function(
        [i], net.layers[-1].accuracy(net.y), # y(m,)
        givens={
            net.x:
            test_x[i*mini_batch_size: (i+1)*mini_batch_size],
            net.y:
            test_y[i*mini_batch_size: (i+1)*mini_batch_size]
        })
    
    test_predictions = test_mb_predictions(0)
    print 'real values of first 10: ',test_y[:10].eval()
    print 'predictions of first 10: ',test_predictions
    
    test_accuracy = np.mean( [test_mb_accuracy(j) for j in xrange(num_test_batches)] )
    print 'test_accuracy ',test_accuracy
    
#********************************************************
# end of predict
#********************************************************


#### Define layer types

class ConvPoolLayer(object):
    """Used to create a combination of a convolutional and a max-pooling
    layer.  A more sophisticated implementation would separate the
    two, but for our purposes we'll always use them together, and it
    simplifies the code, so it makes sense to combine them.

    """

    def __init__(self, filter_shape, image_shape, poolsize=(2, 2),
                 activation_fn=sigmoid):
        """`filter_shape` is a tuple of length 4, whose entries are the number
        of filters, the number of input feature maps, the filter height, and the
        filter width.
        
        `image_shape` is a tuple of length 4, whose entries are the
        mini-batch size, the number of input feature maps, the image
        height, and the image width.

        `poolsize` is a tuple of length 2, whose entries are the y and
        x pooling sizes.

        np.prod((2,2)) = 4 # int64
        
        ConvPoolLayer1
        image_shape=(m,1,28,28)  1*28*28   (1 input feature map)
        filter_shape=(20,1,5,5)  20*24*24 
        poolsize=(2,2)           20*12*12 
        
        ConvPoolLayer2
        image_shape=(m,20,12,12) 20*12*12  (20 input feature map)
        filter_shape=(40,20,5,5) 40*8*8
        poolsize=(2,2)           40*4*4
        
        ConvPoolLayer1
        (20,1,5,5) 
        20指定当前ConvLayer1的features的数量: c1_f1,c1_f2,....c1_f19,c1_f20。
        (1,5,5)指定feature的一个pixel所对应的local receptive field(LRF),此处对应1个input feature的5*5区域。
        对应的w: w1,w2,...w19,w20 of size(1,5,5)===>w(20,1,5,5) filter_shape
        对应的b: b1,b2,...b19,b20 of size()     ===>b(20,)

        ConvPoolLayer2
        (40,20,5,5) 
        40指定当前ConvLayer2的features的数量: c2_f1,c2_f2,....c2_f39,c2_f40。
        (20,5,5)指定feature的一个pixel所对应的local receptive field(LRF),此处对应20个input feature的5*5区域。
        对应的w: w1,w2,...w39,w40 of size(20,5,5)===>w(40,20,5,5) filter_shape
        对应的b: b1,b2,...b39,b40 of size()     ===>b(40,)
        """
        assert image_shape[1] == filter_shape[1] # input feature maps
        self.filter_shape = filter_shape
        self.image_shape = image_shape
        self.poolsize = poolsize
        self.activation_fn=activation_fn
        
        # initialize weights and biases
        # 20*(5*5)/(2*2) = 500/4 = 125
        # 40*(5*5)/(2*2) = 1000/4 = 250
        #n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize)) # 125  250 (why???)
        
        # for tanh: w_bound = numpy.sqrt(6./(n_in+n_out))
        # for sigmoid: w_bound = 4*w_bound(tanh)
        # for ReLU: w = 0
        
        # there are "num input feature maps * filter height * filter width" inputs to each hidden unit
        n_in = np.prod(filter_shape[1:]) # LRF
        # each unit in the lower layer receives a gradient from:
        # "num output feature maps * filter height * filter width" / pooling size
        n_out = (filter_shape[0] * np.prod(filter_shape[2:]) // np.prod(poolsize))
        
        w_bound = np.sqrt(6./(n_in+n_out))
        if activation_fn == sigmoid:
            w_bound = 4*w_bound
            
        self.w = theano.shared(
            np.asarray(
                #np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), 
                np.random.uniform(low=-w_bound,high=w_bound, 
                                 size=filter_shape), 
                # w(20,1,5,5) w(40,20,5,5)
                dtype=theano.config.floatX),
            borrow=True) 
        self.b = theano.shared(
            np.asarray(
                np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)), 
                # b(20,) b(40,)
                dtype=theano.config.floatX),
            borrow=True)
        self.params = [self.w, self.b]

    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
        """
        inpt = x:  fmatrix(m,784)
        ConvPoolLayer1
        image_shape=(m,1,28,28)  m,1*28*28   (1 input feature map)
        filter_shape=(20,1,5,5)  m,20*24*24   w(20,1,5,5) b(20,)
        poolsize=(2,2)           m,20*12*12 
        
        ConvPoolLayer2
        image_shape=(m,20,12,12) m,20*12*12  (20 input feature map)
        filter_shape=(40,20,5,5) m,40*8*8     w(40,20,5,5) b(40,)
        poolsize=(2,2)           m,40*4*4
        
        
        ConvPoolLayer1
        inpt(m,784)--->inpt(m,1,28,28)
        conv_out(m,20,24,24)
        pooled_out(m,20,12,12)
        output(m,20,12,12)
        
        ConvPoolLayer2
        inpt(m,20,12,12)
        conv_out(m,40,8,8)
        pooled_out(m,40,4,4)
        output(m,40,4,4)
        """
        
        self.inpt = inpt.reshape(self.image_shape)
        conv_out = conv.conv2d( input=self.inpt, image_shape=self.image_shape, 
                               filters=self.w, filter_shape=self.filter_shape) 
        
        #conv_out = conv.conv2d(input=self.inpt,filters=self.w) 
        #theano.tensor.var.TensorVariable float32 TensorType(float32, 4D)
        
        pooled_out = pool_2d( input=conv_out, ws=self.poolsize, ignore_border=True) 
        #theano.tensor.var.TensorVariable float32 TensorType(float32, 4D)
        
        b_shuffle = self.b.dimshuffle('x', 0, 'x', 'x')  
        # TensorVariable TensorType(float32, (True, False, True, True))
        # ConvPoolLayer1: b(20,) 20个feature map分别增加b0,b1,...b19,b20
        # 对于pooled_out=(m,20,12,12)而言，('x', 0, 'x', 'x')的dim2=0，其他为x
        
        # ConvPoolLayer2: b(40,) 40个feature map分别增加b0,b1,...b39,b40
        # 对于pooled_out=(m,40,4,4)而言，('x', 0, 'x', 'x')的dim2=0，其他为x
        
        self.output = self.activation_fn( pooled_out + b_shuffle )
        #theano.tensor.var.TensorVariable float32 TensorType(float32, 4D)
        
        self.output_dropout = self.output # no dropout in the convolutional layers

class FullyConnectedLayer(object):

    def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0):
        self.n_in = n_in
        self.n_out = n_out
        self.activation_fn = activation_fn
        self.p_dropout = p_dropout
        
        #rng = numpy.random.RandomState(1234) # for w initialization
        
        # for tanh: w_bound = numpy.sqrt(6./(n_in+n_out))
        # for sigmoid: w_bound = 4*w_bound(tanh)
        # for ReLU: w = 0
        
        w_bound = np.sqrt(6./(n_in+n_out))
        if activation_fn == sigmoid:
            w_bound = 4*w_bound
        
        # Initialize weights and biases
        self.w = theano.shared(
            np.asarray(
                #np.random.normal(loc=0.0, scale=np.sqrt(1.0/n_in),
                np.random.uniform(low=-w_bound,high=w_bound,          
                size=(n_in, n_out)),
                dtype=theano.config.floatX),
            name='w', borrow=True)
        self.b = theano.shared(
            np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)),
                       dtype=theano.config.floatX),
            name='b', borrow=True)
        self.params = [self.w, self.b]

    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
        """
        (1) inpt,output for validating and testing
        (2) inpt_dropout,output_dropout for training (output_dropout--->[cost]--->grad--->params)
        
        以 ConvPoolLayer1(m,20,12,12),ConvPoolLayer2(m,40,4,4),[640,30,10]网络结构为例说明：
        ************************************************************************************************
        X(m,784),Y(m,)
        
        ConvPoolLayer1:
        当前层的inpt是前一层的output，因为是第一层，所以初始化为inpt = X(m,784)
        inpt(m,784)--->inpt(m,1,28,28)
        conv_out(m,20,24,24)
        pooled_out(m,20,12,12)
        output(m,20,12,12)
        
        ConvPoolLayer2:
        inpt(m,20,12,12)
        conv_out(m,40,8,8)
        pooled_out(m,40,4,4)
        output(m,40,4,4)
        ************************************************************************************************
        
        对于FullyConnectedLayer而言，inpt是ConvPoolLayer2的output=(m,40,4,4) 
        ================================================================================================
        Layer1:
        inpt=(m,40,4,4)--->inpt(m,640)    a1(m,640)即：m个样本，每个样本640个neurons
        output = sigmoid(input*w+b) ===> a2 = sigmoid(a1*w+b)
        a2(m,30) = sigmoid(  a1(m,640)* w(640,30)+ b(30,) ) 
        
        Layer2:
        当前层的inpt是前一层的output，即是FullyConnectedLayer1的output，包含30个hidden neurons输出 a2(m,30)
        output = SOFTMAX(input*w+b) ===> a3 = SOFTMAX(a2*w+b)
        a3(m,10) = SOFTMAX(  a2(m,30)* w(30,10)+ b(10,) ) 
        
        output是m个样本对应的10个概率,y_out是m个样本对应的真实数值。
        ================================================================================================
        """
        
        self.inpt = inpt.reshape((mini_batch_size, self.n_in))
        #self.output = self.activation_fn((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)  
        self.output = self.activation_fn(T.dot(self.inpt, self.w) + self.b)
       
        #self.y_out = T.argmax(self.output, axis=1) # 暂时不用，只是用最后一层的y_out作为输出结果
        
        self.inpt_dropout = dropout_layer( inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
        self.output_dropout = self.activation_fn(T.dot(self.inpt_dropout, self.w) + self.b)

    #def accuracy(self, y):
    #    "Return the accuracy for the mini-batch."
    #    # 暂时不用，只是用最后一层
    #    return T.mean(T.eq(y, self.y_out))

class SoftmaxLayer(object):

    def __init__(self, n_in, n_out, p_dropout=0.0):
        self.n_in = n_in
        self.n_out = n_out
        self.activation_fn = softmax # default to softmax
        self.p_dropout = p_dropout
        
        # Initialize weights and biases
        # for sigmoid neurons,w--->(0, 1/sqrt(n_in)) b--->(0,1)
        # for softmax neurons,w = 0,b = 0, no need using suitably parameteried normal random variables
        self.w = theano.shared(
            np.zeros((n_in, n_out), dtype=theano.config.floatX),
            name='w', borrow=True)
        self.b = theano.shared(
            np.zeros((n_out,), dtype=theano.config.floatX),
            name='b', borrow=True)
        self.params = [self.w, self.b]

    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
        """
        (1) inpt,output for validating and testing
        (2) inpt_dropout,output_dropout for training (output_dropout--->[cost]--->grad--->params)
        
        
        在Python中，a = sigmoid(w*a+b), w=(30,784),a=(784,1)一次使用一个样本参与计算。
        在Theano中修改为,a = sigmoid(a*w+b) a=(m,784),w=(784,30)一次使用m个样本参与计算。
        
        以[784,30,10]网络结构为例说明：
        Layer1:
        当前层的inpt是前一层的output，因为是第一层，所以初始化为a1 = X(m,784) Matrix，每一个样本包含784个输入neurons
        output = sigmoid(input*w+b) ===> a2 = sigmoid(a1*w+b)
        a2(m,30) = sigmoid(  a1(m,784)* w(784,30)+ b(30,) ) 
        
        Layer2:
        当前层的inpt是前一层的output，即是FullyConnectedLayer的output，包含30个hidden neurons输出 a2(m,30)
        output = SOFTMAX(input*w+b) ===> a3 = SOFTMAX(a2*w+b)
        a3(m,10) = SOFTMAX(  a2(m,30)* w(30,10)+ b(10,) ) 
        
        output是m个样本对应的10个概率,y_out是m个样本对应的真实数值。
        """
        self.inpt = inpt.reshape((mini_batch_size, self.n_in))  # tesorvariable Matrix(m,n_in)
        #self.output = self.activation_fn((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
        self.output = self.activation_fn(T.dot(self.inpt, self.w) + self.b)
        
        """
        input-->    output   ---> y_out
        X1---> [y0,y1,...y9] --->  1
        X2---> [y0,y1,...y9] --->  0
        ...
        Xm---> [y0,y1,...y9] --->  2
        
        axis沿着row作为一个整体进行，y_out作为最终的输出=vector(m,)。
        """
        self.y_out = T.argmax(self.output, axis=1) # 对应的数值 [2,1,...7]
        
        self.inpt_dropout = dropout_layer( inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
        self.output_dropout = self.activation_fn(T.dot(self.inpt_dropout, self.w) + self.b)

    def cost(self, net):
        "Return the log-likelihood cost."
        
        """
        使用output_dropout用于train
        
        (1) 一个样本对应的代价Cx
        C = -log(a[i])
        i = np.argmax(y)  # a(10,1) y(10,1)
        return -np.log(a[i,0])
        
        (2) m个样本的平均代价
        计算代价的时候，传递Network作为参数，方便获取net.y
       
                output(m,10)     net.y   cost
        X1---> [y0,y1,...y9] --->  1     -log a[1,1]
        X2---> [y0,y1,...y9] --->  0     -log a[2,0]
        Xm---> [y0,y1,...y9] --->  2     -log a[m,2]
        
        
        a = np.array([[0,   0.8, 0,   0,...],
                      [0.9, 0,   0,   0,...],
                      [0,   0,   0.7, 0...]])
        y = [1,0,2]
        a[[0,1,2],y]
        
        > array([ 0.8,  0.9,  0.7])
        """
        
        m = net.y.shape[0]
        rows = T.arange(m)
        return -T.mean(T.log( self.output_dropout[rows, net.y] ))

    def accuracy(self, y):
        "Return the accuracy for the mini-batch."
        
        """
        使用output,y_out用于test
        
        y(m,) 对应m个样本的真实结果
        y_out(m,)　对应m个样本的预测结果
        如果mini_batch_size = 5
        
        y = np.array([2,1,7,8,9])
        y_out = np.array([2,1,7,6,9])
        np.mean(np.equal(y,y_out))  # [1,1,1,0,1] 0.80
        """
        return T.mean(T.eq(y, self.y_out))


#### Miscellanea
def size(data):
    "Return the size of the dataset `data`."
    return data[0].get_value(borrow=True).shape[0]  # N = 50000

def dropout_layer(layer, p_dropout):
    """
    对于[784,30,10]
    Layer1:
    layer= float32 (m,784), p_dropout = 0.2,对每个节点以一定的概率进行drop
    
    参考：http://www.jianshu.com/p/ba9ca3b07922
    
    Inverted Dropout 
    我们稍微将 Dropout 方法改进一下，使得我们只需要在训练阶段缩放激活函数的输出值，而不用在测试阶段改变什么。
    这个改进的 Dropout 方法就被称之为 Inverted Dropout 。
    
    在各种深度学习框架的实现中，我们都是用 Inverted Dropout 来代替 Dropout，因为这种方式有助于模型的完整性，
    我们只需要修改一个参数（保留/丢弃概率），而整个模型都不用修改。
    """
    srng = shared_randomstreams.RandomStreams( np.random.RandomState(0).randint(999999) )
    retain_prob = 1. - p_dropout # retain probility  theano.config.floatX
    #mask = srng.binomial(n=1, p=retain_prob, size=layer.shape,dtype='int8') # int8
    
    #mask: <class 'theano.tensor.var.TensorVariable'> TensorType(float32, vector)
    mask = srng.binomial(n=1, p=retain_prob, size=layer.shape,dtype=theano.config.floatX)
    mask_layer = layer*mask
    return mask_layer/retain_prob #在train阶段除以retain_prob，以便test阶段每一个Layer的output形式保持不变。

Test Network3

import random
import numpy as np
random.seed(12345678)
np.random.seed(12345678)

#from ke_network3 import *
epochs = 3
training_set_size = 100
mini_batch_size = 10
train_data,val_data,test_data = load_data_shared(training_set_size=training_set_size)

# for conv pool layer
image_shape=(mini_batch_size,1,28,28)
filter_shape=(20,1,5,5)
poolsize=(2,2)
convpool_layer1 = ConvPoolLayer(image_shape=image_shape,filter_shape=filter_shape, poolsize=poolsize)
n_in = 20*12*12


image_shape=(mini_batch_size,20,12,12)
filter_shape=(40,20,5,5)
poolsize=(2,2)
n_in = 40*4*4
convpool_layer2 = ConvPoolLayer(image_shape=image_shape,filter_shape=filter_shape, poolsize=poolsize)


full_layer = FullyConnectedLayer(n_in=n_in,n_out=30)
softmax_layer = SoftmaxLayer(n_in=30,n_out=10)
#net = Network([convpool_layer1,full_layer,softmax_layer],10)
net = Network([convpool_layer1,convpool_layer2,full_layer,softmax_layer],10)
net.SGD(train_data,epochs,mini_batch_size,0.3,val_data,test_data,lmbda=0)

updates TensorType(float32, 4D) TensorType(float32, 4D)
updates TensorType(float32, vector) TensorType(float32, vector)
updates TensorType(float32, 4D) TensorType(float32, 4D)
updates TensorType(float32, vector) TensorType(float32, vector)
updates TensorType(float32, matrix) TensorType(float32, matrix)
updates TensorType(float32, vector) TensorType(float32, vector)
updates TensorType(float32, matrix) TensorType(float32, matrix)
updates TensorType(float32, vector) TensorType(float32, vector)
Training mini-batch number 0
Epoch 0: validation accuracy 10.00%

This is the best validation accuracy to date.
The corresponding test accuracy is 8.00%
Epoch 1: validation accuracy 10.00%

This is the best validation accuracy to date.
The corresponding test accuracy is 8.00%
Epoch 2: validation accuracy 10.00%

This is the best validation accuracy to date.
The corresponding test accuracy is 8.00%

Finished training network.
Best validation accuracy of 10.00% obtained at iteration 29
Corresponding test accuracy of 8.00%





([2.2949765, 2.2951121, 2.2958748],
 [0.10000000000000001, 0.10000000000000001, 0.10000000000000001],
 [2.2682509, 2.2655275, 2.2644706],
 [0.13, 0.13, 0.13])

Basic Test of Network3.py

(1) load data

from ke_network3 import *
filename="../data/mnist.pkl.gz"
filename="../data/mnist_expanded.pkl.gz"
f = gzip.open(filename, 'rb')
training_data, validation_data, test_data = cPickle.load(f)
f.close()

x = training_data[0] # (m,784)
y = training_data[1] # (m,)
print type(x),type(y)
print type(x[0]),type(y[0])
print x.shape,y.shape
print x[0].shape,y[0].shape
x2 = x[:10,]

set_size = 10
x = training_data[0] # float32  (50000, 784)
y = training_data[1] # int64  (50000,)

training_x = theano.shared( training_data[0][:set_size,],  borrow=True) #float32
training_y = theano.shared( np.asarray(training_data[0][:set_size,],dtype='int8'),  borrow=True) # int8
#training_x2 = theano.shared(np.asarray(training_data[0], dtype=theano.config.floatX), borrow=True) # float64
print training_x.type
print training_y.type
#print training_x2.type

# 乘法可能会改变TensorVariable的类型
new_x = training_x*0.1 # float32--->float64
print training_x.type,new_x.type

<type 'numpy.ndarray'> <type 'numpy.ndarray'>
<type 'numpy.ndarray'> <type 'numpy.int64'>
(50, 784) (50,)
(784,) ()
TensorType(float32, matrix)
TensorType(int8, matrix)
TensorType(float32, matrix) TensorType(float64, matrix)

(2) dimshuffle b to match pooled_out

pooled_out = np.arange(18).reshape(1,2,3,3)
print pooled_out
b = np.array([0.0,1.0],dtype='float32') # [0,1]

# shuffle b to match pooled_out
sb = theano.shared(np.asarray(b,dtype='float32')) 
y = sb.dimshuffle('x', 0, 'x', 'x')  # TensorVariable TensorType(float32, (True, False, True, True))
# 2个feature map分别增加b0,b1
print type(y),y.type,y.shape.eval()

b_value = y.eval()
print b_value
pooled_out + b_value

[[[[ 0  1  2]
   [ 3  4  5]
   [ 6  7  8]]

  [[ 9 10 11]
   [12 13 14]
   [15 16 17]]]]
<class 'theano.tensor.var.TensorVariable'> TensorType(float32, (True, False, True, True)) [1 2 1 1]
[[[[ 0.]]

  [[ 1.]]]]





array([[[[  0.,   1.,   2.],
         [  3.,   4.,   5.],
         [  6.,   7.,   8.]],

        [[ 10.,  11.,  12.],
         [ 13.,  14.,  15.],
         [ 16.,  17.,  18.]]]])

Reference

History

20180807: created.