quacknet.convulationalBackpropagation

  1import numpy as np
  2
  3class CNNbackpropagation:
  4    def _ConvolutionDerivative(self, errorPatch, kernals, inputTensor, stride):
  5        """
  6        Compute gradients for conolutional layer weights, biases and input errors during backpropagation.
  7
  8        Args:
  9            errorPatch (ndarray): Error gradient from the next layer.
 10            kernals (ndarray): Kernals used during forward propagatation, shape (input channels, num kernels, kernel height, kernel width).
 11            inputTensor (ndarray): Input to the convolutional layer during forward propagation.
 12            stride (int): Stride length used during convolution.
 13        
 14        Returns:
 15            weightGradients (ndarray): Gradients of the loss with respect to kernels.
 16            biasGradients (ndarray): Gradients of the loss with respect to biases for each kernel.
 17            inputErrorTerms (ndarray): Error terms propagated to the previous layer.
 18        """
 19        ###################################        
 20        # gets the error gradient from the layer infront and it is a error patch
 21        # this error patch is the same size as what the convolutional layer outputed during forward propgation
 22        # get the kernal (as in a patch of the image) again, but this time you are multipling each value in the kernal by 1 value that is inside the error patch
 23        # this makes the gradient of the loss of one kernal's weight
 24        
 25        # the gradient of the loss of one kernal's bias is the summ of all the error terms
 26        # because bias is applied to every input in forward propgation
 27        
 28        # the gradient of the loss of the input, which is the error terms for the layer behind it
 29        # firstly the kernal has to be flipped, meaning flip the kernal left to right and then top to bottom, but not flipping the layers,
 30        # the gradient of one pixel, is the summ of each error term multiplied by the flipped kernal 
 31        ###################################     
 32        
 33        kernalSize = self.kernalSize # all kernals are the same shape and squares
 34        weightGradients = np.zeros((len(inputTensor), len(kernals), kernalSize, kernalSize)) #kernals are the same size
 35        outputHeight, outputWidth = errorPatch.shape[1], errorPatch.shape[2]
 36        for output in range(len(kernals)):
 37            for layer in range(len(inputTensor)):
 38                for i in range(outputHeight):
 39                    for j in range(outputWidth):
 40                        startI = i * stride
 41                        startJ = j * stride
 42                        if(startI + kernalSize > inputTensor.shape[1] or startJ + kernalSize > inputTensor.shape[2]):
 43                            continue
 44                        kernal = inputTensor[layer, startI: startI + kernalSize, startJ : startJ + kernalSize]
 45                        weightGradients[layer, output] += kernal * errorPatch[output, i, j]
 46    
 47        biasGradients = np.sum(errorPatch, axis=(1, 2))
 48
 49        inputErrorTerms = np.zeros_like(inputTensor)
 50        flipped = kernals[:, :, ::-1, ::-1]
 51        for output in range(len(errorPatch)):
 52            for layer in range(len(inputTensor)):
 53                for i in range(outputHeight):
 54                    inputI = i * stride
 55                    for j in range(outputWidth):
 56                        inputJ = j * stride
 57                        if(inputI + kernalSize > inputTensor.shape[1] or inputJ + kernalSize > inputTensor.shape[2]):
 58                            continue
 59                        errorKernal = errorPatch[output, i, j]
 60                        inputErrorTerms[layer, inputI: inputI + kernalSize, inputJ: inputJ + kernalSize] += errorKernal * flipped[output, layer]
 61        
 62        weightGradients = np.transpose(weightGradients, (1, 0, 2, 3))
 63        return weightGradients, biasGradients, inputErrorTerms
 64            
 65            
 66    def _MaxPoolingDerivative(self, errorPatch, inputTensor, sizeOfGrid, strideLength):
 67        """
 68        Compute the gradient of the loss with respect to the input of the max pooling layer during backpropagation.
 69
 70        Args:
 71            errorPatch (ndarray): Error gradient from the next layer.
 72            inputTensor (ndarray): Input to the max pooling layer during forward propagation.
 73            sizeOfGrid (int): Size of the pooling window.
 74            strideLength (int): Stride length used during pooling.
 75        
 76        Returns:
 77            inputGradient (ndarray): Gradient of the loss with respect to the inputTensor
 78        """
 79        inputGradient = np.zeros_like(inputTensor, dtype=np.float64)
 80        outputHeight = (inputTensor.shape[1] - sizeOfGrid) // strideLength + 1
 81        outputWidth = (inputTensor.shape[2] - sizeOfGrid) // strideLength + 1
 82        for image in range(len(inputTensor)): # tensor is a 3d structures, so it is turning it into a 2d array (eg. an layer or image)
 83            for x in range(outputHeight):
 84                for y in range(outputWidth):
 85                    indexX = x * strideLength
 86                    indexY = y * strideLength
 87
 88                    gridOfValues = inputTensor[image, indexX: indexX + sizeOfGrid, indexY: indexY + sizeOfGrid]
 89                    indexMax = np.argmax(gridOfValues)
 90                    maxX, maxY = divmod(indexMax, sizeOfGrid)
 91
 92                    #newValues = np.zeros((sizeOfGrid, sizeOfGrid))
 93                    #newValues[maxX, maxY] = 1
 94                    #inputGradient[image, indexX: indexX + sizeOfGrid, indexY: indexY + sizeOfGrid] += newValues * errorPatch[image, x, y]
 95
 96                    inputGradient[image, indexX + maxX, indexY + maxY] += errorPatch[image, x, y]
 97        return inputGradient
 98
 99    def _AveragePoolingDerivative(self, errorPatch, inputTensor, sizeOfGrid, strideLength):
100        """
101        Compute the gradient of the loss with respect to the input of the average pooling layer during backpropagation.
102
103        Args:
104            errorPatch (ndarray): Error gradient from the next layer.
105            inputTensor (ndarray): Input to the average pooling layer during forward propagation.
106            sizeOfGrid (int): Size of the pooling window.
107            strideLength (int): Stride length used during pooling.
108        
109        Returns:
110            inputGradient (ndarray): Gradient of the loss with respect to the inputTensor
111        """       
112        inputGradient = np.zeros_like(inputTensor, dtype=np.float32)
113        outputHeight = (inputTensor.shape[1] - sizeOfGrid) // strideLength + 1
114        outputWidth = (inputTensor.shape[2] - sizeOfGrid) // strideLength + 1
115        avgMultiplier = 1 / (sizeOfGrid ** 2)
116        for image in range(len(inputTensor)): # tensor is a 3d structures, so it is turning it into a 2d array (eg. an layer or image)
117            for x in range(outputHeight):
118                for y in range(outputWidth):
119                    indexX = x * strideLength
120                    indexY = y * strideLength
121                    #newValues = np.ones((sizeOfGrid, sizeOfGrid)) * errorPatch[image, x, y] / (sizeOfGrid ** 2)
122                    newValues = errorPatch[image, x, y] * avgMultiplier
123                    inputGradient[image, indexX: indexX + sizeOfGrid, indexY: indexY + sizeOfGrid] += newValues 
124        return inputGradient
125    
126    def _GlobalAveragePoolingDerivative(self, inputTensor):
127        """
128        Compute the gradient of the loss with respect to the input of the global average pooling layer during backpropagation.
129
130        Args:
131            inputTensor (ndarray): Input to the global average pooling layer during forward propagation.
132        
133        Returns:
134            inputGradient (ndarray): Gradient of the loss with respect to the inputTensor
135        """     
136        return np.ones_like(inputTensor) * (1 / (inputTensor.shape[1] * inputTensor.shape[2]))
137    
138    def _ActivationLayerDerivative(self, errorPatch, activationDerivative, inputTensor):
139        """
140        Compute the gradient of the loss with respect to the input of the activation layer during backpropagation.
141
142        Args:
143            errorPatch (ndarray): Error gradient from the next layer.
144            activationDerivative (function): Derivative function of the activation function.
145            inputTensor (ndarray): Input to the activation layer during forward propagation.
146        
147        Returns:
148            inputGradient (ndarray): Gradient of the loss with respect to the inputTensor
149        """  
150        return errorPatch * activationDerivative(inputTensor)
class CNNbackpropagation:
  4class CNNbackpropagation:
  5    def _ConvolutionDerivative(self, errorPatch, kernals, inputTensor, stride):
  6        """
  7        Compute gradients for conolutional layer weights, biases and input errors during backpropagation.
  8
  9        Args:
 10            errorPatch (ndarray): Error gradient from the next layer.
 11            kernals (ndarray): Kernals used during forward propagatation, shape (input channels, num kernels, kernel height, kernel width).
 12            inputTensor (ndarray): Input to the convolutional layer during forward propagation.
 13            stride (int): Stride length used during convolution.
 14        
 15        Returns:
 16            weightGradients (ndarray): Gradients of the loss with respect to kernels.
 17            biasGradients (ndarray): Gradients of the loss with respect to biases for each kernel.
 18            inputErrorTerms (ndarray): Error terms propagated to the previous layer.
 19        """
 20        ###################################        
 21        # gets the error gradient from the layer infront and it is a error patch
 22        # this error patch is the same size as what the convolutional layer outputed during forward propgation
 23        # get the kernal (as in a patch of the image) again, but this time you are multipling each value in the kernal by 1 value that is inside the error patch
 24        # this makes the gradient of the loss of one kernal's weight
 25        
 26        # the gradient of the loss of one kernal's bias is the summ of all the error terms
 27        # because bias is applied to every input in forward propgation
 28        
 29        # the gradient of the loss of the input, which is the error terms for the layer behind it
 30        # firstly the kernal has to be flipped, meaning flip the kernal left to right and then top to bottom, but not flipping the layers,
 31        # the gradient of one pixel, is the summ of each error term multiplied by the flipped kernal 
 32        ###################################     
 33        
 34        kernalSize = self.kernalSize # all kernals are the same shape and squares
 35        weightGradients = np.zeros((len(inputTensor), len(kernals), kernalSize, kernalSize)) #kernals are the same size
 36        outputHeight, outputWidth = errorPatch.shape[1], errorPatch.shape[2]
 37        for output in range(len(kernals)):
 38            for layer in range(len(inputTensor)):
 39                for i in range(outputHeight):
 40                    for j in range(outputWidth):
 41                        startI = i * stride
 42                        startJ = j * stride
 43                        if(startI + kernalSize > inputTensor.shape[1] or startJ + kernalSize > inputTensor.shape[2]):
 44                            continue
 45                        kernal = inputTensor[layer, startI: startI + kernalSize, startJ : startJ + kernalSize]
 46                        weightGradients[layer, output] += kernal * errorPatch[output, i, j]
 47    
 48        biasGradients = np.sum(errorPatch, axis=(1, 2))
 49
 50        inputErrorTerms = np.zeros_like(inputTensor)
 51        flipped = kernals[:, :, ::-1, ::-1]
 52        for output in range(len(errorPatch)):
 53            for layer in range(len(inputTensor)):
 54                for i in range(outputHeight):
 55                    inputI = i * stride
 56                    for j in range(outputWidth):
 57                        inputJ = j * stride
 58                        if(inputI + kernalSize > inputTensor.shape[1] or inputJ + kernalSize > inputTensor.shape[2]):
 59                            continue
 60                        errorKernal = errorPatch[output, i, j]
 61                        inputErrorTerms[layer, inputI: inputI + kernalSize, inputJ: inputJ + kernalSize] += errorKernal * flipped[output, layer]
 62        
 63        weightGradients = np.transpose(weightGradients, (1, 0, 2, 3))
 64        return weightGradients, biasGradients, inputErrorTerms
 65            
 66            
 67    def _MaxPoolingDerivative(self, errorPatch, inputTensor, sizeOfGrid, strideLength):
 68        """
 69        Compute the gradient of the loss with respect to the input of the max pooling layer during backpropagation.
 70
 71        Args:
 72            errorPatch (ndarray): Error gradient from the next layer.
 73            inputTensor (ndarray): Input to the max pooling layer during forward propagation.
 74            sizeOfGrid (int): Size of the pooling window.
 75            strideLength (int): Stride length used during pooling.
 76        
 77        Returns:
 78            inputGradient (ndarray): Gradient of the loss with respect to the inputTensor
 79        """
 80        inputGradient = np.zeros_like(inputTensor, dtype=np.float64)
 81        outputHeight = (inputTensor.shape[1] - sizeOfGrid) // strideLength + 1
 82        outputWidth = (inputTensor.shape[2] - sizeOfGrid) // strideLength + 1
 83        for image in range(len(inputTensor)): # tensor is a 3d structures, so it is turning it into a 2d array (eg. an layer or image)
 84            for x in range(outputHeight):
 85                for y in range(outputWidth):
 86                    indexX = x * strideLength
 87                    indexY = y * strideLength
 88
 89                    gridOfValues = inputTensor[image, indexX: indexX + sizeOfGrid, indexY: indexY + sizeOfGrid]
 90                    indexMax = np.argmax(gridOfValues)
 91                    maxX, maxY = divmod(indexMax, sizeOfGrid)
 92
 93                    #newValues = np.zeros((sizeOfGrid, sizeOfGrid))
 94                    #newValues[maxX, maxY] = 1
 95                    #inputGradient[image, indexX: indexX + sizeOfGrid, indexY: indexY + sizeOfGrid] += newValues * errorPatch[image, x, y]
 96
 97                    inputGradient[image, indexX + maxX, indexY + maxY] += errorPatch[image, x, y]
 98        return inputGradient
 99
100    def _AveragePoolingDerivative(self, errorPatch, inputTensor, sizeOfGrid, strideLength):
101        """
102        Compute the gradient of the loss with respect to the input of the average pooling layer during backpropagation.
103
104        Args:
105            errorPatch (ndarray): Error gradient from the next layer.
106            inputTensor (ndarray): Input to the average pooling layer during forward propagation.
107            sizeOfGrid (int): Size of the pooling window.
108            strideLength (int): Stride length used during pooling.
109        
110        Returns:
111            inputGradient (ndarray): Gradient of the loss with respect to the inputTensor
112        """       
113        inputGradient = np.zeros_like(inputTensor, dtype=np.float32)
114        outputHeight = (inputTensor.shape[1] - sizeOfGrid) // strideLength + 1
115        outputWidth = (inputTensor.shape[2] - sizeOfGrid) // strideLength + 1
116        avgMultiplier = 1 / (sizeOfGrid ** 2)
117        for image in range(len(inputTensor)): # tensor is a 3d structures, so it is turning it into a 2d array (eg. an layer or image)
118            for x in range(outputHeight):
119                for y in range(outputWidth):
120                    indexX = x * strideLength
121                    indexY = y * strideLength
122                    #newValues = np.ones((sizeOfGrid, sizeOfGrid)) * errorPatch[image, x, y] / (sizeOfGrid ** 2)
123                    newValues = errorPatch[image, x, y] * avgMultiplier
124                    inputGradient[image, indexX: indexX + sizeOfGrid, indexY: indexY + sizeOfGrid] += newValues 
125        return inputGradient
126    
127    def _GlobalAveragePoolingDerivative(self, inputTensor):
128        """
129        Compute the gradient of the loss with respect to the input of the global average pooling layer during backpropagation.
130
131        Args:
132            inputTensor (ndarray): Input to the global average pooling layer during forward propagation.
133        
134        Returns:
135            inputGradient (ndarray): Gradient of the loss with respect to the inputTensor
136        """     
137        return np.ones_like(inputTensor) * (1 / (inputTensor.shape[1] * inputTensor.shape[2]))
138    
139    def _ActivationLayerDerivative(self, errorPatch, activationDerivative, inputTensor):
140        """
141        Compute the gradient of the loss with respect to the input of the activation layer during backpropagation.
142
143        Args:
144            errorPatch (ndarray): Error gradient from the next layer.
145            activationDerivative (function): Derivative function of the activation function.
146            inputTensor (ndarray): Input to the activation layer during forward propagation.
147        
148        Returns:
149            inputGradient (ndarray): Gradient of the loss with respect to the inputTensor
150        """  
151        return errorPatch * activationDerivative(inputTensor)