$\textbf{Problem}$: Write functions that take an arbitrary grayscale image and convolve them with a Sobel edge detection kernel. Apply both functions to a grayscale image of your choice.
$\textbf{Solution}$:
import numpy as np
import matplotlib.pyplot as plt
sobel_horizontal_kernel = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
sobel_vertical_kernel = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]])
def sobel_horizontal(img):
convolved_image = np.zeros(shape=(img.shape[0]-2,img.shape[1]-2))
for i in range(1,img.shape[0] - 1):
for j in range(1,img.shape[1] - 1):
convolved_image[i-1, j-1] = np.sum(img[i-1:i+2, j-1:j+2] * sobel_horizontal_kernel)
return convolved_image
def sobel_vertical(img):
convolved_image = np.zeros(shape=(img.shape[0]-2,img.shape[1]-2))
for i in range(1,img.shape[0] - 1):
for j in range(1,img.shape[1] - 1):
convolved_image[i-1, j-1] = np.sum(img[i-1:i+2, j-1:j+2] * sobel_vertical_kernel)
return convolved_image
np.random.seed(42)
random_grayscale_img = np.random.randint(0, 256, size=(10, 10))
print("Original Image")
plt.imshow(random_grayscale_img, cmap="gray")
plt.show()
print("Horizontal Sobel")
plt.imshow(sobel_horizontal(random_grayscale_img), cmap="gray")
plt.show()
print("Vertical Sobel")
plt.imshow(sobel_vertical(random_grayscale_img), cmap="gray")
plt.show()
Original Image
Horizontal Sobel
Vertical Sobel
$\textbf{Problem}$: Above, the $3\times 3$ Sobel edge detection kernels were convolved with an $n\times m$ grayscale image, leading to an $(n-2)\times (m-2)$ edge map. However, although for $n, m \gg 1$ this $-2$ shrinking effect is not particularly noticeable, over many layers of convolutions it can compound undesirably. Thus, one solution is to use $\textit{padding}$ to prevent this shrinking effect. Show how this works.
$\textbf{Solution}$: (by the way, the above unpadded version is sometimes called a $\textit{valid}$ convolution, whereas the padded version is sometimes called a $\textit{same}$ convolution).
padded_random_grayscale_img = np.pad(random_grayscale_img, ((1, 1), (1, 1)), 'constant')
print("Padded random grayscale image shape: ", padded_random_grayscale_img.shape)
plt.imshow(padded_random_grayscale_img, cmap='gray')
plt.show()
print("Horizontal Sobel")
plt.imshow(sobel_horizontal(padded_random_grayscale_img), cmap="gray")
plt.show()
print("Vertical Sobel")
plt.imshow(sobel_vertical(padded_random_grayscale_img), cmap="gray")
plt.show()
Padded random grayscale image shape: (12, 12)
Horizontal Sobel
Vertical Sobel
$\textbf{Problem}$: Generate a random $15\times 15$ grayscale image, a random $5\times 5$ kernel, and perform a $\textit{stride}$ convolution of the image with the kernel using a stride size of $2$.
$\textbf{Solution}$: The dimension of the output image in this (unpadded) case is $(15-5)/2+1=6$.
np.random.seed(42)
rnd_gray_img = np.random.randint(0, 256, size=(15, 15))
rnd_kernel = np.random.randint(0, 10, size=(5, 5))
plt.imshow(rnd_gray_img, cmap='gray')
plt.show()
plt.imshow(rnd_kernel, cmap='gray')
plt.show()
def stride_convolution(img, kernel, stride_size, padding):
output_img = np.zeros(shape=(int((img.shape[0] + 2 * padding - kernel.shape[0])/stride_size + 1), int((img.shape[1] + 2 * padding - kernel.shape[1])/stride_size + 1)))
for i in range(output_img.shape[0]):
for j in range(output_img.shape[1]):
output_img[i, j] = np.sum(img[i * stride_size:i * stride_size + kernel.shape[0], j * stride_size:j * stride_size + kernel.shape[1]] * kernel)
return output_img
plt.imshow(stride_convolution(rnd_gray_img, rnd_kernel, 2, 0), cmap='gray')
<matplotlib.image.AxesImage at 0x164ca230b90>
$\textbf{Problem}$: Suppose one has a $28\times 28\times 192$ volume at some point inside a convolutional neural network. Explain how a same, unpadded, unit-stride convolution can be performed on this volume to obtain an output volume of dimensions $28\times 28\times 32$.
$\textbf{Solution}$: Use $32$ filters of size $1\times 1\times 192$ with stride $1$ and no padding; this is a common technique to reduce channel number while preserving spatial dimensions.
$\textbf{Problem}$: Implement simplified versions of the LeNet-5, AlexNet, and VGG-16 convolutional neural networks with randomly initialized weights and biases (here simplified means that we will not implement batch normalization, dropout, or any activation functions). Run a forward pass through it without bothering to train the parameters.
$\textbf{Solution}$:
class ConvLayer():
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
# Initialize weights and biases
self.weights = np.random.randn(out_channels, in_channels, kernel_size, kernel_size)
self.biases = np.random.randn(out_channels)
def forward(self, x):
# x shape: (batch_size, in_channels, height, width)
batch_size, in_channels, height, width = x.shape
# Calculate output dimensions
out_height = (height + 2 * self.padding - self.kernel_size) // self.stride + 1
out_width = (width + 2 * self.padding - self.kernel_size) // self.stride + 1
# Initialize output
out = np.zeros((batch_size, self.out_channels, out_height, out_width))
# Apply padding if needed
if self.padding > 0:
x = np.pad(x, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), 'constant')
# Convolution operation
for b in range(batch_size):
for c in range(self.out_channels):
for h in range(out_height):
for w in range(out_width):
# Extract the receptive field
receptive_field = x[b, :, h*self.stride:h*self.stride+self.kernel_size,
w*self.stride:w*self.stride+self.kernel_size]
# Perform convolution
out[b, c, h, w] = np.sum(receptive_field * self.weights[c]) + self.biases[c]
return out
class PoolingLayer():
def __init__(self, kernel_size, stride=2, mode="max"):
self.kernel_size = kernel_size
self.stride = stride
self.mode = mode
def forward(self, x):
# x shape: (batch_size, channels, height, width)
batch_size, channels, height, width = x.shape
# Calculate output dimensions
out_height = height // self.stride
out_width = width // self.stride
# Initialize output
out = np.zeros((batch_size, channels, out_height, out_width))
# Max pooling operation
for b in range(batch_size):
for c in range(channels):
for h in range(out_height):
for w in range(out_width):
# Extract the receptive field
receptive_field = x[b, c, h*self.stride:h*self.stride+self.kernel_size,
w*self.stride:w*self.stride+self.kernel_size]
# Perform max pooling
if self.mode == "max":
out[b, c, h, w] = np.max(receptive_field)
elif self.mode == "avg":
out[b, c, h, w] = np.mean(receptive_field)
return out
class FCLayer():
def __init__(self, in_features, out_features):
self.in_features = in_features
self.out_features = out_features
# Initialize weights and biases
self.weights = np.random.randn(out_features, in_features)
self.biases = np.random.randn(out_features)
def forward(self, x):
# x shape: (batch_size, in_features)
batch_size = x.shape[0]
# Initialize output
out = np.zeros((batch_size, self.out_features))
# Perform matrix multiplication
for b in range(batch_size):
out[b] = np.dot(self.weights, x[b]) + self.biases
return out
#Le-Net 5 Implementation
num_batches = 3
np.random.seed(42)
batch_rand_gray_imgs = np.random.randint(0, 256, size=(num_batches, 1, 32, 32))
# Create layers
layer1 = ConvLayer(in_channels=1, out_channels=6, kernel_size=5, stride=1, padding=0)
layer2 = PoolingLayer(kernel_size=2, stride=2, mode="avg")
layer3 = ConvLayer(in_channels=6, out_channels=16, kernel_size=5, stride=1, padding=0)
layer4 = PoolingLayer(kernel_size=2, stride=2, mode="avg")
layer5 = FCLayer(in_features=400, out_features=120) # both fully connected layers assume the input image dimensions are (h,w,c) = (32, 32, 1)
layer6 = FCLayer(in_features=120, out_features=84)
layer7 = FCLayer(in_features=84, out_features=1) #y_hat is int b/w 0 and 9 to classify handwritten digit, nowadays prefer softmax
# Forward pass
output = layer1.forward(batch_rand_gray_imgs)
output = layer2.forward(output)
output = layer3.forward(output)
output = layer4.forward(output)
output = output.reshape(num_batches, -1)
output = layer5.forward(output)
output = layer6.forward(output)
output = layer7.forward(output)
print(output)
[[-14004126.31782551] [-11998811.82382246] [-13235583.18652075]]
# AlexNet Implementation
num_batches = 3
np.random.seed(42)
batch_rand_gray_imgs = np.random.randint(0, 256, size=(num_batches, 3, 227, 227))
# Create layers
layer1 = ConvLayer(in_channels=3, out_channels=96, kernel_size=11, stride=4, padding=0)
layer2 = PoolingLayer(kernel_size=3, stride=2, mode="max")
layer3 = ConvLayer(in_channels=96, out_channels=256, kernel_size=5, stride=1, padding=2) # SAME convolutional layer
layer4 = PoolingLayer(kernel_size=3, stride=2, mode="max")
layer5 = ConvLayer(in_channels=256, out_channels=384, kernel_size=3, stride=1, padding=1)
layer6 = ConvLayer(in_channels=384, out_channels=384, kernel_size=3, stride=1, padding=1)
layer7 = ConvLayer(in_channels=384, out_channels=256, kernel_size=3, stride=1, padding=1)
layer8 = PoolingLayer(kernel_size=3, stride=2, mode="max")
layer9 = FCLayer(in_features=9216, out_features=4096) #9216 = 256 * 6 * 6
layer10 = FCLayer(in_features=4096, out_features=4096)
layer11 = FCLayer(in_features=4096, out_features=1000)
# Forward pass
output = layer1.forward(batch_rand_gray_imgs)
output = layer2.forward(output)
output = layer3.forward(output)
output = layer4.forward(output)
output = layer5.forward(output)
output = layer6.forward(output)
output = layer7.forward(output)
output = layer8.forward(output)
output = output.reshape(num_batches, -1)
output = layer9.forward(output)
output = layer10.forward(output)
output = layer11.forward(output)
# normally would apply softmax on this 1000-feature vector output
for i in range(num_batches):
plt.semilogy(output[i, :])
plt.show()
# VGG-16 Implementation
num_batches = 3
np.random.seed(42)
batch_rand_gray_imgs = np.random.randint(0, 256, size=(num_batches, 3, 224, 224))
# Create layers
# Block 1
layer1_1 = ConvLayer(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1)
layer1_2 = ConvLayer(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)
pool1 = PoolingLayer(kernel_size=2, stride=2, mode="max")
# Block 2
layer2_1 = ConvLayer(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
layer2_2 = ConvLayer(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1)
pool2 = PoolingLayer(kernel_size=2, stride=2, mode="max")
# Block 3
layer3_1 = ConvLayer(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
layer3_2 = ConvLayer(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)
layer3_3 = ConvLayer(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)
pool3 = PoolingLayer(kernel_size=2, stride=2, mode="max")
# Block 4
layer4_1 = ConvLayer(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1)
layer4_2 = ConvLayer(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
layer4_3 = ConvLayer(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
pool4 = PoolingLayer(kernel_size=2, stride=2, mode="max")
# Block 5
layer5_1 = ConvLayer(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
layer5_2 = ConvLayer(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
layer5_3 = ConvLayer(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
pool5 = PoolingLayer(kernel_size=2, stride=2, mode="max")
# Fully Connected Layers
fc1 = FCLayer(in_features=25088, out_features=4096) # 25088 = 512 * 7 * 7
fc2 = FCLayer(in_features=4096, out_features=4096)
fc3 = FCLayer(in_features=4096, out_features=1000)
# Forward pass
output = layer1_1.forward(batch_rand_gray_imgs)
output = layer1_2.forward(output)
output = pool1.forward(output)
output = layer2_1.forward(output)
output = layer2_2.forward(output)
output = pool2.forward(output)
output = layer3_1.forward(output)
output = layer3_2.forward(output)
output = layer3_3.forward(output)
output = pool3.forward(output)
output = layer4_1.forward(output)
output = layer4_2.forward(output)
output = layer4_3.forward(output)
output = pool4.forward(output)
output = layer5_1.forward(output)
output = layer5_2.forward(output)
output = layer5_3.forward(output)
output = pool5.forward(output)
output = output.reshape(num_batches, -1)
output = fc1.forward(output)
output = fc2.forward(output)
output = fc3.forward(output)
for i in range(num_batches):
plt.semilogy(output[i, :])
plt.show()
$\textbf{Problem}$: Why is a CNN a better choice of architecture than an MLP for computer vision tasks?
$\textbf{Solution}$: As a simple example, given a convolutional layer of a CNN in which a square $n\times n\times c$ image is convolved (without striding or padding) with $N$ kernels each of size $k\times k$, producing an output volume of dimensions $(n-k+1)\times (n-k+1)\times N$, the number of parameters is expected to be $N(k^2+1)$ (each of the $N$ kernels has $k^2$ weights and a single bias term). By contrast, if one were to instead implement a fully connected layer of dense connections between the flattened input image of size $cn^2$ and the flattened output volume of size $N(n-k+1)^2$, the total number of parameters would be $cn^2N(n-k+1)^2$ (much larger!). Essentially, this is due to the $\textit{parameter sharing}$ property of CNNs, the sparsity of their connections. Finally, the convolutional structure is better at capturing $\textit{translational invariance}$ of the image (unlike the MLP where pixels are just flattened arbitrarily, losing their spatial information).
$\textbf{Problem}$: Explain what a $\textit{residual block}$ is and how several residual blocks may be composed together to define a $\textit{residual neural network}$ (ResNet).
$\textbf{Solution}$: A residual block may be viewed as a simple additive perturbation to a standard $2$-layer feedforward neural network. Recall that, given an input feature vector $\mathbf x$, a feedforward layer $(W,\mathbf b)$ may be viewed as the composition of a $\textit{linear layer}$ $\mathbf x\mapsto W\mathbf x + \mathbf b$ with a nonlinear activation function $\boldsymbol{\sigma}$. Thus, the overall action of a $2$-layer feedforward looks something like: $$\mathbf x\mapsto\boldsymbol{\sigma}(W_2\boldsymbol{\sigma}(W_1\mathbf x + \mathbf b_1) + \mathbf b_2)$$
By contrast, in a residual block, the overall action looks like: $$\mathbf x\mapsto\boldsymbol{\sigma}(W_2\boldsymbol{\sigma}(W_1\mathbf x + \mathbf b_1) + \mathbf b_2 + \mathbf x)$$
(assuming compatible dimensions for vector addition with $\mathbf x$). Thus, although the $1^{\text{st}}$ internal layer of the residual block is no different from a standard feedforward, the difference lies in the insertion of a so-called $\textit{skip connection}$ $+\mathbf x$ in between the linear layer and the activation function of the $2^{\text{nd}}$ layer. A ResNet is then literally just a bunch of residual blocks composed together!
$\textbf{Problem}$: Give some hand-wavy intuition/justification for what makes ResNets interesting/useful.
$\textbf{Solution}$: Consider the following $\textit{thought experiment}$. A standard feedforward neural network classifier with $20$ layers, post-training, has a misclassification rate of $15\%$ $\textit{on the training set}$ itself. Clearly, this seems to be a bias/underfitting problem rather than a variance/overfitting problem because it’s on the $\textit{training set}$. So one reasons that, to make the network more expressive and improve its performance on the training set, one might decide to append another $36$ feedforward layers on top of the $20$ layers already there, thus creating a $56$-layer feedforward. Theoretically, it’s manifest that the optimal training set error can only get lower because one could simply have the first $20$ layers doing exactly what they were doing before, and then have the remaining $36$ layers implement an identity function $\mathbf x\mapsto\mathbf x$. In practice however, it turns out that, paradoxically, training set error actually increases!
The problem is that it turns out to be surprisingly delicate to learn an identity mapping $\mathbf x\mapsto\mathbf x$ due to exploding/vanishing gradient problems during cost function minimization. So rather than struggling so hard to learn the identity, why not redesign the architecture such that the identity $\mathbf x\mapsto\mathbf x$ is the $\textit{default}$ mapping, and to then merely learn whatever perturbation (a.k.a. $\textit{residual}$ hence the network’s name) $\boldsymbol{\varepsilon}(\mathbf x)$ to this identity mapping is needed to learn the actual underlying map of interest $\mathbf x\mapsto \mathbf x+\boldsymbol{\varepsilon}(\mathbf x)$. Roughly speaking, in the earlier notation, one has parameterized the residual by $\boldsymbol{\varepsilon}(\mathbf x)=W_2\boldsymbol{\sigma}(W_1\mathbf x + \mathbf b_1) + \mathbf b_2$, assuming the outer activation function $\boldsymbol{\sigma}$ may be neglected (or if it happens to be a ReLU, this argument is even more appealing!). Notice than that (loosely) $W_1=W_2=\mathbf b_1=\mathbf b_2=\mathbf 0$ gives $\boldsymbol{\varepsilon}(\mathbf x)=\mathbf 0$, hence learning an exact identity map $\mathbf x\mapsto\mathbf x$. But it’s very easy to coerce weights and biases towards $\mathbf 0$ using standard regularization techniques.
# ResNets were first used in computer vision, so the W_1, W_2 matrices above really correspond to convolutional layers in a CNN structure
class ResBlock():
def __init__(self, in_channels, out_channels, stride=1):
self.conv1 = ConvLayer(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
self.conv2 = ConvLayer(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
# Identity x path (Projection Shortcut if dimensions change)
self.projection = None
if stride != 1 or in_channels != out_channels:
# 1x1 convolution to match dimensions
self.projection = ConvLayer(in_channels, out_channels, kernel_size=1, stride=stride, padding=0)
def relu(self, x):
return np.maximum(0, x)
def forward(self, x):
identity = x
# 1. First transformation + activation
out = self.conv1.forward(x)
out = self.relu(out)
# 2. Second transformation
out = self.conv2.forward(out)
# 3. Match dimensions of identity if needed
if self.projection is not None:
identity = self.projection.forward(x)
# 4. Skip Connection Addition
out += identity
# 5. Final activation
out = self.relu(out)
return out
$\textbf{Problem}$: Just like a ResNet is made from composing a sequence of residual blocks together, an inception neural network (InceptionNet) is made from composing a sequence of inception blocks together (mostly true; in practice there may also some other side branches, pooling layers, etc). Explain the architecture of an inception block.
$\textbf{Solution}$: Whereas typical CNN architectures require one to choose a filter size, an inception block asks “why not try several filter sizes?”. To this effect, a naive inception block implementation might look something like:
class NaiveInceptionBlock():
def __init__(self, in_channels, out_channels):
self.conv1 = ConvLayer(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
self.conv2 = ConvLayer(in_channels, out_channels, kernel_size=3, stride=1, padding=1) #same convolution
self.conv3 = ConvLayer(in_channels, out_channels, kernel_size=5, stride=1, padding=2) #same convolution
self.pool = PoolingLayer(kernel_size=2, stride=2, mode="max")
def forward(self, x):
x1 = self.conv1.forward(x)
x2 = self.conv2.forward(x)
x3 = self.conv3.forward(x)
x4 = self.pool.forward(x)
return np.concatenate([x1, x2, x3, x4], axis=1)
$\textbf{Problem}$: Needless to say, one of the drawbacks to asking “why not try several filter sizes?” is that, by having to literally try several filter sizes, the naive inception block implementation above can get computationally expensive in terms of the number of multiplications and additions required per forward pass. Explain how the introduction of $1\times 1$ $\textit{bottleneck layers}$ can help reduce computational burden without sacrificing too much accuracy.
$\textbf{Solution}$: The idea is to insert $1\times 1$ convolutional layers before the more computationally expensive $3\times 3$ and $5\times 5$ convolutions. These $1\times 1$ convolutions act as “bottlenecks” by reducing the number of channels (and thus the computational cost) before the larger convolutions are applied. For example, instead of applying a $5\times 5$ convolution directly to 256 input channels, we can first apply a $1\times 1$ convolution to reduce the channels to, say, 64, and then apply the $5\times 5$ convolution to the reduced-channel tensor. This significantly reduces the number of parameters and computations while maintaining most of the representational power of the larger filters.
class InceptionBlockWithBottleNeck(NaiveInceptionBlock):
def __init__(self, in_channels, num_1x1, num_3x3_reduce, num_3x3, num_5x5_reduce, num_5x5, num_pool_proj):
super().__init__(in_channels, num_1x1, num_3x3, num_5x5, num_pool_proj)
self.conv_3x3_reduce = ConvLayer(in_channels, num_3x3_reduce, kernel_size=1)
self.conv_5x5_reduce = ConvLayer(in_channels, num_5x5_reduce, kernel_size=1)
self.conv_3x3_reduce_out = ConvLayer(num_3x3_reduce, num_3x3, kernel_size=3, padding=1)
self.conv_5x5_reduce_out = ConvLayer(num_5x5_reduce, num_5x5, kernel_size=5, padding=2)
def forward(self, x):
out_1x1 = self.conv_1x1(x)
out_3x3_reduce = self.conv_3x3_reduce(x)
out_3x3 = self.conv_3x3_reduce_out(out_3x3_reduce)
out_5x5_reduce = self.conv_5x5_reduce(x)
out_5x5 = self.conv_5x5_reduce_out(out_5x5_reduce)
out_pool = self.pool(x)
out_pool_proj = self.pool_proj(out_pool)
return np.concatenate([out_1x1, out_3x3, out_5x5, out_pool_proj], axis=1)
$\textbf{Problem}$: Explain how the MobileNet architecture replaces the standard convolutional layer with a depthwise separable convolutional layer.
$\textbf{Solution}$: The idea is to sort of break down the standard convolutional layer into two parts: a $\textit{depthwise convolutional layer}$ and a $\textit{pointwise convolutional layer}$. The depthwise convolutional layer is a convolutional layer that only convolves with the depth (channels) dimension, while the pointwise convolutional layer is a convolutional layer that only convolves with the height and width dimensions.





