Step 1:

    # Compute the output response to this 'img' input for each neuron (linear unit).
    output = img @ self.W + self.b

    # Step 2:
    # Convert these to probabilities by implementing the softmax function.
    exp_output = np.exp(output)
    probs = exp_output / np.sum(exp_output)

    # Step 3:
    # Compute the error against the training label 'gt_label' using the cross-entropy loss
    # Remember:
    #     log has a potential divide by zero error
    loss_sum = -np.log(probs[gt_label])

    ################
    # BACKWARD PASS (BACK PROPAGATION):
    # This is where we find which direction to move in for gradient descent to
    # optimize our weights and biases.
    # Use the derivations from the questions handout.

    # Step 4:
    # Compute the delta_W and delta_b gradient terms for the weights and biases
    # using the provided derivations in Eqs. 6 and 7 of the handout.
    delta_output = probs.copy()
    delta_output[gt_label] -= 1
    delta_W = img.reshape((-1, 1)) @ delta_output.reshape((1, -1))
    delta_b = delta_output

    # Step 5:
    # Update self.W and self.b using the gradient terms
    # and the self.learning_rate hyperparameter.
    self.W -= self.learning_rate * delta_W
    self.b -= self.learning_rate * delta_b

    return loss_su

################ # GENERAL ADVICE # This is _precise work_ - we need very few lines of code # At this point we need not write any for loops

################ # GENERAL ADVICE # This is _precise work_ - we need very few lines of code # At this point we need not write any for loops

Step 1: