# set up the env
import pytest
import ipytest
import unittest
import numpy as np
ipytest.autoconfig()
9. Linear Regression Metrics#
9.1. Mean Squared Error (MSE)#
In the realm of linear regression metrics, one fundamental measure of model performance is the Mean Squared Error (MSE). MSE serves as a valuable indicator of how well your linear regression model aligns its predictions with the actual data points. This metric quantifies the average of the squared differences between predicted values and observed values.
9.1.1. The Formula#
Mathematically, the MSE is computed using the following formula:
Where:
\(n\) is the number of data points.
\(y_i\) represents the actual observed value for the \(i^{th}\) data point.
\(\hat{y}_i\) represents the predicted value for the \(i^{th}\) data point.
9.1.2. Python Implementation#
def mean_squared_error(y_true, y_pred):
"""
Calculate the Mean Squared Error (MSE).
Parameters:
y_true -- Actual values, can be a list, array, or other sequence.
y_pred -- Predicted values, can be a list, array, or other sequence.
Returns:
mse -- Mean Squared Error.
"""
# Ensure that the lengths of the input lists are the same
if len(___) != len(___):
raise ValueError("Length mismatch: y_true and y_pred")
# Calculate the Mean Squared Error
squared_errors = (___ - ___) ** 2
mse = np.___(squared_errors)
return mse
# Example
y_true = np.array([22.1, 19.9, 24.5, 20.1, 18.7])
y_pred = np.array([23.5, 20.2, 23.9, 19.8, 18.5])
mse = mean_squared_error(y_true, y_pred)
print("Mean Squared Error:", mse)
assert mse == 0.5079999999999996
Check result by executing below... 📝
%%ipytest -qq
class Test_MSE:
def test_mean_squared_error_identical_values(self):
# Test when the predicted values are identical to the actual values, the mean squared error should be 0
y_true = np.array([3, -0.5, 2, 7])
y_pred = np.array([3, -0.5, 2, 7])
assert mean_squared_error(y_true, y_pred) == 0
def test_mean_squared_error_all_zeros(self):
# Test when all predicted values are 0, the mean squared error should equal the sum of squares of actual values divided by the number of actual values
y_true = np.array([1, 2, 3, 4, 5])
y_pred = np.array([0, 0, 0, 0, 0])
assert mean_squared_error(y_true, y_pred) == 11.0
def test_mean_squared_error_length_mismatch(self):
# Test for the case of mismatched lengths
y_true = np.array([1, 2, 3])
y_pred = np.array([1, 2, 3, 4])
with pytest.raises(ValueError):
mean_squared_error(y_true, y_pred)
9.2. R-squared (R2) Score#
The R-squared (R2) score, also known as the coefficient of determination, is a measure that indicates the proportion of the variance in the dependent variable that is predictable from the independent variables. It provides insight into how well the model is performing compared to a simple mean.
9.2.1. The Formula#
The formula for R2 score is:
Where:
\( n \) is the number of data points
\( y_i \) represents the actual value of the dependent variable
\( \hat{y}_i \) represents the predicted value of the dependent variable
\( \bar{y} \) is the mean of the dependent variable
9.2.2. Python Implementation#
def r2_score(y_true, y_pred):
"""
Calculate the R2 score (coefficient of determination).
Parameters:
y_true -- Actual values, can be a list, array, or other sequence.
y_pred -- Predicted values, can be a list, array, or other sequence.
Returns:
r2 -- R2 score.
"""
# Calculate the mean of actual values
mean_y_true = np.___(___)
# Calculate the total sum of squares
total_sum_squares = np.sum((___ - ___) ** 2)
# Calculate the residual sum of squares
residual_sum_squares = np.sum((___ - ___) ** 2)
# Calculate R2 score
r2 = 1 - (___ / ___)
return r2
y_true = np.array([1, 2, 3, 4])
y_pred = np.array([2, 3, 4, 5])
r2_score(y_true, y_pred)
Check result by executing below... 📝
%%ipytest -qq
class Test_r2_score:
def test_r2_score_identical_values(self):
# Test when the predicted values are identical to the actual values, the R2 score should be 1
y_true = np.array([3, -0.5, 2, 7])
y_pred = np.array([3, -0.5, 2, 7])
assert r2_score(y_true, y_pred) == 1
def test_r2_score_completely_different_values(self):
# Test when the predicted values are completely different from the actual values, the R2 score should be close to 0
y_true = np.array([3, -0.5, 2, 7])
y_pred = np.array([2, -1, 1, 10])
assert r2_score(y_true, y_pred) == 0.6145610278372591
# Test when the length of y_true and y_pred is not the same, it should raise a ValueError
def test_r2_score_length_mismatch(self):
y_true = np.array([1, 2, 3])
y_pred = np.array([1, 2, 3, 4])
with pytest.raises(ValueError):
r2_score(y_true, y_pred)