This notebook will demonstrate how to build and interpret 2 types of regression models: Linear Regression and Random Forests.
#Common imports for data science
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt #For visualizations
import seaborn as sns #For visualizations
#Imports for machine learning
from sklearn.model_selection import train_test_split #For validation split
#Imports for feature transformations
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
#Imports for building preprocessing object
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
#Imports for regression models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
#Imports for model metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
#Set sklearn output to pandas
from sklearn import set_config
set_config(transform_output = 'pandas')
#Mute warnings
import warnings
warnings.filterwarnings('ignore')
#Read in sample dataset from repo folder
file_path = "Data/insurance_mod.csv"
df = pd.read_csv(file_path)
#Preview data
df.head()
age | sex | bmi | children | smoker | region | charges | |
---|---|---|---|---|---|---|---|
0 | 19 | female | 27.900 | 0 | 1 | southwest | 16885.0 |
1 | 18 | male | 33.770 | 1 | 0 | southeast | 1726.0 |
2 | 28 | male | 33.000 | 3 | 0 | southeast | 4449.0 |
3 | 33 | male | 22.705 | 0 | 0 | northwest | 21984.0 |
4 | 32 | male | 28.880 | 0 | 0 | northwest | 3867.0 |
#Define X and y variables
y = df['charges']
X = df.drop(columns = 'charges')
#Perform validation split
#Setting a random state will make this reproducible in the future
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
#Verify the split is correct
X_train.head() #Note the absence of the charges column from the X_train data
age | sex | bmi | children | smoker | region | |
---|---|---|---|---|---|---|
693 | 24 | male | 23.655 | 0 | 0 | northwest |
1297 | 28 | female | 26.510 | 2 | 0 | southeast |
634 | 51 | male | 39.700 | 1 | 0 | southwest |
1022 | 47 | male | 36.080 | 1 | 1 | southeast |
178 | 46 | female | 28.900 | 2 | 0 | southwest |
##Create numeric pipeline
#Define numeric columns
num_cols = X_train.select_dtypes('number').columns
#Instantiate transformers
impute_mean = SimpleImputer(strategy='mean')
scaler = StandardScaler()
#Set numeric pipeline
num_pipe = make_pipeline(impute_mean, scaler)
#Create tuple for column transformer
num_tuple = ("Numeric", num_pipe, num_cols)
##Create categorical pipeline
#Define categorical columns
cat_cols = X_train.select_dtypes('object').columns
#Instantiate transformers
impute_missing = SimpleImputer(strategy='constant', fill_value='Missing')
cat_encode = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
#Set categorical pipeline
cat_pipe = make_pipeline(impute_missing, cat_encode)
#Create tuple for column transformer
cat_tuple = ("Categorical", cat_pipe, cat_cols)
#Finalize preprocessing object
preprocessor = ColumnTransformer([num_tuple, cat_tuple], verbose_feature_names_out=False)
#Fit preprocessor on training data
preprocessor.fit(X_train)
ColumnTransformer(transformers=[('Numeric', Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())]), Index(['age', 'bmi', 'children', 'smoker'], dtype='object')), ('Categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='Missing', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), Index(['sex', 'region'], dtype='object'))], verbose_feature_names_out=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
ColumnTransformer(transformers=[('Numeric', Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())]), Index(['age', 'bmi', 'children', 'smoker'], dtype='object')), ('Categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='Missing', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), Index(['sex', 'region'], dtype='object'))], verbose_feature_names_out=False)
Index(['age', 'bmi', 'children', 'smoker'], dtype='object')
SimpleImputer()
StandardScaler()
Index(['sex', 'region'], dtype='object')
SimpleImputer(fill_value='Missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
#Transform training and testing data
X_train_tf = preprocessor.transform(X_train)
X_test_tf = preprocessor.transform(X_test)
#View preprocessed training data
X_train_tf.head()
age | bmi | children | smoker | sex_female | sex_male | region_northeast | region_northwest | region_southeast | region_southwest | |
---|---|---|---|---|---|---|---|---|---|---|
693 | -1.087167 | -1.140875 | -0.917500 | -0.508399 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
1297 | -0.802106 | -0.665842 | 0.743605 | -0.508399 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
634 | 0.836992 | 1.528794 | -0.086947 | -0.508399 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1022 | 0.551932 | 0.926476 | -0.086947 | 1.966960 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
178 | 0.480667 | -0.268178 | 0.743605 | -0.508399 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
#Instantiate the model
lin_reg = LinearRegression()
#Fit the model onto the training data
lin_reg.fit(X_train_tf, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
#View the intercept of the line
lin_reg.intercept_
3.75043980544144e+16
#Determine the coeffcients from the regression
lin_reg.coef_
array([ 3.64288226e+03, 2.04174591e+03, 5.13908932e+02, 9.54795339e+03, -3.90284646e+16, -3.90284646e+16, 1.52406651e+15, 1.52406651e+15, 1.52406651e+15, 1.52406651e+15])
#Get predictions for training data
y_pred_train = lin_reg.predict(X_train_tf)
#Get predictions for testing data
y_pred_test = lin_reg.predict(X_test_tf)
#View results of the testing predictions in a new data frame
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_test,
'Error': y_pred_test - y_test})
results
Actual | Predicted | Error | |
---|---|---|---|
764 | 9095.0 | 8960.0 | -135.0 |
887 | 5272.0 | 7064.0 | 1792.0 |
890 | 29331.0 | 36904.0 | 7573.0 |
1293 | 9302.0 | 9488.0 | 186.0 |
259 | 33750.0 | 26928.0 | -6822.0 |
... | ... | ... | ... |
342 | 13217.0 | 12808.0 | -409.0 |
308 | 11945.0 | 14768.0 | 2823.0 |
1128 | 14358.0 | 7424.0 | -6934.0 |
503 | 32548.0 | 25936.0 | -6612.0 |
1197 | 5700.0 | 9136.0 | 3436.0 |
335 rows × 3 columns
##Using code to derive metrical evaluation
#Calculating MAE
train_MAE = mean_absolute_error(y_train, y_pred_train)
test_MAE = mean_absolute_error(y_test, y_pred_test)
print(f'Model Training MAE: {train_MAE:,.2f}')
print(f'Model Testing MAE: {test_MAE:,.2f}')
Model Training MAE: 4,180.30 Model Testing MAE: 4,239.17
#Calculating MSE
train_MSE = mean_squared_error(y_train, y_pred_train)
test_MSE = mean_squared_error(y_test, y_pred_test)
print(f'Model Training MSE: {train_MSE:,.2f}')
print(f'Model Testing MSE: {test_MSE:,.2f}')
Model Training MSE: 37,004,850.50 Model Testing MSE: 35,103,353.69
#Calculating RMSE
train_RMSE = mean_squared_error(y_train, y_pred_train, squared=False)
test_RMSE = mean_squared_error(y_test, y_pred_test, squared=False)
print(f'Model Training RMSE: {train_RMSE:,.2f}')
print(f'Model Testing RMSE: {test_RMSE:,.2f}')
Model Training RMSE: 6,083.16 Model Testing RMSE: 5,924.81
#Calcuating R2
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print(f'Model Training R2: {train_r2:.2f}')
print(f'Model Testing R2: {test_r2:.2f}')
Model Training R2: 0.74 Model Testing R2: 0.77
#View training metrics altogether
print(f'Model Training MAE: {train_MAE:,.2f}')
print(f'Model Training MSE: {train_MSE:,.2f}')
print(f'Model Training RMSE: {train_RMSE:,.2f}')
print(f'Model Training R2: {train_r2:.2f}')
Model Training MAE: 4,180.30 Model Training MSE: 37,004,850.50 Model Training RMSE: 6,083.16 Model Training R2: 0.74
#View testing metrics altogether
print(f'Model Testing MAE: {test_MAE:,.2f}')
print(f'Model Testing MSE: {test_MSE:,.2f}')
print(f'Model Testing RMSE: {test_RMSE:,.2f}')
print(f'Model Testing R2: {test_r2:.2f}')
Model Testing MAE: 4,239.17 Model Testing MSE: 35,103,353.69 Model Testing RMSE: 5,924.81 Model Testing R2: 0.77
#Instantiate the model
random_forest = RandomForestRegressor()
#Fit the model on the training data
random_forest.fit(X_train_tf, y_train)
RandomForestRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor()
#Get predictions for training data
y_pred_train = random_forest.predict(X_train_tf)
#Get predictions for testing data
y_pred_test = random_forest.predict(X_test_tf)
#Evaluate model using MAE
train_MAE = mean_absolute_error(y_train, y_pred_train)
test_MAE = mean_absolute_error(y_test, y_pred_test)
print(f'Model Training MAE: {train_MAE:,.2f}')
print(f'Model Testing MAE: {test_MAE:,.2f}')
Model Training MAE: 1,021.42 Model Testing MAE: 2,596.28
#Evaluate model using MSE
train_MSE = mean_squared_error(y_train, y_pred_train)
test_MSE = mean_squared_error(y_test, y_pred_test)
print(f'Model Training MSE: {train_MSE:,.2f}')
print(f'Model Testing MSE: {test_MSE:,.2f}')
Model Training MSE: 3,506,607.41 Model Testing MSE: 22,616,773.35
#Evaluate model using RMSE
train_RMSE = mean_squared_error(y_train, y_pred_train, squared=False)
test_RMSE = mean_squared_error(y_test, y_pred_test, squared=False)
print(f'Model Training RMSE: {train_RMSE:,.2f}')
print(f'Model Testing RMSE: {test_RMSE:,.2f}')
Model Training RMSE: 1,872.59 Model Testing RMSE: 4,755.71
#Evaluate model using R2
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print(f'Model Training R2: {train_r2:.2f}')
print(f'Model Testing R2: {test_r2:.2f}')
Model Training R2: 0.98 Model Testing R2: 0.85