#Common imports for data science
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  #For visualizations
import seaborn as sns #For visualizations

#Imports for machine learning 
from sklearn.model_selection import train_test_split  #For validation split

#Imports for feature transformations
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

#Imports for building preprocessing object
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

#Imports for regression models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

#Imports for model metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

#Set sklearn output to pandas
from sklearn import set_config
set_config(transform_output = 'pandas')

#Mute warnings
import warnings
warnings.filterwarnings('ignore')


#Read in sample dataset from repo folder
file_path = "Data/insurance_mod.csv"
df = pd.read_csv(file_path)
#Preview data
df.head()


#Define X and y variables
y = df['charges']
X = df.drop(columns = 'charges')


#Perform validation split
#Setting a random state will make this reproducible in the future
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#Verify the split is correct
X_train.head()  #Note the absence of the charges column from the X_train data


##Create numeric pipeline
#Define numeric columns
num_cols = X_train.select_dtypes('number').columns

#Instantiate transformers
impute_mean = SimpleImputer(strategy='mean')
scaler = StandardScaler()

#Set numeric pipeline
num_pipe = make_pipeline(impute_mean, scaler)

#Create tuple for column transformer
num_tuple = ("Numeric", num_pipe, num_cols)


##Create categorical pipeline
#Define categorical columns
cat_cols = X_train.select_dtypes('object').columns

#Instantiate transformers
impute_missing = SimpleImputer(strategy='constant', fill_value='Missing')
cat_encode = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

#Set categorical pipeline
cat_pipe = make_pipeline(impute_missing, cat_encode)

#Create tuple for column transformer
cat_tuple = ("Categorical", cat_pipe, cat_cols)


#Finalize preprocessing object
preprocessor = ColumnTransformer([num_tuple, cat_tuple], verbose_feature_names_out=False)


#Fit preprocessor on training data
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('Numeric',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer()),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['age', 'bmi', 'children', 'smoker'], dtype='object')),
                                ('Categorical',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='Missing',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 Index(['sex', 'region'], dtype='object'))],
                  verbose_feature_names_out=False)

ColumnTransformer(transformers=[('Numeric',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer()),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['age', 'bmi', 'children', 'smoker'], dtype='object')),
                                ('Categorical',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='Missing',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 Index(['sex', 'region'], dtype='object'))],
                  verbose_feature_names_out=False)

Index(['age', 'bmi', 'children', 'smoker'], dtype='object')

SimpleImputer()

StandardScaler()

Index(['sex', 'region'], dtype='object')

SimpleImputer(fill_value='Missing', strategy='constant')

OneHotEncoder(handle_unknown='ignore', sparse_output=False)


#Transform training and testing data
X_train_tf = preprocessor.transform(X_train)
X_test_tf = preprocessor.transform(X_test)

#View preprocessed training data
X_train_tf.head()


#Instantiate the model
lin_reg = LinearRegression()


#Fit the model onto the training data
lin_reg.fit(X_train_tf, y_train)

LinearRegression()

LinearRegression()


#View the intercept of the line
lin_reg.intercept_

3.75043980544144e+16


#Determine the coeffcients from the regression
lin_reg.coef_

array([ 3.64288226e+03,  2.04174591e+03,  5.13908932e+02,  9.54795339e+03,
       -3.90284646e+16, -3.90284646e+16,  1.52406651e+15,  1.52406651e+15,
        1.52406651e+15,  1.52406651e+15])


#Get predictions for training data
y_pred_train = lin_reg.predict(X_train_tf)

#Get predictions for testing data
y_pred_test = lin_reg.predict(X_test_tf)


#View results of the testing predictions in a new data frame
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_test, 
                        'Error': y_pred_test - y_test})
results


##Using code to derive metrical evaluation
#Calculating MAE
train_MAE = mean_absolute_error(y_train, y_pred_train)
test_MAE = mean_absolute_error(y_test, y_pred_test)
print(f'Model Training MAE: {train_MAE:,.2f}')
print(f'Model Testing MAE: {test_MAE:,.2f}')

Model Training MAE: 4,180.30
Model Testing MAE: 4,239.17


#Calculating MSE
train_MSE = mean_squared_error(y_train, y_pred_train)
test_MSE = mean_squared_error(y_test, y_pred_test)
print(f'Model Training MSE: {train_MSE:,.2f}')
print(f'Model Testing MSE: {test_MSE:,.2f}')

Model Training MSE: 37,004,850.50
Model Testing MSE: 35,103,353.69


#Calculating RMSE
train_RMSE = mean_squared_error(y_train, y_pred_train, squared=False)
test_RMSE = mean_squared_error(y_test, y_pred_test, squared=False)
print(f'Model Training RMSE: {train_RMSE:,.2f}')
print(f'Model Testing RMSE: {test_RMSE:,.2f}')

Model Training RMSE: 6,083.16
Model Testing RMSE: 5,924.81


#Calcuating R2
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print(f'Model Training R2: {train_r2:.2f}')
print(f'Model Testing R2: {test_r2:.2f}')

Model Training R2: 0.74
Model Testing R2: 0.77


#View training metrics altogether
print(f'Model Training MAE: {train_MAE:,.2f}')
print(f'Model Training MSE: {train_MSE:,.2f}')
print(f'Model Training RMSE: {train_RMSE:,.2f}')
print(f'Model Training R2: {train_r2:.2f}')

Model Training MAE: 4,180.30
Model Training MSE: 37,004,850.50
Model Training RMSE: 6,083.16
Model Training R2: 0.74


#View testing metrics altogether
print(f'Model Testing MAE: {test_MAE:,.2f}')
print(f'Model Testing MSE: {test_MSE:,.2f}')
print(f'Model Testing RMSE: {test_RMSE:,.2f}')
print(f'Model Testing R2: {test_r2:.2f}')

Model Testing MAE: 4,239.17
Model Testing MSE: 35,103,353.69
Model Testing RMSE: 5,924.81
Model Testing R2: 0.77


#Instantiate the model
random_forest = RandomForestRegressor()


#Fit the model on the training data
random_forest.fit(X_train_tf, y_train)

RandomForestRegressor()

RandomForestRegressor()


#Get predictions for training data
y_pred_train = random_forest.predict(X_train_tf)

#Get predictions for testing data
y_pred_test = random_forest.predict(X_test_tf)


#Evaluate model using MAE
train_MAE = mean_absolute_error(y_train, y_pred_train)
test_MAE = mean_absolute_error(y_test, y_pred_test)
print(f'Model Training MAE: {train_MAE:,.2f}')
print(f'Model Testing MAE: {test_MAE:,.2f}')

Model Training MAE: 1,021.42
Model Testing MAE: 2,596.28


#Evaluate model using MSE
train_MSE = mean_squared_error(y_train, y_pred_train)
test_MSE = mean_squared_error(y_test, y_pred_test)
print(f'Model Training MSE: {train_MSE:,.2f}')
print(f'Model Testing MSE: {test_MSE:,.2f}')

Model Training MSE: 3,506,607.41
Model Testing MSE: 22,616,773.35


#Evaluate model using RMSE
train_RMSE = mean_squared_error(y_train, y_pred_train, squared=False)
test_RMSE = mean_squared_error(y_test, y_pred_test, squared=False)
print(f'Model Training RMSE: {train_RMSE:,.2f}')
print(f'Model Testing RMSE: {test_RMSE:,.2f}')

Model Training RMSE: 1,872.59
Model Testing RMSE: 4,755.71


#Evaluate model using R2
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print(f'Model Training R2: {train_r2:.2f}')
print(f'Model Testing R2: {test_r2:.2f}')

Model Training R2: 0.98
Model Testing R2: 0.85

	age	sex	bmi	children	smoker	region	charges
0	19	female	27.900	0	1	southwest	16885.0
1	18	male	33.770	1	0	southeast	1726.0
2	28	male	33.000	3	0	southeast	4449.0
3	33	male	22.705	0	0	northwest	21984.0
4	32	male	28.880	0	0	northwest	3867.0

	age	sex	bmi	children	smoker	region
693	24	male	23.655	0	0	northwest
1297	28	female	26.510	2	0	southeast
634	51	male	39.700	1	0	southwest
1022	47	male	36.080	1	1	southeast
178	46	female	28.900	2	0	southwest

	age	bmi	children	smoker	sex_female	sex_male	region_northwest	region_southeast	region_southwest
693	-1.087167	-1.140875	-0.917500	-0.508399	0.0	1.0	1.0	0.0	0.0
1297	-0.802106	-0.665842	0.743605	-0.508399	1.0	0.0	0.0	1.0	0.0
634	0.836992	1.528794	-0.086947	-0.508399	0.0	1.0	0.0	0.0	1.0
1022	0.551932	0.926476	-0.086947	1.966960	0.0	1.0	0.0	1.0	0.0
178	0.480667	-0.268178	0.743605	-0.508399	1.0	0.0	0.0	0.0	1.0

	Actual	Predicted	Error
764	9095.0	8960.0	-135.0
887	5272.0	7064.0	1792.0
890	29331.0	36904.0	7573.0
1293	9302.0	9488.0	186.0
259	33750.0	26928.0	-6822.0
...	...	...	...
342	13217.0	12808.0	-409.0
308	11945.0	14768.0	2823.0
1128	14358.0	7424.0	-6934.0
503	32548.0	25936.0	-6612.0
1197	5700.0	9136.0	3436.0

Machine Learning 101 - Regression¶

How to Build and Interpret Regression Models¶

Overview:¶

Regression Models Overview:¶

Regression Models in Code¶

Import Libraries and Read in Data¶

Preprocess Data¶

Model 1: Linear Regression¶

Different Metrics Explained¶

Linear Regression Metrics Interpreted¶

Model 2: Random Forest¶

Random Forest Metrics Interpreted¶

Conclusions¶