#Common imports for data science
import pandas as pd
import numpy as np

#Imports for machine learning 
from sklearn.model_selection import train_test_split  #For validation split

#Imports for feature transformations
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

#Imports for building preprocessing object
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

#Set sklearn output to pandas
from sklearn import set_config
set_config(transform_output = 'pandas')

#Mute warnings
import warnings
warnings.filterwarnings('ignore')


#Read in sample dataset from repo folder
file_path = "Data/insurance_mod.csv"
df = pd.read_csv(file_path)
#Preview data
df.head()


#Define X and y variables
y = df['charges']
X = df.drop(columns = 'charges')


#Perform validation split
#Setting a random state will make this reproducible in the future
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#Verify the split is correct
X_train.head()  #Note the absence of the charges column from the X_train data


#Define numeric and categorical columns
num_cols = X_train.select_dtypes('number').columns
cat_cols = X_train.select_dtypes('object').columns


##Instantiate transformers

#Simple Imputer - imputes missing values
#Here are two imputers, one for categorical and one for numeric data
#Categorical imputer:
impute_missing = SimpleImputer(strategy='constant', fill_value='Missing')

#Numeric imputer:
impute_mean = SimpleImputer(strategy='mean')

#Standard scaler:
scaler = StandardScaler()

#OneHotEncoder:
cat_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')


##Create two pipelines
#Numerical pipeline
num_pipe = make_pipeline(impute_mean, scaler)

#Categorical pipeline
cat_pipe = make_pipeline(impute_missing, cat_encoder)


##Create tuples for each pipeline
##Include a title for the pipeline, the pipe variable, and the column variable

#Numeric tuple
num_tuple = ('Numeric', num_pipe, num_cols)

#Categorical tuple
cat_tuple = ('Categorical', cat_pipe, cat_cols)


##Finally, create final preprocessor
preprocessor = ColumnTransformer([num_tuple, cat_tuple], verbose_feature_names_out= False)

#View preprocessor
preprocessor

ColumnTransformer(transformers=[('Numeric',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer()),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['age', 'bmi', 'children', 'smoker'], dtype='object')),
                                ('Categorical',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='Missing',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 Index(['sex', 'region'], dtype='object'))],
                  verbose_feature_names_out=False)

ColumnTransformer(transformers=[('Numeric',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer()),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['age', 'bmi', 'children', 'smoker'], dtype='object')),
                                ('Categorical',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='Missing',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 Index(['sex', 'region'], dtype='object'))],
                  verbose_feature_names_out=False)

Index(['age', 'bmi', 'children', 'smoker'], dtype='object')

SimpleImputer()

StandardScaler()

Index(['sex', 'region'], dtype='object')

SimpleImputer(fill_value='Missing', strategy='constant')

OneHotEncoder(handle_unknown='ignore', sparse_output=False)


#Fit preprocessor to training data
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('Numeric',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer()),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['age', 'bmi', 'children', 'smoker'], dtype='object')),
                                ('Categorical',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='Missing',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 Index(['sex', 'region'], dtype='object'))],
                  verbose_feature_names_out=False)

ColumnTransformer(transformers=[('Numeric',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer()),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Index(['age', 'bmi', 'children', 'smoker'], dtype='object')),
                                ('Categorical',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='Missing',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 Index(['sex', 'region'], dtype='object'))],
                  verbose_feature_names_out=False)

Index(['age', 'bmi', 'children', 'smoker'], dtype='object')

SimpleImputer()

StandardScaler()

Index(['sex', 'region'], dtype='object')

SimpleImputer(fill_value='Missing', strategy='constant')

OneHotEncoder(handle_unknown='ignore', sparse_output=False)


#Transform training and testing data and save as new, transformed variables (tf)
X_train_tf = preprocessor.transform(X_train)
X_test_tf = preprocessor.transform(X_test)

#Verify the change:
X_train_tf.head()

	age	sex	bmi	children	smoker	region	charges
0	19	female	27.900	0	1	southwest	16885.0
1	18	male	33.770	1	0	southeast	1726.0
2	28	male	33.000	3	0	southeast	4449.0
3	33	male	22.705	0	0	northwest	21984.0
4	32	male	28.880	0	0	northwest	3867.0

	age	sex	bmi	children	smoker	region
693	24	male	23.655	0	0	northwest
1297	28	female	26.510	2	0	southeast
634	51	male	39.700	1	0	southwest
1022	47	male	36.080	1	1	southeast
178	46	female	28.900	2	0	southwest

	age	bmi	children	smoker	sex_female	sex_male	region_northwest	region_southeast	region_southwest
693	-1.087167	-1.140875	-0.917500	-0.508399	0.0	1.0	1.0	0.0	0.0
1297	-0.802106	-0.665842	0.743605	-0.508399	1.0	0.0	0.0	1.0	0.0
634	0.836992	1.528794	-0.086947	-0.508399	0.0	1.0	0.0	0.0	1.0
1022	0.551932	0.926476	-0.086947	1.966960	0.0	1.0	0.0	1.0	0.0
178	0.480667	-0.268178	0.743605	-0.508399	1.0	0.0	0.0	0.0	1.0

Machine Learning 101 - Preprocessing¶

How to Prepare a Dataset for Machine Learning Models¶

Overview:¶

Preprocessing Overview¶

Preprocessing Steps in Code¶

Load Libraries and Read in Data¶

Perform Validation Split¶

Transform Features¶

Create Pipelines¶

Final Preprocessor¶

Final Steps: Fit and Transform Data¶

Conclusion¶