Once a data set has been sufficiently cleaned and explored for the task at hand, it needs to be processed in such a way that a machine learning model can use it to solve the problem outlined. This involves several steps, outlined below:
After completing these steps, the data will be ready to plug into a machine learning model!
#Common imports for data science
import pandas as pd
import numpy as np
#Imports for machine learning
from sklearn.model_selection import train_test_split #For validation split
#Imports for feature transformations
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
#Imports for building preprocessing object
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
#Set sklearn output to pandas
from sklearn import set_config
set_config(transform_output = 'pandas')
#Mute warnings
import warnings
warnings.filterwarnings('ignore')
#Read in sample dataset from repo folder
file_path = "Data/insurance_mod.csv"
df = pd.read_csv(file_path)
#Preview data
df.head()
age | sex | bmi | children | smoker | region | charges | |
---|---|---|---|---|---|---|---|
0 | 19 | female | 27.900 | 0 | 1 | southwest | 16885.0 |
1 | 18 | male | 33.770 | 1 | 0 | southeast | 1726.0 |
2 | 28 | male | 33.000 | 3 | 0 | southeast | 4449.0 |
3 | 33 | male | 22.705 | 0 | 0 | northwest | 21984.0 |
4 | 32 | male | 28.880 | 0 | 0 | northwest | 3867.0 |
#Define X and y variables
y = df['charges']
X = df.drop(columns = 'charges')
#Perform validation split
#Setting a random state will make this reproducible in the future
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
#Verify the split is correct
X_train.head() #Note the absence of the charges column from the X_train data
age | sex | bmi | children | smoker | region | |
---|---|---|---|---|---|---|
693 | 24 | male | 23.655 | 0 | 0 | northwest |
1297 | 28 | female | 26.510 | 2 | 0 | southeast |
634 | 51 | male | 39.700 | 1 | 0 | southwest |
1022 | 47 | male | 36.080 | 1 | 1 | southeast |
178 | 46 | female | 28.900 | 2 | 0 | southwest |
#Define numeric and categorical columns
num_cols = X_train.select_dtypes('number').columns
cat_cols = X_train.select_dtypes('object').columns
##Instantiate transformers
#Simple Imputer - imputes missing values
#Here are two imputers, one for categorical and one for numeric data
#Categorical imputer:
impute_missing = SimpleImputer(strategy='constant', fill_value='Missing')
#Numeric imputer:
impute_mean = SimpleImputer(strategy='mean')
#Standard scaler:
scaler = StandardScaler()
#OneHotEncoder:
cat_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
##Create two pipelines
#Numerical pipeline
num_pipe = make_pipeline(impute_mean, scaler)
#Categorical pipeline
cat_pipe = make_pipeline(impute_missing, cat_encoder)
##Create tuples for each pipeline
##Include a title for the pipeline, the pipe variable, and the column variable
#Numeric tuple
num_tuple = ('Numeric', num_pipe, num_cols)
#Categorical tuple
cat_tuple = ('Categorical', cat_pipe, cat_cols)
##Finally, create final preprocessor
preprocessor = ColumnTransformer([num_tuple, cat_tuple], verbose_feature_names_out= False)
#View preprocessor
preprocessor
ColumnTransformer(transformers=[('Numeric', Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())]), Index(['age', 'bmi', 'children', 'smoker'], dtype='object')), ('Categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='Missing', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), Index(['sex', 'region'], dtype='object'))], verbose_feature_names_out=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
ColumnTransformer(transformers=[('Numeric', Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())]), Index(['age', 'bmi', 'children', 'smoker'], dtype='object')), ('Categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='Missing', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), Index(['sex', 'region'], dtype='object'))], verbose_feature_names_out=False)
Index(['age', 'bmi', 'children', 'smoker'], dtype='object')
SimpleImputer()
StandardScaler()
Index(['sex', 'region'], dtype='object')
SimpleImputer(fill_value='Missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
#Fit preprocessor to training data
preprocessor.fit(X_train)
ColumnTransformer(transformers=[('Numeric', Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())]), Index(['age', 'bmi', 'children', 'smoker'], dtype='object')), ('Categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='Missing', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), Index(['sex', 'region'], dtype='object'))], verbose_feature_names_out=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
ColumnTransformer(transformers=[('Numeric', Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())]), Index(['age', 'bmi', 'children', 'smoker'], dtype='object')), ('Categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='Missing', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), Index(['sex', 'region'], dtype='object'))], verbose_feature_names_out=False)
Index(['age', 'bmi', 'children', 'smoker'], dtype='object')
SimpleImputer()
StandardScaler()
Index(['sex', 'region'], dtype='object')
SimpleImputer(fill_value='Missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
#Transform training and testing data and save as new, transformed variables (tf)
X_train_tf = preprocessor.transform(X_train)
X_test_tf = preprocessor.transform(X_test)
#Verify the change:
X_train_tf.head()
age | bmi | children | smoker | sex_female | sex_male | region_northeast | region_northwest | region_southeast | region_southwest | |
---|---|---|---|---|---|---|---|---|---|---|
693 | -1.087167 | -1.140875 | -0.917500 | -0.508399 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
1297 | -0.802106 | -0.665842 | 0.743605 | -0.508399 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
634 | 0.836992 | 1.528794 | -0.086947 | -0.508399 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1022 | 0.551932 | 0.926476 | -0.086947 | 1.966960 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
178 | 0.480667 | -0.268178 | 0.743605 | -0.508399 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
Preprocessing is a necessary step in order to prepare data for machine learning modeling. The steps to be taken include:
For additional questions or more information, contact the author at krisbarbier02@gmail.com