import pandas as pd

df = pd.read_csv('data/diabetes.csv')
df.head()

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import accuracy_score

X = df.drop(columns='class')
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27, stratify=y)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

pca = PCA(n_components=3)
pca.fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)

model = LogisticRegression()
model.fit(X_train_pca, y_train)

# Test portion
X_test_scaled = scaler.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)

y_pred = model.predict(X_test_pca)
acc = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {acc}')

Test set accuracy: 0.6948051948051948

from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[('scaling', StandardScaler()),
                           ('pca', PCA(n_components=3)),
                           ('classifier', LogisticRegression())])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {acc}')

Test set accuracy: 0.6948051948051948

from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest

feature_union = FeatureUnion([('pca', PCA(n_components=3)), 
                              ('select_best', SelectKBest(k=6))])

pipeline = Pipeline(steps=[('scaling', StandardScaler()),
                           ('features', feature_union),
                           ('classifier', LogisticRegression())])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {acc}')

Test set accuracy: 0.7337662337662337

df = pd.read_csv('data/heart_disease.csv')
df.head()

X, y = df.drop(columns='target'), df['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=27)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

numerical_features = ["age", "trestbps", "chol", "thalach", "oldpeak"]
categorical_features = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]

numeric_transform = StandardScaler()
categorical_transform = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer([('numeric', numeric_transform, numerical_features), 
                                  ('categorical', categorical_transform, categorical_features)])

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])
                     
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  ['age', 'trestbps', 'chol',
                                                   'thalach', 'oldpeak']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['sex', 'cp', 'fbs',
                                                   'restecg', 'exang', 'slope',
                                                   'ca', 'thal'])])),
                ('classifier', LogisticRegression())])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  ['age', 'trestbps', 'chol',
                                                   'thalach', 'oldpeak']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['sex', 'cp', 'fbs',
                                                   'restecg', 'exang', 'slope',
                                                   'ca', 'thal'])])),
                ('classifier', LogisticRegression())])

ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                 ['age', 'trestbps', 'chol', 'thalach',
                                  'oldpeak']),
                                ('categorical',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 ['sex', 'cp', 'fbs', 'restecg', 'exang',
                                  'slope', 'ca', 'thal'])])

['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

StandardScaler()

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

OneHotEncoder(handle_unknown='ignore')

LogisticRegression()

from sklearn.metrics import accuracy_score
y_pred = pipeline.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {acc}")

Test set accuracy: 0.618421052631579

# Initial data
X.head()

# Preprocessed data
X_preprocessed = preprocessor.transform(X)
X_preprocessed[0]

array([ 0.92303809,  0.78129791, -0.2614227 ,  0.08134766,  1.07875303,
        0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        1.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ])

# Display HTML representation in a jupyter context
from sklearn import set_config
set_config(display='diagram')

pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  ['age', 'trestbps', 'chol',
                                                   'thalach', 'oldpeak']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['sex', 'cp', 'fbs',
                                                   'restecg', 'exang', 'slope',
                                                   'ca', 'thal'])])),
                ('classifier', LogisticRegression())])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  ['age', 'trestbps', 'chol',
                                                   'thalach', 'oldpeak']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['sex', 'cp', 'fbs',
                                                   'restecg', 'exang', 'slope',
                                                   'ca', 'thal'])])),
                ('classifier', LogisticRegression())])

ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                 ['age', 'trestbps', 'chol', 'thalach',
                                  'oldpeak']),
                                ('categorical',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 ['sex', 'cp', 'fbs', 'restecg', 'exang',
                                  'slope', 'ca', 'thal'])])

['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

StandardScaler()

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

OneHotEncoder(handle_unknown='ignore')

LogisticRegression()

# Or, save the HTML to a file
from sklearn.utils import estimator_html_repr

with open('img/model_pipeline.html', 'w') as f:  
    f.write(estimator_html_repr(pipeline))

from sklearn.model_selection import GridSearchCV, cross_val_score

pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaling', StandardScaler()),
    ('pca', PCA()),
    ('classifier', LogisticRegression())
])

# Define the parameter grid
param_grid = {
    'pca__n_components': [2, 3, 4],
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'lbfgs']}

# GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Cross-Validation Score: {grid_search.best_score_}')

# Evaluate the test set with the best model found
best_pipeline = grid_search.best_estimator_
acc = best_pipeline.score(X_test, y_test)
print(f'Test set accuracy with best parameters: {acc}')

Best Parameters: {'classifier__C': 0.1, 'classifier__solver': 'liblinear', 'pca__n_components': 3}
Best Cross-Validation Score: 0.6125603864734299
Test set accuracy with best parameters: 0.5921052631578947

from sklearn.svm import SVC

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', LogisticRegression())  # Placeholder for classifier
])

param_grid = [
    {
        'pca__n_components': [2, 3, 4],
        'classifier': [LogisticRegression()],
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__solver': ['liblinear', 'lbfgs']
    },
    {
        'pca__n_components': [2, 3, 4],
        'classifier': [SVC()],
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__kernel': ['linear', 'rbf']
    },
    {
        'pca__n_components': [2, 3, 4],
        'classifier': [RidgeClassifier()],
        'classifier__alpha': [0.1, 0.01, 1.0]
    }
]


grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('imputer', SimpleImputer()),
                                       ('scaler', StandardScaler()),
                                       ('pca', PCA()),
                                       ('classifier', LogisticRegression())]),
             param_grid=[{'classifier': [LogisticRegression()],
                          'classifier__C': [0.1, 1, 10, 100],
                          'classifier__solver': ['liblinear', 'lbfgs'],
                          'pca__n_components': [2, 3, 4]},
                         {'classifier': [SVC()],
                          'classifier__C': [0.1, 1, 10, 100],
                          'classifier__kernel': ['linear', 'rbf'],
                          'pca__n_components': [2, 3, 4]},
                         {'classifier': [RidgeClassifier()],
                          'classifier__alpha': [0.1, 0.01, 1.0],
                          'pca__n_components': [2, 3, 4]}],
             scoring='accuracy')

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('imputer', SimpleImputer()),
                                       ('scaler', StandardScaler()),
                                       ('pca', PCA()),
                                       ('classifier', LogisticRegression())]),
             param_grid=[{'classifier': [LogisticRegression()],
                          'classifier__C': [0.1, 1, 10, 100],
                          'classifier__solver': ['liblinear', 'lbfgs'],
                          'pca__n_components': [2, 3, 4]},
                         {'classifier': [SVC()],
                          'classifier__C': [0.1, 1, 10, 100],
                          'classifier__kernel': ['linear', 'rbf'],
                          'pca__n_components': [2, 3, 4]},
                         {'classifier': [RidgeClassifier()],
                          'classifier__alpha': [0.1, 0.01, 1.0],
                          'pca__n_components': [2, 3, 4]}],
             scoring='accuracy')

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('pca', PCA(n_components=2)), ('classifier', SVC(C=10))])

SimpleImputer()

StandardScaler()

PCA(n_components=2)

SVC(C=10)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

Best Parameters: {'classifier': SVC(), 'classifier__C': 10, 'classifier__kernel': 'rbf', 'pca__n_components': 2}
Best Cross-Validation Score: 0.6302415458937198

y_pred = grid_search.best_estimator_.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {acc}")

Test set accuracy: 0.5921052631578947

import pickle        # Built-in python module

# Create some object and manipulate it in some way (e.g. train the model)
myobj = SomeClass(...)
myobj = myobj.some_method(...)

# Save to a file using Pickle
with open('myfile.pickle', 'wb') as file_handle:
    pickle.dump(myobj, file_handle)

import pickle 

# Save the model
with open('saved_models/pipeline.pickle', 'wb') as f:
    pickle.dump(pipeline, f)

# Load the model
with open('saved_models/pipeline.pickle', 'rb') as f:
    pipeline_loaded = pickle.load(f)

pipeline_loaded

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('pca', PCA()), ('classifier', LogisticRegression())])

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('pca', PCA()), ('classifier', LogisticRegression())])

SimpleImputer()

StandardScaler()

PCA()

LogisticRegression()

import joblib

# Create some object and manipulate it in some way (e.g. train the model)
myobj = SomeClass(...)
myobj = myobj.some_method(...)

# Save to a file using Joblib
joblib.dump(myobj, file_path)

import joblib

joblib.dump(grid_search, 'saved_models/pipeline.can')

# Load the model
pipeline_loaded = joblib.load('saved_models/pipeline.can')

pipeline_loaded

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('imputer', SimpleImputer()),
                                       ('scaler', StandardScaler()),
                                       ('pca', PCA()),
                                       ('classifier', LogisticRegression())]),
             param_grid=[{'classifier': [LogisticRegression()],
                          'classifier__C': [0.1, 1, 10, 100],
                          'classifier__solver': ['liblinear', 'lbfgs'],
                          'pca__n_components': [2, 3, 4]},
                         {'classifier': [SVC()],
                          'classifier__C': [0.1, 1, 10, 100],
                          'classifier__kernel': ['linear', 'rbf'],
                          'pca__n_components': [2, 3, 4]},
                         {'classifier': [RidgeClassifier()],
                          'classifier__alpha': [0.1, 0.01, 1.0],
                          'pca__n_components': [2, 3, 4]}],
             scoring='accuracy')

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('imputer', SimpleImputer()),
                                       ('scaler', StandardScaler()),
                                       ('pca', PCA()),
                                       ('classifier', LogisticRegression())]),
             param_grid=[{'classifier': [LogisticRegression()],
                          'classifier__C': [0.1, 1, 10, 100],
                          'classifier__solver': ['liblinear', 'lbfgs'],
                          'pca__n_components': [2, 3, 4]},
                         {'classifier': [SVC()],
                          'classifier__C': [0.1, 1, 10, 100],
                          'classifier__kernel': ['linear', 'rbf'],
                          'pca__n_components': [2, 3, 4]},
                         {'classifier': [RidgeClassifier()],
                          'classifier__alpha': [0.1, 0.01, 1.0],
                          'pca__n_components': [2, 3, 4]}],
             scoring='accuracy')

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('pca', PCA(n_components=2)), ('classifier', SVC(C=10))])

SimpleImputer()

StandardScaler()

PCA(n_components=2)

SVC(C=10)

	preg	plas	pres	skin	test	mass	pedi	age	class
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

Feature	Description
`age`	Age of the patient in years
`sex`	Sex of the patient (1 = male, 0 = female)
`cp`	Chest pain type (0: typical angina, 1: atypical angina, 2: non-anginal pain, 3: asymptomatic)
`trestbps`	Resting blood pressure in mm Hg
`chol`	Serum cholesterol in mg/dl
`fbs`	Fasting blood sugar > 120 mg/dl (1 = true, 0 = false)
`restecg`	Resting electrocardiographic results (0: normal, 1: ST-T wave abnormality, 2: probable left ventricular hypertrophy)
`thalach`	Maximum heart rate achieved
`exang`	Exercise-induced angina (1 = yes, 0 = no)
`oldpeak`	ST depression induced by exercise relative to rest
`slope`	Slope of the peak exercise ST segment (0: upsloping, 1: flat, 2: downsloping)
`ca`	Number of major vessels (0-3) colored by fluoroscopy
`thal`	Thalassemia (3 = normal, 6 = fixed defect, 7 = reversible defect)
`target`	Diagnosis of heart disease (1 = disease, 0 = no disease)

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	target
0	63.0	1.0	1.0	145.0	233.0	1.0	2.0	150.0	0.0	2.3	3.0	0.0	6.0	0
1	67.0	1.0	4.0	160.0	286.0	0.0	2.0	108.0	1.0	1.5	2.0	3.0	3.0	2
2	67.0	1.0	4.0	120.0	229.0	0.0	2.0	129.0	1.0	2.6	2.0	2.0	7.0	1
3	37.0	1.0	3.0	130.0	250.0	0.0	0.0	187.0	0.0	3.5	3.0	0.0	3.0	0
4	41.0	0.0	2.0	130.0	204.0	0.0	2.0	172.0	0.0	1.4	1.0	0.0	3.0	0

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal
0	63.0	1.0	1.0	145.0	233.0	1.0	2.0	150.0	0.0	2.3	3.0	0.0	6.0
1	67.0	1.0	4.0	160.0	286.0	0.0	2.0	108.0	1.0	1.5	2.0	3.0	3.0
2	67.0	1.0	4.0	120.0	229.0	0.0	2.0	129.0	1.0	2.6	2.0	2.0	7.0
3	37.0	1.0	3.0	130.0	250.0	0.0	0.0	187.0	0.0	3.5	3.0	0.0	3.0
4	41.0	0.0	2.0	130.0	204.0	0.0	2.0	172.0	0.0	1.4	1.0	0.0	3.0

Lighthouse Labs

W7D5 - Pipelines and Model Persistence

Overview - Pipelines¶

Why Use a Pipeline?¶

Using pipelines¶

Without a Pipeline¶

The solution: Sklearn Pipelines¶

Feature unions¶

Column transformers¶

Visualizing pipelines¶

Hyperparameter tuning with pipelines¶

Model Persistence¶

Pickle¶

Features¶

Limitations¶

Saving procedure¶

Loading procedure¶

Methods¶

Example¶

Joblib¶

Saving procedure¶

Loading procedure¶

Example¶