import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('data/coffee_productivity.csv', usecols = ['Coffee_Cups', 'Productivity'])
df.head()

X = df['Coffee_Cups'].values.reshape(-1, 1)
y = df['Productivity'].values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LinearRegression
lr_sample = LinearRegression()
lr_sample.fit(X_train, y_train)

LinearRegression()

LinearRegression()

lr_sample.predict(X_train)[:5]

array([17.62058075, 20.11357862, 25.09957436, 12.634585  , 27.59257223])

y_train[:5]

array([18.7002218 , 23.09371016, 27.84047591,  9.24645982, 30.72015875])

lr_sample.predict(X_test)[:5]

array([17.62058075, 27.59257223, 20.11357862, 25.09957436, 12.634585  ])

y_test[:5]

array([16.21165539, 28.80351029, 23.433057  , 26.89351191, 13.10162284])

lr_sample.score(X_train, y_train)

0.8309714784972858

lr_sample.score(X_test, y_test)

0.82765549336117

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 7))

axes[0].scatter(X_train, y_train, marker='o', s=2)
axes[0].set_title('Coffee vs Productivity', fontsize=16)
axes[0].set_xlabel('Cups of Coffee')
axes[0].set_ylabel('Productivity (completed tasks)')

axes[1].scatter(X_train, y_train, marker='o', alpha=0.5, s=2)
axes[1].plot(X_train, (lr_sample.coef_ * X_train) + lr_sample.intercept_, c='black')
axes[1].set_title('Coffee vs Productivity (with regression)', fontsize=16)
axes[1].set_xlabel('Cups of Coffee')
axes[1].set_ylabel('Productivity (completed tasks)')
axes[1].text(
    x=0.4,
    y=0.08,
    s='$\\hat{y}=-350.7x+7.7$',
    fontsize=18,
    fontweight='bold',
    transform=axes[1].transAxes
)

plt.show()

lr_sample.coef_

array([2.49299787])

lr_sample.intercept_

np.float64(5.155591389534953)

new_obs = pd.DataFrame({'Coffee_Cups': [4,5]})
new_obs

# First person:
lr_sample.coef_*new_obs.loc[0, 'Coffee_Cups'] + lr_sample.intercept_

array([15.12758288])

# Second person:
lr_sample.coef_*new_obs.loc[1, 'Coffee_Cups'] + lr_sample.intercept_

array([17.62058075])

y_pred = lr_sample.predict(new_obs['Coffee_Cups'].values.reshape(-1, 1))
y_pred

array([15.12758288, 17.62058075])

df = pd.read_csv('data/coffee_productivity.csv')
df.head()

X = df.drop(columns=['Productivity'])
y = df['Productivity'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

LinearRegression()

LinearRegression()

lr_coeffs = lr.coef_
lr_coeffs

array([6.4249594 , 2.09152413])

lr_intercept = lr.intercept_
lr_intercept

np.float64(17.36193221782039)

words_coeffs_df = pd.DataFrame(data = lr_coeffs.T, index = X_train.columns, columns=['Coefficients'])
words_coeffs_df

lr.predict(X_train_scaled)[:5]

array([15.53617259, 20.72490379, 28.2837986 , 10.5896231 , 27.82100943])

lr.predict(X_test_scaled)[:5]

array([17.23753722, 28.37399184, 22.42152222, 26.65615455, 13.31313734])

lr.intercept_ + (lr.coef_[0] * X_test_scaled[0][0]) + (lr.coef_[1] * X_test_scaled[0][1])

np.float64(17.237537221177554)

lr.score(X_train_scaled, y_train)

0.9192148183081769

lr.score(X_test_scaled, y_test)

0.9321424686508619

import statsmodels.api as sm

model = sm.OLS(y_train, sm.add_constant(X_train_scaled))
results = model.fit()

results.params

array([17.36193222,  6.4249594 ,  2.09152413])

print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.919
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                     4534.
Date:                Tue, 22 Oct 2024   Prob (F-statistic):               0.00
Time:                        13:45:08   Log-Likelihood:                -1690.1
No. Observations:                 800   AIC:                             3386.
Df Residuals:                     797   BIC:                             3400.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         17.3619      0.071    244.930      0.000      17.223      17.501
x1             6.4250      0.071     90.638      0.000       6.286       6.564
x2             2.0915      0.071     29.506      0.000       1.952       2.231
==============================================================================
Omnibus:                        0.145   Durbin-Watson:                   2.019
Prob(Omnibus):                  0.930   Jarque-Bera (JB):                0.143
Skew:                           0.032   Prob(JB):                        0.931
Kurtosis:                       2.989   Cond. No.                         1.00
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

from sklearn.preprocessing import PolynomialFeatures

df = pd.read_csv('data/non_linear_coffee_productivity.csv')
df.head(2)

X = df.drop(columns=['Productivity'])
y = df['Productivity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create polynomial features (degree 2)
poly = PolynomialFeatures(degree=2)
X_poly_train = poly.fit_transform(X_train)

# Fit the polynomial regression model
model = LinearRegression()
model.fit(X_poly_train, y_train)

LinearRegression()

LinearRegression()

y_pred = model.predict(X_poly_train)

# We'll plot only the actual vs predicted productivity
plt.scatter(y_train, y_pred, color='blue', label='Predicted vs Actual')
plt.plot([min(y_train), max(y_train)], [min(y_train), max(y_train)], color='red', linestyle='--', label='Perfect Fit Line')
plt.xlabel('Actual Productivity (completed tasks)')
plt.ylabel('Predicted Productivity (completed tasks)')
plt.title('Polynomial Regression: Coffee, Sleep vs. Productivity')
plt.legend()
plt.show()

import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'iframe'

scatter = go.Scatter3d(
    x=X_train['Coffee_Cups'], 
    y=X_train['Hours_Sleep'], 
    z=y_train,
    mode='markers',
    marker=dict(size=5, color='blue', opacity=0.8),
    name='Data'
)

surface = go.Mesh3d(
    x=X_train['Coffee_Cups'],
    y=X_train['Hours_Sleep'],
    z=y_pred,
    opacity=0.6,
    color='red',
    name='Polynomial Fit'
)

layout = go.Layout(
    title='Polynomial Regression: Coffee, Sleep vs. Productivity',
    scene=dict(
        xaxis_title='Cups of Coffee',
        yaxis_title='Hours of Sleep',
        zaxis_title='Productivity (completed tasks)',
        aspectratio=dict(x=1, y=1, z=1)
    )
)

fig = go.Figure(data=[scatter, surface], layout=layout)
fig.show()

Model	Training Score relative to Test Score	Performance
Too Complex	High training score compared to test score	Overfit
Too Simple	Low training score and low test score	Underfit

	Coffee_Cups	Productivity
0	7	24.851737
1	4	12.687456
2	8	21.917630
3	5	19.208558
4	7	17.402905

	Coffee_Cups	Hours_Sleep	Productivity
0	7	7.605595	24.851737
1	4	6.963707	12.687456
2	8	5.644447	21.917630
3	5	7.486539	19.208558
4	7	5.231440	17.402905

	Coffee_Cups	Hours_Sleep	Productivity
0	3	9.055615	11.421316
1	2	9.144187	6.674547

	Coefficients
Coffee_Cups	6.424959
Hours_Sleep	2.091524

Lighthouse Labs

W4D5 - Introduction to Machine Learning (ML)

Machine Learning¶

What is Machine Learning (ML)?¶

Types of Machine Learning¶

Supervised Learning¶

Unsupervised Learning¶

Supervised Learning: Regression & Classification¶

Classification Problems¶

Regression Problems¶

The Golden Rule of Supervised Learning¶

Training vs. Test Scores¶

The fundamental tradeoff¶

Empirical Risk Minimization (ERM)¶

Three Main Errors¶

Approximation Error¶

Estimation Error¶

Optimization Error¶

Fundamental Trade-off¶

Bias-Variance Trade-off¶

Linear Regression¶

Simple Linear Regression¶

Multiple Linear Regression¶

Matrix representation¶

The Coffee-Productivity Dilemma: A Data-Driven Approach¶

Step 1: Load the data¶

Step 2: Identify the features X and the target y¶

Step 3: Split into train and test sets.¶

Step 4: Choosing the model¶

Step 5: Observe some of the outputs in the train set.¶

Step 6: Observe some of the outputs in the test set.¶

Step 7: Compare the scores¶

Step 8: New observations¶

Step 9a: Calculation by Hand¶

Step 9b: Calculation using sklearn .predict()¶

Multivariable Regression¶

Additional step: Scale the data so that it is comparable.¶

Important¶

Why did we scale?¶

Prediction¶

Linear Regression with statsmodels¶

Score interpretation¶

Going beyond linear regression: Polynomial regression¶

Sarah's Coffee-Sleep-Productivity Example Revisited¶

What is Polynomial Regression?¶

Example:¶

How Polynomial Regression Works:¶

Step 3: Split into `train` and `test` sets.¶

Step 5: Observe some of the outputs in the `train` set.¶

Step 6: Observe some of the outputs in the `test` set.¶

Step 9b: Calculation using `sklearn` `.predict()`¶

Linear Regression with `statsmodels`¶