import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

gremlins_df = pd.read_csv('data/gremlins.csv')
gremlins_df.head()

from scipy.cluster.hierarchy import dendrogram, linkage
linked = linkage(gremlins_df, method='ward')

plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.show()

from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters=5, n_init=10)
kmeans_model.fit(gremlins_df)

KMeans(n_clusters=5, n_init=10)

KMeans(n_clusters=5, n_init=10)

gremlins_df['Kmeans_Cluster'] = kmeans_model.fit_predict(gremlins_df)
gremlins_df.head()

plt.scatter(gremlins_df['Size (cm)'], gremlins_df['Weight (kg)'], c=gremlins_df['Kmeans_Cluster'], cmap='viridis')
plt.xlabel('Size (cm)')
plt.ylabel('Weight (kg)')
plt.title('K-Means Clustering of Gremlins')
plt.show()

scores = {}
for k in range(1, 10):
  model = KMeans(n_clusters=k, n_init=10)
  model.fit(gremlins_df)
  scores[k] = model.inertia_

sns.lineplot(x=scores.keys(), y=scores.values())
plt.xlabel('k')
plt.ylabel('Inertia (SSE)')
plt.title('K-Means Elbow Plot')
plt.show()

kmeans_model = KMeans(n_clusters=3, n_init=10)
kmeans_model.fit(gremlins_df)
gremlins_df['Kmeans_Cluster'] = kmeans_model.fit_predict(gremlins_df)
gremlins_df.head()

plt.scatter(gremlins_df['Size (cm)'], gremlins_df['Weight (kg)'], c=gremlins_df['Kmeans_Cluster'], cmap='viridis')
plt.xlabel('Size (cm)')
plt.ylabel('Weight (kg)')
plt.title('K-Means Clustering of Gremlins')
plt.show()

from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=1, min_samples=3)

gremlins_df['DBSCAN_Cluster'] = dbscan.fit_predict(gremlins_df)
gremlins_df.head()

plt.scatter(gremlins_df['Size (cm)'], gremlins_df['Weight (kg)'], c=gremlins_df['DBSCAN_Cluster'], cmap='plasma')
plt.xlabel('Size (cm)')
plt.ylabel('Weight (kg)')
plt.title('DBSCAN Clustering of Gremlins')
plt.show()

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(gremlins_df.drop(['Kmeans_Cluster', 'DBSCAN_Cluster'], axis=1))

dbscan = DBSCAN(eps=5, min_samples=2)
gremlins_df['DBSCAN_Cluster_Scaled'] = dbscan.fit_predict(scaled_data)
gremlins_df.head()

plt.scatter(gremlins_df['Size (cm)'], gremlins_df['Weight (kg)'], c=gremlins_df['DBSCAN_Cluster_Scaled'], cmap='plasma')
plt.xlabel('Size (cm)')
plt.ylabel('Weight (kg)')
plt.title('DBSCAN Clustering (Scaled) of Gremlins')
plt.show()

gremlins_df.shape

(150, 12)

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_components = pca.fit(gremlins_df.drop(['Kmeans_Cluster', 'DBSCAN_Cluster', 'DBSCAN_Cluster_Scaled'], axis=1))
pca_components = pca.transform(gremlins_df.drop(['Kmeans_Cluster', 'DBSCAN_Cluster', 'DBSCAN_Cluster_Scaled'], axis=1))

plt.scatter(pca_components[:, 0], pca_components[:, 1], c=gremlins_df['Kmeans_Cluster'], cmap='viridis')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA of Gremlins Dataset with K-Means Clusters')
plt.show()

from sklearn.metrics import silhouette_score

silhouette_kmeans = silhouette_score(gremlins_df.drop(['Kmeans_Cluster',
       'DBSCAN_Cluster', 'DBSCAN_Cluster_Scaled'], axis=1), gremlins_df['Kmeans_Cluster'])
print(f'Silhouette Score for K-Means: {silhouette_kmeans}')

Silhouette Score for K-Means: 0.2964678965605285

pca_components[:3]

array([[-26.39973483,  -5.38378156],
       [  0.9824435 ,  14.5977015 ],
       [  2.0911697 ,   8.05267201]])

kmeans = KMeans(n_clusters=3, random_state=42)

kmeans_labels = kmeans.fit_predict(pca_components)

from matplotlib.colors import ListedColormap
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

cmap = ListedColormap(["purple", "teal", "yellow"])
plt.scatter(pca_components[:, 0], pca_components[:, 1], c=kmeans_labels, cmap=cmap)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('KMeans Clustering on PCA-Reduced Data')

stripe_img = mpimg.imread('img/Stripe.webp') 
gizmo_img = mpimg.imread('img/gizmo.png') 
brain_img = mpimg.imread('img/brain.jpeg')

centroids = np.array([pca_components[kmeans_labels == i].mean(axis=0) for i in range(3)])

def add_image(ax, img, x, y, zoom=0.2):
    imagebox = OffsetImage(img, zoom=zoom)
    ab = AnnotationBbox(imagebox, (x, y), frameon=False)
    ax.add_artist(ab)

ax = plt.gca()
add_image(ax, gizmo_img, centroids[0, 0], centroids[0, 1], zoom=0.1)
add_image(ax, stripe_img, centroids[1, 0], centroids[1, 1], zoom=0.1) 
add_image(ax, brain_img, centroids[2, 0], centroids[2, 1], zoom=.3)

plt.show()

img = mpimg.imread('img/Stripe.webp')

feature_importance_pc1 = pca.components_[0]  # Loadings for the first principal component
feature_importance_pc1

array([-0.08743386, -0.00634363,  0.99577796, -0.00985658,  0.01670778,
        0.01697297,  0.00713279,  0.00425874, -0.0028172 ])

features = gremlins_df.drop(columns=['Kmeans_Cluster', 'DBSCAN_Cluster', 'DBSCAN_Cluster_Scaled']).columns
pc1_importance_df = pd.DataFrame({'Feature': features, 'PC1 Loading': feature_importance_pc1})

pc1_importance_df['Absolute Loading'] = pc1_importance_df['PC1 Loading'].abs()
pc1_importance_df = pc1_importance_df.sort_values('Absolute Loading', ascending=False)
pc1_importance_df

pca = PCA()
pca = pca.fit(gremlins_df.drop(['Kmeans_Cluster', 'DBSCAN_Cluster', 'DBSCAN_Cluster_Scaled'], axis=1))
explained_variance = pca.explained_variance_ratio_

plt.figure(figsize=(8, 5))
plt.bar(range(1, len(explained_variance) + 1), explained_variance, color='skyblue', edgecolor='black')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot (Bar Chart)')
plt.xticks(range(1, len(explained_variance) + 1))
plt.show()

Feature	Description
Size (cm)	The height of the Gremlin or Mogwai in centimeters.
Weight (kg)	The weight of the Gremlin or Mogwai in kilograms.
Color_Intensity	A scale (0-100) representing the intensity of the creature's color (darker = more intense).
Aggressiveness	A scale (1-10) measuring how aggressive the Gremlin is (1 = docile, 10 = highly aggressive).
Intelligence	A scale (1-10) measuring the creature's intelligence.
Number of Spikes	The number of spikes or physical protrusions on the Gremlin (0 for Mogwai).
Age (years)	The age of the Gremlin or Mogwai in years.
Moisture Exposure (hrs)	The number of hours the creature has been exposed to moisture (triggers Gremlin transformation).
Fed_After_Midnight	Binary feature (0 = not fed after midnight, 1 = fed after midnight, which transforms Mogwai to Gremlins).

	Size (cm)	Weight (kg)	Color_Intensity	Aggressiveness	Intelligence	Number of Spikes	Age (years)	Moisture Exposure (hrs)	Fed_After_Midnight
0	26.236204	14.082659	43.100903	6	8	2	2.204537	1.813789	1
1	43.521429	7.395619	71.881278	7	7	12	7.197828	4.305340	0
2	36.959818	6.448949	72.438107	10	10	11	5.041019	1.097553	0
3	32.959755	9.894528	78.245794	10	3	4	7.595426	4.872736	1
4	19.680559	14.856505	83.565480	3	1	5	1.208577	3.898794	0

	Size (cm)	Weight (kg)	Color_Intensity	Aggressiveness	Intelligence	Number of Spikes	Age (years)	Moisture Exposure (hrs)	Fed_After_Midnight	Kmeans_Cluster
0	26.236204	14.082659	43.100903	6	8	2	2.204537	1.813789	1	2
1	43.521429	7.395619	71.881278	7	7	12	7.197828	4.305340	0	0
2	36.959818	6.448949	72.438107	10	10	11	5.041019	1.097553	0	0
3	32.959755	9.894528	78.245794	10	3	4	7.595426	4.872736	1	3
4	19.680559	14.856505	83.565480	3	1	5	1.208577	3.898794	0	1

	Size (cm)	Weight (kg)	Color_Intensity	Aggressiveness	Intelligence	Number of Spikes	Age (years)	Moisture Exposure (hrs)	Fed_After_Midnight	Kmeans_Cluster
0	26.236204	14.082659	43.100903	6	8	2	2.204537	1.813789	1	0
1	43.521429	7.395619	71.881278	7	7	12	7.197828	4.305340	0	2
2	36.959818	6.448949	72.438107	10	10	11	5.041019	1.097553	0	2
3	32.959755	9.894528	78.245794	10	3	4	7.595426	4.872736	1	2
4	19.680559	14.856505	83.565480	3	1	5	1.208577	3.898794	0	1

	Size (cm)	Weight (kg)	Color_Intensity	Aggressiveness	Intelligence	Number of Spikes	Age (years)	Moisture Exposure (hrs)	Fed_After_Midnight	Kmeans_Cluster	DBSCAN_Cluster
0	26.236204	14.082659	43.100903	6	8	2	2.204537	1.813789	1	0	-1
1	43.521429	7.395619	71.881278	7	7	12	7.197828	4.305340	0	2	-1
2	36.959818	6.448949	72.438107	10	10	11	5.041019	1.097553	0	2	-1
3	32.959755	9.894528	78.245794	10	3	4	7.595426	4.872736	1	2	-1
4	19.680559	14.856505	83.565480	3	1	5	1.208577	3.898794	0	1	-1

Lighthouse Labs

W8D4 - Unsupervised Learning

Unsupervised Learning:¶

Reinforcement Learning:¶

Semi-supervised Learning:¶

Unsupervised Learning Use Cases¶

Clustering¶

“Gremlin Risk Management Task Force” in Kingston Falls¶

Hierarchical Clustering¶

KMeans Clustering¶

Characteristics:¶

How can I visualize the 5 clusters?¶

Plot clusters using 2 features¶

How do I know how many clusters `k` I should do?¶

DBSCAN (Density-Based Spatial Clustering of Applications with Noise)¶

Why DBSCAN Failed?¶

Scaling¶

PCA (Principal Component Analysis)¶

Uses:¶

How Does `PCA` work?¶

Measuring Variation¶

Let's apply `PCA` to the Gremlins dataset for Visualization Purposes¶

`PCA` to the Gremlins dataset as a Preprocessing Tool¶

What does PCA Component 1 mean?¶

Other Feature Reduction Techniques¶

Why do dimensionality reduction?¶

Feature Selection¶

Feature Selection Techniques: Filter and Wrapper Methods¶

Filter methods¶

Wrapper methods¶

Forward selection wrapper method¶

Backward elimination wrapper method¶

Recursive Feature Elimination¶

Variable Selection - Wrapper Methods Tips¶

	Feature	PC1 Loading	Absolute Loading
2	Color_Intensity	0.995778	0.995778
0	Size (cm)	-0.087434	0.087434
5	Number of Spikes	0.016973	0.016973
4	Intelligence	0.016708	0.016708
3	Aggressiveness	-0.009857	0.009857
6	Age (years)	0.007133	0.007133
1	Weight (kg)	-0.006344	0.006344
7	Moisture Exposure (hrs)	0.004259	0.004259
8	Fed_After_Midnight	-0.002817	0.002817

Lighthouse Labs

W8D4 - Unsupervised Learning

Unsupervised Learning:¶

Reinforcement Learning:¶

Semi-supervised Learning:¶

Unsupervised Learning Use Cases¶

Clustering¶

“Gremlin Risk Management Task Force” in Kingston Falls¶

Hierarchical Clustering¶

KMeans Clustering¶

Characteristics:¶

How can I visualize the 5 clusters?¶

Plot clusters using 2 features¶

How do I know how many clusters k I should do?¶

DBSCAN (Density-Based Spatial Clustering of Applications with Noise)¶

Why DBSCAN Failed?¶

Scaling¶

PCA (Principal Component Analysis)¶

Uses:¶

How Does PCA work?¶

Measuring Variation¶

Let's apply PCA to the Gremlins dataset for Visualization Purposes¶

PCA to the Gremlins dataset as a Preprocessing Tool¶

What does PCA Component 1 mean?¶

Other Feature Reduction Techniques¶

Why do dimensionality reduction?¶

Feature Selection¶

Feature Selection Techniques: Filter and Wrapper Methods¶

Filter methods¶

Wrapper methods¶

Forward selection wrapper method¶

Backward elimination wrapper method¶

Recursive Feature Elimination¶

Variable Selection - Wrapper Methods Tips¶

How do I know how many clusters `k` I should do?¶

How Does `PCA` work?¶

Let's apply `PCA` to the Gremlins dataset for Visualization Purposes¶

`PCA` to the Gremlins dataset as a Preprocessing Tool¶