import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv('data/amsterdam_AQ.csv', parse_dates = ['date'])

df.shape

(5439, 8)

# Observe the raw data.
df.head()

df.describe(include='all')

df2 = df.drop(columns = 'sensor_id').pivot_table(index=['date', 'latitude',
       'longitude', 'tavg'], columns='parameter', values='value').reset_index()
df2.shape

(2016, 11)

df2.head()

# Observe a specific date.
df2[df2['date'] == '2022-08-30']

# Visualize summary of statistics
df2.describe()

missing_values = df2.isnull().sum()
missing_percentage = (missing_values / df2.shape[0]) * 100

missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
missing_data[missing_data['Missing Values'] > 0]

df_filled = df2.fillna(df2.mean())

df2.shape

(2016, 11)

df2.isnull().any(axis=1).sum()

np.int64(2016)

# Filling missing values with the mean after grouping by date.
df_filled = df2.groupby(['date']).transform(lambda x: x.fillna(x.mean()))
df_filled['date'] = df2['date']

# Filling missing values by location afterwards
df_filled = df_filled.groupby(['latitude', 'longitude']
                             ).transform(lambda x: x.fillna(x.mean()))
df_filled['latitude'] = df2['latitude']
df_filled['longitude'] = df2['longitude']
df_filled.head(2)

missing_values = df_filled.isnull().sum()
missing_percentage = (missing_values / df_filled.shape[0]) * 100
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
missing_data[missing_data['Missing Values'] > 0]

df_filled = df_filled.dropna(thresh=len(df_filled.columns) - 3)

df_melted = df_filled.melt(value_vars=['co', 'no', 'no2', 'o3', 'pm10', 'pm25', 'so2'],
                            var_name='parameter', 
                            value_name='value')
df_melted.head(2)

plt.figure(figsize=(12, 6))
sns.boxplot(x='parameter', y='value', data=df_melted)
plt.title('Box Plot of Pollutants')
plt.xlabel('Pollutants')
plt.ylabel('Concentration (µg/m³)')
plt.show()

for pollutant in df_melted['parameter'].unique():
    print(f"Negative values for {pollutant}: {(df_filled[pollutant] < 0).sum()}")

Negative values for co: 0
Negative values for no: 262
Negative values for no2: 0
Negative values for o3: 1
Negative values for pm10: 0
Negative values for pm25: 3
Negative values for so2: 35

# Filter numeric columns
numeric_cols = df_filled.select_dtypes(include='number')

# Remove rows where there is a negative value
df_filled = df_filled[(numeric_cols >= 0).all(axis=1)]
df_filled.shape

(1681, 11)

# cols with inappropriate 0s
cols_missing_vals = ['pm25', 'o3']

# replace 0's with NaNs
df_filled[cols_missing_vals] = df_filled[cols_missing_vals].replace(0, np.NaN)
df_filled.isnull().sum()

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))

sns.histplot(df_filled['o3'], bins=30, ax=axes[0])
axes[0].set_title('Distribution of O3 Levels')
axes[0].set_xlabel('O3 (µg/m³)')
axes[0].set_ylabel('Frequency')

sns.histplot(df_filled['o3'], bins=30, color='green', ax=axes[1])
axes[1].set_title('Distribution of O3 Levels')
axes[1].set_xlabel('O3 (µg/m³)')
axes[1].set_ylabel('Frequency')
axes[1].set_xlim(left=0)

plt.tight_layout()
plt.show()

fig, axes = plt.subplots(nrows=2, 
                         ncols=2, 
                         figsize=(14, 10))

sns.histplot(df_filled['o3'], bins=30, ax=axes[0, 0])
axes[0, 0].set_title('Distribution of O3 Levels')
axes[0, 0].set_xlabel('O3 (µg/m³)')
axes[0, 0].set_ylabel('Frequency')

sns.histplot(df_filled['no'], bins=30, color='green', ax=axes[0, 1])
axes[0, 1].set_title('Distribution of NO Levels')
axes[0, 1].set_xlabel('NO (µg/m³)')
axes[0, 1].set_ylabel('Frequency')

sns.histplot(df_filled['so2'], bins=30, color='grey', ax=axes[1, 0])
axes[1, 0].set_title('Distribution of SO2 Levels')
axes[1, 0].set_xlabel('SO2 (µg/m³)')
axes[1, 0].set_ylabel('Frequency')

sns.histplot(df_filled['tavg'], bins=30, color='orange', ax=axes[1, 1])
axes[1, 1].set_title('Distribution of Temperature')
axes[1, 1].set_xlabel('Temperature')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_xlim(left=0)

plt.tight_layout()
plt.show()

sns.kdeplot(data=df_filled, x='pm25', fill=True, label='PM25')
plt.title('Distribution of PM25 Levels')
plt.xlabel('PM25 (µg/m³)')
plt.ylabel('Distribution')
plt.show()

#df_filled['pm25'][np.abs((df_filled['pm25'] - df_filled['pm25'].mean()) / df_filled['pm25'].std())>3]

import numpy as np
from scipy import stats

stats.zscore(df_filled['pm25'])
np.abs(stats.zscore(df_filled['pm25']))
df_filled['pm25'][(np.abs(stats.zscore(df_filled['pm25'])) > 3)].head(2)

391    30.0
447    32.0
Name: pm25, dtype: float64

fig, ax = plt.subplots(1, 3, figsize=(12, 5))

sns.histplot(data=df_filled, x='o3', y='pm25', ax=ax[0])
ax[0].set_title('Density Plot', fontsize=16)

sns.kdeplot(data=df_filled, x='o3', y='pm25', fill=True, ax=ax[1])
ax[1].set_title('Kernal Density Plot', fontsize=16)

sns.scatterplot(data=df_filled, x='o3', y='pm25',alpha=0.7, ax=ax[2])
ax[2].set_title('Scatter Plot', fontsize=16)

plt.show()

sns.jointplot(
    data=df_filled, 
    x='pm25', 
    y='o3',
    height=5
)
plt.suptitle('Joint Plot Comparing PM25 to O3', y=1)
plt.show()

# pandas method
df_filled.corr()

plt.figure(figsize=(6, 4))
sns.heatmap(df_filled.corr(), cmap='Blues', annot=True, annot_kws={'fontsize': 8})
plt.xticks(rotation=45, ha='right')
plt.show()

import numpy as np
mask = np.triu(np.ones_like(df_filled.corr(), dtype=bool), k=1)
plt.figure(figsize=(6, 4))
sns.heatmap(
    df_filled.corr(),
    mask=mask,
    cmap='Blues',
    annot=True,
    cbar=False,
    annot_kws={'fontsize': 8})
plt.xticks(rotation=45, ha='right')
plt.show()

sns.pairplot(df_filled[['co', 'no', 'no2', 'o3', 'pm10', 'pm25', 'so2', 'tavg']])
plt.show()

ax = sns.PairGrid(df_filled[['co', 'no', 'no2', 'o3', 'pm10', 'pm25', 'so2', 'tavg']])

ax.map_upper(sns.scatterplot)
ax.map_lower(sns.kdeplot, fill=True)
ax.map_diag(sns.histplot, kde=True)
plt.show()

plt.figure(figsize=(12, 6))
sns.scatterplot(data=df_filled, x='pm25', y='o3', hue='pollution_level', s=75)
plt.legend(loc='upper left', fontsize='x-large')
plt.show()

plt.figure(figsize=(12, 6))
sns.scatterplot(data=df_filled, x='pm25', y='o3', hue='pollution_level', size='month')
plt.show()

# Plotting trends for PM2.5
plt.figure(figsize=(14, 7))
sns.lineplot(data=df_filled, x='date', y=df_filled['pm25'], errorbar=None)
plt.title('PM2.5 Levels Over Time in Amsterdam')
plt.xlabel('Date')
plt.ylabel('PM2.5 (µg/m³)')
plt.xticks(rotation=45)
plt.show()

import pandas as pd
import plotly.express as px

import plotly.io as pio

# Set the renderer to use in Jupyter Notebook
pio.renderers.default = 'iframe'  # or 'notebook_connected'

global_min = df_filled['pm25'].min()
global_max = df_filled['pm25'].max()

fig = px.density_mapbox(
    df_filled,
    lat='latitude',
    lon='longitude',
    z='pm25',
    radius=10,
    center=dict(lat=52.3676, lon=4.9041),
    mapbox_style="carto-positron",
    zoom=10,
    animation_frame='date',
    title='PM2.5 Concentrations in Amsterdam Over Time',
    range_color=[global_min, global_max],   # Set fixed maximum for color scale
)

fig.show()

hover_cols =['tavg', 'o3', 'so2']

fig = px.scatter(
    df_filled,
    x='pm25',
    y='pm10',
    marginal_x='histogram',
    marginal_y='histogram',
    hover_data=hover_cols,
    title='PM25 vs. PM10'
)
fig.show()

corr_matrix = np.abs(df_filled.corr())

fig = px.imshow(corr_matrix, color_continuous_scale='RdBu_r')

fig.update_layout(
    title = 'Correlation Heatmat',
    xaxis = dict(title='Features'),
    yaxis = dict(title='Features')
)

fig.show()

	sensor_id	parameter	parameter_units	value	date	latitude	longitude	tavg
0	164	co	µg/m³	0.22	2022-08-30	52.389983	4.887811	18.9
1	164	co	µg/m³	0.27	2022-08-31	52.389983	4.887811	18.8
2	164	co	µg/m³	0.26	2022-09-01	52.389983	4.887811	18.9
3	164	co	µg/m³	0.39	2022-09-02	52.389983	4.887811	19.6
4	164	co	µg/m³	0.35	2022-09-03	52.389983	4.887811	20.5

Field Name	Description	Data Type	Example Value
`sensor_id`	Unique identifier for the air quality sensor	String or Integer	`12345`
`parameter`	Air quality parameter being measured (e.g., pollutants)	String	`PM2.5`
`parameter_units`	Units of measurement for the parameter	String	`µg/m³`
`value`	Measured value of the parameter	Float or Integer	`15.2`
`date`	Date of the measurement	Datetime	`2022-09-01`
`latitude`	Latitude of the sensor's location	Float	`52.3676`
`longitude`	Longitude of the sensor's location	Float	`4.9041`
`tavg`	Average temperature at the time of measurement	Float	`18.5`

	sensor_id	parameter	parameter_units	value	date	latitude	longitude	tavg
count	5.439000e+03	5439	5439	5439.000000	5439	5439.000000	5439.000000	5439.000000
unique	NaN	7	1	NaN	NaN	NaN	NaN	NaN
top	NaN	no	µg/m³	NaN	NaN	NaN	NaN	NaN
freq	NaN	1382	5439	NaN	NaN	NaN	NaN	NaN
mean	1.335117e+06	NaN	NaN	15.310678	2023-02-15 01:08:02.515168	52.376177	4.885997	10.614837
min	1.190000e+02	NaN	NaN	-5.100000	2022-08-30 00:00:00	52.358039	4.860319	-3.300000
25%	4.244000e+03	NaN	NaN	1.700000	2022-12-01 00:00:00	52.359714	4.866208	6.600000
50%	4.390000e+03	NaN	NaN	10.000000	2023-02-08 00:00:00	52.374786	4.887811	10.300000
75%	4.275709e+06	NaN	NaN	21.000000	2023-04-30 00:00:00	52.389983	4.899700	15.200000
max	5.079267e+06	NaN	NaN	180.000000	2023-08-30 00:00:00	52.393972	4.943822	24.700000
std	2.062524e+06	NaN	NaN	17.367817	NaN	0.014159	0.023998	5.927832

parameter	date	latitude	longitude	tavg	co	no	no2	o3	pm10	pm25	so2
count	2016	2016.000000	2016.000000	2016.000000	464.000000	1382.000000	1286.000000	519.000000	749.000000	770.000000	269.000000
mean	2023-03-08 07:18:34.285714176	52.376930	4.890129	11.424405	0.282269	6.563606	22.816991	46.377823	17.405073	9.659675	0.690746
min	2022-08-30 00:00:00	52.358039	4.860319	-3.300000	0.075000	-5.100000	0.550000	-1.800000	3.200000	-4.200000	-0.100000
25%	2022-12-17 00:00:00	52.359714	4.866208	7.000000	0.210000	0.300000	14.000000	31.500000	12.000000	4.900000	0.300000
50%	2023-03-07 00:00:00	52.374786	4.887811	11.600000	0.270000	2.550000	20.000000	48.000000	16.000000	7.300000	0.580000
75%	2023-06-04 00:00:00	52.389983	4.904400	16.300000	0.332500	7.575000	30.000000	61.000000	21.000000	12.000000	0.930000
max	2023-08-30 00:00:00	52.393972	4.943822	24.700000	0.650000	180.000000	82.000000	110.000000	60.000000	45.000000	3.700000
std	NaN	0.013718	0.027191	5.991488	0.105376	12.862397	12.823187	21.528311	8.160573	7.327437	0.552440

	Missing Values	Percentage
parameter
co	1552	76.984127
no	634	31.448413
no2	730	36.210317
o3	1497	74.255952
pm10	1267	62.847222
pm25	1246	61.805556
so2	1747	86.656746

Lighthouse Labs

W4D5 - Introduction to Exploratory Data Analysis (EDA)

Introduction¶

Meet Aisha

Loading and Understanding the Data¶

Initial Data Exploration¶

Handling Missing Values¶

Understanding Data Distribution¶

More null values¶

Visualizing a Single Variable at a Time¶

Skewness¶

Outlier Detection¶

Z-scores¶

Using data visualization for EDA on multiple variables¶

Displaying info of more than two variables¶

Adding a 3rd variable¶

Adding a 4th variable¶

Analyzing Trends Over Time¶

Why 3 different visualizing libraries?¶

Visualizing Pollution Distribution with a Heatmap¶

Data Visualization¶

Plotly Favs¶

parameter	tavg	co	no	no2	o3	pm10	pm25	so2	date	latitude	longitude
0	18.9	0.21	9.220395	12.0	70.0	13.0	5.8	0.95	2022-08-30	52.358039	4.899700
1	18.9	0.20	1.580324	14.0	79.0	12.0	7.2	0.95	2022-08-30	52.359714	4.866208

parameter	tavg	co	no	no2	o3	pm10	pm25	so2	date	latitude	longitude
parameter
tavg	1.000000	0.058953	-0.221376	-0.267582	0.300348	-0.324104	-0.236687	0.249517	0.343563	0.025871	0.001357
co	0.058953	1.000000	0.408081	0.606741	-0.399448	0.510728	0.527517	0.167175	0.152583	0.121998	0.086315
no	-0.221376	0.408081	1.000000	0.642967	-0.460738	0.430354	0.318059	0.146812	-0.224529	-0.020413	-0.140993
no2	-0.267582	0.606741	0.642967	1.000000	-0.571758	0.573999	0.464198	0.229643	-0.224146	0.029190	-0.031145
o3	0.300348	-0.399448	-0.460738	-0.571758	1.000000	-0.484241	-0.547283	-0.085718	0.371098	0.013634	0.003422
pm10	-0.324104	0.510728	0.430354	0.573999	-0.484241	1.000000	0.824989	0.282404	-0.175353	-0.011140	0.005043
pm25	-0.236687	0.527517	0.318059	0.464198	-0.547283	0.824989	1.000000	0.269116	-0.001839	-0.002965	0.047681
so2	0.249517	0.167175	0.146812	0.229643	-0.085718	0.282404	0.269116	1.000000	0.026164	0.001148	0.006324
date	0.343563	0.152583	-0.224529	-0.224146	0.371098	-0.175353	-0.001839	0.026164	1.000000	0.034172	0.036451
latitude	0.025871	0.121998	-0.020413	0.029190	0.013634	-0.011140	-0.002965	0.001148	0.034172	1.000000	0.134697
longitude	0.001357	0.086315	-0.140993	-0.031145	0.003422	0.005043	0.047681	0.006324	0.036451	0.134697	1.000000