# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
PCA Baseline: Python
This Python notebook demonstrates a conventional PCA workflow using scikit-learn
. Through a series of standard steps - data loading, standardization, dimensionality reduction, and visualization - it establishes expected behaviors and results for PCA analysis. The notebook employs widely-used libraries like pandas
for data manipulation, scikit-learn
for PCA computation, and matplotlib
for visualization. This represents the typical approach data scientists take when performing PCA, making it an ideal reference point for validating our implementation.
# Load and prepare the data
= pd.read_csv("../../data/iris.csv")
iris_data = iris_data.iloc[:, :4] # Select all rows and first 4 columns
X = iris_data['variety']
y
# Standardize the features
= StandardScaler()
scaler = scaler.fit_transform(X)
X_scaled
# Perform PCA
= PCA()
pca = pca.fit_transform(X_scaled)
X_pca
# Print PCA summary
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Cumulative variance ratio:", np.cumsum(pca.explained_variance_ratio_))
Explained variance ratio: [0.72962445 0.22850762 0.03668922 0.00517871]
Cumulative variance ratio: [0.72962445 0.95813207 0.99482129 1. ]
=(10, 6))
plt.figure(figsize= pd.Categorical(y).codes
categories = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=categories, cmap='viridis')
scatter f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance explained)')
plt.xlabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance explained)')
plt.ylabel('PCA of Iris Dataset')
plt.title(0], y.unique())
plt.legend(scatter.legend_elements()[ plt.show()
# Create biplot
def biplot(score, coef, labels=None):
= score[:,0]
xs = score[:,1]
ys = coef.shape[0]
n =(10, 6))
plt.figure(figsize=pd.Categorical(y).codes, cmap='viridis')
plt.scatter(xs, ys, cfor i in range(n):
0, 0, coef[i,0]*5, coef[i,1]*5, color='r', alpha=0.5)
plt.arrow(if labels is None:
0]*5.2, coef[i,1]*5.2, f'Var{i+1}')
plt.text(coef[i,else:
0]*5.2, coef[i,1]*5.2, labels[i])
plt.text(coef[i,
'PC1')
plt.xlabel('PC2')
plt.ylabel('PCA Biplot of Iris Dataset')
plt.title(
plt.grid()
# Create and show biplot
biplot(X_pca, pca.components_.T, X.columns) plt.show()
# Create scree plot
=(10, 6))
plt.figure(figsizerange(1, len(pca.explained_variance_ratio_) + 1),
plt.bar(* 100)
pca.explained_variance_ratio_ 'Principal Component')
plt.xlabel('Variance Explained (%)')
plt.ylabel('Scree Plot')
plt.title( plt.show()