# import
import numpy as np
from numpy import random
import plotly
import plotly.graph_objs as go
from sklearn.decomposition import PCA
# configure plotly to be rendered inline in the notebook
plotly.offline.init_notebook_mode()
We build randomly a dataset of 1000 points within an ellipsoid. We do not use a scaler on purpose to keep the unbalanced dimensions.
Then we perform a PCA with 3 components and we make a display in 3D. We can see that the first PCA component in black matches with the longest ellipsoid dimension (axis X) and the second PCA component in blue matches with the longest ellipsoid dimension (axis Y).
To see what would provide a PCA with only 2 components, you need to manage the display so that the black vector is horizontal and the blue vector vertical.
# build the dataset
# 1000 random points in a single ellipsoid
random.seed(0)
n = 1000
r = random.random(n)
theta = random.random(n) * 2 * np.pi
phi = random.random(n) * 2 * np.pi
# ellipsoid 3.X, 2.Y, 1.Z centered in (0, 0, 0)
# it is possible to modify the coefficients a, b and c
a, b, c = 3, 2, 1
x = a * r * np.cos(theta) * np.cos(phi)
y = b * r * np.cos(theta) * np.sin(phi)
z = c * r * np.sin(theta)
# merging the data
data = np.array([x, y, z]).T
# running the PCA
pca = PCA(n_components=3)
pca.fit(data)
print('explained variance ratio:', pca.explained_variance_ratio_)
# configure the plot for points
ellipsoid = go.Scatter3d(
x=x,
y=y,
z=z,
mode='markers',
marker={
'size': 3,
'opacity': 1.0,
},
name='ellipsoid'
)
# configure the plot for vectors
colors = ['black', 'blue', 'red']
names = ['PCA 1', 'PCA 2', 'PCA 3']
vectors = [go.Scatter3d( x = [0, pca.components_[i][0]],
y = [0, pca.components_[i][1]],
z = [0, pca.components_[i][2]],
marker = dict( size = 3,
symbol= 'diamond',
color = colors[i]),
line = dict( color = colors[i],
width = 3),
name = names[i]
)
for i in range(3)]
# configure the layout
layout = go.Layout(
margin={'l': 0, 'r': 0, 'b': 0, 't': 0}
)
data = [ellipsoid] + vectors
plot_figure = go.Figure(data=data, layout=layout)
# render the plot
plotly.offline.iplot(plot_figure)
We build randomly a dataset of 1000 points within two ellipsoids. We do not use a scaler on purpose to keep the unbalanced dimensions.
Then we perform a PCA with 3 components and we make a display in 3D. We can see that the first PCA component in black matches with the direction which separates the 2 ellipsoids (axis Y) and the second PCA component in blue matches with the longest ellipsoid dimension (axis X).
To see what would provide a PCA with only 2 dimensions, you need to manage the display so that the black vector is horizontal and the blue vector vertical.
# build the dataset
# 1000 random points split in 2 ellipsoids
random.seed(0)
n = 500
r = random.random(n)
theta = random.random(n) * 2 * np.pi
phi = random.random(n) * 2 * np.pi
# ellipsoid 1 : 2.X, 1.Y, 1.Z centered in (0, -1, 0)
x1 = 2 * r * np.cos(theta) * np.cos(phi)
y1 = -1 + r * np.cos(theta) * np.sin(phi)
z1 = r * np.sin(theta)
r = random.random(n)
theta = random.random(n) * 2 * np.pi
phi = random.random(n) * 2 * np.pi
# ellipsoid 1 : 2.X, 1.Y, 1.Z centered in (0, 1, 0)
x2 = 2 * r * np.cos(theta) * np.cos(phi)
y2 = 1 - r * np.cos(theta) * np.sin(phi)
z2 = r * np.sin(theta)
x = np.concatenate([x1, x2])
y = np.concatenate([y1, y2])
z = np.concatenate([z1, z2])
data = np.array([x, y, z]).T
# run the PCA
pca = PCA(n_components=3)
pca.fit(data)
print('explained variance ratio:', pca.explained_variance_ratio_)
# configure the plot
ellipsoid1 = go.Scatter3d(
x=x1,
y=y1,
z=z1,
mode='markers',
marker={
'size': 3,
'opacity': 1.0
},
name='ellipsoid 1'
)
ellipsoid2 = go.Scatter3d(
x=x2,
y=y2,
z=z2,
mode='markers',
marker={
'size': 3,
'opacity': 1.0,
},
name='ellipsoid 2'
)
# vectors
colors = ['black', 'blue', 'red']
names = ['PCA 1', 'PCA 2', 'PCA 3']
vectors = [go.Scatter3d( x = [0, pca.components_[i][0]],
y = [0, pca.components_[i][1]],
z = [0, pca.components_[i][2]],
marker = dict( size = 3,
symbol= 'diamond',
color = colors[i]),
line = dict( color = colors[i],
width = 3),
name = names[i]
)
for i in range(3)]
# configure the layout
layout = go.Layout(
margin={'l': 0, 'r': 0, 'b': 0, 't': 0}
)
data = [ellipsoid1] + [ellipsoid2] + vectors
plot_figure = go.Figure(data=data, layout=layout)
# render the plot
plotly.offline.iplot(plot_figure)