In [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.decomposition
import sklearn.impute
import geopandas
import pandas
import shapely
import numpy
import matplotlib.pyplot as plt
In [2]:
# Read the US Map and the data
# Same as the code in the previous two lectures
data = pandas.read_csv("SASUMMARY__ALL_AREAS_1998_2022.csv")
sub = data.pivot(index = "GeoName", columns = "LineCode", values = "2021")
features = sub.drop("United States", axis = "index")
impute = sklearn.impute.SimpleImputer(strategy = "mean")
impute.fit(features)
full = impute.transform(features)
scale = sklearn.preprocessing.StandardScaler()
scale.fit(full)
x = scale.transform(full)
x[:5, :]
Out[2]:
array([[-3.41837110e-01, -2.72299505e-01, -2.43077934e-01, -3.40324616e-01, -3.19624545e-01, -3.09324344e-01, -3.06725318e-01, -1.04464533e+00, -7.99733159e-01, -1.28769981e+00, -1.35594162e+00, -1.23288204e+00, -1.28890469e+00, -1.28888500e+00, -2.68065547e-01], [-6.75623478e-01, -7.65635948e-01, -7.83613007e-01, -6.76094478e-01, -7.11945327e-01, -7.23348772e-01, -7.23099059e-01, -9.65791949e-02, 7.54489458e-01, 3.32493069e-01, 7.17488439e-01, 9.32445961e-01, 1.03569578e+00, 1.03567467e+00, -7.85144660e-01], [-4.85733029e-02, 2.58396711e-03, 7.44911590e-02, -4.67443476e-02, -1.83734086e-02, -3.13965049e-03, 5.26418474e-02, -9.14165266e-01, -4.89604513e-01, -6.27182798e-01, -6.67621151e-01, -2.31373971e-01, -6.49954153e-02, -6.49326720e-02, 2.47314722e-02], [-5.19596988e-01, -5.06788992e-01, -5.14358886e-01, -5.18963369e-01, -5.04544431e-01, -5.04932052e-01, -5.14728333e-01, -1.02545708e+00, -1.09656692e+00, -1.12394641e+00, -1.15422124e+00, -1.22454711e+00, -1.10265143e+00, -1.10263407e+00, -5.08782074e-01], [ 5.03631776e+00, 4.70296477e+00, 4.44532525e+00, 4.98062713e+00, 4.97479028e+00, 4.84627386e+00, 4.82259576e+00, 7.01484923e-01, -2.49785330e-01, 1.50887999e+00, 1.30876655e+00, 1.05761619e+00, 2.08504610e+00, 2.08509383e+00, 4.43617422e+00]])
In [3]:
# Compute explained variance
pca = sklearn.decomposition.PCA(n_components = 3)
pca.fit(x)
numpy.vstack([pca.explained_variance_, pca.explained_variance_ratio_])
Out[3]:
array([[9.01412012, 5.03351571, 0.90684963], [0.58915818, 0.32898795, 0.05927122]])
In [4]:
# Check the total amount of variation explained
sum(pca.explained_variance_ratio_)
Out[4]:
0.9774173497603195
In [5]:
# Set the explained variance ratio to find the number of dimensions
pca = sklearn.decomposition.PCA(n_components = 0.99)
pca.fit(x)
numpy.vstack([pca.explained_variance_, pca.explained_variance_ratio_])
Out[5]:
array([[9.01412012, 5.03351571, 0.90684963, 0.27976852], [0.58915818, 0.32898795, 0.05927122, 0.01828552]])