# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.decomposition
import sklearn.impute
import geopandas
import pandas
import shapely
import numpy
import matplotlib.pyplot as plt

# Read the US Map and the data
# Same as the code in the previous two lectures
data = pandas.read_csv("SASUMMARY__ALL_AREAS_1998_2022.csv")
sub = data.pivot(index = "GeoName", columns = "LineCode", values = "2021")
features = sub.drop("United States", axis = "index")
impute = sklearn.impute.SimpleImputer(strategy = "mean")
impute.fit(features)
full = impute.transform(features)
scale = sklearn.preprocessing.StandardScaler()
scale.fit(full)
x = scale.transform(full)
x[:5, :]

array([[-3.41837110e-01, -2.72299505e-01, -2.43077934e-01,
        -3.40324616e-01, -3.19624545e-01, -3.09324344e-01,
        -3.06725318e-01, -1.04464533e+00, -7.99733159e-01,
        -1.28769981e+00, -1.35594162e+00, -1.23288204e+00,
        -1.28890469e+00, -1.28888500e+00, -2.68065547e-01],
       [-6.75623478e-01, -7.65635948e-01, -7.83613007e-01,
        -6.76094478e-01, -7.11945327e-01, -7.23348772e-01,
        -7.23099059e-01, -9.65791949e-02,  7.54489458e-01,
         3.32493069e-01,  7.17488439e-01,  9.32445961e-01,
         1.03569578e+00,  1.03567467e+00, -7.85144660e-01],
       [-4.85733029e-02,  2.58396711e-03,  7.44911590e-02,
        -4.67443476e-02, -1.83734086e-02, -3.13965049e-03,
         5.26418474e-02, -9.14165266e-01, -4.89604513e-01,
        -6.27182798e-01, -6.67621151e-01, -2.31373971e-01,
        -6.49954153e-02, -6.49326720e-02,  2.47314722e-02],
       [-5.19596988e-01, -5.06788992e-01, -5.14358886e-01,
        -5.18963369e-01, -5.04544431e-01, -5.04932052e-01,
        -5.14728333e-01, -1.02545708e+00, -1.09656692e+00,
        -1.12394641e+00, -1.15422124e+00, -1.22454711e+00,
        -1.10265143e+00, -1.10263407e+00, -5.08782074e-01],
       [ 5.03631776e+00,  4.70296477e+00,  4.44532525e+00,
         4.98062713e+00,  4.97479028e+00,  4.84627386e+00,
         4.82259576e+00,  7.01484923e-01, -2.49785330e-01,
         1.50887999e+00,  1.30876655e+00,  1.05761619e+00,
         2.08504610e+00,  2.08509383e+00,  4.43617422e+00]])

# Compute explained variance
pca = sklearn.decomposition.PCA(n_components = 3)
pca.fit(x)
numpy.vstack([pca.explained_variance_, pca.explained_variance_ratio_])

array([[9.01412012, 5.03351571, 0.90684963],
       [0.58915818, 0.32898795, 0.05927122]])

# Check the total amount of variation explained
sum(pca.explained_variance_ratio_)

0.9774173497603195

# Set the explained variance ratio to find the number of dimensions
pca = sklearn.decomposition.PCA(n_components = 0.99)
pca.fit(x)
numpy.vstack([pca.explained_variance_, pca.explained_variance_ratio_])

array([[9.01412012, 5.03351571, 0.90684963, 0.27976852],
       [0.58915818, 0.32898795, 0.05927122, 0.01828552]])