InĀ [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.cluster
import sklearn.impute
import geopandas
import pandas
import shapely
import matplotlib.pyplot as plt
InĀ [2]:
# Read the US Map
map = geopandas.read_file("cb_2018_us_state_20m.zip")
ax = map.plot()
ax.set_xlim(-130, -60)
ax.set_ylim(20, 50)
Out[2]:
(20.0, 50.0)
No description has been provided for this image
InĀ [3]:
# Read the economic data
data = pandas.read_csv("SASUMMARY__ALL_AREAS_1998_2022.csv")
name = data[data["LineCode"] == 1.0]["GeoName"].reset_index(drop = True)
dpi = data[data["LineCode"] == 11.0]["2021"].reset_index(drop = True)
pce = data[data["LineCode"] == 12.0]["2021"].reset_index(drop = True)
rpp = data[data["LineCode"] == 13.0]["2021"].reset_index(drop = True)
sub = pandas.concat([name, dpi, pce, rpp], axis = 1)
sub.columns = ["NAME", "DPI", "PCE", "RPP"]
sub
Out[3]:
NAME DPI PCE RPP
0 United States 56175 48318 100.000
1 Alabama 45191 39174 88.139
2 Alaska 60126 53982 104.439
3 Arizona 50149 46023 96.721
4 Arkansas 46644 39231 89.445
5 California 64385 54838 111.797
6 Colorado 62487 54126 103.009
7 Connecticut 67013 56371 102.603
8 Delaware 51079 50700 97.677
9 District of Columbia 80275 79105 111.271
10 Florida 55483 51038 101.430
11 Georgia 49598 43681 95.784
12 Hawaii 53862 49543 113.227
13 Idaho 48076 39639 91.776
14 Illinois 58619 49429 101.412
15 Indiana 50803 42565 92.735
16 Iowa 51922 42247 89.568
17 Kansas 51785 42438 91.157
18 Kentucky 46306 41027 89.124
19 Louisiana 50057 42115 91.276
20 Maine 52104 51730 97.205
21 Maryland 58805 48549 106.223
22 Massachusetts 69743 59267 106.555
23 Michigan 50179 46019 94.253
24 Minnesota 57651 48846 98.423
25 Mississippi 42851 36681 86.601
26 Missouri 49831 44827 92.022
27 Montana 51272 48015 91.567
28 Nebraska 55943 45469 91.751
29 Nevada 54223 46176 95.543
30 New Hampshire 63939 56507 102.510
31 New Jersey 65290 55472 109.099
32 New Mexico 46918 39950 89.907
33 New York 62891 53354 109.504
34 North Carolina 50150 44287 93.805
35 North Dakota 59808 49209 91.103
36 Ohio 50733 44215 92.459
37 Oklahoma 50461 38889 90.269
38 Oregon 53449 47649 103.032
39 Pennsylvania 56289 48874 96.371
40 Rhode Island 55918 48711 102.083
41 South Carolina 47130 42930 93.693
42 South Dakota 59734 45944 90.147
43 Tennessee 52012 42741 90.854
44 Texas 54726 45460 98.502
45 Utah 49589 43850 94.592
46 Vermont 54076 51197 98.660
47 Virginia 57713 48003 102.278
48 Washington 65315 52155 108.885
49 West Virginia 44715 41273 90.763
50 Wisconsin 53513 45491 93.347
51 Wyoming 62022 48858 91.418
InĀ [4]:
# Merge the two data sets
combined = map.merge(sub, how = "left", on = "NAME")[["NAME", "geometry", "DPI", "PCE", "RPP"]]
combined.head(5)
Out[4]:
NAME geometry DPI PCE RPP
0 Maryland MULTIPOLYGON (((-76.04621 38.02553, -76.00734 ... 58805 48549 106.223
1 Iowa POLYGON ((-96.62187 42.77925, -96.57794 42.827... 51922 42247 89.568
2 Delaware POLYGON ((-75.77379 39.72220, -75.75323 39.757... 51079 50700 97.677
3 Ohio MULTIPOLYGON (((-82.86334 41.69369, -82.82572 ... 50733 44215 92.459
4 Pennsylvania POLYGON ((-80.51989 40.90666, -80.51964 40.987... 56289 48874 96.371
InĀ [5]:
# Extract the feature columns and standardize
features = combined[["DPI", "PCE", "RPP"]]
impute = sklearn.impute.SimpleImputer(strategy = "mean")
impute.fit(features)
full = impute.transform(features)
scale = sklearn.preprocessing.StandardScaler()
scale.fit(full)
x = scale.transform(full)
x[0:5, :]
Out[5]:
array([[ 0.53930445,  0.13934152,  1.30270512],
       [-0.42558713, -0.79117111, -1.09569665],
       [-0.54376287,  0.45694429,  0.07203916],
       [-0.59226679, -0.50058894, -0.67937846],
       [ 0.18659964,  0.18732892, -0.11603125]])
InĀ [6]:
# Show clusters on map
cluster = sklearn.cluster.AgglomerativeClustering(n_clusters = 5)
cluster.fit(x)
ax = map.plot(cluster.labels_)
ax.set_xlim(-130, -60)
ax.set_ylim(20, 50)
Out[6]:
(20.0, 50.0)
No description has been provided for this image
InĀ [7]:
# Show clusters in feature space
fig = plt.figure()
ax = fig.add_subplot(projection = "3d")
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c = cluster.labels_)
Out[7]:
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x1fdaee4b9d0>
No description has been provided for this image