InĀ [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.cluster
import sklearn.impute
import geopandas
import pandas
import shapely
import matplotlib.pyplot as plt
InĀ [2]:
# Read the US Map
map = geopandas.read_file("cb_2018_us_state_20m.zip")
ax = map.plot()
ax.set_xlim(-130, -60)
ax.set_ylim(20, 50)
Out[2]:
(20.0, 50.0)
InĀ [3]:
# Read the economic data
data = pandas.read_csv("SASUMMARY__ALL_AREAS_1998_2022.csv")
name = data[data["LineCode"] == 1.0]["GeoName"].reset_index(drop = True)
dpi = data[data["LineCode"] == 11.0]["2021"].reset_index(drop = True)
pce = data[data["LineCode"] == 12.0]["2021"].reset_index(drop = True)
rpp = data[data["LineCode"] == 13.0]["2021"].reset_index(drop = True)
sub = pandas.concat([name, dpi, pce, rpp], axis = 1)
sub.columns = ["NAME", "DPI", "PCE", "RPP"]
sub
Out[3]:
NAME | DPI | PCE | RPP | |
---|---|---|---|---|
0 | United States | 56175 | 48318 | 100.000 |
1 | Alabama | 45191 | 39174 | 88.139 |
2 | Alaska | 60126 | 53982 | 104.439 |
3 | Arizona | 50149 | 46023 | 96.721 |
4 | Arkansas | 46644 | 39231 | 89.445 |
5 | California | 64385 | 54838 | 111.797 |
6 | Colorado | 62487 | 54126 | 103.009 |
7 | Connecticut | 67013 | 56371 | 102.603 |
8 | Delaware | 51079 | 50700 | 97.677 |
9 | District of Columbia | 80275 | 79105 | 111.271 |
10 | Florida | 55483 | 51038 | 101.430 |
11 | Georgia | 49598 | 43681 | 95.784 |
12 | Hawaii | 53862 | 49543 | 113.227 |
13 | Idaho | 48076 | 39639 | 91.776 |
14 | Illinois | 58619 | 49429 | 101.412 |
15 | Indiana | 50803 | 42565 | 92.735 |
16 | Iowa | 51922 | 42247 | 89.568 |
17 | Kansas | 51785 | 42438 | 91.157 |
18 | Kentucky | 46306 | 41027 | 89.124 |
19 | Louisiana | 50057 | 42115 | 91.276 |
20 | Maine | 52104 | 51730 | 97.205 |
21 | Maryland | 58805 | 48549 | 106.223 |
22 | Massachusetts | 69743 | 59267 | 106.555 |
23 | Michigan | 50179 | 46019 | 94.253 |
24 | Minnesota | 57651 | 48846 | 98.423 |
25 | Mississippi | 42851 | 36681 | 86.601 |
26 | Missouri | 49831 | 44827 | 92.022 |
27 | Montana | 51272 | 48015 | 91.567 |
28 | Nebraska | 55943 | 45469 | 91.751 |
29 | Nevada | 54223 | 46176 | 95.543 |
30 | New Hampshire | 63939 | 56507 | 102.510 |
31 | New Jersey | 65290 | 55472 | 109.099 |
32 | New Mexico | 46918 | 39950 | 89.907 |
33 | New York | 62891 | 53354 | 109.504 |
34 | North Carolina | 50150 | 44287 | 93.805 |
35 | North Dakota | 59808 | 49209 | 91.103 |
36 | Ohio | 50733 | 44215 | 92.459 |
37 | Oklahoma | 50461 | 38889 | 90.269 |
38 | Oregon | 53449 | 47649 | 103.032 |
39 | Pennsylvania | 56289 | 48874 | 96.371 |
40 | Rhode Island | 55918 | 48711 | 102.083 |
41 | South Carolina | 47130 | 42930 | 93.693 |
42 | South Dakota | 59734 | 45944 | 90.147 |
43 | Tennessee | 52012 | 42741 | 90.854 |
44 | Texas | 54726 | 45460 | 98.502 |
45 | Utah | 49589 | 43850 | 94.592 |
46 | Vermont | 54076 | 51197 | 98.660 |
47 | Virginia | 57713 | 48003 | 102.278 |
48 | Washington | 65315 | 52155 | 108.885 |
49 | West Virginia | 44715 | 41273 | 90.763 |
50 | Wisconsin | 53513 | 45491 | 93.347 |
51 | Wyoming | 62022 | 48858 | 91.418 |
InĀ [4]:
# Merge the two data sets
combined = map.merge(sub, how = "left", on = "NAME")[["NAME", "geometry", "DPI", "PCE", "RPP"]]
combined.head(5)
Out[4]:
NAME | geometry | DPI | PCE | RPP | |
---|---|---|---|---|---|
0 | Maryland | MULTIPOLYGON (((-76.04621 38.02553, -76.00734 ... | 58805 | 48549 | 106.223 |
1 | Iowa | POLYGON ((-96.62187 42.77925, -96.57794 42.827... | 51922 | 42247 | 89.568 |
2 | Delaware | POLYGON ((-75.77379 39.72220, -75.75323 39.757... | 51079 | 50700 | 97.677 |
3 | Ohio | MULTIPOLYGON (((-82.86334 41.69369, -82.82572 ... | 50733 | 44215 | 92.459 |
4 | Pennsylvania | POLYGON ((-80.51989 40.90666, -80.51964 40.987... | 56289 | 48874 | 96.371 |
InĀ [5]:
# Extract the feature columns and standardize
features = combined[["DPI", "PCE", "RPP"]]
impute = sklearn.impute.SimpleImputer(strategy = "mean")
impute.fit(features)
full = impute.transform(features)
scale = sklearn.preprocessing.StandardScaler()
scale.fit(full)
x = scale.transform(full)
x[0:5, :]
Out[5]:
array([[ 0.53930445, 0.13934152, 1.30270512], [-0.42558713, -0.79117111, -1.09569665], [-0.54376287, 0.45694429, 0.07203916], [-0.59226679, -0.50058894, -0.67937846], [ 0.18659964, 0.18732892, -0.11603125]])
InĀ [6]:
# Show clusters on map
cluster = sklearn.cluster.AgglomerativeClustering(n_clusters = 5)
cluster.fit(x)
ax = map.plot(cluster.labels_)
ax.set_xlim(-130, -60)
ax.set_ylim(20, 50)
Out[6]:
(20.0, 50.0)
InĀ [7]:
# Show clusters in feature space
fig = plt.figure()
ax = fig.add_subplot(projection = "3d")
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c = cluster.labels_)
Out[7]:
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x1fdaee4b9d0>