# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.cluster
import sklearn.impute
import geopandas
import pandas
import shapely
import matplotlib.pyplot as plt

# Read the US Map
map = geopandas.read_file("cb_2018_us_state_20m.zip")
ax = map.plot()
ax.set_xlim(-130, -60)
ax.set_ylim(20, 50)

(20.0, 50.0)

# Read the economic data
data = pandas.read_csv("SASUMMARY__ALL_AREAS_1998_2022.csv")
name = data[data["LineCode"] == 1.0]["GeoName"].reset_index(drop = True)
dpi = data[data["LineCode"] == 11.0]["2021"].reset_index(drop = True)
pce = data[data["LineCode"] == 12.0]["2021"].reset_index(drop = True)
rpp = data[data["LineCode"] == 13.0]["2021"].reset_index(drop = True)
sub = pandas.concat([name, dpi, pce, rpp], axis = 1)
sub.columns = ["NAME", "DPI", "PCE", "RPP"]
sub

# Merge the two data sets
combined = map.merge(sub, how = "left", on = "NAME")[["NAME", "geometry", "DPI", "PCE", "RPP"]]
combined.head(5)

# Extract the feature columns and standardize
features = combined[["DPI", "PCE", "RPP"]]
impute = sklearn.impute.SimpleImputer(strategy = "mean")
impute.fit(features)
full = impute.transform(features)
scale = sklearn.preprocessing.StandardScaler()
scale.fit(full)
x = scale.transform(full)
x[0:5, :]

array([[ 0.53930445,  0.13934152,  1.30270512],
       [-0.42558713, -0.79117111, -1.09569665],
       [-0.54376287,  0.45694429,  0.07203916],
       [-0.59226679, -0.50058894, -0.67937846],
       [ 0.18659964,  0.18732892, -0.11603125]])

# Show clusters on map
cluster = sklearn.cluster.AgglomerativeClustering(n_clusters = 5)
cluster.fit(x)
ax = map.plot(cluster.labels_)
ax.set_xlim(-130, -60)
ax.set_ylim(20, 50)

(20.0, 50.0)

# Show clusters in feature space
fig = plt.figure()
ax = fig.add_subplot(projection = "3d")
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c = cluster.labels_)

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x1fdaee4b9d0>

	NAME	DPI	PCE	RPP
0	United States	56175	48318	100.000
1	Alabama	45191	39174	88.139
2	Alaska	60126	53982	104.439
3	Arizona	50149	46023	96.721
4	Arkansas	46644	39231	89.445
5	California	64385	54838	111.797
6	Colorado	62487	54126	103.009
7	Connecticut	67013	56371	102.603
8	Delaware	51079	50700	97.677
9	District of Columbia	80275	79105	111.271
10	Florida	55483	51038	101.430
11	Georgia	49598	43681	95.784
12	Hawaii	53862	49543	113.227
13	Idaho	48076	39639	91.776
14	Illinois	58619	49429	101.412
15	Indiana	50803	42565	92.735
16	Iowa	51922	42247	89.568
17	Kansas	51785	42438	91.157
18	Kentucky	46306	41027	89.124
19	Louisiana	50057	42115	91.276
20	Maine	52104	51730	97.205
21	Maryland	58805	48549	106.223
22	Massachusetts	69743	59267	106.555
23	Michigan	50179	46019	94.253
24	Minnesota	57651	48846	98.423
25	Mississippi	42851	36681	86.601
26	Missouri	49831	44827	92.022
27	Montana	51272	48015	91.567
28	Nebraska	55943	45469	91.751
29	Nevada	54223	46176	95.543
30	New Hampshire	63939	56507	102.510
31	New Jersey	65290	55472	109.099
32	New Mexico	46918	39950	89.907
33	New York	62891	53354	109.504
34	North Carolina	50150	44287	93.805
35	North Dakota	59808	49209	91.103
36	Ohio	50733	44215	92.459
37	Oklahoma	50461	38889	90.269
38	Oregon	53449	47649	103.032
39	Pennsylvania	56289	48874	96.371
40	Rhode Island	55918	48711	102.083
41	South Carolina	47130	42930	93.693
42	South Dakota	59734	45944	90.147
43	Tennessee	52012	42741	90.854
44	Texas	54726	45460	98.502
45	Utah	49589	43850	94.592
46	Vermont	54076	51197	98.660
47	Virginia	57713	48003	102.278
48	Washington	65315	52155	108.885
49	West Virginia	44715	41273	90.763
50	Wisconsin	53513	45491	93.347
51	Wyoming	62022	48858	91.418

	NAME	geometry	DPI	PCE	RPP
0	Maryland	MULTIPOLYGON (((-76.04621 38.02553, -76.00734 ...	58805	48549	106.223
1	Iowa	POLYGON ((-96.62187 42.77925, -96.57794 42.827...	51922	42247	89.568
2	Delaware	POLYGON ((-75.77379 39.72220, -75.75323 39.757...	51079	50700	97.677
3	Ohio	MULTIPOLYGON (((-82.86334 41.69369, -82.82572 ...	50733	44215	92.459
4	Pennsylvania	POLYGON ((-80.51989 40.90666, -80.51964 40.987...	56289	48874	96.371