In [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.cluster
import sklearn.impute
import geopandas
import pandas
import shapely
import matplotlib.pyplot as plt
In [2]:
# Read the US Map and the data
# See the example in the previous lecture for getting the same dataset without "pivot"
map = geopandas.read_file("cb_2018_us_state_20m.zip")
data = pandas.read_csv("SASUMMARY__ALL_AREAS_1998_2022.csv")
sub = data.pivot(index = "GeoName", columns = "LineCode", values = "2021")
sub["NAME"] = sub.index
combined = map.merge(sub, how = "left", on = "NAME")[["NAME", "geometry", 11, 12, 13]]
combined
Out[2]:
NAME geometry 11 12 13
0 Maryland MULTIPOLYGON (((-76.04621 38.02553, -76.00734 ... 58805 48549 106.223
1 Iowa POLYGON ((-96.62187 42.77925, -96.57794 42.827... 51922 42247 89.568
2 Delaware POLYGON ((-75.77379 39.72220, -75.75323 39.757... 51079 50700 97.677
3 Ohio MULTIPOLYGON (((-82.86334 41.69369, -82.82572 ... 50733 44215 92.459
4 Pennsylvania POLYGON ((-80.51989 40.90666, -80.51964 40.987... 56289 48874 96.371
5 Nebraska POLYGON ((-104.05314 41.11446, -104.05245 41.2... 55943 45469 91.751
6 Washington MULTIPOLYGON (((-123.23715 48.68347, -123.0704... 65315 52155 108.885
7 Puerto Rico MULTIPOLYGON (((-65.34207 18.34529, -65.25593 ... NaN NaN NaN
8 Alabama POLYGON ((-88.46866 31.89386, -88.46866 31.933... 45191 39174 88.139
9 Arkansas POLYGON ((-94.61792 36.49941, -94.36120 36.499... 46644 39231 89.445
10 New Mexico POLYGON ((-109.04919 31.79655, -109.04830 32.0... 46918 39950 89.907
11 Texas POLYGON ((-106.62345 31.91403, -106.63011 31.9... 54726 45460 98.502
12 California MULTIPOLYGON (((-118.59397 33.46720, -118.4847... 64385 54838 111.797
13 Kentucky POLYGON ((-89.54443 36.57451, -89.47935 36.566... 46306 41027 89.124
14 Georgia POLYGON ((-85.60516 34.98468, -85.47434 34.983... 49598 43681 95.784
15 Wisconsin MULTIPOLYGON (((-86.93428 45.42115, -86.83575 ... 53513 45491 93.347
16 Oregon POLYGON ((-124.55244 42.84057, -124.48094 42.9... 53449 47649 103.032
17 Missouri POLYGON ((-95.76564 40.58521, -95.53318 40.582... 49831 44827 92.022
18 Virginia MULTIPOLYGON (((-76.02347 37.28907, -75.98712 ... 57713 48003 102.278
19 Tennessee POLYGON ((-90.30070 35.02879, -90.26530 35.040... 52012 42741 90.854
20 Louisiana POLYGON ((-94.04305 32.69303, -94.04303 32.797... 50057 42115 91.276
21 New York MULTIPOLYGON (((-72.01893 41.27411, -71.92680 ... 62891 53354 109.504
22 Michigan MULTIPOLYGON (((-84.61622 45.89447, -84.51789 ... 50179 46019 94.253
23 Idaho POLYGON ((-117.24303 44.39097, -117.21507 44.4... 48076 39639 91.776
24 Florida MULTIPOLYGON (((-81.81169 24.56874, -81.75127 ... 55483 51038 101.43
25 Alaska MULTIPOLYGON (((179.48132 51.97530, 179.58286 ... 60126 53982 104.439
26 Illinois POLYGON ((-91.50626 40.20016, -91.49696 40.248... 58619 49429 101.412
27 Montana POLYGON ((-116.04919 49.00091, -115.50102 49.0... 51272 48015 91.567
28 Minnesota POLYGON ((-97.22904 49.00069, -96.93096 48.999... 57651 48846 98.423
29 Indiana POLYGON ((-88.05947 37.86669, -88.04086 37.891... 50803 42565 92.735
30 Massachusetts MULTIPOLYGON (((-70.27553 41.31046, -70.19371 ... 69743 59267 106.555
31 Kansas POLYGON ((-102.05174 40.00308, -101.83216 40.0... 51785 42438 91.157
32 Nevada POLYGON ((-120.00480 39.31648, -120.00303 39.4... 54223 46176 95.543
33 Vermont POLYGON ((-73.41632 44.09942, -73.39987 44.152... 54076 51197 98.66
34 Connecticut POLYGON ((-73.69594 41.11526, -73.48271 41.212... 67013 56371 102.603
35 New Jersey POLYGON ((-75.55945 39.62981, -75.53514 39.647... 65290 55472 109.099
36 District of Columbia POLYGON ((-77.11976 38.93434, -77.04102 38.995... 80275 79105 111.271
37 North Carolina POLYGON ((-84.28660 35.20576, -84.28322 35.226... 50150 44287 93.805
38 Utah POLYGON ((-114.05247 37.60478, -114.05173 37.7... 49589 43850 94.592
39 North Dakota POLYGON ((-104.04874 48.99988, -103.37547 48.9... 59808 49209 91.103
40 South Carolina POLYGON ((-83.35324 34.72865, -83.32006 34.759... 47130 42930 93.693
41 Mississippi POLYGON ((-91.62136 31.26781, -91.56419 31.261... 42851 36681 86.601
42 Colorado POLYGON ((-109.05996 38.49999, -109.05151 39.1... 62487 54126 103.009
43 South Dakota POLYGON ((-104.05770 44.99743, -104.03914 44.9... 59734 45944 90.147
44 Oklahoma POLYGON ((-103.00252 36.67519, -103.00196 36.9... 50461 38889 90.269
45 Wyoming POLYGON ((-111.05689 44.86666, -111.04432 45.0... 62022 48858 91.418
46 West Virginia POLYGON ((-82.59886 38.20101, -82.58469 38.240... 44715 41273 90.763
47 Maine MULTIPOLYGON (((-68.92401 43.88541, -68.87478 ... 52104 51730 97.205
48 Hawaii MULTIPOLYGON (((-156.04965 19.78045, -156.0062... 53862 49543 113.227
49 New Hampshire POLYGON ((-72.55611 42.86625, -72.53147 42.897... 63939 56507 102.51
50 Arizona POLYGON ((-114.79968 32.59362, -114.80939 32.6... 50149 46023 96.721
51 Rhode Island MULTIPOLYGON (((-71.63147 41.16668, -71.59334 ... 55918 48711 102.083
In [3]:
# Extract the feature columns and standardize
features = combined[[11, 12, 13]]
impute = sklearn.impute.SimpleImputer(strategy = "mean")
impute.fit(features)
full = impute.transform(features)
scale = sklearn.preprocessing.StandardScaler()
scale.fit(full)
x = scale.transform(full)
x[0:5, :]
Out[3]:
array([[ 0.53930445,  0.13934152,  1.30270512],
       [-0.42558713, -0.79117111, -1.09569665],
       [-0.54376287,  0.45694429,  0.07203916],
       [-0.59226679, -0.50058894, -0.67937846],
       [ 0.18659964,  0.18732892, -0.11603125]])
In [4]:
# Show clusters on map
cluster = sklearn.cluster.KMeans(n_clusters = 5, n_init = 1)
cluster.fit(x)
ax = map.plot(cluster.labels_)
ax.set_xlim(-130, -60)
ax.set_ylim(20, 50)
Out[4]:
(20.0, 50.0)
No description has been provided for this image
In [5]:
# Run the same function again
cluster.fit(x)
ax = map.plot(cluster.labels_)
ax.set_xlim(-130, -60)
ax.set_ylim(20, 50)
Out[5]:
(20.0, 50.0)
No description has been provided for this image
In [6]:
# Compare the total distortion (inertia) for k = 1, 2, ..., 50
td = []
n = range(1, len(x))
for k in n:
    cluster = sklearn.cluster.KMeans(n_clusters = k, n_init = 1)
    cluster.fit(x)
    td.append(cluster.inertia_)
plt.scatter(n, td)
Out[6]:
<matplotlib.collections.PathCollection at 0x2559fba95d0>
No description has been provided for this image