In [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import sklearn
import sklearn.cluster
import sklearn.impute
import geopandas
import pandas
import shapely
import matplotlib.pyplot as plt
In [2]:
# Read the US Map and the data
# See the example in the previous lecture for getting the same dataset without "pivot"
map = geopandas.read_file("cb_2018_us_state_20m.zip")
data = pandas.read_csv("SASUMMARY__ALL_AREAS_1998_2022.csv")
sub = data.pivot(index = "GeoName", columns = "LineCode", values = "2021")
sub["NAME"] = sub.index
combined = map.merge(sub, how = "left", on = "NAME")[["NAME", "geometry", 11, 12, 13]]
combined
Out[2]:
NAME | geometry | 11 | 12 | 13 | |
---|---|---|---|---|---|
0 | Maryland | MULTIPOLYGON (((-76.04621 38.02553, -76.00734 ... | 58805 | 48549 | 106.223 |
1 | Iowa | POLYGON ((-96.62187 42.77925, -96.57794 42.827... | 51922 | 42247 | 89.568 |
2 | Delaware | POLYGON ((-75.77379 39.72220, -75.75323 39.757... | 51079 | 50700 | 97.677 |
3 | Ohio | MULTIPOLYGON (((-82.86334 41.69369, -82.82572 ... | 50733 | 44215 | 92.459 |
4 | Pennsylvania | POLYGON ((-80.51989 40.90666, -80.51964 40.987... | 56289 | 48874 | 96.371 |
5 | Nebraska | POLYGON ((-104.05314 41.11446, -104.05245 41.2... | 55943 | 45469 | 91.751 |
6 | Washington | MULTIPOLYGON (((-123.23715 48.68347, -123.0704... | 65315 | 52155 | 108.885 |
7 | Puerto Rico | MULTIPOLYGON (((-65.34207 18.34529, -65.25593 ... | NaN | NaN | NaN |
8 | Alabama | POLYGON ((-88.46866 31.89386, -88.46866 31.933... | 45191 | 39174 | 88.139 |
9 | Arkansas | POLYGON ((-94.61792 36.49941, -94.36120 36.499... | 46644 | 39231 | 89.445 |
10 | New Mexico | POLYGON ((-109.04919 31.79655, -109.04830 32.0... | 46918 | 39950 | 89.907 |
11 | Texas | POLYGON ((-106.62345 31.91403, -106.63011 31.9... | 54726 | 45460 | 98.502 |
12 | California | MULTIPOLYGON (((-118.59397 33.46720, -118.4847... | 64385 | 54838 | 111.797 |
13 | Kentucky | POLYGON ((-89.54443 36.57451, -89.47935 36.566... | 46306 | 41027 | 89.124 |
14 | Georgia | POLYGON ((-85.60516 34.98468, -85.47434 34.983... | 49598 | 43681 | 95.784 |
15 | Wisconsin | MULTIPOLYGON (((-86.93428 45.42115, -86.83575 ... | 53513 | 45491 | 93.347 |
16 | Oregon | POLYGON ((-124.55244 42.84057, -124.48094 42.9... | 53449 | 47649 | 103.032 |
17 | Missouri | POLYGON ((-95.76564 40.58521, -95.53318 40.582... | 49831 | 44827 | 92.022 |
18 | Virginia | MULTIPOLYGON (((-76.02347 37.28907, -75.98712 ... | 57713 | 48003 | 102.278 |
19 | Tennessee | POLYGON ((-90.30070 35.02879, -90.26530 35.040... | 52012 | 42741 | 90.854 |
20 | Louisiana | POLYGON ((-94.04305 32.69303, -94.04303 32.797... | 50057 | 42115 | 91.276 |
21 | New York | MULTIPOLYGON (((-72.01893 41.27411, -71.92680 ... | 62891 | 53354 | 109.504 |
22 | Michigan | MULTIPOLYGON (((-84.61622 45.89447, -84.51789 ... | 50179 | 46019 | 94.253 |
23 | Idaho | POLYGON ((-117.24303 44.39097, -117.21507 44.4... | 48076 | 39639 | 91.776 |
24 | Florida | MULTIPOLYGON (((-81.81169 24.56874, -81.75127 ... | 55483 | 51038 | 101.43 |
25 | Alaska | MULTIPOLYGON (((179.48132 51.97530, 179.58286 ... | 60126 | 53982 | 104.439 |
26 | Illinois | POLYGON ((-91.50626 40.20016, -91.49696 40.248... | 58619 | 49429 | 101.412 |
27 | Montana | POLYGON ((-116.04919 49.00091, -115.50102 49.0... | 51272 | 48015 | 91.567 |
28 | Minnesota | POLYGON ((-97.22904 49.00069, -96.93096 48.999... | 57651 | 48846 | 98.423 |
29 | Indiana | POLYGON ((-88.05947 37.86669, -88.04086 37.891... | 50803 | 42565 | 92.735 |
30 | Massachusetts | MULTIPOLYGON (((-70.27553 41.31046, -70.19371 ... | 69743 | 59267 | 106.555 |
31 | Kansas | POLYGON ((-102.05174 40.00308, -101.83216 40.0... | 51785 | 42438 | 91.157 |
32 | Nevada | POLYGON ((-120.00480 39.31648, -120.00303 39.4... | 54223 | 46176 | 95.543 |
33 | Vermont | POLYGON ((-73.41632 44.09942, -73.39987 44.152... | 54076 | 51197 | 98.66 |
34 | Connecticut | POLYGON ((-73.69594 41.11526, -73.48271 41.212... | 67013 | 56371 | 102.603 |
35 | New Jersey | POLYGON ((-75.55945 39.62981, -75.53514 39.647... | 65290 | 55472 | 109.099 |
36 | District of Columbia | POLYGON ((-77.11976 38.93434, -77.04102 38.995... | 80275 | 79105 | 111.271 |
37 | North Carolina | POLYGON ((-84.28660 35.20576, -84.28322 35.226... | 50150 | 44287 | 93.805 |
38 | Utah | POLYGON ((-114.05247 37.60478, -114.05173 37.7... | 49589 | 43850 | 94.592 |
39 | North Dakota | POLYGON ((-104.04874 48.99988, -103.37547 48.9... | 59808 | 49209 | 91.103 |
40 | South Carolina | POLYGON ((-83.35324 34.72865, -83.32006 34.759... | 47130 | 42930 | 93.693 |
41 | Mississippi | POLYGON ((-91.62136 31.26781, -91.56419 31.261... | 42851 | 36681 | 86.601 |
42 | Colorado | POLYGON ((-109.05996 38.49999, -109.05151 39.1... | 62487 | 54126 | 103.009 |
43 | South Dakota | POLYGON ((-104.05770 44.99743, -104.03914 44.9... | 59734 | 45944 | 90.147 |
44 | Oklahoma | POLYGON ((-103.00252 36.67519, -103.00196 36.9... | 50461 | 38889 | 90.269 |
45 | Wyoming | POLYGON ((-111.05689 44.86666, -111.04432 45.0... | 62022 | 48858 | 91.418 |
46 | West Virginia | POLYGON ((-82.59886 38.20101, -82.58469 38.240... | 44715 | 41273 | 90.763 |
47 | Maine | MULTIPOLYGON (((-68.92401 43.88541, -68.87478 ... | 52104 | 51730 | 97.205 |
48 | Hawaii | MULTIPOLYGON (((-156.04965 19.78045, -156.0062... | 53862 | 49543 | 113.227 |
49 | New Hampshire | POLYGON ((-72.55611 42.86625, -72.53147 42.897... | 63939 | 56507 | 102.51 |
50 | Arizona | POLYGON ((-114.79968 32.59362, -114.80939 32.6... | 50149 | 46023 | 96.721 |
51 | Rhode Island | MULTIPOLYGON (((-71.63147 41.16668, -71.59334 ... | 55918 | 48711 | 102.083 |
In [3]:
# Extract the feature columns and standardize
features = combined[[11, 12, 13]]
impute = sklearn.impute.SimpleImputer(strategy = "mean")
impute.fit(features)
full = impute.transform(features)
scale = sklearn.preprocessing.StandardScaler()
scale.fit(full)
x = scale.transform(full)
x[0:5, :]
Out[3]:
array([[ 0.53930445, 0.13934152, 1.30270512], [-0.42558713, -0.79117111, -1.09569665], [-0.54376287, 0.45694429, 0.07203916], [-0.59226679, -0.50058894, -0.67937846], [ 0.18659964, 0.18732892, -0.11603125]])
In [4]:
# Show clusters on map
cluster = sklearn.cluster.KMeans(n_clusters = 5, n_init = 1)
cluster.fit(x)
ax = map.plot(cluster.labels_)
ax.set_xlim(-130, -60)
ax.set_ylim(20, 50)
Out[4]:
(20.0, 50.0)
In [5]:
# Run the same function again
cluster.fit(x)
ax = map.plot(cluster.labels_)
ax.set_xlim(-130, -60)
ax.set_ylim(20, 50)
Out[5]:
(20.0, 50.0)
In [6]:
# Compare the total distortion (inertia) for k = 1, 2, ..., 50
td = []
n = range(1, len(x))
for k in n:
cluster = sklearn.cluster.KMeans(n_clusters = k, n_init = 1)
cluster.fit(x)
td.append(cluster.inertia_)
plt.scatter(n, td)
Out[6]:
<matplotlib.collections.PathCollection at 0x2559fba95d0>