SEC Schools (11/14 Teams)
Scraping
All of the scraping for this project was done in R, using the
rvest
package, with the read_html()
and
html_table
, functions. These conveniently take a webpage
and automatically create a list of R dataframes, allowing for easy,
simple scraping. Here’s an example:
# Scraping data for the University of Missouri's rosters for all sports in 2023
# The sports that the school offers
sports = c("baseball",
"softball",
"mens-basketball",
"womens-basketball",
"cross-country",
"football",
"mens-golf",
"womens-golf",
"mens-swimming-and-diving",
"womens-swimming-and-diving",
"womens-gymnastics",
"womens-soccer",
"track-and-field",
"wrestling",
"womens-tennis",
"womens-volleyball")
# "Type" is just determining what the format of the year is. If the sport's urls are formatted as 2023, 2024, etc, its type is 0. If it is formatted like 2023-24, it is 1.
types = c(0,0,1,1,0,0,1,1,0,0,0,0,0,1,1,0)
sports_df = data.frame(Sport = sports, Type = types) # DataFrame combining sports with their types'
# This is a helper function taking in a column that has Hometown data. Some of the columns are presented as "Hometown/High School"; we only want Hometown, so we can split on the "/" character, and take the first element.
hometown_split = function(L) {
new_list = c()
for (i in 1:length(L)) {
new_list = c(new_list, str_trim(str_split_1(L[i], "/")[1]))
}
return (new_list)
}
# Scraping
BASE_URL = "https://mutigers.com"
years = 2023:2023 # List of years that we want to look at. Normally this would be something like 2013:2023.
college_df = data.frame() # Initializing an empty dataframe that we can add to.
for (i in 1:nrow(sports_df)) {
sport = sports_df$Sport[i] # Easier for readability
for (y in 1:length(years)) {
year = years[y] # Easier for readability
# If the sport's url is presented as 2023-24 (type 1), we want to convert the year in our for loop to be of that format.
if (sports_df$Type[i] == 1) {
next_year = as.character(years[y] + 1)
year_str = paste(years[y], "-", substr(next_year, nchar(next_year) -1, nchar(next_year)), sep="")
}
else { # otherwise, keep year as is.
year_str = as.character(years[y])
}
# Put together the pieces of the url
url = paste(BASE_URL, "/sports/", sport, "/roster/", year_str, sep="")
# Read the data.
tables = read_html(url) %>% html_table()
# Which element of the tables list the roster is on depends on the school.
roster = data.frame(tables[3])
# Some sports have men's and women's rosters combined in two different tables. This adds that extra table in.
if (sport %in% c("cross-country", "track-and-field")) {
roster = rbind(roster, data.frame(tables[4]))
}
# Adding a column for the year and the sport
roster$Year = rep(year, nrow(roster))
roster$Sport = rep(sport, nrow(roster))
# This checks if there is a column including the name "hometown". If there is one, it adds a Hometown column, using the hometown_split() function.
for (i in 1:length(colnames(roster))){
if (length(grep("[hH]ometown", colnames(roster)[i])) > 0){
roster$Hometown = hometown_split(roster[[colnames(roster)[i]]])
}
# If there is a column "Name", add it to the roster dataframe.
if (length(grep("[nN]ame", colnames(roster)[i])) > 0){
roster$Name = roster[[colnames(roster)[i]]]
}
}
# If we don't find a column with "Hometown" in it, likely this url doesn't contain any data. We move on to the next year/sport.
if (!("Hometown" %in% colnames(roster))) {
next
}
# Get the columns we want from the roster, and add them to the overall college dataframe.
roster = roster %>% select(Name, Hometown, Year, Sport)
college_df = rbind(college_df, roster)
}
# For monitoring purposes
# print(paste("Finished with ", sport))
}
# Typically, csv files are written for each school, so we can combine them in the cleaning. I will not do that here, but instead show a snapshot of the dataframe.
# write.csv(college_df, "MIZZOU.csv", row.names=F)
Here is a snapshot of MIZZOU.csv
.
Name | Hometown | Year | Sport |
---|---|---|---|
Jerry Houston | Dolton, Ill. | 2013 | baseball |
Brett Peel | St. Charles, Mo. | 2013 | baseball |
Shane Benes | Town and Country, Mo. | 2013 | baseball |
Case Munson | St. Charles, Mo. | 2013 | baseball |
Brett Bond | St. Louis, Mo. | 2013 | baseball |
Trey Harris | Powder Springs, Ga. | 2013 | baseball |
Data Cleaning
The way I did the scraping, we are now left with 11 different
directories, each with a .csv
file, and a .R
script file that scrapes the data, and generates those .csv
files.
Reading and Combining
The approach I took is to get those .csv.
files
recursively from the home directory that houses all of these
subdirectories.
# Get all files from this directory
files = list.files(recursive=T)
# Get all .csv files, this will remove the .R scripts
csv_files = files[endsWith(files, ".csv")]
csv_files = csv_files[!(csv_files %in% c("geocodes.csv", "rosters.csv"))]
all_schools = data.frame()
# Combine all of the content from each csv together into one large dataframe.
for (f in csv_files) {
file = read_csv(f)
file$School = rep(str_split_1(f, "/")[1], nrow(file))
all_schools = rbind(all_schools, file)
}
We can see the head and tail of this dataset, showing we have combined all the data for all schools.
Name | Hometown | Year | Sport | School |
---|---|---|---|---|
Mikey White | Birmingham, Ala. | 2014 | baseball | Alabama |
Daniel Cucjen | Shreveport, La. | 2014 | baseball | Alabama |
Kyle Overstreet | Boaz, Ala. | 2014 | baseball | Alabama |
Ben Moore | Cullman, Ala. | 2014 | baseball | Alabama |
Casey Hughston | Mobile, Ala. | 2014 | baseball | Alabama |
Wade Wass | Pensacola, Fla. | 2014 | baseball | Alabama |
Name | Hometown | Year | Sport | School |
---|---|---|---|---|
Molly Finlay | Oakton, Va. | 2023 | wlax | Vanderbilt |
Ashley Sampone | Rochester, N.Y. | 2023 | wlax | Vanderbilt |
Taylor Sampone | Rochester, N.Y. | 2023 | wlax | Vanderbilt |
Kristin Gruber | Woodbine, Md. | 2023 | wlax | Vanderbilt |
Molly Krestinski | Bronxville, N.Y. | 2023 | wlax | Vanderbilt |
Remy Ricciardi | Ridgewood, N.J. | 2023 | wlax | Vanderbilt |
When scraping and creating these .csv
files, I just
added the sport names as the universities had structured them, so some
of them had different names. We need to loop through the sport vector
and create some sort of standard naming for the sport names, so that
they can be visualized. Here we can see all the different names we have
of sports.
## [1] "baseball" "softball"
## [3] "mens-basketball" "womens-basketball"
## [5] "football" "mens-golf"
## [7] "womens-golf" "swimming-and-diving"
## [9] "womens-gymnastics" "womens-soccer"
## [11] "xctrack" "womens-tennis"
## [13] "mens-tennis" "womens-volleyball"
## [15] "womens-rowing" "m-basebl"
## [17] "w-softbl" "m-baskbl"
## [19] "w-baskbl" "m-footbl"
## [21] "m-golf" "w-golf"
## [23] "w-swim" "w-gym"
## [25] "w-soccer" "m-track"
## [27] "w-track" "m-xc"
## [29] "w-xc" "w-tennis"
## [31] "m-tennis" "w-volley"
## [33] "cross-country" "mens-swimming-and-diving"
## [35] "womens-swimming-and-diving" "track-and-field"
## [37] "womens-lacrosse" "mbball"
## [39] "wbball" "cross"
## [41] "mgolf" "wgolf"
## [43] "swimming" "wgym"
## [45] "msoc" "wsoc"
## [47] "track" "mten"
## [49] "wten" "wvball"
## [51] "bsb" "sb"
## [53] "mb" "xc"
## [55] "fb" "mg"
## [57] "wg" "sd"
## [59] "gm" "sc"
## [61] "tf" "mt"
## [63] "wt" "vb"
## [65] "bvb" "wrestling"
## [67] "wcross" "mcross"
## [69] "wswim" "wtrack"
## [71] "wlax"
I don’t want to do this automatically with a loop (and with things like regular expressions) so I’m going to just create a manual dataframe by combining two lists together.
names = unique(sport_names)
name_replacements = c("baseball", "softball", "mens-basketball", "womens-basketball", "football", "mens-golf", "womens-golf", "swimming-and-diving", "gymnastics", "womens-soccer", "xc-track", "womens-tennis", "mens-tennis", "volleyball", "womens-rowing", "baseball", "softball", "mens-basketball", "womens-basketball", "football", "mens-golf", "womens-golf", "swimming-and-diving", "gymnastics", "womens-soccer", "xc-track", "xc-track", "xc-track", "xc-track", "womens-tennis", "mens-tennis", "volleyball", "xc-track", "swimming-and-diving", "swimming-and-diving", "xc-track", "womens-lacrosse", "mens-basketball", "womens-basketball", "xc-track", "mens-golf", "womens-golf", "swimming-and-diving", "gymnastics", "mens-soccer", "womens-soccer", "xc-track", "mens-tennis", "womens-tennis", "volleyball", "baseball", "softball", "mens-basketball", "xc-track", "football", "mens-golf", "womens-golf", "swimming-and-diving", "gymnastics", "womens-soccer", "xc-track", "mens-tennis", "womens-tennis", "volleyball", "beach-volleyball", "wrestling", "xc-track", "xc-track", "swimming-and-diving", "xc-track", "womens-lacrosse")
sport_name_changes = data.frame(Name = names, Modified = name_replacements)
show_table(tail(sport_name_changes))
Name | Modified | |
---|---|---|
66 | wrestling | wrestling |
67 | wcross | xc-track |
68 | mcross | xc-track |
69 | wswim | swimming-and-diving |
70 | wtrack | xc-track |
71 | wlax | womens-lacrosse |
This adjustment also allows me to combine some men’s/women’s sports together. Some universities combine the rosters, while some don’t, so this allows me to combine sports like cross country, track and field, and swimming and diving.
# Apply this to the all_sports sport list
new_sports = c()
for (sport in all_schools$Sport){
new_sports = c(new_sports, sport_name_changes[sport_name_changes["Name"] == sport][2])
}
all_schools$Sport = new_sports
print(paste("Old number of names:", length(names), "New number of names", length(unique(new_sports))))
## [1] "Old number of names: 71 New number of names 19"
Standardizing State Names
All of the city names are not the same, that is just the nature of scraping from these different websites. So we have to try and standardize as best as possible in order to produce relevant data.
hometowns = data.frame(City = all_schools$Hometown)
states = c()
cities = c()
for (city in hometowns$City){
if (is.na(city)){
states = c(states, NA)
cities = c(cities, NA)
next
}
cities = c(cities, str_trim(str_split_1(city, ",")[1]))
states = c(states, str_trim(str_split_1(city, ",")[2]))
}
hometowns$Town = cities
hometowns$State = states
Behind the scenes, I used a script to help standardize the states, so that we don’t have multiple instances of the same city. Now we can combine this back to the original format.
Geocoding the Data
I created a geocodes.R
script that takes the cities in
the players dataset and geocodes all 5000+ unique cities. This way we
can map them and show all the different areas these student-athletes
come from. Here is what that script looks like.
library(tidyverse)
library(ggmap)
# Get the rosters.csv file we just wrote
players = read_csv("rosters.csv")
# Read the API key from our .REnviron file
readRenviron("./.Renviron")
register_google(Sys.getenv("MAPSAPIKEY"))
cities = data.frame(unique(players$Hometown))
cities$Hometown = cities$unique.players.Hometown.
cities = cities %>% select(Hometown) %>% drop_na()
# Geocode all of the cities
geocoded_cities = mutate_geocode(cities, Hometown)
write_csv(distinct(geocoded_cities %>% select(Hometown, lon, lat)), "./geocodes.csv")
Here is part of what that returns.
Hometown | lon | lat |
---|---|---|
Birmingham, Alabama | -86.81036 | 33.51859 |
Shreveport, Louisiana | -93.75018 | 32.52515 |
Boaz, Alabama | -86.15210 | 34.20317 |
Cullman, Alabama | -86.84361 | 34.17482 |
Mobile, Alabama | -88.03989 | 30.69537 |
Pensacola, Florida | -87.21691 | 30.42131 |
To help with the map, I will add a metric that has the top school of choice for each hometown.
schools = all_schools %>% select(Hometown, School) %>% group_by(Hometown, School) %>% summarize(n = length(Hometown)) %>% drop_na()
# SQL query to find the School with the most players from that city.
cities = sqldf("SELECT Hometown, School, max(n) as Num_Players FROM schools GROUP BY Hometown")
# Add all of these schools and their respective number of players to the geocodes dataframe
top_schools = c()
ns = c()
for (city in geocodes$Hometown) {
df = cities[cities["Hometown"] == city,] %>% select(School, Num_Players)
top_schools = append(top_schools, df$School[1])
ns = append(ns, df$Num_Players[1])
}
geocodes$School = top_schools
geocodes$Num_Players = ns
Mapping the Data
We will use the leaflet package for this portion. This will give us a nice interactive map we can look at.
# These are some mapping functions that will allow us to see not only some statistics on the cities, but color them so we know what schools are getting the most recruits from each city.
school_color = function (school) {
school = school[1]
if (is.na(school)){
return ("black")
}
if (school == "Alabama") {
return ("#9E1B32")
}
if (school == "Auburn"){
return ("#E87722")
}
if (school == "Missouri"){
return ("#F1B82D")
}
if (school == "Arkansas"){
return ("#9D2235")
}
if (school == "Kentucky") {
return ("#0033A0")
}
if (school == "Florida") {
return ("#0021A5")
}
if (school == "Mississippi State") {
return ("#5D1725")
}
if (school == "LSU") {
return ("#461D7C")
}
if (school == "South Carolina") {
return ("#73000A")
}
if (school == "Vanderbilt") {
return ("#866D4B")
}
}
colors = c()
for (school in geocodes$School){
colors = c(colors, school_color(school))
}
geocodes$Color = colors
# Colors group for future plotting
group.colors = c("Alabama" = "#9E1B32", "Arkansas" = "#9D2235", "Auburn" = "#E87722", "Florida" = "#0021A5", "Kentucky" = "#0033A0", "LSU" = "#461D7C", "Mississippi State" = "#5D1725", "Missouri" = "#F1B82D", "South Carolina" = "#73000A", "Vanderbilt" = "#866D4B")
m = leaflet() %>%
addProviderTiles(provider=providers$Esri.WorldGrayCanvas) %>%
addCircleMarkers(geocodes$lon, geocodes$lat, label=geocodes$Hometown, popup=paste("<b>", geocodes$Hometown, "</b><br>", geocodes$Num_Players, "were recruited to", geocodes$School), radius=4.5, color=geocodes$Color, fillOpacity=1.0, stroke=F)
m
Adding Census Data
First, we retrieve the data from the census API, and clean it up slightly.
readRenviron("./.Renviron")
api_key = Sys.getenv("CENSUSKEY")
# Create our url string
url = str_glue("https://api.census.gov/data/2019/acs/acs5?get=NAME,B01002_001E,B02001_002E,B02001_003E,B25010_001E,B01003_001E&for=place:*&in=state:*&key={api_key}")
data = fromJSON(url)
census_data = data.frame(data) %>% slice(2:nrow(data))
colnames(census_data) = c("City", "Avg_Age", "B02001_002E", "B02001_003E", "Avg_Household_Size", "Total_Pop", "state", "place")
# Change column names
census_data$B02001_002E = as.numeric(census_data$B02001_002E)
census_data$B02001_003E = as.numeric(census_data$B02001_003E)
census_data$Total_Pop = as.numeric(census_data$Total_Pop)
# Calculate percentages
census_data = census_data %>%
mutate(Percent_White = (B02001_002E/Total_Pop) * 100) %>%
mutate(Percent_Black = (B02001_003E/Total_Pop) * 100) %>%
select(City, Avg_Age, Percent_White, Percent_Black, Avg_Household_Size)
# Split City and State
split_city = function(name){
split_list = strsplit(name, ",")[[1]]
city_name = split_list[1]
city_split = strsplit(city_name, " ")[[1]]
city_name = paste(city_split[-length(city_split)], collapse = " ")
return (city_name)
}
get_state = function(name) {
split_list = strsplit(name, ",")[[1]]
state = str_trim(split_list[2])
return(state)
}
Data Visualizations
Below are some different visualizations that I think can represent some of this data in a effective manner.
Census Data
cities_w_census = census_data %>%
filter(City_str %in% cities$Hometown)
# What cities in our database have the highest percentage of black players?
pct_black = cities_w_census %>%
select(City_str, Percent_Black) %>%
arrange(desc(Percent_Black)) %>%
top_n(25)
ggplot(pct_black, aes(y=reorder(City_str, Percent_Black), x=Percent_Black)) +
geom_col(fill="lightblue", color="black") +
ylab("City Name") +
xlab("% of Black People") +
ggtitle("% of Black Population by Census City")
Let’s plot the top 50 towns now, and see where they land on the map in accordance with the players.
register_google(Sys.getenv("MAPSAPIKEY"))
geocoded_top_50 = cities_w_census %>%
select(City_str, Percent_Black) %>%
arrange(desc(Percent_Black)) %>%
top_n(50) %>%
mutate_geocode(City_str)
m %>% addCircleMarkers(geocoded_top_50$lon, geocoded_top_50$lat, label=geocoded_top_50$City_str, popup=paste("<b>", geocoded_top_50$City_str, "</b><br>", round(geocoded_top_50$Percent_Black, 2), "% Black"), color="purple")
Let’s use the cities dataframe to find the top schools for each of these census cities, so we can see more effectively where players from these places are attending.
# Get top schools for each census city
tempdf1 = cities %>% filter(Hometown %in% cities_w_census$City_str)
census_top_schools = c()
for (i in 1:nrow(cities_w_census)){
city = tempdf1[tempdf1["Hometown"] == cities_w_census$City_str[i],]
census_top_schools = c(census_top_schools, city$School)
}
cities_w_census$Top_School = census_top_schools
Which schools are high-black/white recruits attending?
cities_w_census %>%
filter(Percent_Black > 50) %>%
count(Top_School) %>%
arrange(desc(n)) %>%
ggplot(aes(x=reorder(Top_School, -n), y=n, fill=Top_School)) +
geom_col(color = "black") +
scale_fill_manual(values=group.colors) +
theme(axis.text.x = element_text(angle=90, vjust=1)) +
xlab("School") +
ylab("Number of Majority Black Cities") +
ggtitle("Schools who have Most Recruits from Majority Black Cities") +
guides(fill=guide_legend(title="School"))
Naturally, we would expect to see opposite results from majority white cities.
cities_w_census %>%
filter(Percent_White > 50) %>%
count(Top_School) %>%
arrange(desc(n)) %>%
ggplot(aes(x=reorder(Top_School, -n), y=n, fill=Top_School)) +
geom_col(color = "black") +
scale_fill_manual(values=group.colors) +
theme(axis.text.x = element_text(angle=90, vjust=1)) +
xlab("School") +
ylab("Number of Majority White Cities") +
ggtitle("Schools who have Most Recruits from Majority White Cities") +
guides(fill=guide_legend(title="School"))
We see that many of the schools who previously brought more black student-athletes in, tend to be less likely to be a top school in a majority white city.
Let’s also take into account how many majority white cities there are in comparison with majority black cities. While this is to be expected due to African-Americans being a large minority, it is important to visualize.
num_black_cities = nrow(cities_w_census %>% filter(Percent_Black > 50))
num_white_cities = nrow(cities_w_census %>% filter(Percent_White > 50))
num_cities_df = data.frame(Race = c("Black", "White"), Num_Majority_Cities = c(num_black_cities, num_white_cities))
ggplot(num_cities_df, aes(x=reorder(Race, -Num_Majority_Cities), y=Num_Majority_Cities)) +
geom_col(fill="lightblue", color="black") +
ylab("Number of Majority Cities") +
xlab("Race")
Distance Away From Home
I am quite fascinated by the world of Name, Image, and Likeness (NIL), as well as the impact of the transfer portal on the world of college sports. It started on July 1, 2021, so let’s take into account some things with its impact.
First, lets do some data cleaning/calculations to determine spherical distances from each city to its respective school.
universities = unique(schools$School)
uni_cities = c("Nashville, TN", "Lexington, KY", "Tuscaloosa, AL", "Gainesville, FL", "Baton Rouge, LA", "Starkville, MS", "Columbia, SC", "Fayetteville, AR", "Auburn, AL", "Columbia, MO")
# Add each university's city so it can be geocoded for distance calculations
school_locs = data.frame(School = universities, City = uni_cities) %>% mutate_geocode(City)
In the next two chunks, I add each city’s geocoded location to the
all_schools
dataframe, as well as the geocoded location for
each school in the dataframe.
lons = c()
lats = c()
# Apply to all players dataframe
for (city in all_schools$Hometown) {
geocode = geocodes[geocodes$Hometown == city,]
lons = c(lons, geocode$lon)
lats = c(lats, geocode$lat)
}
all_schools$lon = lons
all_schools$lat = lats
sc_lons = c()
sc_lats = c()
for (school in all_schools$School) {
code = school_locs[school_locs$School == school,]
sc_lons = c(sc_lons, code$lon)
sc_lats = c(sc_lats, code$lat)
}
all_schools$school_lon = sc_lons
all_schools$school_lat = sc_lats
Now with those geocodes in place, we can calculate the Haversine distance (distance across a sphere) of each city-school pair.
dists = c()
for (i in 1:nrow(all_schools)) {
# Calculate distance, divide by 1000 for km
dists = c(dists, distHaversine(c(all_schools$lon[i], all_schools$lat[i]), c(all_schools$school_lon[i], all_schools$school_lat[i]))/1000)
}
all_schools$school_dist = dists
Now we can group this by school, and by year, and find the variance.
schools_dist_var = all_schools %>%
drop_na() %>%
group_by(School, Year) %>%
summarize(distance_var = var(school_dist)) %>%
mutate(Color = school_color(School), label= toupper(substr(School, 1, 3)))
Now we can take each schools variance in distance over time, and see how “spread out” their recruitment is.
# This allows for nice labels
ends = schools_dist_var %>% filter(Year == 2023)
ggplot(schools_dist_var, aes(x=Year, y=distance_var)) +
geom_smooth(aes(color=School), ,se=F, size=1.25) +
scale_color_manual(values=group.colors) +
scale_x_continuous(breaks=c(2013, 2015, 2017, 2019, 2021, 2023)) +
geom_text_repel(aes(label=label), data=ends, nudge_x=1, na.rm=T) +
ggtitle("Variance in Distance from 2013-2023") +
ylab("Variance in Distance") +
theme(axis.text.y=element_blank(), axis.ticks.y = element_blank())
We see from this that Florida has the most variance in distance at the moment, while schools like Kentucky have dropped off over the years.
Since the transfer portal and NIL came about, let’s take a look at the difference across all schools in variance before and after this change.
nil_difference = schools_dist_var %>%
mutate(nil = ifelse(Year >= 2021, "2021-2023", "2013-2020")) %>%
group_by(nil) %>%
summarize(avg_var = mean(distance_var))
ggplot(nil_difference, aes(x=nil, y=avg_var)) +
geom_col(width=0.5, color="black", fill="lightblue") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
ylab("Average Variance") +
xlab("Year Range") +
ggtitle("Average Variance in Distance Before and After NIL Introduction")
We can see a slight increase in variance, but it is likely negligible,
so this will probably need a few more years of data to see changes, if
any. However, we can still look at this by school.
nil_diff_school = schools_dist_var %>%
mutate(nil = ifelse(Year >= 2021, "after", "before")) %>%
group_by(nil, School) %>%
summarize(avg_var = mean(distance_var)) %>%
pivot_wider(names_from=nil, values_from=avg_var) %>%
mutate(var_diff = after-before)
ggplot(nil_diff_school, aes(x=School, y=var_diff, fill=School)) +
geom_col(width=0.5, color="black") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_text(angle=90, vjust=1)) +
scale_fill_manual(values=group.colors) +
ylab("Difference in Variance") +
xlab("School") +
ggtitle("Difference in Variance Before and After NIL Introduction")
We see from this that 6 out of 10 schools have increased their recruitment range since NIL was introduced. Some like Kentucky and Auburn have adjusted to bring recruits in that are closer to campus.
In State Recruiting
Which schools (out of the subset that I have scraped) are the best at recruiting in their state? Out of all of the cities in their state, how many are they the majority recruiter in?
unis = sort(unique(cities$School))
uni_states = data.frame(School = unis, State = c("Alabama", "Arkansas", "Alabama", "Florida", "Kentucky", "Louisiana", "Mississippi", "Missouri", "South Carolina", "Tennessee"))
# Give the cities dataframe all of their states
hometowns_unique = distinct(hometowns) %>% arrange(City)
cities$State = hometowns_unique$State
cities_filtered = cities %>% filter(State %in% uni_states$State)
# SQL query to determine percentage of cities in state that a school has a majority in
sql_calc = sqldf("with cte as (SELECT State, School, COUNT(Hometown) as Num_Cities,
count(*) * 100.0 / sum(count(*)) over(PARTITION BY State) as Percentage_Of_Cities
FROM cities_filtered
GROUP BY State, School)
SELECT * FROM cte WHERE State == School OR (State = 'Alabama' AND School = 'Auburn') OR (State = 'Louisiana' AND School = 'LSU') OR (State = 'Tennessee' AND School = 'Vanderbilt') OR (State = 'Mississippi' AND School = 'Mississippi State')")
# Plot
ggplot(sql_calc, aes(x=reorder(School, -Percentage_Of_Cities), y=Percentage_Of_Cities, fill=School)) +
geom_col(color="black") +
theme(axis.text.x=element_text(angle=90, vjust=1)) +
scale_fill_manual(values=group.colors) +
xlab("School") +
ylab("Percentage of Cities in State") +
ggtitle("Percentage of Majority Cities in State")
Thank You!
Thanks for listening to my presentation! This took a lot of hard work over the last week or so, so I appreciate the support.