SEC Schools (11/14 Teams)

Scraping

All of the scraping for this project was done in R, using the rvest package, with the read_html() and html_table, functions. These conveniently take a webpage and automatically create a list of R dataframes, allowing for easy, simple scraping. Here’s an example:

# Scraping data for the University of Missouri's rosters for all sports in 2023


# The sports that the school offers
sports = c("baseball",
           "softball",
           "mens-basketball",
           "womens-basketball",
           "cross-country",
           "football",
           "mens-golf",
           "womens-golf",
           "mens-swimming-and-diving",
           "womens-swimming-and-diving",
           "womens-gymnastics",
           "womens-soccer",
           "track-and-field",
           "wrestling",
           "womens-tennis",
           "womens-volleyball")

# "Type" is just determining what the format of the year is. If the sport's urls are formatted as 2023, 2024, etc, its type is 0. If it is formatted like 2023-24, it is 1.
types = c(0,0,1,1,0,0,1,1,0,0,0,0,0,1,1,0)

sports_df = data.frame(Sport = sports, Type = types) # DataFrame combining sports with their types'

# This is a helper function taking in a column that has Hometown data. Some of the columns are presented as "Hometown/High School"; we only want Hometown, so we can split on the "/" character, and take the first element.
hometown_split = function(L) {
  new_list = c()
  for (i in 1:length(L)) {
    new_list = c(new_list, str_trim(str_split_1(L[i], "/")[1]))
  }
  return (new_list)
}

# Scraping
BASE_URL = "https://mutigers.com"
years = 2023:2023 # List of years that we want to look at. Normally this would be something like 2013:2023.

college_df = data.frame() # Initializing an empty dataframe that we can add to.

for (i in 1:nrow(sports_df)) {
  sport = sports_df$Sport[i] # Easier for readability
  for (y in 1:length(years)) {
    year = years[y] # Easier for readability
    
    # If the sport's url is presented as 2023-24 (type 1), we want to convert the year in our for loop to be of that format.
    if (sports_df$Type[i] == 1) {
      next_year = as.character(years[y] + 1)
      year_str = paste(years[y], "-", substr(next_year, nchar(next_year) -1, nchar(next_year)), sep="")
    }
    else { # otherwise, keep year as is.
      year_str = as.character(years[y])
    }
    
    # Put together the pieces of the url
    url = paste(BASE_URL, "/sports/", sport, "/roster/", year_str, sep="")
    
    # Read the data.
    tables = read_html(url) %>% html_table()
    
    # Which element of the tables list the roster is on depends on the school.
    roster = data.frame(tables[3])
    
    # Some sports have men's and women's rosters combined in two different tables. This adds that extra table in.
    if (sport %in% c("cross-country", "track-and-field")) {
      roster = rbind(roster, data.frame(tables[4]))
    }
    
    # Adding a column for the year and the sport
    roster$Year = rep(year, nrow(roster))
    roster$Sport = rep(sport, nrow(roster))
    
    # This checks if there is a column including the name "hometown". If there is one, it adds a Hometown column, using the hometown_split() function.
    for (i in 1:length(colnames(roster))){
      if (length(grep("[hH]ometown", colnames(roster)[i])) > 0){
        roster$Hometown = hometown_split(roster[[colnames(roster)[i]]])
      }
      
      # If there is a column "Name", add it to the roster dataframe.
      if (length(grep("[nN]ame", colnames(roster)[i])) > 0){
        roster$Name = roster[[colnames(roster)[i]]]
      }
    }
    
    
    # If we don't find a column with "Hometown" in it, likely this url doesn't contain any data. We move on to the next year/sport.
    if (!("Hometown" %in% colnames(roster))) {
      next
    }
    
    # Get the columns we want from the roster, and add them to the overall college dataframe.
    roster = roster %>% select(Name, Hometown, Year, Sport)
    college_df = rbind(college_df, roster)
  }
  # For monitoring purposes
  # print(paste("Finished with ", sport))
}

# Typically, csv files are written for each school, so we can combine them in the cleaning. I will not do that here, but instead show a snapshot of the dataframe.
# write.csv(college_df, "MIZZOU.csv", row.names=F)

Here is a snapshot of MIZZOU.csv.

show_table(head(read_csv("./Missouri/mizzou.csv")))


Name	Hometown	Year	Sport
Jerry Houston	Dolton, Ill.	2013	baseball
Brett Peel	St. Charles, Mo.	2013	baseball
Shane Benes	Town and Country, Mo.	2013	baseball
Case Munson	St. Charles, Mo.	2013	baseball
Brett Bond	St. Louis, Mo.	2013	baseball
Trey Harris	Powder Springs, Ga.	2013	baseball

Data Cleaning

The way I did the scraping, we are now left with 11 different directories, each with a .csv file, and a .R script file that scrapes the data, and generates those .csv files.

Reading and Combining

The approach I took is to get those .csv. files recursively from the home directory that houses all of these subdirectories.

# Get all files from this directory
files = list.files(recursive=T)

# Get all .csv files, this will remove the .R scripts
csv_files = files[endsWith(files, ".csv")]
csv_files = csv_files[!(csv_files %in% c("geocodes.csv", "rosters.csv"))]


all_schools = data.frame()

# Combine all of the content from each csv together into one large dataframe.
for (f in csv_files) {
  file = read_csv(f)
  file$School = rep(str_split_1(f, "/")[1], nrow(file))
  all_schools = rbind(all_schools, file)
}

We can see the head and tail of this dataset, showing we have combined all the data for all schools.

show_table(head(all_schools))


Name	Hometown	Year	Sport	School
Mikey White	Birmingham, Ala.	2014	baseball	Alabama
Daniel Cucjen	Shreveport, La.	2014	baseball	Alabama
Kyle Overstreet	Boaz, Ala.	2014	baseball	Alabama
Ben Moore	Cullman, Ala.	2014	baseball	Alabama
Casey Hughston	Mobile, Ala.	2014	baseball	Alabama
Wade Wass	Pensacola, Fla.	2014	baseball	Alabama

show_table(tail(all_schools))


Name	Hometown	Year	Sport	School
Molly Finlay	Oakton, Va.	2023	wlax	Vanderbilt
Ashley Sampone	Rochester, N.Y.	2023	wlax	Vanderbilt
Taylor Sampone	Rochester, N.Y.	2023	wlax	Vanderbilt
Kristin Gruber	Woodbine, Md.	2023	wlax	Vanderbilt
Molly Krestinski	Bronxville, N.Y.	2023	wlax	Vanderbilt
Remy Ricciardi	Ridgewood, N.J.	2023	wlax	Vanderbilt

When scraping and creating these .csv files, I just added the sport names as the universities had structured them, so some of them had different names. We need to loop through the sport vector and create some sort of standard naming for the sport names, so that they can be visualized. Here we can see all the different names we have of sports.

sport_names = all_schools$Sport
unique(sport_names)

##  [1] "baseball"                   "softball"                  
##  [3] "mens-basketball"            "womens-basketball"         
##  [5] "football"                   "mens-golf"                 
##  [7] "womens-golf"                "swimming-and-diving"       
##  [9] "womens-gymnastics"          "womens-soccer"             
## [11] "xctrack"                    "womens-tennis"             
## [13] "mens-tennis"                "womens-volleyball"         
## [15] "womens-rowing"              "m-basebl"                  
## [17] "w-softbl"                   "m-baskbl"                  
## [19] "w-baskbl"                   "m-footbl"                  
## [21] "m-golf"                     "w-golf"                    
## [23] "w-swim"                     "w-gym"                     
## [25] "w-soccer"                   "m-track"                   
## [27] "w-track"                    "m-xc"                      
## [29] "w-xc"                       "w-tennis"                  
## [31] "m-tennis"                   "w-volley"                  
## [33] "cross-country"              "mens-swimming-and-diving"  
## [35] "womens-swimming-and-diving" "track-and-field"           
## [37] "womens-lacrosse"            "mbball"                    
## [39] "wbball"                     "cross"                     
## [41] "mgolf"                      "wgolf"                     
## [43] "swimming"                   "wgym"                      
## [45] "msoc"                       "wsoc"                      
## [47] "track"                      "mten"                      
## [49] "wten"                       "wvball"                    
## [51] "bsb"                        "sb"                        
## [53] "mb"                         "xc"                        
## [55] "fb"                         "mg"                        
## [57] "wg"                         "sd"                        
## [59] "gm"                         "sc"                        
## [61] "tf"                         "mt"                        
## [63] "wt"                         "vb"                        
## [65] "bvb"                        "wrestling"                 
## [67] "wcross"                     "mcross"                    
## [69] "wswim"                      "wtrack"                    
## [71] "wlax"

I don’t want to do this automatically with a loop (and with things like regular expressions) so I’m going to just create a manual dataframe by combining two lists together.

names = unique(sport_names)
name_replacements = c("baseball", "softball", "mens-basketball", "womens-basketball", "football", "mens-golf", "womens-golf", "swimming-and-diving", "gymnastics", "womens-soccer", "xc-track", "womens-tennis", "mens-tennis", "volleyball", "womens-rowing", "baseball", "softball", "mens-basketball", "womens-basketball", "football", "mens-golf", "womens-golf", "swimming-and-diving", "gymnastics", "womens-soccer", "xc-track", "xc-track", "xc-track", "xc-track", "womens-tennis", "mens-tennis", "volleyball", "xc-track", "swimming-and-diving", "swimming-and-diving", "xc-track", "womens-lacrosse", "mens-basketball", "womens-basketball", "xc-track", "mens-golf", "womens-golf", "swimming-and-diving", "gymnastics", "mens-soccer", "womens-soccer", "xc-track", "mens-tennis", "womens-tennis", "volleyball", "baseball", "softball", "mens-basketball", "xc-track", "football", "mens-golf", "womens-golf", "swimming-and-diving", "gymnastics", "womens-soccer", "xc-track", "mens-tennis", "womens-tennis", "volleyball", "beach-volleyball", "wrestling", "xc-track", "xc-track", "swimming-and-diving", "xc-track", "womens-lacrosse")

sport_name_changes = data.frame(Name = names, Modified = name_replacements)
show_table(tail(sport_name_changes))


	Name	Modified
66	wrestling	wrestling
67	wcross	xc-track
68	mcross	xc-track
69	wswim	swimming-and-diving
70	wtrack	xc-track
71	wlax	womens-lacrosse

This adjustment also allows me to combine some men’s/women’s sports together. Some universities combine the rosters, while some don’t, so this allows me to combine sports like cross country, track and field, and swimming and diving.

# Apply this to the all_sports sport list

new_sports = c()

for (sport in all_schools$Sport){
  new_sports = c(new_sports, sport_name_changes[sport_name_changes["Name"] == sport][2])
}
all_schools$Sport = new_sports



print(paste("Old number of names:", length(names), "New number of names", length(unique(new_sports))))

## [1] "Old number of names: 71 New number of names 19"

Standardizing State Names

All of the city names are not the same, that is just the nature of scraping from these different websites. So we have to try and standardize as best as possible in order to produce relevant data.

hometowns = data.frame(City = all_schools$Hometown)

states = c()
cities = c()
for (city in hometowns$City){
  if (is.na(city)){
    states = c(states, NA)
    cities = c(cities, NA)
    next
  }
  cities = c(cities, str_trim(str_split_1(city, ",")[1]))
  states = c(states, str_trim(str_split_1(city, ",")[2]))
}

hometowns$Town = cities
hometowns$State = states

Behind the scenes, I used a script to help standardize the states, so that we don’t have multiple instances of the same city. Now we can combine this back to the original format.

hometowns$City = paste(hometowns$Town, ", ", hometowns$State, sep="")
all_schools$Hometown = hometowns$City

# Write this to a csv to use for geocoding script
write.csv(all_schools, "rosters.csv", row.names=F)

Geocoding the Data

I created a geocodes.R script that takes the cities in the players dataset and geocodes all 5000+ unique cities. This way we can map them and show all the different areas these student-athletes come from. Here is what that script looks like.

library(tidyverse)
library(ggmap)

# Get the rosters.csv file we just wrote
players = read_csv("rosters.csv")

# Read the API key from our .REnviron file
readRenviron("./.Renviron")
register_google(Sys.getenv("MAPSAPIKEY"))


cities = data.frame(unique(players$Hometown))
cities$Hometown = cities$unique.players.Hometown.
cities = cities %>% select(Hometown) %>% drop_na()


# Geocode all of the cities
geocoded_cities = mutate_geocode(cities, Hometown)

write_csv(distinct(geocoded_cities %>% select(Hometown, lon, lat)), "./geocodes.csv")

Here is part of what that returns.

geocodes = read_csv("geocodes.csv")
show_table(head(geocodes))


Hometown	lon	lat
Birmingham, Alabama	-86.81036	33.51859
Shreveport, Louisiana	-93.75018	32.52515
Boaz, Alabama	-86.15210	34.20317
Cullman, Alabama	-86.84361	34.17482
Mobile, Alabama	-88.03989	30.69537
Pensacola, Florida	-87.21691	30.42131

To help with the map, I will add a metric that has the top school of choice for each hometown.

schools = all_schools %>% select(Hometown, School) %>% group_by(Hometown, School) %>% summarize(n = length(Hometown)) %>% drop_na()

# SQL query to find the School with the most players from that city.
cities = sqldf("SELECT Hometown, School, max(n) as Num_Players FROM schools GROUP BY Hometown")

# Add all of these schools and their respective number of players to the geocodes dataframe
top_schools = c()
ns = c()
for (city in geocodes$Hometown) {
  df = cities[cities["Hometown"] == city,] %>% select(School, Num_Players)
  top_schools = append(top_schools, df$School[1])
  ns = append(ns, df$Num_Players[1])
}

geocodes$School = top_schools
geocodes$Num_Players = ns

Mapping the Data

We will use the leaflet package for this portion. This will give us a nice interactive map we can look at.

# These are some mapping functions that will allow us to see not only some statistics on the cities, but color them so we know what schools are getting the most recruits from each city.

school_color = function (school) {
  
  school = school[1]
  
  if (is.na(school)){
    return ("black")
  }
  
  if (school == "Alabama") {
    return ("#9E1B32")
  }
  
  if (school == "Auburn"){
    return ("#E87722")
  }
  
  if (school == "Missouri"){
    return ("#F1B82D")
  }
  
  if (school == "Arkansas"){
    return ("#9D2235")
  }
  
  if (school == "Kentucky") {
    return ("#0033A0")
  }
  
  if (school == "Florida") {
    return ("#0021A5")
  }
  if (school == "Mississippi State") {
    return ("#5D1725")
  }
  if (school == "LSU") {
    return ("#461D7C")
  }
  if (school == "South Carolina") {
    return ("#73000A")
  }
  if (school == "Vanderbilt") {
    return ("#866D4B")
  }
  
  
}

colors = c()
for (school in geocodes$School){
  colors = c(colors, school_color(school))
}
geocodes$Color = colors

# Colors group for future plotting

group.colors = c("Alabama" = "#9E1B32", "Arkansas" = "#9D2235", "Auburn" = "#E87722", "Florida" = "#0021A5", "Kentucky" = "#0033A0", "LSU" = "#461D7C", "Mississippi State" = "#5D1725", "Missouri" = "#F1B82D", "South Carolina" = "#73000A", "Vanderbilt" = "#866D4B")

m = leaflet() %>% 
  addProviderTiles(provider=providers$Esri.WorldGrayCanvas) %>%
  addCircleMarkers(geocodes$lon, geocodes$lat, label=geocodes$Hometown, popup=paste("<b>", geocodes$Hometown, "</b><br>", geocodes$Num_Players, "were recruited to", geocodes$School), radius=4.5, color=geocodes$Color, fillOpacity=1.0, stroke=F)
m

Adding Census Data

First, we retrieve the data from the census API, and clean it up slightly.

readRenviron("./.Renviron")
api_key = Sys.getenv("CENSUSKEY")

# Create our url string
url = str_glue("https://api.census.gov/data/2019/acs/acs5?get=NAME,B01002_001E,B02001_002E,B02001_003E,B25010_001E,B01003_001E&for=place:*&in=state:*&key={api_key}")

data = fromJSON(url)

census_data = data.frame(data) %>% slice(2:nrow(data))
colnames(census_data) = c("City", "Avg_Age", "B02001_002E", "B02001_003E", "Avg_Household_Size", "Total_Pop", "state", "place")

# Change column names
census_data$B02001_002E = as.numeric(census_data$B02001_002E)
census_data$B02001_003E = as.numeric(census_data$B02001_003E)
census_data$Total_Pop = as.numeric(census_data$Total_Pop)

# Calculate percentages
census_data = census_data %>% 
  mutate(Percent_White = (B02001_002E/Total_Pop) * 100) %>% 
  mutate(Percent_Black = (B02001_003E/Total_Pop) * 100) %>% 
  select(City, Avg_Age, Percent_White, Percent_Black, Avg_Household_Size)

# Split City and State
split_city = function(name){
  split_list = strsplit(name, ",")[[1]]
  
  city_name = split_list[1]
  
  city_split = strsplit(city_name, " ")[[1]]
  city_name = paste(city_split[-length(city_split)], collapse = " ")
  
  return (city_name)
}

get_state = function(name) {
  split_list = strsplit(name, ",")[[1]]
  
  state = str_trim(split_list[2])
  
  return(state)
}

# Apply City and State Calculations to Census data
census_data$State = lapply(census_data$City, get_state)
census_data$City = lapply(census_data$City, split_city)
census_data$City_str = paste(census_data$City, ", ", census_data$State, sep="")

Data Visualizations

Below are some different visualizations that I think can represent some of this data in a effective manner.

Census Data

cities_w_census = census_data %>% 
  filter(City_str %in% cities$Hometown)


# What cities in our database have the highest percentage of black players?

pct_black = cities_w_census %>% 
  select(City_str, Percent_Black) %>% 
  arrange(desc(Percent_Black)) %>% 
  top_n(25)

ggplot(pct_black, aes(y=reorder(City_str, Percent_Black), x=Percent_Black)) +
  geom_col(fill="lightblue", color="black") +
  ylab("City Name") +
  xlab("% of Black People") +
  ggtitle("% of Black Population by Census City")

Let’s plot the top 50 towns now, and see where they land on the map in accordance with the players.

register_google(Sys.getenv("MAPSAPIKEY"))

geocoded_top_50 = cities_w_census %>% 
  select(City_str, Percent_Black) %>% 
  arrange(desc(Percent_Black)) %>% 
  top_n(50) %>% 
  mutate_geocode(City_str)

m %>% addCircleMarkers(geocoded_top_50$lon, geocoded_top_50$lat, label=geocoded_top_50$City_str, popup=paste("<b>", geocoded_top_50$City_str, "</b><br>", round(geocoded_top_50$Percent_Black, 2), "% Black"), color="purple")

Let’s use the cities dataframe to find the top schools for each of these census cities, so we can see more effectively where players from these places are attending.

# Get top schools for each census city
tempdf1 = cities %>% filter(Hometown %in% cities_w_census$City_str)

census_top_schools = c()

for (i in 1:nrow(cities_w_census)){
  city = tempdf1[tempdf1["Hometown"] == cities_w_census$City_str[i],]
  census_top_schools = c(census_top_schools, city$School)
}

cities_w_census$Top_School = census_top_schools

Which schools are high-black/white recruits attending?

cities_w_census %>% 
  filter(Percent_Black > 50) %>% 
  count(Top_School) %>% 
  arrange(desc(n)) %>% 
  ggplot(aes(x=reorder(Top_School, -n), y=n, fill=Top_School)) +
  geom_col(color = "black") +
  scale_fill_manual(values=group.colors) +
  theme(axis.text.x = element_text(angle=90, vjust=1)) +
  xlab("School") +
  ylab("Number of Majority Black Cities") +
  ggtitle("Schools who have Most Recruits from Majority Black Cities") +
  guides(fill=guide_legend(title="School"))

Naturally, we would expect to see opposite results from majority white cities.

cities_w_census %>% 
  filter(Percent_White > 50) %>% 
  count(Top_School) %>% 
  arrange(desc(n)) %>% 
  ggplot(aes(x=reorder(Top_School, -n), y=n, fill=Top_School)) +
  geom_col(color = "black") +
  scale_fill_manual(values=group.colors) +
  theme(axis.text.x = element_text(angle=90, vjust=1)) +
  xlab("School") +
  ylab("Number of Majority White Cities") +
  ggtitle("Schools who have Most Recruits from Majority White Cities") +
  guides(fill=guide_legend(title="School"))

We see that many of the schools who previously brought more black student-athletes in, tend to be less likely to be a top school in a majority white city.

Let’s also take into account how many majority white cities there are in comparison with majority black cities. While this is to be expected due to African-Americans being a large minority, it is important to visualize.

num_black_cities = nrow(cities_w_census %>% filter(Percent_Black > 50))
num_white_cities = nrow(cities_w_census %>% filter(Percent_White > 50))

num_cities_df = data.frame(Race = c("Black", "White"), Num_Majority_Cities = c(num_black_cities, num_white_cities))

ggplot(num_cities_df, aes(x=reorder(Race, -Num_Majority_Cities), y=Num_Majority_Cities)) +
  geom_col(fill="lightblue", color="black") +
  ylab("Number of Majority Cities") +
  xlab("Race")

Distance Away From Home

I am quite fascinated by the world of Name, Image, and Likeness (NIL), as well as the impact of the transfer portal on the world of college sports. It started on July 1, 2021, so let’s take into account some things with its impact.

First, lets do some data cleaning/calculations to determine spherical distances from each city to its respective school.

universities = unique(schools$School)
uni_cities = c("Nashville, TN", "Lexington, KY", "Tuscaloosa, AL", "Gainesville, FL", "Baton Rouge, LA", "Starkville, MS", "Columbia, SC", "Fayetteville, AR", "Auburn, AL", "Columbia, MO")

# Add each university's city so it can be geocoded for distance calculations
school_locs = data.frame(School = universities, City = uni_cities) %>% mutate_geocode(City)

In the next two chunks, I add each city’s geocoded location to the all_schools dataframe, as well as the geocoded location for each school in the dataframe.

lons = c()
lats = c()

# Apply to all players dataframe
for (city in all_schools$Hometown) {
  geocode = geocodes[geocodes$Hometown == city,]
  lons = c(lons, geocode$lon)
  lats = c(lats, geocode$lat)
}

all_schools$lon = lons
all_schools$lat = lats

sc_lons = c()
sc_lats = c()
for (school in all_schools$School) {
  code = school_locs[school_locs$School == school,]
  sc_lons = c(sc_lons, code$lon)
  sc_lats = c(sc_lats, code$lat)
}

all_schools$school_lon = sc_lons
all_schools$school_lat = sc_lats

Now with those geocodes in place, we can calculate the Haversine distance (distance across a sphere) of each city-school pair.

dists = c()

for (i in 1:nrow(all_schools)) {
  # Calculate distance, divide by 1000 for km
  dists = c(dists, distHaversine(c(all_schools$lon[i], all_schools$lat[i]), c(all_schools$school_lon[i], all_schools$school_lat[i]))/1000)
}

all_schools$school_dist = dists

Now we can group this by school, and by year, and find the variance.

schools_dist_var = all_schools %>% 
  drop_na() %>% 
  group_by(School, Year) %>% 
  summarize(distance_var = var(school_dist)) %>% 
  mutate(Color = school_color(School), label= toupper(substr(School, 1, 3)))

Now we can take each schools variance in distance over time, and see how “spread out” their recruitment is.

# This allows for nice labels
ends = schools_dist_var %>% filter(Year == 2023)

ggplot(schools_dist_var, aes(x=Year, y=distance_var)) +
  geom_smooth(aes(color=School), ,se=F, size=1.25) +
  scale_color_manual(values=group.colors) +
  scale_x_continuous(breaks=c(2013, 2015, 2017, 2019, 2021, 2023)) +
  geom_text_repel(aes(label=label), data=ends, nudge_x=1, na.rm=T) +
  ggtitle("Variance in Distance from 2013-2023") +
  ylab("Variance in Distance") +
  theme(axis.text.y=element_blank(), axis.ticks.y = element_blank())

We see from this that Florida has the most variance in distance at the moment, while schools like Kentucky have dropped off over the years.

Since the transfer portal and NIL came about, let’s take a look at the difference across all schools in variance before and after this change.

nil_difference = schools_dist_var %>% 
  mutate(nil = ifelse(Year >= 2021, "2021-2023", "2013-2020")) %>% 
  group_by(nil) %>% 
  summarize(avg_var = mean(distance_var))

ggplot(nil_difference, aes(x=nil, y=avg_var)) +
  geom_col(width=0.5, color="black", fill="lightblue") +
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
  ylab("Average Variance") +
  xlab("Year Range") +
  ggtitle("Average Variance in Distance Before and After NIL Introduction")

We can see a slight increase in variance, but it is likely negligible, so this will probably need a few more years of data to see changes, if any. However, we can still look at this by school.

nil_diff_school = schools_dist_var %>% 
  mutate(nil = ifelse(Year >= 2021, "after", "before")) %>% 
  group_by(nil, School) %>% 
  summarize(avg_var = mean(distance_var)) %>% 
  pivot_wider(names_from=nil, values_from=avg_var) %>% 
  mutate(var_diff = after-before)

ggplot(nil_diff_school, aes(x=School, y=var_diff, fill=School)) +
  geom_col(width=0.5, color="black") +
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_text(angle=90, vjust=1)) +
  scale_fill_manual(values=group.colors) +
  ylab("Difference in Variance") +
  xlab("School") +
  ggtitle("Difference in Variance Before and After NIL Introduction")

We see from this that 6 out of 10 schools have increased their recruitment range since NIL was introduced. Some like Kentucky and Auburn have adjusted to bring recruits in that are closer to campus.

In State Recruiting

Which schools (out of the subset that I have scraped) are the best at recruiting in their state? Out of all of the cities in their state, how many are they the majority recruiter in?

unis = sort(unique(cities$School))
uni_states = data.frame(School = unis, State = c("Alabama", "Arkansas", "Alabama", "Florida", "Kentucky", "Louisiana", "Mississippi", "Missouri", "South Carolina", "Tennessee"))

# Give the cities dataframe all of their states
hometowns_unique = distinct(hometowns) %>% arrange(City)

cities$State = hometowns_unique$State
cities_filtered = cities %>% filter(State %in% uni_states$State)


# SQL query to determine percentage of cities in state that a school has a majority in
sql_calc = sqldf("with cte as (SELECT State, School, COUNT(Hometown) as Num_Cities,
                 count(*) * 100.0 / sum(count(*)) over(PARTITION BY State) as Percentage_Of_Cities 
                 FROM cities_filtered 
                 GROUP BY State, School) 
                 SELECT * FROM cte WHERE State == School OR (State = 'Alabama' AND School = 'Auburn') OR (State = 'Louisiana' AND School = 'LSU') OR (State = 'Tennessee' AND School = 'Vanderbilt') OR (State = 'Mississippi' AND School = 'Mississippi State')")


# Plot
ggplot(sql_calc, aes(x=reorder(School, -Percentage_Of_Cities), y=Percentage_Of_Cities, fill=School)) +
  geom_col(color="black") +
  theme(axis.text.x=element_text(angle=90, vjust=1)) +
  scale_fill_manual(values=group.colors) +
  xlab("School") +
  ylab("Percentage of Cities in State") +
  ggtitle("Percentage of Majority Cities in State")

Thank You!

Thanks for listening to my presentation! This took a lot of hard work over the last week or so, so I appreciate the support.

UW Athletics Data Contest

Matt McClain

December 6, 2023