Introduction

Although I am not a huge college basketball fan, I find this particular data set fascinating in its richness. Let’s dig in and see what we can learn about the history of March Madness. Hopefully along the way, I’ll discover something useful to use in your models.

Libraries

library(data.table)
library(dplyr)
library(magrittr)
library(ggplot2)
install.packages("gridExtra")

## package 'gridExtra' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\adminlauraedell\AppData\Local\Temp\2\RtmpOEctyB\downloaded_packages

install.packages("ggExtra")

## package 'ggExtra' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\adminlauraedell\AppData\Local\Temp\2\RtmpOEctyB\downloaded_packages

library(gridExtra)
library(ggExtra)

theme_set(theme_bw())

# Data Section 1
teams <- fread('E:\\sportsML\\sportsML\\kaggleNCAA-master\\March-Madness-Machine-Learning-2018-master\\Kaggle-March-Madness-Machine-Learning-2018-master\\DataFiles\\Teams.csv')
seasons <- fread('E:\\sportsML\\sportsML\\kaggleNCAA-master\\March-Madness-Machine-Learning-2018-master\\Kaggle-March-Madness-Machine-Learning-2018-master\\DataFiles\\Seasons.csv')
seeds <- fread('E:\\sportsML\\sportsML\\kaggleNCAA-master\\March-Madness-Machine-Learning-2018-master\\Kaggle-March-Madness-Machine-Learning-2018-master\\DataFiles\\NCAATourneySeeds.csv')
seas_results <- fread('E:\\sportsML\\sportsML\\kaggleNCAA-master\\March-Madness-Machine-Learning-2018-master\\Kaggle-March-Madness-Machine-Learning-2018-master\\DataFiles\\RegularSeasonCompactResults.csv')
tour_results <- fread('E:\\sportsML\\sportsML\\kaggleNCAA-master\\March-Madness-Machine-Learning-2018-master\\Kaggle-March-Madness-Machine-Learning-2018-master\\DataFiles\\NCAATourneyCompactResults.csv')
seas_detail <- fread('E:\\sportsML\\sportsML\\kaggleNCAA-master\\March-Madness-Machine-Learning-2018-master\\Kaggle-March-Madness-Machine-Learning-2018-master\\DataFiles\\RegularSeasonDetailedResults.csv')
tour_detail <- fread('E:\\sportsML\\sportsML\\kaggleNCAA-master\\March-Madness-Machine-Learning-2018-master\\Kaggle-March-Madness-Machine-Learning-2018-master\\DataFiles\\NCAATourneyDetailedResults.csv')
conferences <- fread('E:\\sportsML\\sportsML\\kaggleNCAA-master\\March-Madness-Machine-Learning-2018-master\\Kaggle-March-Madness-Machine-Learning-2018-master\\DataFiles\\Conferences.csv')
team_conferences <- fread('E:\\sportsML\\sportsML\\kaggleNCAA-master\\March-Madness-Machine-Learning-2018-master\\Kaggle-March-Madness-Machine-Learning-2018-master\\DataFiles\\TeamConferences.csv')
coaches <- fread('E:\\sportsML\\sportsML\\kaggleNCAA-master\\March-Madness-Machine-Learning-2018-master\\Kaggle-March-Madness-Machine-Learning-2018-master\\DataFiles\\TeamCoaches.csv')

Data Overview

glimpse(teams)

## Observations: 364
## Variables: 4
## $ TeamID        <int> 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, ...
## $ TeamName      <chr> "Abilene Chr", "Air Force", "Akron", "Alabama", ...
## $ FirstD1Season <int> 2014, 1985, 1985, 1985, 2000, 1985, 2000, 1985, ...
## $ LastD1Season  <int> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, ...

glimpse(seasons)

## Observations: 34
## Variables: 6
## $ Season  <int> 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, ...
## $ DayZero <chr> "10/29/1984", "10/28/1985", "10/27/1986", "11/2/1987",...
## $ RegionW <chr> "East", "East", "East", "East", "East", "East", "East"...
## $ RegionX <chr> "West", "Midwest", "Southeast", "Midwest", "West", "Mi...
## $ RegionY <chr> "Midwest", "Southeast", "Midwest", "Southeast", "Midwe...
## $ RegionZ <chr> "Southeast", "West", "West", "West", "Southeast", "Wes...

glimpse(seeds)

## Observations: 2,150
## Variables: 3
## $ Season <int> 1985, 1985, 1985, 1985, 1985, 1985, 1985, 1985, 1985, 1...
## $ Seed   <chr> "W01", "W02", "W03", "W04", "W05", "W06", "W07", "W08",...
## $ TeamID <int> 1207, 1210, 1228, 1260, 1374, 1208, 1393, 1396, 1439, 1...

glimpse(seas_results)

## Observations: 150,684
## Variables: 8
## $ Season  <int> 1985, 1985, 1985, 1985, 1985, 1985, 1985, 1985, 1985, ...
## $ DayNum  <int> 20, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25...
## $ WTeamID <int> 1228, 1106, 1112, 1165, 1192, 1218, 1228, 1242, 1260, ...
## $ WScore  <int> 81, 77, 63, 70, 86, 79, 64, 58, 98, 97, 103, 75, 91, 7...
## $ LTeamID <int> 1328, 1354, 1223, 1432, 1447, 1337, 1226, 1268, 1133, ...
## $ LScore  <int> 64, 70, 56, 54, 74, 78, 44, 56, 80, 89, 71, 71, 72, 65...
## $ WLoc    <chr> "N", "H", "H", "H", "H", "H", "N", "N", "H", "H", "H",...
## $ NumOT   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...

glimpse(tour_results)

## Observations: 2,117
## Variables: 8
## $ Season  <int> 1985, 1985, 1985, 1985, 1985, 1985, 1985, 1985, 1985, ...
## $ DayNum  <int> 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,...
## $ WTeamID <int> 1116, 1120, 1207, 1229, 1242, 1246, 1256, 1260, 1314, ...
## $ WScore  <int> 63, 59, 68, 58, 49, 66, 78, 59, 76, 79, 75, 96, 85, 83...
## $ LTeamID <int> 1234, 1345, 1250, 1425, 1325, 1449, 1338, 1233, 1292, ...
## $ LScore  <int> 54, 58, 43, 55, 38, 58, 54, 58, 57, 70, 64, 83, 68, 59...
## $ WLoc    <chr> "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N",...
## $ NumOT   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...

glimpse(seas_detail)

## Observations: 76,636
## Variables: 34
## $ Season  <int> 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, ...
## $ DayNum  <int> 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13...
## $ WTeamID <int> 1104, 1272, 1266, 1296, 1400, 1458, 1161, 1186, 1194, ...
## $ WScore  <int> 68, 70, 73, 56, 77, 81, 80, 75, 71, 84, 106, 74, 66, 7...
## $ LTeamID <int> 1328, 1393, 1437, 1457, 1208, 1186, 1236, 1457, 1156, ...
## $ LScore  <int> 62, 63, 61, 50, 71, 55, 62, 61, 66, 56, 50, 73, 65, 48...
## $ WLoc    <chr> "N", "N", "N", "N", "N", "H", "H", "N", "N", "H", "H",...
## $ NumOT   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ WFGM    <int> 27, 26, 24, 18, 30, 26, 23, 28, 28, 32, 41, 29, 26, 25...
## $ WFGA    <int> 58, 62, 58, 38, 61, 57, 55, 62, 58, 67, 69, 51, 66, 56...
## $ WFGM3   <int> 3, 8, 8, 3, 6, 6, 2, 4, 5, 5, 15, 7, 5, 10, 11, 10, 5,...
## $ WFGA3   <int> 14, 20, 18, 9, 14, 12, 8, 14, 11, 17, 25, 13, 19, 23, ...
## $ WFTM    <int> 11, 10, 17, 17, 11, 23, 32, 15, 10, 15, 9, 9, 9, 16, 1...
## $ WFTA    <int> 18, 19, 29, 31, 13, 27, 39, 21, 18, 19, 13, 11, 13, 23...
## $ WOR     <int> 14, 15, 17, 6, 17, 12, 13, 13, 9, 14, 15, 6, 21, 8, 15...
## $ WDR     <int> 24, 28, 26, 19, 22, 24, 18, 35, 22, 22, 29, 21, 23, 35...
## $ WAst    <int> 13, 16, 15, 11, 12, 12, 14, 19, 9, 11, 21, 18, 15, 18,...
## $ WTO     <int> 23, 13, 10, 12, 14, 9, 17, 19, 17, 6, 11, 15, 17, 13, ...
## $ WStl    <int> 7, 4, 5, 14, 4, 9, 11, 7, 9, 12, 10, 7, 12, 14, 18, 2,...
## $ WBlk    <int> 1, 4, 2, 2, 4, 3, 1, 2, 2, 0, 6, 1, 3, 19, 5, 6, 3, 5,...
## $ WPF     <int> 22, 18, 25, 18, 20, 18, 25, 21, 23, 13, 16, 5, 17, 13,...
## $ LFGM    <int> 22, 24, 22, 18, 24, 20, 19, 20, 24, 23, 17, 29, 24, 18...
## $ LFGA    <int> 53, 67, 73, 49, 62, 46, 41, 59, 52, 52, 52, 63, 56, 64...
## $ LFGM3   <int> 2, 6, 3, 6, 6, 3, 4, 4, 6, 3, 4, 10, 6, 8, 4, 7, 2, 2,...
## $ LFGA3   <int> 10, 24, 26, 22, 16, 11, 15, 17, 18, 14, 11, 22, 19, 24...
## $ LFTM    <int> 16, 9, 14, 8, 17, 12, 20, 17, 12, 7, 12, 5, 11, 4, 17,...
## $ LFTA    <int> 22, 20, 23, 15, 27, 17, 28, 23, 27, 12, 17, 5, 17, 8, ...
## $ LOR     <int> 10, 20, 31, 17, 21, 6, 9, 8, 13, 9, 8, 13, 14, 14, 20,...
## $ LDR     <int> 22, 25, 22, 20, 15, 22, 21, 25, 26, 23, 15, 16, 21, 26...
## $ LAst    <int> 8, 7, 9, 9, 12, 8, 11, 10, 13, 10, 8, 15, 17, 12, 17, ...
## $ LTO     <int> 18, 12, 12, 19, 10, 19, 30, 15, 25, 18, 17, 12, 18, 17...
## $ LStl    <int> 9, 8, 2, 4, 7, 4, 10, 14, 8, 1, 7, 6, 8, 10, 7, 5, 15,...
## $ LBlk    <int> 2, 6, 5, 3, 1, 3, 4, 8, 2, 3, 3, 2, 4, 0, 7, 1, 3, 4, ...
## $ LPF     <int> 20, 16, 23, 23, 14, 25, 28, 18, 18, 18, 15, 12, 13, 17...

glimpse(tour_detail)

## Observations: 981
## Variables: 34
## $ Season  <int> 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, ...
## $ DayNum  <int> 134, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,...
## $ WTeamID <int> 1421, 1112, 1113, 1141, 1143, 1163, 1181, 1211, 1228, ...
## $ WScore  <int> 92, 80, 84, 79, 76, 58, 67, 74, 65, 64, 72, 72, 70, 71...
## $ LTeamID <int> 1411, 1436, 1272, 1166, 1301, 1140, 1161, 1153, 1443, ...
## $ LScore  <int> 84, 51, 71, 73, 74, 53, 57, 69, 60, 61, 68, 71, 69, 54...
## $ WLoc    <chr> "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N",...
## $ NumOT   <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
## $ WFGM    <int> 32, 31, 31, 29, 27, 17, 19, 20, 24, 28, 22, 28, 23, 24...
## $ WFGA    <int> 69, 66, 59, 53, 64, 52, 54, 47, 56, 51, 51, 52, 54, 52...
## $ WFGM3   <int> 11, 7, 6, 3, 7, 4, 4, 6, 5, 2, 9, 5, 3, 10, 8, 8, 6, 7...
## $ WFGA3   <int> 29, 23, 14, 7, 20, 14, 13, 14, 14, 6, 16, 13, 13, 18, ...
## $ WFTM    <int> 17, 11, 16, 18, 15, 20, 25, 28, 12, 6, 19, 11, 21, 13,...
## $ WFTA    <int> 26, 14, 22, 25, 23, 27, 31, 37, 14, 11, 23, 18, 25, 24...
## $ WOR     <int> 14, 11, 10, 11, 18, 12, 13, 8, 15, 7, 11, 9, 11, 11, 1...
## $ WDR     <int> 30, 36, 27, 20, 20, 29, 27, 28, 23, 20, 20, 32, 33, 24...
## $ WAst    <int> 17, 22, 18, 15, 17, 8, 4, 12, 15, 13, 14, 7, 7, 16, 16...
## $ WTO     <int> 12, 16, 9, 18, 13, 14, 16, 12, 14, 11, 10, 23, 20, 14,...
## $ WStl    <int> 5, 10, 7, 13, 8, 3, 10, 2, 11, 8, 4, 4, 6, 8, 4, 6, 7,...
## $ WBlk    <int> 3, 7, 4, 1, 2, 8, 8, 2, 4, 4, 3, 6, 6, 3, 8, 1, 2, 5, ...
## $ WPF     <int> 22, 8, 19, 19, 14, 16, 23, 15, 14, 17, 23, 19, 19, 12,...
## $ LFGM    <int> 29, 20, 25, 27, 25, 20, 18, 26, 22, 23, 24, 26, 23, 21...
## $ LFGA    <int> 67, 64, 69, 60, 56, 64, 54, 66, 58, 56, 54, 65, 66, 52...
## $ LFGM3   <int> 12, 4, 7, 7, 9, 2, 3, 10, 8, 6, 5, 8, 8, 6, 4, 9, 7, 6...
## $ LFGA3   <int> 31, 16, 28, 17, 21, 17, 11, 27, 24, 17, 15, 19, 23, 20...
## $ LFTM    <int> 14, 7, 14, 12, 15, 11, 18, 7, 8, 9, 15, 11, 15, 6, 25,...
## $ LFTA    <int> 31, 7, 21, 17, 20, 13, 22, 10, 13, 10, 25, 21, 20, 8, ...
## $ LOR     <int> 17, 8, 20, 14, 10, 15, 11, 13, 17, 13, 14, 11, 14, 7, ...
## $ LDR     <int> 28, 26, 22, 17, 26, 26, 24, 22, 18, 19, 20, 18, 23, 22...
## $ LAst    <int> 16, 12, 11, 20, 16, 11, 8, 13, 10, 13, 14, 19, 15, 8, ...
## $ LTO     <int> 15, 17, 12, 21, 14, 11, 19, 10, 14, 13, 9, 13, 12, 18,...
## $ LStl    <int> 5, 10, 2, 6, 5, 8, 5, 7, 6, 6, 4, 6, 11, 7, 3, 3, 4, 8...
## $ LBlk    <int> 0, 3, 5, 6, 8, 4, 4, 6, 5, 1, 1, 0, 3, 2, 0, 3, 0, 4, ...
## $ LPF     <int> 22, 15, 18, 21, 19, 22, 19, 24, 16, 15, 20, 19, 25, 23...

glimpse(conferences)

## Observations: 51
## Variables: 2
## $ ConfAbbrev  <chr> "a_sun", "a_ten", "aac", "acc", "aec", "asc", "awc...
## $ Description <chr> "Atlantic Sun Conference", "Atlantic 10 Conference...

glimpse(team_conferences)

## Observations: 10,888
## Variables: 3
## $ Season     <int> 1985, 1985, 1985, 1985, 1985, 1985, 1985, 1985, 198...
## $ TeamID     <int> 1114, 1147, 1204, 1209, 1215, 1223, 1273, 1359, 118...
## $ ConfAbbrev <chr> "a_sun", "a_sun", "a_sun", "a_sun", "a_sun", "a_sun...

glimpse(coaches)

## Observations: 10,992
## Variables: 5
## $ Season      <int> 1985, 1985, 1985, 1985, 1985, 1985, 1985, 1985, 19...
## $ TeamID      <int> 1102, 1103, 1104, 1106, 1108, 1109, 1110, 1111, 11...
## $ FirstDayNum <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ LastDayNum  <int> 154, 154, 154, 154, 154, 154, 154, 154, 154, 154, ...
## $ CoachName   <chr> "reggie_minton", "bob_huggins", "wimp_sanderson", ...

Historical Performance

Teams

What have been the most successful teams since 1985? We’ll attempt to determine this with the metrics: regular season wins, number 1 #seeds, tournament wins, and championships.

setkey(teams, TeamID)
setkey(seeds, TeamID)

g1 <-
    teams[seeds][, one_seed := as.numeric(substr(Seed, 2, 3)) == 1][, sum(one_seed), by = TeamName][order(V1, decreasing = T)][1:15,] %>%
    ggplot(aes(x = reorder(TeamName, V1), y = V1)) +
    geom_bar(stat = 'identity', fill = 'darkblue') +
    labs(x = '', y = 'No 1 seeds', title = 'No. 1 Seeds since 1985') +
    coord_flip()

setkey(seas_results, WTeamID)

g2 <-
    seas_results[teams][, .(wins = .N), by = TeamName][order(-wins)][1:15,] %>%
    ggplot(aes(x = reorder(TeamName, wins), y = wins)) +
    geom_bar(stat = 'identity', fill = 'darkblue') +
    labs(x = '', y = 'Wins', title = 'Regular Season Wins since 1985') +
    coord_flip()

setkey(tour_results, WTeamID)

g3 <-
    tour_results[teams][, .(wins = .N), by = TeamName][order(-wins)][1:15,] %>%
    ggplot(aes(x = reorder(TeamName, wins), y = wins)) +
    geom_bar(stat = 'identity', fill = 'darkblue') +
    labs(x = '', y = 'Wins', title = 'Tournament Wins since 1985') +
    coord_flip()

g4 <-
    tour_results[teams][DayNum == 154, .(wins = .N), by = TeamName][order(-wins)][1:15,] %>%
    ggplot(aes(x = reorder(TeamName, wins), y = wins)) +
    geom_bar(stat = 'identity', fill = 'darkblue') +
    labs(x = '', y = 'Championships', title = 'Tournament Championships since 1985') +
    coord_flip()

grid.arrange(g1, g2, g3, g4, nrow = 2)

Conferences

What about conferences? Which conferences have produced the most championships since 1985?

tour_results[team_conferences, on = c(WTeamID = 'TeamID', 'Season'), nomatch = 0
             ][DayNum == 154, .(ConfAbbrev, wins = .N), by = ConfAbbrev
               ][conferences, on = 'ConfAbbrev', nomatch = 0] %>%
    ggplot(aes(x = reorder(Description, wins), y = wins)) + 
    geom_bar(
        stat = 'identity', 
        fill = 'darkblue') + 
    labs(
        x = '', 
        y = 'Wins', 
        title = 'NCAA Championships by Conference (1985-2017)') + 
    scale_y_continuous(breaks = c(1:10)) + 
    coord_flip()

How conferences have fared against each other in past tournament matchups. Because there are so many conferences, I’m limiting them here to a subset of the better conferences. You could change this by editing the top_conf vector below.

top_conf <- c('acc', 'big_east', 'sec', 'big_ten', 'pac_ten', 'big_twelve')

tour_results[, .(Season,
                 TeamID1 = pmin(WTeamID, LTeamID), 
                 TeamID2 = pmax(WTeamID, LTeamID),
                 low_team_win = ifelse(WTeamID == pmin(WTeamID, LTeamID), 1, 0))
             ][team_conferences, on = c(TeamID1 = 'TeamID', 'Season'), nomatch = 0
               ][team_conferences, on = c(TeamID2 = 'TeamID', 'Season'), nomatch = 0
                 ][ConfAbbrev %in% top_conf & i.ConfAbbrev %in% top_conf
                   ][, .(win_pct = sum(low_team_win) / .N, n = .N), by = c('ConfAbbrev', 'i.ConfAbbrev')
                   ][conferences, on = 'ConfAbbrev',  nomatch = 0
                     ][conferences, on = c(i.ConfAbbrev = 'ConfAbbrev'), nomatch = 0
                       ][ConfAbbrev != i.ConfAbbrev] %>%
    ggplot(aes(x = Description, y = i.Description, fill = win_pct)) + 
    geom_tile() + 
    geom_text(aes(label = n)) + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    labs(x = '', y = '', title = 'Conference Matchups in NCAA Tournament History 1985-2017') + 
    scale_fill_distiller(palette = "Blues")

Here, the fill color represents the percentage of times that a team from the conference on the x-axis beat a team from the conference on the y-axis. The number in the center of each tile shows how many times such a matchup occured. For example we see that when Big Ten Conference team played an Atlantic Coast Conference team 23 times and that the Big Ten Conference team only won a small percentage of the games.

Indicators of Regular Season Success

Let’s now turn to the regular season game statistics. We are interested in knowing how certain statistics correlate with winning vs losing. We will take the regular season detail and first convert it to a more ‘long’ format with only 1 column of TeamIDs and a factor indicating whether that row corresponds to a win or a loss. Here I also add some additional game statistcs. These include field goal percentage, free throw percentage, offensive/defensive rebounding efficiency, and possessions.

win_stats <- seas_detail[, .(
    Season,
    TeamID = WTeamID,
    Outcome = rep('W', .N),
    FGM = WFGM,
    FGA = WFGA,
    FGP = WFGM / WFGA,
    FGP2 = (WFGM - WFGM3) / (WFGA - WFGA3),
    FGM3 = WFGM3,
    FGA3 = WFGA3,
    FGP3 = WFGM3 / WFGA3,
    FTM = WFTM,
    FTA = WFTA,
    FTP = WFTM / WFTA,
    OR = WOR,
    DR = WDR,
    AST = WAst,
    TO = WTO,
    STL = WStl,
    BLK = WBlk,
    PF = WPF,
    ORP = WOR / (WOR + LDR),
    DRP = WDR / (WDR + LOR),
    POS = 0.96 * (WFGA + WTO + 0.44 * WFTA - WOR)
    )]

los_stats <- seas_detail[, .(
    Season,
    TeamID = LTeamID,
    Outcome = rep('L', .N),
    FGM = LFGM,
    FGA = LFGA,
    FGP = LFGM / LFGA,
    FGP2 = (LFGM - LFGM3) / (LFGA - LFGA3),
    FGM3 = LFGM3,
    FGA3 = LFGA3,
    FGP3 = LFGM3 / LFGA3,
    FTM = LFTM,
    FTA = LFTA,
    FTP = LFTM / LFTA,
    OR = LOR,
    DR = LDR,
    AST = LAst,
    TO = LTO,
    STL = LStl,
    BLK = LBlk,
    PF = LPF,
    ORP = (LOR / (LOR + WDR)),
    DRP = LDR / (LDR + WOR),
    POS = 0.96 * (LFGA + LTO + 0.44 * LFTA - LOR)
    )]

stats_all <- rbindlist(list(win_stats, los_stats))

Now let’s take a look at the distributions of these statistics for winning and losing teams.

g1 <- stats_all %>%
    ggplot(aes(x = FGP, fill = Outcome)) +
    geom_density(alpha = 0.7) +
    scale_fill_manual(values = c('darkblue', 'grey')) + 
    labs(x = 'Field goal %', y = '', title = 'Field Goal Shooting')

g2 <- stats_all %>%
    ggplot(aes(x = FGP2, fill = Outcome)) +
    geom_density(alpha = 0.7) +
    scale_fill_manual(values = c('darkblue', 'grey')) + 
    labs(x = '2 pt Field goal %', y = '', title = '2 Pt Field Goal Shooting')

g3 <- stats_all %>%
    ggplot(aes(x = FGP3, fill = Outcome)) +
    geom_density(alpha = 0.7) +
    scale_fill_manual(values = c('darkblue', 'grey')) + 
    labs(x = '3 pt Field goal %', y = '', title = '3 Pt Field Goal Shooting')

g4 <- stats_all %>%
    ggplot(aes(x = FTP, fill = Outcome)) +
    geom_density(alpha = 0.7) +
    scale_fill_manual(values = c('darkblue', 'grey')) + 
    labs(x = 'Free throw %', y = '', title = 'Free Throw Shooting')

g5 <- stats_all %>%
    ggplot(aes(x = ORP, fill = Outcome)) +
    geom_density(alpha = 0.7) +
    scale_fill_manual(values = c('darkblue', 'grey')) + 
    labs(x = 'Offensive rebound %', y = '', title = 'Offensive Rebounding Efficiency')

g6 <- stats_all %>%
    ggplot(aes(x = DRP, fill = Outcome)) +
    geom_density(alpha = 0.7) +
    scale_fill_manual(values = c('darkblue', 'grey')) + 
    labs(x = 'Defensive rebouding %', y = '', title = 'Defensive Rebounding Efficiency')

g7 <- stats_all %>%
    ggplot(aes(x = AST, fill = Outcome)) +
    geom_density(alpha = 0.7) +
    scale_fill_manual(values = c('darkblue', 'grey')) + 
    labs(x = 'Assists', y = '', title = 'Assists per Game')

g8 <- stats_all %>%
    ggplot(aes(x = TO, fill = Outcome)) +
    geom_density(alpha = 0.7) +
    scale_fill_manual(values = c('darkblue', 'grey')) + 
    labs(x = 'Turnovers', y = '', title = 'Turnovers per Game')

g9 <- stats_all %>%
    ggplot(aes(x = STL, fill = Outcome)) +
    geom_density(alpha = 0.7) +
    scale_fill_manual(values = c('darkblue', 'grey')) + 
    labs(x = 'Steals', y = '', title = 'Steals per Game')

g10 <- stats_all %>%
    ggplot(aes(x = BLK, fill = Outcome)) +
    geom_density(alpha = 0.7) +
    scale_fill_manual(values = c('darkblue', 'grey')) + 
    labs(x = 'Blocks', y = '', title = 'Blocks per Game')

grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 2)

Unsurprisingly, we see that winning teams tend to have a higher mean (or lower in the case of turnover) in pretty much every metric. The last few plots are bit spikey due to the more discrete nature of the data.

We don’t have final game statistics until we have the game result, so we obviously can’t use these statistics in this form to predict the winners of tournament matchups. However, we can use regular season aggregate statistics to predict the winner in tournament matchups. Let’s take a look at that next.

Predictors of Tournament Success

One of the obvious predictors for how deep a team goes in the tournament would be regular season wins. Let’s see how regular season wins correlate to tournament progress each year.

wins_s <- seas_results[, .(rsW = .N), by = c('WTeamID', 'Season')]

wins_t <- tour_results[!(DayNum %in% c(134, 135)), .(tW = .N), by = c('WTeamID', 'Season')]


wins_teams <- wins_s[wins_t][teams]

wins_teams[!is.na(Season), ] %>%
    ggplot(aes(x = rsW, y = tW)) + 
    geom_point() + 
    geom_smooth(method = 'lm') + 
    facet_wrap( ~ as.factor(Season)) + 
    labs(
        x = 'Regular season wins', 
        y = 'Tournament wins', 
        title = 'Tournament Wins by Regular Season Wins')

In nearly every year, tournament wins is positively correlated with regular season wins. Of course, there are some exceptions - for example in 2000, the relationship is slightly negative! Single-elimination tournaments produce some variations as they leave little room for error. Sometimes strong favorites don’t get as far as expected.

The problem with using regular season wins is that in college basketball, not every team plays the same number of games in a regular season. Let’s do something similar to see if average scores during regular season are associated with better tournament progress.

wins <- seas_results[, .(n_games = .N, sum_score = sum(WScore)), by = c('WTeamID', 'Season')]

losses <- seas_results[, .(n_games = .N, sum_score = sum(LScore)), by = c('LTeamID', 'Season')]

all_games <- rbindlist(list(wins, losses))

all_games <- all_games[, .(rs_ppg = sum(sum_score) / sum(n_games)), by = c('WTeamID', 'Season')]

all_games[wins_t, on = c('WTeamID', 'Season')] %>%
    ggplot(aes(x = rs_ppg, y = tW)) + 
    geom_point() + 
    geom_smooth(method = 'lm') + 
    facet_wrap( ~ as.factor(Season)) + 
    labs(
        x = 'Regular season average score', 
        y = 'Tournament wins', 
        title = 'Tournament Wins by Regular Season Point per Game')

We can see that in each year, there is a positive relationship between regular season points per game and tournament wins. We can also see that the champion each year (as indicated by the point with 6 tournament wins) tends to have a ppg near the top of the range. There also appears to be an outlier. In 1990 there is a team that scored more than 120 points per game! It turns out this is not a bad data point. In 1990 Loyola Marymount was the highest scoring team in Division 1 history with 122 points per game!

Of course, going into the tournament, we know a team’s seed. How well do seeds predict tournament progress?

seeds[, .(Season, WTeamID = TeamID, seed_num = as.numeric(substr(Seed, 2, 3)))
      ][wins_t, on = c('Season', 'WTeamID')] %>%
    ggplot(aes(x = seed_num, y = tW)) + 
    geom_jitter(width = 0.2, height = 0.2) + 
    geom_smooth(method = 'lm') + 
    labs(
        x = 'Seed', 
        y = 'Tournament Wins', 
        title = 'Tournament Wins by Seed')

I’ve introduced some jiter to this plot to avoid overplotting. It exhibits a strong negative relationship between seed and tournament progress - the lower a team’s seed, the deeper they go into the tournament (as measured by tournament wins). We see that a 16 seed has never made it past the first round of the tournament. From the plot we can also determine that the lowest seed to ever win the tournament was a number 8. A vast majority of teams that have won the tournament since 1985 have been number 1 seeds.

We may also wonder how likely it is that a better-seeded (i.e. lower number) team will win any particular tournament matchup. Let’s look at the percentage of times the better-seeded team won by season.

tour_results_seeds <- seeds[, .(Season, WTeamID = TeamID, winner_seed = as.numeric(substr(Seed, 2, 3)))
                            ][tour_results, on = c('Season', 'WTeamID')
                              ][seeds[, .(Season, LTeamID = TeamID, loser_seed = as.numeric(substr(Seed, 2, 3)))
                                      ], on = c('Season', 'LTeamID')]



tour_results_seeds[Season != 2018, .(Season, low_seed_win = ifelse(winner_seed < loser_seed, 1, 0))
                   ][, sum(low_seed_win, na.rm = TRUE) / .N, by = Season] %>%
    ggplot(aes(x = reorder(Season, -V1), y = V1)) + 
    geom_point(color = 'darkblue', size = 2) + 
    labs(
        x = '', 
        y = '% of games in which better-seeded team won', 
        title = 'Better-seed winning percentage by year') + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

When examining these data by season, we see that the better-seeded team won games at a rate that varies between 0.79 in the 2007 tournament to approximately 0.59 in the 1999 tournament.

Now let’s examine the relationship between a team’s regular season win margin and its tournament performance.

seas_results[, .(avg_win_marg = mean(WScore - LScore)), by = c('WTeamID', 'Season')
             ][wins_t, on = c('WTeamID', 'Season')] %>%
    ggplot(aes(x = avg_win_marg, y = tW)) + 
    geom_point() + 
    geom_smooth(method = 'lm') +
    labs(
        x = 'Average regular season win margin', 
        y = 'Tournament wins', 
        title = 'Tournament Wins by Regular Season Win Margin') + 
    facet_wrap(~Season)

Now let’s move beyond the basic stats and use some of the box score data as well. To start, let’s create a standardized data frame in wide format with all of the teams regular season stats. We’ll create some additional statistics such as various shooting percentages, rebounds per game, steals per game, etc. Because of the format of the data, we first need to get the stats for the games winning teams and losing teams seperately. Then we will bind these row-wise and group by Season and TeamID to calculate the stats.

stats_season <- stats_all[, .(
    FGP = sum(FGM) / sum(FGA),
    FGP3 = sum(FGM3) / sum(FGA3),
    FTP = sum(FTM) / sum(FTA),
    ORPG = mean(OR),
    DRPG = mean(DR),
    ASPG = mean(AST),
    TOPG = mean(TO),
    STPG = mean(STL),
    BLPG = mean(BLK),
    PFPG = mean(PF),
    MORP = mean(ORP),
    MPOS = mean(POS))
    , by = c('TeamID', 'Season')]

Can we use a team’s regular season game statistics to predict tournament success. First let’s look at field goal % and free throw % during the regular season and see if these equate to tournament success. We’ll define success in the case as making the Final Four.

g1 <- stats_season[wins_t, on = c(TeamID = 'WTeamID', 'Season'), nomatch = 0
              ][, final_four := tW >= 4] %>%
    ggplot(aes(x = FGP, y = FTP, color = final_four)) + 
    geom_point(alpha = 0.6) +
    labs(
        x = 'Field goal %', 
        y = 'Free throw %', 
        title = 'Regular Season Shooting Performance of Tournament Teams') +
    scale_color_manual(values = c('darkgrey', 'steelblue'))

ggMarginal(g1, type = 'histogram', fill = 'steelblue')

We see that the distribution of field goal % appears to have a peak around 0.45. The distribution of free throw percentage peaks near 72%. Interestingly in terms of shooting %, there does not seem to be much of a difference between teams that make the Final Four and the rest of the tournament field in terms of their regular season performance; however it is hard to tell from this plot type. To double-check, let’s plot the densities of these two statistics for Final Four teams and the rest of the field.

g1 <- stats_season[wins_t, on = c(TeamID = 'WTeamID', 'Season'), nomatch = 0
              ][, final_four := tW >= 4] %>%
    ggplot(aes(x = FGP, fill = final_four)) + 
    geom_density(alpha = 0.6) + 
    labs(x = 'Field goal %', title = 'Regular Season Field Goal % of Tournament Teams')

g2 <- stats_season[wins_t, on = c(TeamID = 'WTeamID', 'Season'), nomatch = 0
              ][, final_four := tW >= 4] %>%
    ggplot(aes(x = FTP, fill = final_four)) + 
    geom_density(alpha = 0.6) + 
    labs(x = 'Free throw %', title = 'Regular Season Free Throw % of Tournament Teams')

grid.arrange(g1, g2, ncol = 2)

From the density plots, it actually appears that Final Four teams do shoot better from the floor during the regular season. Non Final Four teams shoot around 0.45 during the regular season and Final Four teams seem to shoot around 0.475. It is very important to keep in mind however that the sample size for Final Four teams is much smaller than the sample size for the rest of the tournament field. Therefore its unclear whether we can consider this difference statistically significant. For free throw percentage, there does not appear to be much of a difference.

Let’s get a better idea of whether the difference in field goal percentage is real. We can use a two-sample t-test to determine if there is a difference in the sample means. Because the sample size of the two are different (and hence the variance), we can use Welch’s two-sample t-test.

fgp_noff <- stats_season[wins_t, on = c(TeamID = 'WTeamID', 'Season'), nomatch = 0
              ][, final_four := tW >= 4
                ][final_four == FALSE,  FGP]

fgp_ff <- stats_season[wins_t, on = c(TeamID = 'WTeamID', 'Season'), nomatch = 0
              ][, final_four := tW >= 4
                ][final_four == TRUE,  FGP]

t.test(fgp_noff, fgp_ff, alternative = 'two.sided', var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  fgp_noff and fgp_ff
## t = -2.6329, df = 71.732, p-value = 0.01036
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.015059158 -0.002081044
## sample estimates:
## mean of x mean of y 
## 0.4613265 0.4698966

When doing so, we get a test statistic of -3.1443 and a p-value of 0.002417. At the 95% significance level therefore, we can reject the null hypothesis of a zero difference in mean and accept evidence of the alternative hypothesis that there is a difference in the mean field goal percentage of Final Four teams and non-Final Four teams. That difference appears to be about one percentage point.

Now let’s do the same thing for rebounding performance.

g1 <- stats_season[wins_t, on = c(TeamID = 'WTeamID', 'Season'), nomatch = 0
              ][, final_four := tW >= 4] %>%
    ggplot(aes(x = DRPG, y = ORPG, color = final_four)) + 
    geom_point(alpha = 0.6) +
    labs(
        x = 'Defensive rebounds per game', 
        y = 'Offensive rebounds per game', 
        title = 'Regular Season Rebounding Performance of Tournament Teams') +
    scale_color_manual(values = c('darkgrey', 'steelblue')) 

ggMarginal(g1, type = 'histogram', fill = 'steelblue')

g1 <- stats_season[wins_t, on = c(TeamID = 'WTeamID', 'Season'), nomatch = 0
              ][, final_four := tW >= 4] %>%
    ggplot(aes(x = DRPG, fill = final_four)) + 
    geom_density(alpha = 0.6) + 
    labs(x = 'Defensive rebounds per game', title = 'Regular Season Rebounding of Tournament Teams')

g2 <- stats_season[wins_t, on = c(TeamID = 'WTeamID', 'Season'), nomatch = 0
              ][, final_four := tW >= 4] %>%
    ggplot(aes(x = ORPG, fill = final_four)) + 
    geom_density(alpha = 0.6) + 
    labs(x = 'Offensive rebounds per game')

grid.arrange(g1, g2, ncol = 2)

In terms of defensive rebounding, there does not appear to be much separation between Final Four teams and the rest of the field. The same goes for offensive rebounding, however the appears to be a skew in the distribution for Final Four teams, perhaps an artifact of limited sample size.

g1 <- stats_season[wins_t, on = c(TeamID = 'WTeamID', 'Season'), nomatch = 0
              ][, final_four := tW >= 4] %>%
    ggplot(aes(x = TOPG, y = STPG, color = final_four)) + 
    geom_point(alpha = 0.6) +
    geom_smooth(aes(color = final_four), method = 'lm') +
    labs(
        x = 'Turnovers per game', 
        y = 'Steals per game', 
        title = 'Turnover Performance of Tournament Teams') +
    scale_color_manual(values = c('darkgrey', 'steelblue')) 

ggMarginal(g1, type = 'histogram', fill = 'steelblue')

The ratio of steals to turnovers is positive for all tournament teams, however the relatioship appears to be stronger for Final Four teams indicating that this ratio be be a good predictor of tournament success.

g1 <- stats_season[wins_t, on = c(TeamID = 'WTeamID', 'Season'), nomatch = 0
              ][, final_four := tW >= 4] %>%
    ggplot(aes(x = TOPG, fill = final_four)) + 
    geom_density(alpha = 0.6) + 
    labs(x = 'Turnovers per game', title = 'Regular Season Turnovers of Tournament Teams')

g2 <- stats_season[wins_t, on = c(TeamID = 'WTeamID', 'Season'), nomatch = 0
              ][, final_four := tW >= 4] %>%
    ggplot(aes(x = STPG, fill = final_four)) + 
    geom_density(alpha = 0.6) + 
    labs(x = 'Steals per game',  title = 'Regular Season Steals of Tournament Teams')

grid.arrange(g1, g2, ncol = 2)

There seems to be some separation of means between all tournament teams and Final Four teams for regular season turnovers per game, however its inconclusive whether or not the difference is significant.

A Very Extensive NCAA Exploratory Analysis

Laura Edell

February 20, 2018

Table of Contents