2018 March Madness Prediction Results

Load Libraries and Data

# Load necessary libraries
library(dplyr)
library(data.table)
library(knitr)

# Load prediction data from model
Results <- readRDS("saferesults.Rds")

# Load the 2018 bracket data and team names
tourneyslots <- fread("NCAATourneySlots.csv") %>% filter(Season == 2018)
tourneySeeds <- fread('NCAATourneySeeds.csv') %>% filter(Season == 2018)
teams <- fread('teams.csv')

Create Bracket

# Attach team names to seed info
tourneySeeds <- tourneySeeds %>% 
    left_join(teams, by = 'TeamID') %>% 
    select(Season, Seed, TeamID, TeamName)

# Manually set the play-in winners
tourneyslots[5,4] <- "W16b"
tourneyslots[10,4] <- "W11a"
tourneyslots[18, 4] <- "X11b"
tourneyslots[29, 4] <- "Z16b"

# Complete Bracket
bracket <- tourneyslots %>% 
    left_join(select(tourneySeeds, -Season), by=c('StrongSeed'='Seed')) %>% 
    left_join(select(tourneySeeds, -Season), by=c('WeakSeed'='Seed'))

Making Predictions ATS

A team needs to have a predicted spread more than 2 points beyond the sportsbook spread in order to have greater than a 55% chance of winning ATS (against the spread), which is the minimum winning percent needed to break even. In the ModelChoice variable, the model will return “No Choice” if the model prediction is within 2 points of the sportsbook spread or otherwise return which team the model predicts has a > 55% chance to cover the spread.

# Append model predicted spread
predictionbracket <- bracket %>% 
    left_join(select(Results, TeamName.x, TeamName.y, Prediction, Win_Prob), by = c('TeamName.x' = 'TeamName.x', 'TeamName.y'= 'TeamName.y'))

# Get the rows of lesser seeds
na_rows <- which(is.na(predictionbracket$Prediction))

# Impute prediction data from lesser seeds
for (i in na_rows[which(na_rows < 37)]) {
    predictionbracket[i,9] <- -Results[which(predictionbracket[i,6] == Results$TeamName.y & predictionbracket[i,8] == Results$TeamName.x),9]
    
    predictionbracket[i,10] <- 1 - Results[which(predictionbracket[i,6] == Results$TeamName.y & predictionbracket[i,8] == Results$TeamName.x),10]
}

# All Round 1 Spreads as of 12:30pm Thursday
# Spreads are positive if strong seed is favored, negative if underdogs
spreads <- data.frame(
    matrix(
        c(
"St Bonaventure", -3.5,
"Long Island", -4.5,
"Arizona St", 1,
"NC Central", -4.5,
"Rhode Island", 1.5,
"Tennessee", 11.5,
"Gonzaga", 13.5,
"Kansas", 14.5,
"Duke", 20,
"Miami FL", 2,
"Ohio St", 8,
"Seton Hall", 3,
"Villanova", 23,
"Kentucky", 5,
"Houston", 4,
"Texas Tech", 11,
"Virginia Tech", 2.5,
"Arizona", 8.5,
"Michigan", 10,
"Florida", 5.5,
"Texas A&M", 2.5,
"Purdue", 20.5,
"Wichita St", 11.5,
"Cincinnati", 14,
"North Carolina", 19.5,
"Arkansas", -1.5,
"West Virginia", 10.5,
"Nevada", 0,
"Creighton", 1,
"Michigan St", 14.5,
"Xavier", 19.5,
"Auburn", 9,
"Virginia", 21,
"TCU", 4,
"Missouri", -1.5,
"Clemson", 4.5), 
    ncol = 2, byrow = TRUE), stringsAsFactors = FALSE
)

names(spreads) <- c("Team", "Spread")
spreads$Spread <- as.numeric(spreads$Spread)

# Append spreads on to bracket
predictionbracket <- predictionbracket %>% 
    left_join(spreads, by = c('TeamName.x' = 'Team')) %>% 
    mutate(diff = Prediction - Spread)

# Determine who model would bet on
predictionbracket$ModelChoice <- NA
for (i in 1:36){
    if(predictionbracket$diff[i] < -2) {predictionbracket$ModelChoice[i] <- predictionbracket$TeamName.y[i]}
    if(predictionbracket$diff[i] > 2) {predictionbracket$ModelChoice[i] <- predictionbracket$TeamName.x[i]}
    if(predictionbracket$diff[i] <= 2 & predictionbracket$diff[i] >= -2) {predictionbracket$ModelChoice[i] <- "No choice"}
}

Game Results

# Input Game Results Manually
# Eventually I need to scrape this data from web
game_result <- data.frame(
    matrix(
        c(
"St Bonaventure", 65, 58, 
"Long Island", 61, 71,
"Arizona St", 56, 60,
"NC Central", 46, 64,
"Rhode Island", 83, 78,
"Tennessee", 73, 47,
"Gonzaga", 68, 64,
"Kansas", 76, 60,
"Duke", 89, 67,
"Miami FL", 62, 64,
"Ohio St", 81, 73,
"Seton Hall", 94, 83,
"Villanova", 87, 61,
"Kentucky", 78, 73,
"Houston", 67, 65,
"Texas Tech", 70, 60,
"Virginia Tech", 83, 86,
"Arizona", 68, 89,
"Michigan", 61, 47,
"Florida", 77, 62), 
    ncol = 3, byrow = TRUE), stringsAsFactors = FALSE
)
names(game_result) <- c("Team", "Team.x.score", "Team.y.score")
game_result$Team.x.score <- as.numeric(game_result$Team.x.score)
game_result$Team.y.score <- as.numeric(game_result$Team.y.score)

# Append Game Results to Bracket
finalbracket <- predictionbracket %>% 
    left_join(game_result, by = c('TeamName.x' = 'Team'))

# Determine who covered the spread
finalbracket$ATSWinner <- NA
for (i in 1:36){
    FinalDiff <- finalbracket$Team.x.score[i] - finalbracket$Team.y.score[i]
    if(!is.na(FinalDiff)) {
        if(FinalDiff < finalbracket$Spread[i]) 
            {finalbracket$ATSWinner[i] <- finalbracket$TeamName.y[i]}
        if(FinalDiff > finalbracket$Spread[i]) 
            {finalbracket$ATSWinner[i] <- finalbracket$TeamName.x[i]}
        if(FinalDiff == finalbracket$Spread[i]) 
            {finalbracket$ATSWinner[i] <- "Push"}
    }
}

Accuracy of Predictions

# Determine if Model Prediction was Correct, Incorrect, or Not Made
finalbracket$Accuracy <- NA
for (i in 1:36){
    if(!is.na(finalbracket$ATSWinner[i])) {
        if(finalbracket$ModelChoice[i] == finalbracket$ATSWinner[i])      
            {finalbracket$Accuracy[i] <- "Correct"}
        if(finalbracket$ModelChoice[i] != finalbracket$ATSWinner[i]) 
            {finalbracket$Accuracy[i] <- "Incorrect"}
        if(finalbracket$ATSWinner[i] == "Push") 
            {finalbracket$Accuracy[i] <- "Push"}
    }
    if(is.na(finalbracket$ATSWinner[i])) 
        {finalbracket$Accuracy[i] <- NA}
    if(finalbracket$ModelChoice[i] == "No choice")
        {finalbracket$Accuracy[i] <- NA}
}

Final Results

Note: The Prediction variable outputs how many points the StrongSeed is expected to beat the WeakSeed by. So Prediction will be 3 if the StrongSeed team is favored by 3 points. This is the opposite of conventional wagering notation (-3). The Spread variable is not the conventional spread, but gives how many points the StrongSeed team is favored over the WeakSeed team by. The Win_Prob variable is the probability of the StrongSeed team winning the game straight up.

# TABLE OF ALL RESULTS
output_table <- finalbracket %>% 
    filter(!is.na(TeamName.x)) %>% 
    select(StrongSeed, WeakSeed, TeamName.x, TeamName.y, Spread, Prediction, diff, Win_Prob, ModelChoice, Team.x.score, Team.y.score, ATSWinner, Accuracy)
kable(output_table)

StrongSeed	WeakSeed	TeamName.x	TeamName.y	Spread	Prediction	diff	Win_Prob	ModelChoice	Team.x.score	Team.y.score	ATSWinner	Accuracy
W11a	W11b	St Bonaventure	UCLA	-3.5	2.6	6.1	0.576	St Bonaventure	65	58	St Bonaventure	Correct
W16a	W16b	Long Island	Radford	-4.5	0.9	5.4	0.525	Long Island	61	71	Radford	Incorrect
X11a	X11b	Arizona St	Syracuse	1.0	1.3	0.3	0.539	No choice	56	60	Syracuse	NA
Z16a	Z16b	NC Central	TX Southern	-4.5	9.3	13.8	0.766	NC Central	46	64	TX Southern	Incorrect
W01	W16b	Villanova	Radford	23.0	9.5	-13.5	0.774	Radford	87	61	Villanova	Incorrect
W02	W15	Purdue	CS Fullerton	20.5	13.4	-7.1	0.885	CS Fullerton	NA	NA	NA	NA
W03	W14	Texas Tech	SF Austin	11.0	11.4	0.4	0.828	No choice	70	60	SF Austin	NA
W04	W13	Wichita St	Marshall	11.5	4.3	-7.2	0.623	Marshall	NA	NA	NA	NA
W05	W12	West Virginia	Murray St	10.5	1.1	-9.4	0.532	Murray St	NA	NA	NA	NA
W06	W11a	Florida	St Bonaventure	5.5	8.0	2.5	0.730	Florida	77	62	Florida	Correct
W07	W10	Arkansas	Butler	-1.5	-1.4	0.1	0.459	No choice	NA	NA	NA	NA
W08	W09	Virginia Tech	Alabama	2.5	9.2	6.7	0.765	Virginia Tech	83	86	Alabama	Incorrect
X01	X16	Kansas	Penn	14.5	6.8	-7.7	0.696	Penn	76	60	Kansas	Incorrect
X02	X15	Duke	Iona	20.0	15.0	-5.0	0.929	Iona	89	67	Duke	Incorrect
X03	X14	Michigan St	Bucknell	14.5	7.8	-6.7	0.723	Bucknell	NA	NA	NA	NA
X04	X13	Auburn	Col Charleston	9.0	-0.1	-9.1	0.498	Col Charleston	NA	NA	NA	NA
X05	X12	Clemson	New Mexico St	4.5	-0.3	-4.8	0.492	New Mexico St	NA	NA	NA	NA
X06	X11b	TCU	Syracuse	4.0	1.0	-3.0	0.529	Syracuse	NA	NA	NA	NA
X07	X10	Rhode Island	Oklahoma	1.5	4.7	3.2	0.635	Rhode Island	83	78	Rhode Island	Correct
X08	X09	Seton Hall	NC State	3.0	1.1	-1.9	0.531	No choice	94	83	Seton Hall	NA
Y01	Y16	Virginia	UMBC	21.0	3.5	-17.5	0.599	UMBC	NA	NA	NA	NA
Y02	Y15	Cincinnati	Georgia St	14.0	7.1	-6.9	0.703	Georgia St	NA	NA	NA	NA
Y03	Y14	Tennessee	Wright St	11.5	5.4	-6.1	0.654	Wright St	73	47	Tennessee	Incorrect
Y04	Y13	Arizona	Buffalo	8.5	-7.1	-15.6	0.297	Buffalo	68	89	Buffalo	Correct
Y05	Y12	Kentucky	Davidson	5.0	1.7	-3.3	0.548	Davidson	78	73	Push	Push
Y06	Y11	Miami FL	Loyola-Chicago	2.0	6.7	4.7	0.693	Miami FL	62	64	Loyola-Chicago	Incorrect
Y07	Y10	Nevada	Texas	0.0	6.2	6.2	0.678	Nevada	NA	NA	NA	NA
Y08	Y09	Creighton	Kansas St	1.0	-1.1	-2.1	0.468	Kansas St	NA	NA	NA	NA
Z01	Z16b	Xavier	TX Southern	19.5	15.6	-3.9	0.947	TX Southern	NA	NA	NA	NA
Z02	Z15	North Carolina	Lipscomb	19.5	13.1	-6.4	0.878	Lipscomb	NA	NA	NA	NA
Z03	Z14	Michigan	Montana	10.0	5.2	-4.8	0.650	Montana	61	47	Michigan	Incorrect
Z04	Z13	Gonzaga	UNC Greensboro	13.5	7.8	-5.7	0.725	UNC Greensboro	68	64	UNC Greensboro	Correct
Z05	Z12	Ohio St	S Dakota St	8.0	8.9	0.9	0.756	No choice	81	73	Push	NA
Z06	Z11	Houston	San Diego St	4.0	5.1	1.1	0.646	No choice	67	65	San Diego St	NA
Z07	Z10	Texas A&M	Providence	2.5	2.3	-0.2	0.565	No choice	NA	NA	NA	NA
Z08	Z09	Missouri	Florida St	-1.5	-11.0	-9.5	0.186	Florida St	NA	NA	NA	NA

# OVERALL RECORD
table(output_table$Accuracy)

## 
##   Correct Incorrect      Push 
##         5         9         1

Discussion

Play-In Games

I didn’t even think about the play-in games, but I feel that my model should have been able to predict them fairly. Turns out the model would have gone 1-2 there, oh well.

Day 1

My day 1 record was 4-7-1. I knew from the beginning there is something wrong with my predictions for the top seeds, especially 1s and 2s, so I stayed away from following my model on any of those. Other kagglers have had this same issue and have manually input values for them, some trouble I didn’t want to bother with until I had the rest of the model running well. I’ll focus on that at some point in the future.

So when eliminating the 1 & 2 seeds Villanova, Kansas, and Duke from the conversation and the “No Choice” games where my model prediction was within two points of the spread, the only games the model advised me to wager on were:

URI -2.5 (W)
Wright St +12 (L)
UNC Greensboro +12.5 (W)
Miami FL -2 (L)
Davidson +5 (P)
Va Tech -2.5 (L)
Buffalo +9 (W)
Florida -5.5 (W)
Montana +11 (L)

The model’s ATS record today on those games was 4-4-1. Overall (disregarding 1s and 2s), the model’s record for the tournament is at 5-6-1. Small sample size, so looking forward to seeing tomorrow’s results.

For Day 2, I will be continuing to disregard the 1 and 2 seeds. The model’s recommended wagers are:

# Day 2 Recommended Wagers
Day2 <- output_table %>% filter(abs(diff) > 2 & is.na(ATSWinner))
kable(Day2)

StrongSeed	WeakSeed	TeamName.x	TeamName.y	Spread	Prediction	diff	Win_Prob	ModelChoice	Team.x.score	Team.y.score	ATSWinner	Accuracy
W02	W15	Purdue	CS Fullerton	20.5	13.4	-7.1	0.885	CS Fullerton	NA	NA	NA	NA
W04	W13	Wichita St	Marshall	11.5	4.3	-7.2	0.623	Marshall	NA	NA	NA	NA
W05	W12	West Virginia	Murray St	10.5	1.1	-9.4	0.532	Murray St	NA	NA	NA	NA
X03	X14	Michigan St	Bucknell	14.5	7.8	-6.7	0.723	Bucknell	NA	NA	NA	NA
X04	X13	Auburn	Col Charleston	9.0	-0.1	-9.1	0.498	Col Charleston	NA	NA	NA	NA
X05	X12	Clemson	New Mexico St	4.5	-0.3	-4.8	0.492	New Mexico St	NA	NA	NA	NA
X06	X11b	TCU	Syracuse	4.0	1.0	-3.0	0.529	Syracuse	NA	NA	NA	NA
Y01	Y16	Virginia	UMBC	21.0	3.5	-17.5	0.599	UMBC	NA	NA	NA	NA
Y02	Y15	Cincinnati	Georgia St	14.0	7.1	-6.9	0.703	Georgia St	NA	NA	NA	NA
Y07	Y10	Nevada	Texas	0.0	6.2	6.2	0.678	Nevada	NA	NA	NA	NA
Y08	Y09	Creighton	Kansas St	1.0	-1.1	-2.1	0.468	Kansas St	NA	NA	NA	NA
Z01	Z16b	Xavier	TX Southern	19.5	15.6	-3.9	0.947	TX Southern	NA	NA	NA	NA
Z02	Z15	North Carolina	Lipscomb	19.5	13.1	-6.4	0.878	Lipscomb	NA	NA	NA	NA
Z08	Z09	Missouri	Florida St	-1.5	-11.0	-9.5	0.186	Florida St	NA	NA	NA	NA

The model should avoid Purdue, Virginia, Cincinnati, Xavier, and UNC and advises to take:

Marshall +11.5
Murray St +10.5
Bucknell +14.5 (I’m not putting my own money on this, but will consider it an outcome to evaluate the model on)
Col. Charleston +9
New Mexico St +4.5
Syracuse +4
Kansas St +1 (I don’t like this personally, but also will evaluate as valid)
Nevada EVEN
Florida St -1.5

Best bets in that crowd are Florida St, Murray St, and College of Charleston. I’ll be putting some $ on a few of those moneylines too.