Creating the Women’s Default 2018 Predictions

library(dplyr); library(data.table); library(reshape2)
library(lme4) 

TourneySeeds <- fread("/home/jcross/MarchMadness/data/WNCAATourneySeeds.csv")
SampleSubmission <- fread("/home/jcross/MarchMadness/data/WSampleSubmissionStage1.csv")
Seasons <- fread("/home/jcross/MarchMadness/data/WSeasons.csv")
Teams <- fread("/home/jcross/MarchMadness/data/WTeams.csv")
TourneySlots <- fread("/home/jcross/MarchMadness/data/WNCAATourneySlots.csv")
TourneyCompactResults <- fread("/home/jcross/MarchMadness/data/WNCAATourneyCompactResults.csv")

TourneySeeds <- TourneySeeds %>% 
  mutate(SeedNum = gsub("[A-Z+a-z]", "", Seed)) %>% select(Season, TeamID, SeedNum)

## Regular Season Results

RegularSeasonCompactResults <- fread("/home/jcross/MarchMadness/data/WRegularSeasonCompactResults.csv",
                                     data.table=FALSE)

RegularSeasonCompactResults <- RegularSeasonCompactResults %>% mutate(home = case_when(
  WLoc == "N" ~ 0,
  WLoc == "H" ~ 1,
  WLoc == "A" ~ -1,
  TRUE ~ 0
))


# Making a Data Frame of "Game to Predict" using the Sample Submission

games.to.predict <- cbind(SampleSubmission$ID, colsplit(SampleSubmission$ID, pattern = "_", names = c('season', 'team1', 'team2')))   

temp <- left_join(games.to.predict, TourneySeeds, by=c("season"="Season", "team1"="TeamID"))
games.to.predict <- left_join(temp, TourneySeeds, by=c("season"="Season", "team2"="TeamID"))
colnames(games.to.predict)[c(1,5:6)] <- c("Id", "team1seed", "team2seed")
games.to.predict <- games.to.predict %>% mutate(team1seed = as.numeric(team1seed), team2seed = as.numeric(team2seed))
games.to.predict <- games.to.predict %>% mutate(seed_diff = team2seed-team1seed)
#games.to.predict$home <- 0

load('/home/jcross/MarchMadness/data/all_slots.rda') 
head(all_slots)

games.to.predict <- left_join(games.to.predict %>% filter(season==2018), 
                 all_slots %>% filter(women==1, season==2018) %>% select(-women, season), by=c("team1"="teamid_1", "team2"="teamid_2"))

games.to.predict <- games.to.predict %>% mutate(
  home = case_when(
    (round == 1 | round == 2) & team1seed <= 4 ~ 1,
    (round == 1 | round == 2) & team2seed <= 4 ~ -1,
    TRUE ~ 0
  )
)

season <- 2018
sub1 <- RegularSeasonCompactResults %>% 
  filter(Season==season) %>% 
  mutate(team1=as.factor(WTeamID), team2=as.factor(LTeamID), outcome=1, ptdiff=WScore-LScore) %>% 
  select(team1, team2, home, outcome, ptdiff)
sub2 <- RegularSeasonCompactResults %>% 
  filter(Season==season) %>% 
  mutate(team1=as.factor(LTeamID), team2=as.factor(WTeamID), home=-1*home, outcome=0, ptdiff=LScore-WScore) %>% 
  select(team1, team2, home, outcome, ptdiff)
reg.results <- rbind(sub1, sub2)


head(reg.results) #to take a look at what we've created


m.ptdiff <- lmer(ptdiff ~ home +  (1 | team1) + (1 | team2), data = reg.results)

re <- ranef(m.ptdiff)$team1
teamquality = data.frame(TeamID= as.numeric(row.names(re)),
                         quality=re[,"(Intercept)"])
left_join(teamquality %>% top_n(40, quality) %>% 
            arrange(desc(quality)), Teams)

summary(m.ptdiff) # residual 11.4 points


pred.pt.diffs <- predict(m.ptdiff, games.to.predict, 
                         type="response")
games.to.predict$Pred <- pnorm(pred.pt.diffs, mean=0, sd=11.2)

write.csv(games.to.predict %>% select(Id, Pred), '/home/jcross/MarchMadness/data/women_default_2018.csv', 
          row.names=FALSE)

Modifying the Bracket

library(kaggleNCAA); library(dplyr)
dat <- parseBracket('/home/jcross/MarchMadness/data/women_default_2018.csv', w=1) 


head(dat)

Adding Randomness

One way of modifying a set of predictions is to add random noise.

If we add randomness to a probabilities we risk forecasting something outside of [0,1]. Instead, we’ll convert our probabilities into Log Odds, add randomness to the Log Odds and then convert back into a probability.

In the code below, I’ll add random noise centered on 0 with a standard deviation of 0.5 to the log odds. You can change this code to add more or less random noise to your predictions.

dat <- dat %>% mutate(logodds = log(pred/(1-pred)), 
               logodds_plus_noise = logodds + rnorm(n(), 0, 0.5),
                 pred_plus_noise = exp(logodds_plus_noise)/(1+ exp(logodds_plus_noise)))

head(dat)

library(ggplot2)
dat %>% ggplot(aes(pred, pred_plus_noise))+geom_point()

Picking a Winner

Let’s say that I want to pick Baylor as a winner. First, I’ll find Baylor’s team number.

teams <- read.csv('/home/jcross/MarchMadness/data/WTeams.csv') 
teams %>% filter(TeamName=="Baylor")
# or just use View(teams)

Then, whenever Baylor is team #1, we’ll change the predicted probability to 1 and whenever Baylor is team #2, we’ll change the predicted probability to 0.

dat <- dat %>% mutate(pred_plus_pick = ifelse(teamid_1 == 3124, 1, pred),
                pred_plus_pick = ifelse(teamid_2 == 3124, 0, pred_plus_pick)
                )

dat %>% ggplot(aes(pred, pred_plus_pick))+geom_point()

Picking two Winners

Maybe, I think that Baylor will win all of its games until it faces Connecticut and then Connecticut will defeat it and will everything from there. I can code this with:

teams %>% filter(TeamName=="Connecticut")

dat <- dat %>% mutate(pred_plus_pick = ifelse(teamid_1 == 3124, 1, pred),
                pred_plus_pick = ifelse(teamid_2 == 3124, 0, pred_plus_pick),
                pred_plus_pick = ifelse(teamid_1 == 3163, 1, pred_plus_pick),
                pred_plus_pick = ifelse(teamid_2 == 3163, 0, pred_plus_pick)
                )

dat %>% ggplot(aes(pred, pred_plus_pick))+geom_point()

Picking Round One Games

Maybe I want my gambles to come early by picking one or more first round games. The advantage of gambling on first round games is that, unlike all other games, I know for certain that these games will take place. To do this, I need to identify which games are in which rounds. I’ll do that loading “all slots” data from our data folder.

load('/home/jcross/MarchMadness/data/all_slots.rda') 
head(all_slots)

dat <- left_join(dat, all_slots, by=c("women", "season", "teamid_1", "teamid_2"))

head(dat)

Now, I can find all of the close first round games (removing those where one team had to play in)

dat %>% filter(round==1, teamid_1_playedin == 0, teamid_1_playedin == 0, pred> 0.4, pred < 0.6) %>% arrange(pred)

I’m going to gamble and guess at the outcomes of the two closest match-ups

dat <- dat %>% mutate(pred_plus_pick = ifelse(teamid_1==3125 & teamid_2 ==3181, 1, pred), #picking 3125 to win
               pred_plus_pick = ifelse(teamid_1==3177 & teamid_2 ==3328, 0, pred_plus_pick) #picking 3177 to win
               )
dat %>% ggplot(aes(pred, pred_plus_pick))+geom_point()

Picking the Region Z (Lexignton) to Win it All

In 2018 the regions were Albany, Spokane, KansasCity and Lexington. Lexigton is region Z in our data set

tail(Seasons)

Let’s extract the region from each seed:

dat<- dat %>% 
    mutate(region_1 = gsub("[0-9+a-z]", "", seed_1),
           region_2 = gsub("[0-9+a-z]", "", seed_2)
           ) 
head(dat)

Now, to pick the Lexington winner (whoever it is) I’ll pick the Z region team to win whenever it’s playing a non-Z region team. (Note: Equivalently, we could pick Z region teams to win all round 5 and round 6 games.)

dat <- dat %>% mutate(pred_plus_pick = ifelse(region_1 == "Z" & region_2 != "Z", 1, pred),
               pred_plus_pick = ifelse(region_2 == "Z" & region_1 != "Z", 0, pred_plus_pick)
               )
dat %>% ggplot(aes(pred, pred_plus_pick))+geom_point()

Challenge and Report: Create 10 Brackets

After you’ve altered your bracket you will need to make sure that it’s back in the Kaggle-appropriate format and then write it to a .csv file.

submission_form <- dat %>% mutate(Id = paste(season, teamid_1, teamid_2, sep="_")) %>% select(Id, Pred=pred_plus_pick)

head(submission_form)

write.csv(submission_form, "west_must_win.csv", row.names=FALSE)

Chalenge:

Create 10 named sets of predictions for the 2018 Women’s Tournament. I will simulate this tournament 10,000 times using the probabilities in “/home/jcross/MarchMadness/data/women_default_2018.csv” and score each of your 10 brackets for each of these tournaments. I will also score 400 additional brackets that use the default bracket along with added random noise.