Loading Packages

library(dplyr); library(data.table); library(reshape2)

Reading in the data

TourneySeeds <- fread("/home/jcross/MarchMadness/data/NCAATourneySeeds.csv")
SampleSubmission <- fread("/home/jcross/MarchMadness/data/SampleSubmissionStage1.csv")
Seasons <- fread("/home/jcross/MarchMadness/data/Seasons.csv")
Teams <- fread("/home/jcross/MarchMadness/data/Teams.csv")
TourneySlots <- fread("/home/jcross/MarchMadness/data/NCAATourneySlots.csv")
TourneyDetailedResults <- fread("/home/jcross/MarchMadness/data/NCAATourneyDetailedResults.csv")
TourneyCompactResults <- fread("/home/jcross/MarchMadness/data/NCAATourneyCompactResults.csv")

# TourneySeeds <- fread("/home/jcross/MarchMadness/data/WNCAATourneySeeds.csv")
# SampleSubmission <- fread("/home/jcross/MarchMadness/data/WSampleSubmissionStage1.csv")
# Seasons <- fread("/home/jcross/MarchMadness/data/WSeasons.csv")
# Teams <- fread("/home/jcross/MarchMadness/data/WTeams.csv")
# TourneySlots <- fread("/home/jcross/MarchMadness/data/WNCAATourneySlots.csv")
# TourneyCompactResults <- fread("/home/jcross/MarchMadness/data/WNCAATourneyCompactResults.csv")

A Quick Look at the Data

head(TourneySeeds)
##    Season Seed TeamID
## 1:   1985  W01   1207
## 2:   1985  W02   1210
## 3:   1985  W03   1228
## 4:   1985  W04   1260
## 5:   1985  W05   1374
## 6:   1985  W06   1208
head(TourneySlots)
##    Season Slot StrongSeed WeakSeed
## 1:   1985 R1W1        W01      W16
## 2:   1985 R1W2        W02      W15
## 3:   1985 R1W3        W03      W14
## 4:   1985 R1W4        W04      W13
## 5:   1985 R1W5        W05      W12
## 6:   1985 R1W6        W06      W11
head(SampleSubmission)
##                ID Pred
## 1: 2014_1107_1110  0.5
## 2: 2014_1107_1112  0.5
## 3: 2014_1107_1113  0.5
## 4: 2014_1107_1124  0.5
## 5: 2014_1107_1140  0.5
## 6: 2014_1107_1142  0.5
head(Seasons)
##    Season    DayZero RegionW   RegionX   RegionY   RegionZ
## 1:   1985 10/29/1984    East      West   Midwest Southeast
## 2:   1986 10/28/1985    East   Midwest Southeast      West
## 3:   1987 10/27/1986    East Southeast   Midwest      West
## 4:   1988  11/2/1987    East   Midwest Southeast      West
## 5:   1989 10/31/1988    East      West   Midwest Southeast
## 6:   1990 10/30/1989    East   Midwest Southeast      West
head(Teams)
##    TeamID    TeamName FirstD1Season LastD1Season
## 1:   1101 Abilene Chr          2014         2019
## 2:   1102   Air Force          1985         2019
## 3:   1103       Akron          1985         2019
## 4:   1104     Alabama          1985         2019
## 5:   1105 Alabama A&M          2000         2019
## 6:   1106  Alabama St          1985         2019
head(TourneyDetailedResults)
##    Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT WFGM WFGA WFGM3
## 1:   2003    134    1421     92    1411     84    N     1   32   69    11
## 2:   2003    136    1112     80    1436     51    N     0   31   66     7
## 3:   2003    136    1113     84    1272     71    N     0   31   59     6
## 4:   2003    136    1141     79    1166     73    N     0   29   53     3
## 5:   2003    136    1143     76    1301     74    N     1   27   64     7
## 6:   2003    136    1163     58    1140     53    N     0   17   52     4
##    WFGA3 WFTM WFTA WOR WDR WAst WTO WStl WBlk WPF LFGM LFGA LFGM3 LFGA3
## 1:    29   17   26  14  30   17  12    5    3  22   29   67    12    31
## 2:    23   11   14  11  36   22  16   10    7   8   20   64     4    16
## 3:    14   16   22  10  27   18   9    7    4  19   25   69     7    28
## 4:     7   18   25  11  20   15  18   13    1  19   27   60     7    17
## 5:    20   15   23  18  20   17  13    8    2  14   25   56     9    21
## 6:    14   20   27  12  29    8  14    3    8  16   20   64     2    17
##    LFTM LFTA LOR LDR LAst LTO LStl LBlk LPF
## 1:   14   31  17  28   16  15    5    0  22
## 2:    7    7   8  26   12  17   10    3  15
## 3:   14   21  20  22   11  12    2    5  18
## 4:   12   17  14  17   20  21    6    6  21
## 5:   15   20  10  26   16  14    5    8  19
## 6:   11   13  15  26   11  11    8    4  22
head(TourneyCompactResults)
##    Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT
## 1:   1985    136    1116     63    1234     54    N     0
## 2:   1985    136    1120     59    1345     58    N     0
## 3:   1985    136    1207     68    1250     43    N     0
## 4:   1985    136    1229     58    1425     55    N     0
## 5:   1985    136    1242     49    1325     38    N     0
## 6:   1985    136    1246     66    1449     58    N     0

Extracting seeds for each team

TourneySeeds <- TourneySeeds %>% 
    mutate(SeedNum = gsub("[A-Z+a-z]", "", Seed)) %>% select(Season, TeamID, SeedNum)

head(TourneySeeds)
##   Season TeamID SeedNum
## 1   1985   1207      01
## 2   1985   1210      02
## 3   1985   1228      03
## 4   1985   1260      04
## 5   1985   1374      05
## 6   1985   1208      06
games.to.predict <- cbind(SampleSubmission$ID, colsplit(SampleSubmission$ID, pattern = "_", names = c('season', 'team1', 'team2')))   
head(games.to.predict)
##   SampleSubmission$ID season team1 team2
## 1      2014_1107_1110   2014  1107  1110
## 2      2014_1107_1112   2014  1107  1112
## 3      2014_1107_1113   2014  1107  1113
## 4      2014_1107_1124   2014  1107  1124
## 5      2014_1107_1140   2014  1107  1140
## 6      2014_1107_1142   2014  1107  1142

Joining Games with Team Seeds

temp <- left_join(games.to.predict, TourneySeeds, by=c("season"="Season", "team1"="TeamID"))
games.to.predict <- left_join(temp, TourneySeeds, by=c("season"="Season", "team2"="TeamID"))
colnames(games.to.predict)[c(1,5:6)] <- c("Id", "team1seed", "team2seed")
games.to.predict <- games.to.predict %>% mutate(team1seed = as.numeric(team1seed), team2seed = as.numeric(team2seed))

head(games.to.predict)
##               Id season team1 team2 team1seed team2seed
## 1 2014_1107_1110   2014  1107  1110        16        15
## 2 2014_1107_1112   2014  1107  1112        16         1
## 3 2014_1107_1113   2014  1107  1113        16        10
## 4 2014_1107_1124   2014  1107  1124        16         6
## 5 2014_1107_1140   2014  1107  1140        16        10
## 6 2014_1107_1142   2014  1107  1142        16        16

Joining (compact) Results with Team Seeds

temp <- left_join(as.data.frame(TourneyCompactResults), TourneySeeds, by=c("Season", "WTeamID"="TeamID"))
compact.results <- left_join(temp, TourneySeeds, by=c("Season", "LTeamID"="TeamID"))
head(compact.results)
##   Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT SeedNum.x
## 1   1985    136    1116     63    1234     54    N     0        09
## 2   1985    136    1120     59    1345     58    N     0        11
## 3   1985    136    1207     68    1250     43    N     0        01
## 4   1985    136    1229     58    1425     55    N     0        09
## 5   1985    136    1242     49    1325     38    N     0        03
## 6   1985    136    1246     66    1449     58    N     0        12
##   SeedNum.y
## 1        08
## 2        06
## 3        16
## 4        08
## 5        14
## 6        05

Every win for one team is a loss for the other team…

set1 <- compact.results %>% select(SeedNum.x, SeedNum.y) %>% mutate(result=1)
set2 <- compact.results %>% select(SeedNum.y, SeedNum.x) %>% mutate(result=0)
colnames(set1) <- c("team1seed", "team2seed", "team1win")
colnames(set2) <- c("team1seed", "team2seed", "team1win")
full.set <- rbind(set1, set2)
full.set <- full.set %>% mutate(team1seed = as.numeric(team1seed), team2seed = as.numeric(team2seed))

## calculated the difference in seeds

full.set <- full.set %>% mutate(seed_diff = team2seed-team1seed)
games.to.predict <- games.to.predict %>% mutate(seed_diff = team2seed-team1seed)

Building a Simple Linear Model Based on the Difference in Team Seeds

m.seed.diff <- lm(team1win~ seed_diff, data=full.set)
summary(m.seed.diff)
## 
## Call:
## lm(formula = team1win ~ seed_diff, data = full.set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9958 -0.3678  0.0000  0.3678  0.9958 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.5000000  0.0065597   76.22   <2e-16 ***
## seed_diff   0.0330563  0.0008699   38.00   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4335 on 4366 degrees of freedom
## Multiple R-squared:  0.2485, Adjusted R-squared:  0.2484 
## F-statistic:  1444 on 1 and 4366 DF,  p-value: < 2.2e-16

Making Predictions using the Team Seeds Model

games.to.predict$Pred <- predict(m.seed.diff, games.to.predict)
write.csv(games.to.predict %>% select(Id, Pred), 'seed_submission.csv', row.names=FALSE)

head(games.to.predict)
##               Id season team1 team2 team1seed team2seed seed_diff
## 1 2014_1107_1110   2014  1107  1110        16        15        -1
## 2 2014_1107_1112   2014  1107  1112        16         1       -15
## 3 2014_1107_1113   2014  1107  1113        16        10        -6
## 4 2014_1107_1124   2014  1107  1124        16         6       -10
## 5 2014_1107_1140   2014  1107  1140        16        10        -6
## 6 2014_1107_1142   2014  1107  1142        16        16         0
##          Pred
## 1 0.466943679
## 2 0.004155192
## 3 0.301662077
## 4 0.169436794
## 5 0.301662077
## 6 0.500000000