Loading Packages
library(dplyr); library(data.table); library(reshape2)
Reading in the data
TourneySeeds <- fread("/home/jcross/MarchMadness/data/NCAATourneySeeds.csv")
SampleSubmission <- fread("/home/jcross/MarchMadness/data/SampleSubmissionStage1.csv")
Seasons <- fread("/home/jcross/MarchMadness/data/Seasons.csv")
Teams <- fread("/home/jcross/MarchMadness/data/Teams.csv")
TourneySlots <- fread("/home/jcross/MarchMadness/data/NCAATourneySlots.csv")
TourneyDetailedResults <- fread("/home/jcross/MarchMadness/data/NCAATourneyDetailedResults.csv")
TourneyCompactResults <- fread("/home/jcross/MarchMadness/data/NCAATourneyCompactResults.csv")
# TourneySeeds <- fread("/home/jcross/MarchMadness/data/WNCAATourneySeeds.csv")
# SampleSubmission <- fread("/home/jcross/MarchMadness/data/WSampleSubmissionStage1.csv")
# Seasons <- fread("/home/jcross/MarchMadness/data/WSeasons.csv")
# Teams <- fread("/home/jcross/MarchMadness/data/WTeams.csv")
# TourneySlots <- fread("/home/jcross/MarchMadness/data/WNCAATourneySlots.csv")
# TourneyCompactResults <- fread("/home/jcross/MarchMadness/data/WNCAATourneyCompactResults.csv")
A Quick Look at the Data
head(TourneySeeds)
## Season Seed TeamID
## 1: 1985 W01 1207
## 2: 1985 W02 1210
## 3: 1985 W03 1228
## 4: 1985 W04 1260
## 5: 1985 W05 1374
## 6: 1985 W06 1208
head(TourneySlots)
## Season Slot StrongSeed WeakSeed
## 1: 1985 R1W1 W01 W16
## 2: 1985 R1W2 W02 W15
## 3: 1985 R1W3 W03 W14
## 4: 1985 R1W4 W04 W13
## 5: 1985 R1W5 W05 W12
## 6: 1985 R1W6 W06 W11
head(SampleSubmission)
## ID Pred
## 1: 2014_1107_1110 0.5
## 2: 2014_1107_1112 0.5
## 3: 2014_1107_1113 0.5
## 4: 2014_1107_1124 0.5
## 5: 2014_1107_1140 0.5
## 6: 2014_1107_1142 0.5
head(Seasons)
## Season DayZero RegionW RegionX RegionY RegionZ
## 1: 1985 10/29/1984 East West Midwest Southeast
## 2: 1986 10/28/1985 East Midwest Southeast West
## 3: 1987 10/27/1986 East Southeast Midwest West
## 4: 1988 11/2/1987 East Midwest Southeast West
## 5: 1989 10/31/1988 East West Midwest Southeast
## 6: 1990 10/30/1989 East Midwest Southeast West
head(Teams)
## TeamID TeamName FirstD1Season LastD1Season
## 1: 1101 Abilene Chr 2014 2019
## 2: 1102 Air Force 1985 2019
## 3: 1103 Akron 1985 2019
## 4: 1104 Alabama 1985 2019
## 5: 1105 Alabama A&M 2000 2019
## 6: 1106 Alabama St 1985 2019
head(TourneyDetailedResults)
## Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT WFGM WFGA WFGM3
## 1: 2003 134 1421 92 1411 84 N 1 32 69 11
## 2: 2003 136 1112 80 1436 51 N 0 31 66 7
## 3: 2003 136 1113 84 1272 71 N 0 31 59 6
## 4: 2003 136 1141 79 1166 73 N 0 29 53 3
## 5: 2003 136 1143 76 1301 74 N 1 27 64 7
## 6: 2003 136 1163 58 1140 53 N 0 17 52 4
## WFGA3 WFTM WFTA WOR WDR WAst WTO WStl WBlk WPF LFGM LFGA LFGM3 LFGA3
## 1: 29 17 26 14 30 17 12 5 3 22 29 67 12 31
## 2: 23 11 14 11 36 22 16 10 7 8 20 64 4 16
## 3: 14 16 22 10 27 18 9 7 4 19 25 69 7 28
## 4: 7 18 25 11 20 15 18 13 1 19 27 60 7 17
## 5: 20 15 23 18 20 17 13 8 2 14 25 56 9 21
## 6: 14 20 27 12 29 8 14 3 8 16 20 64 2 17
## LFTM LFTA LOR LDR LAst LTO LStl LBlk LPF
## 1: 14 31 17 28 16 15 5 0 22
## 2: 7 7 8 26 12 17 10 3 15
## 3: 14 21 20 22 11 12 2 5 18
## 4: 12 17 14 17 20 21 6 6 21
## 5: 15 20 10 26 16 14 5 8 19
## 6: 11 13 15 26 11 11 8 4 22
head(TourneyCompactResults)
## Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT
## 1: 1985 136 1116 63 1234 54 N 0
## 2: 1985 136 1120 59 1345 58 N 0
## 3: 1985 136 1207 68 1250 43 N 0
## 4: 1985 136 1229 58 1425 55 N 0
## 5: 1985 136 1242 49 1325 38 N 0
## 6: 1985 136 1246 66 1449 58 N 0
Joining Games with Team Seeds
temp <- left_join(games.to.predict, TourneySeeds, by=c("season"="Season", "team1"="TeamID"))
games.to.predict <- left_join(temp, TourneySeeds, by=c("season"="Season", "team2"="TeamID"))
colnames(games.to.predict)[c(1,5:6)] <- c("Id", "team1seed", "team2seed")
games.to.predict <- games.to.predict %>% mutate(team1seed = as.numeric(team1seed), team2seed = as.numeric(team2seed))
head(games.to.predict)
## Id season team1 team2 team1seed team2seed
## 1 2014_1107_1110 2014 1107 1110 16 15
## 2 2014_1107_1112 2014 1107 1112 16 1
## 3 2014_1107_1113 2014 1107 1113 16 10
## 4 2014_1107_1124 2014 1107 1124 16 6
## 5 2014_1107_1140 2014 1107 1140 16 10
## 6 2014_1107_1142 2014 1107 1142 16 16
Joining (compact) Results with Team Seeds
temp <- left_join(as.data.frame(TourneyCompactResults), TourneySeeds, by=c("Season", "WTeamID"="TeamID"))
compact.results <- left_join(temp, TourneySeeds, by=c("Season", "LTeamID"="TeamID"))
head(compact.results)
## Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT SeedNum.x
## 1 1985 136 1116 63 1234 54 N 0 09
## 2 1985 136 1120 59 1345 58 N 0 11
## 3 1985 136 1207 68 1250 43 N 0 01
## 4 1985 136 1229 58 1425 55 N 0 09
## 5 1985 136 1242 49 1325 38 N 0 03
## 6 1985 136 1246 66 1449 58 N 0 12
## SeedNum.y
## 1 08
## 2 06
## 3 16
## 4 08
## 5 14
## 6 05
Every win for one team is a loss for the other team…
set1 <- compact.results %>% select(SeedNum.x, SeedNum.y) %>% mutate(result=1)
set2 <- compact.results %>% select(SeedNum.y, SeedNum.x) %>% mutate(result=0)
colnames(set1) <- c("team1seed", "team2seed", "team1win")
colnames(set2) <- c("team1seed", "team2seed", "team1win")
full.set <- rbind(set1, set2)
full.set <- full.set %>% mutate(team1seed = as.numeric(team1seed), team2seed = as.numeric(team2seed))
## calculated the difference in seeds
full.set <- full.set %>% mutate(seed_diff = team2seed-team1seed)
games.to.predict <- games.to.predict %>% mutate(seed_diff = team2seed-team1seed)
Building a Simple Linear Model Based on the Difference in Team Seeds
m.seed.diff <- lm(team1win~ seed_diff, data=full.set)
summary(m.seed.diff)
##
## Call:
## lm(formula = team1win ~ seed_diff, data = full.set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9958 -0.3678 0.0000 0.3678 0.9958
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5000000 0.0065597 76.22 <2e-16 ***
## seed_diff 0.0330563 0.0008699 38.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4335 on 4366 degrees of freedom
## Multiple R-squared: 0.2485, Adjusted R-squared: 0.2484
## F-statistic: 1444 on 1 and 4366 DF, p-value: < 2.2e-16
Making Predictions using the Team Seeds Model
games.to.predict$Pred <- predict(m.seed.diff, games.to.predict)
write.csv(games.to.predict %>% select(Id, Pred), 'seed_submission.csv', row.names=FALSE)
head(games.to.predict)
## Id season team1 team2 team1seed team2seed seed_diff
## 1 2014_1107_1110 2014 1107 1110 16 15 -1
## 2 2014_1107_1112 2014 1107 1112 16 1 -15
## 3 2014_1107_1113 2014 1107 1113 16 10 -6
## 4 2014_1107_1124 2014 1107 1124 16 6 -10
## 5 2014_1107_1140 2014 1107 1140 16 10 -6
## 6 2014_1107_1142 2014 1107 1142 16 16 0
## Pred
## 1 0.466943679
## 2 0.004155192
## 3 0.301662077
## 4 0.169436794
## 5 0.301662077
## 6 0.500000000