Uploading Data Files
#get only the seeds of 2017
seeds2017 <- seeds %>%
filter(Season == "2017")
#get only the tour results of 2017
tour_results2017 <- tour_results %>%
filter(Season == "2017")
#make the seed just a number
seeds2017$Seed = as.numeric(substr(seeds2017$Seed, 2, 3))
#make the names match, change back after
seeds2017$WTeamID <- seeds2017$TeamID
seeds2017$LTeamID <- seeds2017$TeamID
#create a new Winning Seed table of what you want to join
seedsWinningSeed <- seeds2017[, c(2,4)]
#make the column the winning seed for the join
seedsWinningSeed$WSeed <- seedsWinningSeed$Seed
#remove old column that says Seed
seedsWinningSeed$Seed <- NULL
#create a new Losing Seed table of what you want to join
seedsLosingSeed <- seeds2017[, c(2,5)]
#make the column the losing seed for the join
seedsLosingSeed$LSeed <- seedsLosingSeed$Seed
#remove old column that says Seed
seedsLosingSeed$Seed <- NULL
#remove column I made that was unneccesary now that I'm going to join them
tour_results2017$WTeamSeed <- NULL
#join the winning seed column
tour_results2017 <- tour_results2017 %>%
left_join(seedsWinningSeed, by = "WTeamID")
#join the losing seed column
tour_results2017 <- tour_results2017 %>%
left_join(seedsLosingSeed, by = "LTeamID")
#Create a column that shows seed difference
tour_results2017$SeedDiff <- tour_results2017$WSeed - tour_results2017$LSeed
#68 games in the data set or the tournament.
tour_results2017 %>%
filter(SeedDiff < 0)
#48 games the better ranked team won.
48/68
#or 70% of the time.
tour_results2017 %>%
filter(SeedDiff > 0)
#only 14 were upsets
14/68
#or 20% of the time.
#get the tour results of all 2000s tournaments
tour_results2000s <- tour_results %>%
filter(Season >= 2000)
#get only the seeds of the 2000s
seeds2000s <- seeds %>%
filter(Season >= 2000)
#make the seed just a number
seeds2000s$Seed = as.numeric(substr(seeds2000s$Seed, 2, 3))
#create two new tables so you can join
seeds2000WinningSeed <- seeds2000s
seeds2000LosingSeed <- seeds2000s
#change the winning seed column names in each for the join
seeds2000WinningSeed$WTeamID <- seeds2000WinningSeed$TeamID
seeds2000WinningSeed$TeamID <- NULL
seeds2000WinningSeed$WSeed <- seeds2000WinningSeed$Seed
seeds2000WinningSeed$Seed <- NULL
#join the winning seed to tour results
tour_results2000s <- tour_results2000s %>%
left_join(seeds2000WinningSeed, by = c("WTeamID", "Season"))
#change the losing seed column names in each for the join
seeds2000LosingSeed$LTeamID <- seeds2000LosingSeed$TeamID
seeds2000LosingSeed$TeamID <- NULL
seeds2000LosingSeed$LSeed <- seeds2000LosingSeed$Seed
seeds2000LosingSeed$Seed <- NULL
#join the losing seed to tour results
tour_results2000s <- tour_results2000s %>%
left_join(seeds2000LosingSeed, by = c("LTeamID", "Season"))
#Create a column that shows seed difference
tour_results2000s$SeedDiff <- tour_results2000s$WSeed - tour_results2000s$LSeed
#1172 Games Played
tour_results2000s %>%
filter(SeedDiff > 0)
#319 were upsets
319/1172
##27% of Games Played were upsets. Signifigantly higher than 20%, but confirms my theory.
#The Higher (Better) Ranked team wins the majority of the games.
#this shows the # of games in the 2000s by their seed differential
#Winning Team Seed # - Losing Team Seed #
#So countneg15 shows how many games were played where a 1 seed beat a 16 seed
#1-16 = -15
countneg15 <- length(which(tour_results2000s$SeedDiff == -15))
#72
countneg14 <- length(which(tour_results2000s$SeedDiff == -14))
#0
(countneg13 <- length(which(tour_results2000s$SeedDiff == -13)))
#67
(countneg12 <- length(which(tour_results2000s$SeedDiff == -12)))
#3
(countneg11 <- length(which(tour_results2000s$SeedDiff == -11)))
#73
(countneg10 <- length(which(tour_results2000s$SeedDiff == -10)))
#4
(countneg9 <- length(which(tour_results2000s$SeedDiff == -9)))
#66
(countneg8 <- length(which(tour_results2000s$SeedDiff == -8)))
#92
(countneg7 <- length(which(tour_results2000s$SeedDiff == -7)))
#84
(countneg6 <- length(which(tour_results2000s$SeedDiff == -6)))
#5
(countneg5 <- length(which(tour_results2000s$SeedDiff == -5)))
#78
(countneg4 <- length(which(tour_results2000s$SeedDiff == -4)))
#42
(countneg3 <- length(which(tour_results2000s$SeedDiff == -3)))
#98
(countneg2 <- length(which(tour_results2000s$SeedDiff == -2)))
#13
(countneg1 <- length(which(tour_results2000s$SeedDiff == -1)))
#105
count0 <- length(which(tour_results2000s$SeedDiff == 0))
#51
count1 <- length(which(tour_results2000s$SeedDiff == 1))
#84
count2 <- length(which(tour_results2000s$SeedDiff == 2))
#11
#At this point I realized I did not have to type the value in the next line for what
#I was planning to do.
count3 <- length(which(tour_results2000s$SeedDiff == 3))
count4 <- length(which(tour_results2000s$SeedDiff == 4))
count5 <- length(which(tour_results2000s$SeedDiff == 5))
count6 <- length(which(tour_results2000s$SeedDiff == 6))
count7 <- length(which(tour_results2000s$SeedDiff == 7))
count8 <- length(which(tour_results2000s$SeedDiff == 8))
count9 <- length(which(tour_results2000s$SeedDiff == 9))
count10 <- length(which(tour_results2000s$SeedDiff == 10))
count11 <- length(which(tour_results2000s$SeedDiff == 11))
count12 <- length(which(tour_results2000s$SeedDiff == 12))
count13 <- length(which(tour_results2000s$SeedDiff == 13))
count14 <- length(which(tour_results2000s$SeedDiff == 14))
count15 <- length(which(tour_results2000s$SeedDiff == 15))
#The DifferenceX will show the Probability % that the better seed will win a given match up.
#By dividing how many games the better seeded team won and dividing it by the sum of total games played
#between those two seeds.
(Difference15 <- countneg15/(countneg15+count15))
Difference14 <- countneg14/(countneg14+count14)
Difference13 <- countneg13/(countneg13+count13)
Difference12 <- countneg12/(countneg12+count12)
Difference11 <- countneg11/(countneg11+count11)
Difference10 <- countneg10/(countneg10+count10)
Difference9 <- countneg9/(countneg9+count9)
Difference8 <- countneg8/(countneg8+count8)
Difference7 <- countneg7/(countneg7+count7)
Difference6 <- countneg6/(countneg6+count6)
Difference5 <- countneg5/(countneg5+count5)
Difference4 <- countneg4/(countneg4+count4)
Difference3 <- countneg3/(countneg3+count3)
Difference2 <- countneg2/(countneg2+count2)
Difference1 <- countneg1/(countneg1+count1)
Difference0 <- .5
#I will use these difference values as my probability predictions for the games.
#input 2018 Team Data
SampleSubmission18 <- read.csv("SampleSubmissionStage2.csv")
#create the column for Winning team ID
SampleSubmission18$WTeamID <- substr(SampleSubmission18$ID, 6, 9)
#create the column for Losing Team ID
SampleSubmission18$LTeamID <- substr(SampleSubmission18$ID, 11, 14)
#Now I want to add the seeds for the winning team
#First upload the seeds
Seeds18 <- read.csv("Seeds2018.csv")
#Clean up the Seeds Column
Seeds18$Seed = as.numeric(substr(Seeds18$Seed, 2, 3))
#create two new tables so you can join
Seeds18WinningSeed <- Seeds18
Seeds18LosingSeed <- Seeds18
#change the winning seed column names in each for the join
Seeds18WinningSeed$WTeamID <- Seeds18WinningSeed$TeamID
Seeds18WinningSeed$TeamID <- NULL
Seeds18WinningSeed$WSeed <- Seeds18WinningSeed$Seed
Seeds18WinningSeed$Seed <- NULL
#join the winning seed to tour results
SampleSubmission18 <- SampleSubmission18 %>%
left_join(Seeds18WinningSeed, by = "WTeamID")
#issue becaues both WTeamIDs are not numeric
Seeds18WinningSeed$WTeamID <- as.numeric(Seeds18WinningSeed$WTeamID)
SampleSubmission18$WTeamID <- as.numeric(SampleSubmission18$WTeamID)
#trying again
SampleSubmission18 <- SampleSubmission18 %>%
left_join(Seeds18WinningSeed, by = "WTeamID")
#Ok Seed for the winnning team is in. Now need to add losing seed.
#change the losing seed column names in each for the join
Seeds18LosingSeed$LTeamID <- Seeds18LosingSeed$TeamID
Seeds18LosingSeed$TeamID <- NULL
Seeds18LosingSeed$LSeed <- Seeds18LosingSeed$Seed
Seeds18LosingSeed$Seed <- NULL
#issue becaues both WTeamIDs are not numeric
Seeds18LosingSeed$LTeamID <- as.numeric(Seeds18LosingSeed$LTeamID)
SampleSubmission18$LTeamID <- as.numeric(SampleSubmission18$LTeamID)
#join the winning seed to tour results
SampleSubmission18 <- SampleSubmission18 %>%
left_join(Seeds18LosingSeed, by = "LTeamID")
#Remove the unncessary season columns that were added on accident.
SampleSubmission18$Season.x <- NULL
SampleSubmission18$Season.y <- NULL
#Create a column that shows seed difference
SampleSubmission18$SeedDiff <- SampleSubmission18$WSeed - SampleSubmission18$LSeed
#I now want to assign my prediction values based off of what I
#calculated before in the Differencex variables.
Difference15
#1
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -15] <- 1
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -14] <- 1
(Difference13)
# .93
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -13] <- .93
Difference12
#1
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -12] <- 1
Difference11
#.90
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -11] <- .9
Difference10
#.67
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -10] <- .67
Difference9
#.80
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -9] <- .80
Difference8
#.75
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -8] <- .75
Difference7
#.68
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -7] <- .68
Difference6
#.55
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -6] <- .55
Difference5
#.63
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -5] <- .63
Difference4
#.68
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -4] <- .68
Difference3
#.64
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -3] <- .64
Difference2
#.54
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -2] <- .54
Difference1
#.55
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -1] <- .55
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 0] <- .5
#Realized I need to add the opposite values for the positive seed differences.
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 15] <- 0
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 14] <- 0
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 13] <- .07
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 12] <- 0
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 11] <- .1
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 10] <- .33
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 9] <- .20
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 8] <- .25
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 7] <- .32
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 6] <- .45
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 5] <- .37
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 4] <- .32
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 3] <- .36
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 2] <- .46
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 1] <- .45
Time to put it in a format for submission on Kaggle!
#Now I need to put into a database that only has the Game ID and the Pred %.
TJGrayMensTournament18Submission <- SampleSubmission18[,1:2]
#Create a CSV and submit it.
write.csv(TJGrayMensTournament18Submission, "TJGrayMensTournament18Submission.csv", row.names = F)