Uploading Data Files

I want to analyze how a team’s seeding will affect the outcome of a game.

2017 Analysis

#get only the seeds of 2017
seeds2017 <- seeds %>%
  filter(Season == "2017")
#get only the tour results of 2017
tour_results2017 <- tour_results %>%
  filter(Season == "2017")

#make the seed just a number
seeds2017$Seed = as.numeric(substr(seeds2017$Seed, 2, 3))
#make the names match, change back after
seeds2017$WTeamID <- seeds2017$TeamID
seeds2017$LTeamID <- seeds2017$TeamID

#create a new Winning Seed table of what you want to join
seedsWinningSeed <- seeds2017[, c(2,4)]

#make the column the winning seed for the join
seedsWinningSeed$WSeed <- seedsWinningSeed$Seed 

#remove old column that says Seed
seedsWinningSeed$Seed <- NULL

#create a new Losing Seed table of what you want to join
seedsLosingSeed <- seeds2017[, c(2,5)]

#make the column the losing seed for the join
seedsLosingSeed$LSeed <- seedsLosingSeed$Seed 

#remove old column that says Seed
seedsLosingSeed$Seed <- NULL

#remove column I made that was unneccesary now that I'm going to join them
tour_results2017$WTeamSeed <- NULL


#join the winning seed column
tour_results2017 <- tour_results2017 %>% 
  left_join(seedsWinningSeed, by = "WTeamID")

#join the losing seed column
tour_results2017 <- tour_results2017 %>% 
  left_join(seedsLosingSeed, by = "LTeamID")


#Create a column that shows seed difference 
tour_results2017$SeedDiff <- tour_results2017$WSeed - tour_results2017$LSeed

#68 games in the data set or the tournament. 

tour_results2017 %>% 
  filter(SeedDiff < 0)
#48 games the better ranked team won. 
48/68
#or 70% of the time. 

tour_results2017 %>% 
       filter(SeedDiff > 0)
#only 14 were upsets 
14/68
#or 20% of the time.

According to the 2017 Tournament there were only upsets in 20% of the games.

I’m now going to analyze all the Tournaments from the 2000s.

2000’s Analysis

#get the tour results of all 2000s tournaments 
tour_results2000s <- tour_results %>%
  filter(Season >= 2000)

#get only the seeds of the 2000s
seeds2000s <- seeds %>%
  filter(Season >= 2000)

#make the seed just a number
seeds2000s$Seed = as.numeric(substr(seeds2000s$Seed, 2, 3))


#create two new tables so you can join
seeds2000WinningSeed <- seeds2000s
seeds2000LosingSeed <- seeds2000s

#change the winning seed column names in each for the join
seeds2000WinningSeed$WTeamID <- seeds2000WinningSeed$TeamID
seeds2000WinningSeed$TeamID <- NULL
seeds2000WinningSeed$WSeed <- seeds2000WinningSeed$Seed
seeds2000WinningSeed$Seed <- NULL

#join the winning seed to tour results
tour_results2000s <- tour_results2000s %>% 
  left_join(seeds2000WinningSeed, by = c("WTeamID", "Season"))

#change the losing seed column names in each for the join
seeds2000LosingSeed$LTeamID <- seeds2000LosingSeed$TeamID
seeds2000LosingSeed$TeamID <- NULL
seeds2000LosingSeed$LSeed <- seeds2000LosingSeed$Seed
seeds2000LosingSeed$Seed <- NULL

#join the losing seed to tour results
tour_results2000s <- tour_results2000s %>% 
  left_join(seeds2000LosingSeed, by = c("LTeamID", "Season"))

#Create a column that shows seed difference 
tour_results2000s$SeedDiff <- tour_results2000s$WSeed - tour_results2000s$LSeed

#1172 Games Played
tour_results2000s %>% 
       filter(SeedDiff > 0)
#319 were upsets

319/1172

##27% of Games Played were upsets. Signifigantly higher than 20%, but confirms my theory. 
#The Higher (Better) Ranked team wins the majority of the games.

#this shows the # of games in the 2000s by their seed differential 
#Winning Team Seed # - Losing Team Seed #
#So countneg15 shows how many games were played where a 1 seed beat a 16 seed 
#1-16 = -15 

countneg15 <- length(which(tour_results2000s$SeedDiff == -15))
#72
countneg14 <- length(which(tour_results2000s$SeedDiff == -14))
#0
(countneg13 <- length(which(tour_results2000s$SeedDiff == -13)))
#67
(countneg12 <- length(which(tour_results2000s$SeedDiff == -12)))
#3
(countneg11 <- length(which(tour_results2000s$SeedDiff == -11)))
#73
(countneg10 <- length(which(tour_results2000s$SeedDiff == -10)))
#4
(countneg9 <- length(which(tour_results2000s$SeedDiff == -9)))
#66
(countneg8 <- length(which(tour_results2000s$SeedDiff == -8)))
#92
(countneg7 <- length(which(tour_results2000s$SeedDiff == -7)))
#84
(countneg6 <- length(which(tour_results2000s$SeedDiff == -6)))
#5
(countneg5 <- length(which(tour_results2000s$SeedDiff == -5)))
#78
(countneg4 <- length(which(tour_results2000s$SeedDiff == -4)))
#42
(countneg3 <- length(which(tour_results2000s$SeedDiff == -3)))
#98
(countneg2 <- length(which(tour_results2000s$SeedDiff == -2)))
#13
(countneg1 <- length(which(tour_results2000s$SeedDiff == -1)))
#105
count0 <- length(which(tour_results2000s$SeedDiff == 0))
#51
count1 <- length(which(tour_results2000s$SeedDiff == 1))
#84
count2 <- length(which(tour_results2000s$SeedDiff == 2))
#11
#At this point I realized I did not have to type the value in the next line for what 
#I was planning to do. 
count3 <- length(which(tour_results2000s$SeedDiff == 3))
count4 <- length(which(tour_results2000s$SeedDiff == 4))
count5 <- length(which(tour_results2000s$SeedDiff == 5))
count6 <- length(which(tour_results2000s$SeedDiff == 6))
count7 <- length(which(tour_results2000s$SeedDiff == 7))
count8 <- length(which(tour_results2000s$SeedDiff == 8))
count9 <- length(which(tour_results2000s$SeedDiff == 9))
count10 <- length(which(tour_results2000s$SeedDiff == 10))
count11 <- length(which(tour_results2000s$SeedDiff == 11))
count12 <- length(which(tour_results2000s$SeedDiff == 12))
count13 <- length(which(tour_results2000s$SeedDiff == 13))
count14 <- length(which(tour_results2000s$SeedDiff == 14))
count15 <- length(which(tour_results2000s$SeedDiff == 15))

#The DifferenceX will show the Probability % that the better seed will win a given match up. 
#By dividing how many games the better seeded team won and dividing it by the sum of total games played
#between those two seeds. 
(Difference15 <- countneg15/(countneg15+count15))
Difference14 <- countneg14/(countneg14+count14)
Difference13 <- countneg13/(countneg13+count13)
Difference12 <- countneg12/(countneg12+count12)
Difference11 <- countneg11/(countneg11+count11)
Difference10 <- countneg10/(countneg10+count10)
Difference9 <- countneg9/(countneg9+count9)
Difference8 <- countneg8/(countneg8+count8)
Difference7 <- countneg7/(countneg7+count7)
Difference6 <- countneg6/(countneg6+count6)
Difference5 <- countneg5/(countneg5+count5)
Difference4 <- countneg4/(countneg4+count4)
Difference3 <- countneg3/(countneg3+count3)
Difference2 <- countneg2/(countneg2+count2)
Difference1 <- countneg1/(countneg1+count1)
Difference0 <- .5

#I will use these difference values as my probability predictions for the games.

Creating My Predictions

2018 Data


#input 2018 Team Data
SampleSubmission18 <- read.csv("SampleSubmissionStage2.csv")

Need to make the data compatible with what I have been working on.


#create the column for Winning team ID
SampleSubmission18$WTeamID <- substr(SampleSubmission18$ID, 6, 9)

#create the column for Losing Team ID
SampleSubmission18$LTeamID <- substr(SampleSubmission18$ID, 11, 14)

#Now I want to add the seeds for the winning team 
#First upload the seeds
Seeds18 <- read.csv("Seeds2018.csv")
#Clean up the Seeds Column 
Seeds18$Seed = as.numeric(substr(Seeds18$Seed, 2, 3))

#create two new tables so you can join
Seeds18WinningSeed <- Seeds18
Seeds18LosingSeed <- Seeds18

#change the winning seed column names in each for the join
Seeds18WinningSeed$WTeamID <- Seeds18WinningSeed$TeamID
Seeds18WinningSeed$TeamID <- NULL
Seeds18WinningSeed$WSeed <- Seeds18WinningSeed$Seed
Seeds18WinningSeed$Seed <- NULL


#join the winning seed to tour results
SampleSubmission18 <- SampleSubmission18 %>% 
  left_join(Seeds18WinningSeed, by = "WTeamID")
#issue becaues both WTeamIDs are not numeric 
Seeds18WinningSeed$WTeamID <- as.numeric(Seeds18WinningSeed$WTeamID)
SampleSubmission18$WTeamID <- as.numeric(SampleSubmission18$WTeamID)
#trying again
SampleSubmission18 <- SampleSubmission18 %>% 
  left_join(Seeds18WinningSeed, by = "WTeamID")

#Ok Seed for the winnning team is in. Now need to add losing seed. 

#change the losing seed column names in each for the join
Seeds18LosingSeed$LTeamID <- Seeds18LosingSeed$TeamID
Seeds18LosingSeed$TeamID <- NULL
Seeds18LosingSeed$LSeed <- Seeds18LosingSeed$Seed
Seeds18LosingSeed$Seed <- NULL

#issue becaues both WTeamIDs are not numeric 
Seeds18LosingSeed$LTeamID <- as.numeric(Seeds18LosingSeed$LTeamID)
SampleSubmission18$LTeamID <- as.numeric(SampleSubmission18$LTeamID)

#join the winning seed to tour results
SampleSubmission18 <- SampleSubmission18 %>% 
  left_join(Seeds18LosingSeed, by = "LTeamID")

#Remove the unncessary season columns that were added on accident. 
SampleSubmission18$Season.x <- NULL
SampleSubmission18$Season.y <- NULL

#Create a column that shows seed difference 
SampleSubmission18$SeedDiff <- SampleSubmission18$WSeed - SampleSubmission18$LSeed



#I now want to assign my prediction values based off of what I
#calculated before in the Differencex variables. 

Difference15
#1
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -15] <- 1
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -14] <- 1

(Difference13)
# .93
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -13] <- .93
Difference12
#1
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -12] <- 1
Difference11
#.90
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -11] <- .9
Difference10
#.67
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -10] <- .67
Difference9
#.80
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -9] <- .80
Difference8
#.75
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -8] <- .75
Difference7
#.68
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -7] <- .68
Difference6
#.55
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -6] <- .55
Difference5
#.63 
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -5] <- .63
Difference4
#.68
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -4] <- .68
Difference3
#.64
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -3] <- .64
Difference2
#.54
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -2] <- .54
Difference1
#.55
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == -1] <- .55
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 0] <- .5


#Realized I need to add the opposite values for the positive seed differences. 
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 15] <- 0
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 14] <- 0
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 13] <- .07
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 12] <- 0
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 11] <- .1
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 10] <- .33
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 9] <- .20
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 8] <- .25
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 7] <- .32
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 6] <- .45
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 5] <- .37
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 4] <- .32
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 3] <- .36
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 2] <- .46
SampleSubmission18$Pred[SampleSubmission18$SeedDiff == 1] <- .45

Time to put it in a format for submission on Kaggle!



#Now I need to put into a database that only has the Game ID and the Pred %. 
TJGrayMensTournament18Submission <- SampleSubmission18[,1:2]

#Create a CSV and submit it. 
write.csv(TJGrayMensTournament18Submission, "TJGrayMensTournament18Submission.csv", row.names = F)

NCAA Mens Tournament

T.J. Gray

March 15, 2018

I want to analyze how a team’s seeding will affect the outcome of a game.

2017 Analysis

According to the 2017 Tournament there were only upsets in 20% of the games.

I’m now going to analyze all the Tournaments from the 2000s.

2000’s Analysis

Creating My Predictions

2018 Data

Need to make the data compatible with what I have been working on.