Data

NCAA Women’s Basketball Tournament data contains information for every team that has participated in the NCAA Division I Women’s Basketball Tournament since it began in 1982. Every school is shown with its seed, conference record (when available), regular-season record, tournament record and full season record, including winning percentages. More information about select columns is in the table below.

library(tidyr)
library(dplyr)
library(tidyverse)
library(janitor)
library(kableExtra)
dsWBB <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/ncaa-womens-basketball-tournament/ncaa-womens-basketball-tournament-history.csv", sep = ",", stringsAsFactors = F)

View raw data

head(dsWBB) %>% kable() %>% kable_styling()
ï..Year School Seed Conference Conf..W Conf..L Conf… Conf..place Reg..W Reg..L Reg… How.qual X1st.game.at.home. Tourney.W Tourney.L Tourney.finish Full.W Full.L Full..
1982 Arizona St.  4 Western Collegiate
23 6 79.3 at-large Y 1 1 RSF 24 7 77.4
1982 Auburn 7 Southeastern
24 4 85.7 at-large N 0 1 1st 24 5 82.8
1982 Cheyney 2 Independent
24 2 92.3 at-large Y 4 1 N2nd 28 3 90.3
1982 Clemson 5 Atlantic Coast 6 3 66.7 4th 20 11 64.5 at-large N 0 1 1st 20 12 62.5
1982 Drake 4 Missouri Valley
26 6 81.3 auto Y 2 1 RF 28 7 80
1982 East Carolina 6 Independent
19 7 73.1 at-large N 0 1 1st 19 8 70.4
# Check columns
colnames(dsWBB)
##  [1] "ï..Year"            "School"             "Seed"              
##  [4] "Conference"         "Conf..W"            "Conf..L"           
##  [7] "Conf..."            "Conf..place"        "Reg..W"            
## [10] "Reg..L"             "Reg..."             "How.qual"          
## [13] "X1st.game.at.home." "Tourney.W"          "Tourney.L"         
## [16] "Tourney.finish"     "Full.W"             "Full.L"            
## [19] "Full.."

Rename columns and view data

# Rename columns
names(dsWBB) <- c("Year", "School", "Seed", "Conference", "ConfW", "ConfL","Conf", "ConfPlace", "RegW", "RegL", "Reg", "HowQual", "X1stGameAtHome", "TourneyW", "TourneyL", "TourneyFinish", "FullW", "FullL", "Full")

# Vew data
head(dsWBB) %>% kable() %>% kable_styling()
Year School Seed Conference ConfW ConfL Conf ConfPlace RegW RegL Reg HowQual X1stGameAtHome TourneyW TourneyL TourneyFinish FullW FullL Full
1982 Arizona St.  4 Western Collegiate
23 6 79.3 at-large Y 1 1 RSF 24 7 77.4
1982 Auburn 7 Southeastern
24 4 85.7 at-large N 0 1 1st 24 5 82.8
1982 Cheyney 2 Independent
24 2 92.3 at-large Y 4 1 N2nd 28 3 90.3
1982 Clemson 5 Atlantic Coast 6 3 66.7 4th 20 11 64.5 at-large N 0 1 1st 20 12 62.5
1982 Drake 4 Missouri Valley
26 6 81.3 auto Y 2 1 RF 28 7 80
1982 East Carolina 6 Independent
19 7 73.1 at-large N 0 1 1st 19 8 70.4

Drop columns that are not used and view data

# Drop columns that are not necessary
dsWBB_new <- select(dsWBB, "Year", "Seed", "Conference", "HowQual", "X1stGameAtHome", "TourneyW", "TourneyL", "TourneyFinish", "FullW", "FullL", "Full")

# View new data
head(dsWBB_new) %>% kable() %>% kable_styling()
Year Seed Conference HowQual X1stGameAtHome TourneyW TourneyL TourneyFinish FullW FullL Full
1982 4 Western Collegiate at-large Y 1 1 RSF 24 7 77.4
1982 7 Southeastern at-large N 0 1 1st 24 5 82.8
1982 2 Independent at-large Y 4 1 N2nd 28 3 90.3
1982 5 Atlantic Coast at-large N 0 1 1st 20 12 62.5
1982 4 Missouri Valley auto Y 2 1 RF 28 7 80
1982 6 Independent at-large N 0 1 1st 19 8 70.4

Extract and other data operations

# Create new column from existing columns
dsWBB_new <- mutate(dsWBB_new, FullWToLRatio = round(dsWBB_new$FullW/dsWBB_new$FullL))

# Filter for year 1982
dsWBB_new_1982 <- filter(dsWBB_new, Year == "1982")

# View new data for 1982
head(dsWBB_new_1982) %>% kable() %>% kable_styling()
Year Seed Conference HowQual X1stGameAtHome TourneyW TourneyL TourneyFinish FullW FullL Full FullWToLRatio
1982 4 Western Collegiate at-large Y 1 1 RSF 24 7 77.4 3
1982 7 Southeastern at-large N 0 1 1st 24 5 82.8 5
1982 2 Independent at-large Y 4 1 N2nd 28 3 90.3 9
1982 5 Atlantic Coast at-large N 0 1 1st 20 12 62.5 2
1982 4 Missouri Valley auto Y 2 1 RF 28 7 80 4
1982 6 Independent at-large N 0 1 1st 19 8 70.4 2

Sorting

# Sort desc on FullWToLRatio
dsWBB_new_sorted <- arrange(dsWBB_new, desc(FullWToLRatio))

# Remove na
dsWBB_new_sorted$FullWToLRatio <- as.numeric(na.omit(dsWBB_new_sorted$FullWToLRatio))

# View sorted data
head(dsWBB_new_sorted) %>% kable() %>% kable_styling()
Year Seed Conference HowQual X1stGameAtHome TourneyW TourneyL TourneyFinish FullW FullL Full FullWToLRatio
1986 1 Southwest auto Y 5 0 Champ 34 0 100 Inf
1995 1 Big East auto Y 6 0 Champ 35 0 100 Inf
2002 1 Big East auto Y 6 0 Champ 39 0 100 Inf
2009 1 Big East auto Y 6 0 Champ 39 0 100 Inf
2010 1 Big East auto N 6 0 Champ 39 0 100 Inf
2012 1 Big 12 auto N 6 0 Champ 40 0 100 Inf

As we see in the above data, we have a divide by 0 error in column FullWToLRatio. We can replace all FullWToLRatio data with FullW where FullL is 0

#Create new calculated colum
dsWBB_new_sorted = dsWBB_new_sorted %>% mutate(FullWToLRatio = factor(ifelse(FullL == 0, FullW, FullWToLRatio)))

# View sorted data
head(dsWBB_new_sorted) %>% kable() %>% kable_styling()
Year Seed Conference HowQual X1stGameAtHome TourneyW TourneyL TourneyFinish FullW FullL Full FullWToLRatio
1986 1 Southwest auto Y 5 0 Champ 34 0 100 34
1995 1 Big East auto Y 6 0 Champ 35 0 100 35
2002 1 Big East auto Y 6 0 Champ 39 0 100 39
2009 1 Big East auto Y 6 0 Champ 39 0 100 39
2010 1 Big East auto N 6 0 Champ 39 0 100 39
2012 1 Big 12 auto N 6 0 Champ 40 0 100 40

Splits and Levels

# View Levels
FullWToLRatioLevels <- cut(as.numeric(dsWBB_new_sorted$FullWToLRatio), breaks = 10) %>% levels
FullWToLRatioLevels %>% kable() %>% kable_styling()
x
(0.971,3.9]
(3.9,6.8]
(6.8,9.7]
(9.7,12.6]
(12.6,15.5]
(15.5,18.4]
(18.4,21.3]
(21.3,24.2]
(24.2,27.1]
(27.1,30]

Summarise and visualize data

# Summarise
summarise(dsWBB_new_sorted, "MeanWLRatio" = mean(as.numeric(dsWBB_new_sorted$FullWToLRatio), na.rm = TRUE), "SDWLRatio" = sd(as.numeric(dsWBB_new_sorted$FullWToLRatio), na.rm = TRUE))
##   MeanWLRatio SDWLRatio
## 1    3.970841  3.605831
# View hist
hist(x=as.numeric(dsWBB_new_sorted$FullWToLRatio), main = "Win to Loss Ratio")