Data
NCAA Women’s Basketball Tournament data contains information for every team that has participated in the NCAA Division I Women’s Basketball Tournament since it began in 1982. Every school is shown with its seed, conference record (when available), regular-season record, tournament record and full season record, including winning percentages. More information about select columns is in the table below.
library(tidyr)
library(dplyr)
library(tidyverse)
library(janitor)
library(kableExtra)
dsWBB <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/ncaa-womens-basketball-tournament/ncaa-womens-basketball-tournament-history.csv", sep = ",", stringsAsFactors = F)
View raw data
head(dsWBB) %>% kable() %>% kable_styling()
ï..Year
|
School
|
Seed
|
Conference
|
Conf..W
|
Conf..L
|
Conf…
|
Conf..place
|
Reg..W
|
Reg..L
|
Reg…
|
How.qual
|
X1st.game.at.home.
|
Tourney.W
|
Tourney.L
|
Tourney.finish
|
Full.W
|
Full.L
|
Full..
|
1982
|
Arizona St.
|
4
|
Western Collegiate
|
|
|
|
|
23
|
6
|
79.3
|
at-large
|
Y
|
1
|
1
|
RSF
|
24
|
7
|
77.4
|
1982
|
Auburn
|
7
|
Southeastern
|
|
|
|
|
24
|
4
|
85.7
|
at-large
|
N
|
0
|
1
|
1st
|
24
|
5
|
82.8
|
1982
|
Cheyney
|
2
|
Independent
|
|
|
|
|
24
|
2
|
92.3
|
at-large
|
Y
|
4
|
1
|
N2nd
|
28
|
3
|
90.3
|
1982
|
Clemson
|
5
|
Atlantic Coast
|
6
|
3
|
66.7
|
4th
|
20
|
11
|
64.5
|
at-large
|
N
|
0
|
1
|
1st
|
20
|
12
|
62.5
|
1982
|
Drake
|
4
|
Missouri Valley
|
|
|
|
|
26
|
6
|
81.3
|
auto
|
Y
|
2
|
1
|
RF
|
28
|
7
|
80
|
1982
|
East Carolina
|
6
|
Independent
|
|
|
|
|
19
|
7
|
73.1
|
at-large
|
N
|
0
|
1
|
1st
|
19
|
8
|
70.4
|
# Check columns
colnames(dsWBB)
## [1] "ï..Year" "School" "Seed"
## [4] "Conference" "Conf..W" "Conf..L"
## [7] "Conf..." "Conf..place" "Reg..W"
## [10] "Reg..L" "Reg..." "How.qual"
## [13] "X1st.game.at.home." "Tourney.W" "Tourney.L"
## [16] "Tourney.finish" "Full.W" "Full.L"
## [19] "Full.."
Rename columns and view data
# Rename columns
names(dsWBB) <- c("Year", "School", "Seed", "Conference", "ConfW", "ConfL","Conf", "ConfPlace", "RegW", "RegL", "Reg", "HowQual", "X1stGameAtHome", "TourneyW", "TourneyL", "TourneyFinish", "FullW", "FullL", "Full")
# Vew data
head(dsWBB) %>% kable() %>% kable_styling()
Year
|
School
|
Seed
|
Conference
|
ConfW
|
ConfL
|
Conf
|
ConfPlace
|
RegW
|
RegL
|
Reg
|
HowQual
|
X1stGameAtHome
|
TourneyW
|
TourneyL
|
TourneyFinish
|
FullW
|
FullL
|
Full
|
1982
|
Arizona St.
|
4
|
Western Collegiate
|
|
|
|
|
23
|
6
|
79.3
|
at-large
|
Y
|
1
|
1
|
RSF
|
24
|
7
|
77.4
|
1982
|
Auburn
|
7
|
Southeastern
|
|
|
|
|
24
|
4
|
85.7
|
at-large
|
N
|
0
|
1
|
1st
|
24
|
5
|
82.8
|
1982
|
Cheyney
|
2
|
Independent
|
|
|
|
|
24
|
2
|
92.3
|
at-large
|
Y
|
4
|
1
|
N2nd
|
28
|
3
|
90.3
|
1982
|
Clemson
|
5
|
Atlantic Coast
|
6
|
3
|
66.7
|
4th
|
20
|
11
|
64.5
|
at-large
|
N
|
0
|
1
|
1st
|
20
|
12
|
62.5
|
1982
|
Drake
|
4
|
Missouri Valley
|
|
|
|
|
26
|
6
|
81.3
|
auto
|
Y
|
2
|
1
|
RF
|
28
|
7
|
80
|
1982
|
East Carolina
|
6
|
Independent
|
|
|
|
|
19
|
7
|
73.1
|
at-large
|
N
|
0
|
1
|
1st
|
19
|
8
|
70.4
|
Drop columns that are not used and view data
# Drop columns that are not necessary
dsWBB_new <- select(dsWBB, "Year", "Seed", "Conference", "HowQual", "X1stGameAtHome", "TourneyW", "TourneyL", "TourneyFinish", "FullW", "FullL", "Full")
# View new data
head(dsWBB_new) %>% kable() %>% kable_styling()
Year
|
Seed
|
Conference
|
HowQual
|
X1stGameAtHome
|
TourneyW
|
TourneyL
|
TourneyFinish
|
FullW
|
FullL
|
Full
|
1982
|
4
|
Western Collegiate
|
at-large
|
Y
|
1
|
1
|
RSF
|
24
|
7
|
77.4
|
1982
|
7
|
Southeastern
|
at-large
|
N
|
0
|
1
|
1st
|
24
|
5
|
82.8
|
1982
|
2
|
Independent
|
at-large
|
Y
|
4
|
1
|
N2nd
|
28
|
3
|
90.3
|
1982
|
5
|
Atlantic Coast
|
at-large
|
N
|
0
|
1
|
1st
|
20
|
12
|
62.5
|
1982
|
4
|
Missouri Valley
|
auto
|
Y
|
2
|
1
|
RF
|
28
|
7
|
80
|
1982
|
6
|
Independent
|
at-large
|
N
|
0
|
1
|
1st
|
19
|
8
|
70.4
|
Extract and other data operations
# Create new column from existing columns
dsWBB_new <- mutate(dsWBB_new, FullWToLRatio = round(dsWBB_new$FullW/dsWBB_new$FullL))
# Filter for year 1982
dsWBB_new_1982 <- filter(dsWBB_new, Year == "1982")
# View new data for 1982
head(dsWBB_new_1982) %>% kable() %>% kable_styling()
Year
|
Seed
|
Conference
|
HowQual
|
X1stGameAtHome
|
TourneyW
|
TourneyL
|
TourneyFinish
|
FullW
|
FullL
|
Full
|
FullWToLRatio
|
1982
|
4
|
Western Collegiate
|
at-large
|
Y
|
1
|
1
|
RSF
|
24
|
7
|
77.4
|
3
|
1982
|
7
|
Southeastern
|
at-large
|
N
|
0
|
1
|
1st
|
24
|
5
|
82.8
|
5
|
1982
|
2
|
Independent
|
at-large
|
Y
|
4
|
1
|
N2nd
|
28
|
3
|
90.3
|
9
|
1982
|
5
|
Atlantic Coast
|
at-large
|
N
|
0
|
1
|
1st
|
20
|
12
|
62.5
|
2
|
1982
|
4
|
Missouri Valley
|
auto
|
Y
|
2
|
1
|
RF
|
28
|
7
|
80
|
4
|
1982
|
6
|
Independent
|
at-large
|
N
|
0
|
1
|
1st
|
19
|
8
|
70.4
|
2
|
Sorting
# Sort desc on FullWToLRatio
dsWBB_new_sorted <- arrange(dsWBB_new, desc(FullWToLRatio))
# Remove na
dsWBB_new_sorted$FullWToLRatio <- as.numeric(na.omit(dsWBB_new_sorted$FullWToLRatio))
# View sorted data
head(dsWBB_new_sorted) %>% kable() %>% kable_styling()
Year
|
Seed
|
Conference
|
HowQual
|
X1stGameAtHome
|
TourneyW
|
TourneyL
|
TourneyFinish
|
FullW
|
FullL
|
Full
|
FullWToLRatio
|
1986
|
1
|
Southwest
|
auto
|
Y
|
5
|
0
|
Champ
|
34
|
0
|
100
|
Inf
|
1995
|
1
|
Big East
|
auto
|
Y
|
6
|
0
|
Champ
|
35
|
0
|
100
|
Inf
|
2002
|
1
|
Big East
|
auto
|
Y
|
6
|
0
|
Champ
|
39
|
0
|
100
|
Inf
|
2009
|
1
|
Big East
|
auto
|
Y
|
6
|
0
|
Champ
|
39
|
0
|
100
|
Inf
|
2010
|
1
|
Big East
|
auto
|
N
|
6
|
0
|
Champ
|
39
|
0
|
100
|
Inf
|
2012
|
1
|
Big 12
|
auto
|
N
|
6
|
0
|
Champ
|
40
|
0
|
100
|
Inf
|
As we see in the above data, we have a divide by 0 error in column FullWToLRatio. We can replace all FullWToLRatio data with FullW where FullL is 0
#Create new calculated colum
dsWBB_new_sorted = dsWBB_new_sorted %>% mutate(FullWToLRatio = factor(ifelse(FullL == 0, FullW, FullWToLRatio)))
# View sorted data
head(dsWBB_new_sorted) %>% kable() %>% kable_styling()
Year
|
Seed
|
Conference
|
HowQual
|
X1stGameAtHome
|
TourneyW
|
TourneyL
|
TourneyFinish
|
FullW
|
FullL
|
Full
|
FullWToLRatio
|
1986
|
1
|
Southwest
|
auto
|
Y
|
5
|
0
|
Champ
|
34
|
0
|
100
|
34
|
1995
|
1
|
Big East
|
auto
|
Y
|
6
|
0
|
Champ
|
35
|
0
|
100
|
35
|
2002
|
1
|
Big East
|
auto
|
Y
|
6
|
0
|
Champ
|
39
|
0
|
100
|
39
|
2009
|
1
|
Big East
|
auto
|
Y
|
6
|
0
|
Champ
|
39
|
0
|
100
|
39
|
2010
|
1
|
Big East
|
auto
|
N
|
6
|
0
|
Champ
|
39
|
0
|
100
|
39
|
2012
|
1
|
Big 12
|
auto
|
N
|
6
|
0
|
Champ
|
40
|
0
|
100
|
40
|
Splits and Levels
# View Levels
FullWToLRatioLevels <- cut(as.numeric(dsWBB_new_sorted$FullWToLRatio), breaks = 10) %>% levels
FullWToLRatioLevels %>% kable() %>% kable_styling()
x
|
(0.971,3.9]
|
(3.9,6.8]
|
(6.8,9.7]
|
(9.7,12.6]
|
(12.6,15.5]
|
(15.5,18.4]
|
(18.4,21.3]
|
(21.3,24.2]
|
(24.2,27.1]
|
(27.1,30]
|
Summarise and visualize data
# Summarise
summarise(dsWBB_new_sorted, "MeanWLRatio" = mean(as.numeric(dsWBB_new_sorted$FullWToLRatio), na.rm = TRUE), "SDWLRatio" = sd(as.numeric(dsWBB_new_sorted$FullWToLRatio), na.rm = TRUE))
## MeanWLRatio SDWLRatio
## 1 3.970841 3.605831
# View hist
hist(x=as.numeric(dsWBB_new_sorted$FullWToLRatio), main = "Win to Loss Ratio")
