Tidyverse

Where did the Data come from?

I’m using a dataset from here: https://github.com/fivethirtyeight/data/tree/master/ncaa-womens-basketball-tournament

This folder contains the data behind the story ‘The Rise And Fall Of Women’s NCAA Tournament Dynasties’.

NCAA Women’s Basketball Tournament data contains information for every team that has participated in the NCAA Division I Women’s Basketball Tournament since it began in 1982. Every school is shown with its seed, conference record (when available), regular-season record, tournament record and full season record, including winning percentages.

Seed = The (OR) seeding designation in 1983 notes the eight teams that played an opening-round game to become the No. 8 seed in each region.

How qual = Whether the school qualified with an automatic bid (by winning its conference or conference tournament) or an at-large bid.

1st game at home = Whether the school played its first-round tournament games on its home court.

Tourney finish = The round of the final game for each team. OR=opening-round loss (1983 only); 1st=first-round loss; 2nd=second-round loss; RSF=loss in the Sweet 16; RF=loss in the Elite Eight; NSF=loss in the national semifinals; N2nd=national runner-up; Champ=national champions

library(tidyr)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(kableExtra)

load and check the data

DataSet <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/ncaa-womens-basketball-tournament/ncaa-womens-basketball-tournament-history.csv", sep = ",", stringsAsFactors = F)

DT::datatable(head(DataSet))

colnames(DataSet)

##  [1] "ï..Year"            "School"             "Seed"              
##  [4] "Conference"         "Conf..W"            "Conf..L"           
##  [7] "Conf..."            "Conf..place"        "Reg..W"            
## [10] "Reg..L"             "Reg..."             "How.qual"          
## [13] "X1st.game.at.home." "Tourney.W"          "Tourney.L"         
## [16] "Tourney.finish"     "Full.W"             "Full.L"            
## [19] "Full.."

We don’t like some of the names, let’s rename them

# Rename columns
names(DataSet) <- c("Year", "School", "Seed", "Conference", "ConfW", "ConfL","Conf", "ConfPlace", "RegW", "RegL", "Reg", "HowQual", "X1stGameAtHome", "TourneyW", "TourneyL", "TourneyFinish", "FullW", "FullL", "Full")

# check now
DT::datatable(head(DataSet))

Columns that we do not like, need to be dropped. Only keep the ones we like

# Drop columns that are not necessary
ShinyDataSet <- select(DataSet, "Year", "Seed", "Conference", "HowQual", "X1stGameAtHome", "TourneyW", "TourneyL", "TourneyFinish", "FullW", "FullL", "Full")

# View new data
DT::datatable(head(ShinyDataSet))

Mutate

# Create new column from existing columns
ShinyDataSet <- mutate(ShinyDataSet, FullWToLRatio = round(ShinyDataSet$FullW/ShinyDataSet$FullL))

Filter

# Filter data, return rows where year = 2000
ShinyDataSet_2000 <- filter(ShinyDataSet, Year == "2000")

DT::datatable(head(ShinyDataSet_2000))

Sort

# Sorted By FullWToLRatio Descending (desc)
SortedShinyDataSet <- arrange(ShinyDataSet, desc(FullWToLRatio))

# Removing missing values(NA)
SortedShinyDataSet$FullWToLRatio <- as.numeric(na.omit(SortedShinyDataSet$FullWToLRatio))

DT::datatable(head(SortedShinyDataSet))

what do we do with the inf value in the column FullWToLRatio, which is the division by zero (0)

#if FullWToLRatio = 0, let's replace it with FullW
SortedShinyDataSet = SortedShinyDataSet %>% mutate(FullWToLRatio = factor(ifelse(FullL == 0, FullW, FullWToLRatio)))

FullWToLRatioLevels <- cut(as.numeric(SortedShinyDataSet$FullWToLRatio), breaks = 10) %>% levels

DT::datatable(head(SortedShinyDataSet))

Level-Up

#what are our levels?
FullWToLRatioLevels

##  [1] "(0.971,3.9]" "(3.9,6.8]"   "(6.8,9.7]"   "(9.7,12.6]"  "(12.6,15.5]"
##  [6] "(15.5,18.4]" "(18.4,21.3]" "(21.3,24.2]" "(24.2,27.1]" "(27.1,30]"

Summarise

summarise(SortedShinyDataSet, "MeanWLRatio" = mean(as.numeric(SortedShinyDataSet$FullWToLRatio), na.rm = TRUE), "SDWLRatio" = sd(as.numeric(SortedShinyDataSet$FullWToLRatio), na.rm = TRUE))

##   MeanWLRatio SDWLRatio
## 1    3.970841  3.605831

Visualize This!

#plot the histogram
ggplot(data = SortedShinyDataSet, aes(x = as.numeric(SortedShinyDataSet$FullWToLRatio)), color = "gold") + 
  labs(title="Win to loss Ratio Histogram", x="Win to loss Ratio", y="Count") +
  geom_histogram(stat="Count", color = "gold")