I’m using a dataset from here: https://github.com/fivethirtyeight/data/tree/master/ncaa-womens-basketball-tournament
This folder contains the data behind the story ‘The Rise And Fall Of Women’s NCAA Tournament Dynasties’.
NCAA Women’s Basketball Tournament data contains information for every team that has participated in the NCAA Division I Women’s Basketball Tournament since it began in 1982. Every school is shown with its seed, conference record (when available), regular-season record, tournament record and full season record, including winning percentages.
Seed = The (OR) seeding designation in 1983 notes the eight teams that played an opening-round game to become the No. 8 seed in each region.
How qual = Whether the school qualified with an automatic bid (by winning its conference or conference tournament) or an at-large bid.
1st game at home = Whether the school played its first-round tournament games on its home court.
Tourney finish = The round of the final game for each team. OR=opening-round loss (1983 only); 1st=first-round loss; 2nd=second-round loss; RSF=loss in the Sweet 16; RF=loss in the Elite Eight; NSF=loss in the national semifinals; N2nd=national runner-up; Champ=national champions
library(tidyr)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(kableExtra)
DataSet <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/ncaa-womens-basketball-tournament/ncaa-womens-basketball-tournament-history.csv", sep = ",", stringsAsFactors = F)
DT::datatable(head(DataSet))
colnames(DataSet)
## [1] "ï..Year" "School" "Seed"
## [4] "Conference" "Conf..W" "Conf..L"
## [7] "Conf..." "Conf..place" "Reg..W"
## [10] "Reg..L" "Reg..." "How.qual"
## [13] "X1st.game.at.home." "Tourney.W" "Tourney.L"
## [16] "Tourney.finish" "Full.W" "Full.L"
## [19] "Full.."
# Rename columns
names(DataSet) <- c("Year", "School", "Seed", "Conference", "ConfW", "ConfL","Conf", "ConfPlace", "RegW", "RegL", "Reg", "HowQual", "X1stGameAtHome", "TourneyW", "TourneyL", "TourneyFinish", "FullW", "FullL", "Full")
# check now
DT::datatable(head(DataSet))
# Drop columns that are not necessary
ShinyDataSet <- select(DataSet, "Year", "Seed", "Conference", "HowQual", "X1stGameAtHome", "TourneyW", "TourneyL", "TourneyFinish", "FullW", "FullL", "Full")
# View new data
DT::datatable(head(ShinyDataSet))
# Create new column from existing columns
ShinyDataSet <- mutate(ShinyDataSet, FullWToLRatio = round(ShinyDataSet$FullW/ShinyDataSet$FullL))
# Filter data, return rows where year = 2000
ShinyDataSet_2000 <- filter(ShinyDataSet, Year == "2000")
DT::datatable(head(ShinyDataSet_2000))
# Sorted By FullWToLRatio Descending (desc)
SortedShinyDataSet <- arrange(ShinyDataSet, desc(FullWToLRatio))
# Removing missing values(NA)
SortedShinyDataSet$FullWToLRatio <- as.numeric(na.omit(SortedShinyDataSet$FullWToLRatio))
DT::datatable(head(SortedShinyDataSet))
#if FullWToLRatio = 0, let's replace it with FullW
SortedShinyDataSet = SortedShinyDataSet %>% mutate(FullWToLRatio = factor(ifelse(FullL == 0, FullW, FullWToLRatio)))
FullWToLRatioLevels <- cut(as.numeric(SortedShinyDataSet$FullWToLRatio), breaks = 10) %>% levels
DT::datatable(head(SortedShinyDataSet))
#what are our levels?
FullWToLRatioLevels
## [1] "(0.971,3.9]" "(3.9,6.8]" "(6.8,9.7]" "(9.7,12.6]" "(12.6,15.5]"
## [6] "(15.5,18.4]" "(18.4,21.3]" "(21.3,24.2]" "(24.2,27.1]" "(27.1,30]"
summarise(SortedShinyDataSet, "MeanWLRatio" = mean(as.numeric(SortedShinyDataSet$FullWToLRatio), na.rm = TRUE), "SDWLRatio" = sd(as.numeric(SortedShinyDataSet$FullWToLRatio), na.rm = TRUE))
## MeanWLRatio SDWLRatio
## 1 3.970841 3.605831
#plot the histogram
ggplot(data = SortedShinyDataSet, aes(x = as.numeric(SortedShinyDataSet$FullWToLRatio)), color = "gold") +
labs(title="Win to loss Ratio Histogram", x="Win to loss Ratio", y="Count") +
geom_histogram(stat="Count", color = "gold")