HW05.2

Author

Ryan Li

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

wncaa <- read_csv("https://raw.githubusercontent.com/vaiseys/dav-course/main/Data/wncaa.csv")

Rows: 2092 Columns: 19
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (6): school, conference, conf_place, how_qual, x1st_game_at_home, tourn...
dbl (13): year, seed, conf_w, conf_l, conf_percent, reg_w, reg_l, reg_percen...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(wncaa)

Rows: 2,092
Columns: 19
$ year              <dbl> 1982, 1982, 1982, 1982, 1982, 1982, 1982, 1982, 1982…
$ school            <chr> "Arizona St.", "Auburn", "Cheyney", "Clemson", "Drak…
$ seed              <dbl> 4, 7, 2, 5, 4, 6, 5, 8, 7, 7, 4, 8, 2, 1, 1, 2, 3, 6…
$ conference        <chr> "Western Collegiate", "Southeastern", "Independent",…
$ conf_w            <dbl> NA, NA, NA, 6, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ conf_l            <dbl> NA, NA, NA, 3, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ conf_percent      <dbl> NA, NA, NA, 66.7, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ conf_place        <chr> "-", "-", "-", "4th", "-", "-", "-", "-", "-", "-", …
$ reg_w             <dbl> 23, 24, 24, 20, 26, 19, 21, 14, 21, 28, 24, 17, 22, …
$ reg_l             <dbl> 6, 4, 2, 11, 6, 7, 8, 10, 8, 7, 5, 13, 7, 5, 1, 6, 4…
$ reg_percent       <dbl> 79.3, 85.7, 92.3, 64.5, 81.3, 73.1, 72.4, 58.3, 72.4…
$ how_qual          <chr> "at-large", "at-large", "at-large", "at-large", "aut…
$ x1st_game_at_home <chr> "Y", "N", "Y", "N", "Y", "N", "N", "N", "N", "N", "Y…
$ tourney_w         <dbl> 1, 0, 4, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 1, 5, 3, 1, 1…
$ tourney_l         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1…
$ tourney_finish    <chr> "RSF", "1st", "N2nd", "1st", "RF", "1st", "1st", "1s…
$ full_w            <dbl> 24, 24, 28, 20, 28, 19, 21, 14, 21, 28, 26, 17, 24, …
$ full_l            <dbl> 7, 5, 3, 12, 7, 8, 9, 11, 9, 8, 6, 14, 8, 6, 1, 7, 5…
$ full_percent      <dbl> 77.4, 82.8, 90.3, 62.5, 80.0, 70.4, 70.0, 56.0, 70.0…

Question 1

champs <- wncaa |>
  filter(tourney_finish == "Champ") |>
  group_by(school) |>
  summarise(championships = n()) |>
  mutate(
    total_tournaments = max(wncaa$year) - min(wncaa$year) + 1,
    percentage = (championships / total_tournaments) * 100
  ) 

ggplot(champs, aes(x = school, y = percentage)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(
    title = "Percentage of WNCAA Tournaments Won by School",
    x = "School Wining Championships",
    y = "Percentage of Tournaments Won (%)"
  )

Most team have a percentages that are around 5% except for UConn and UTennessee which are the two most winning teams, together they won over 50% championships.

Question 2

champ_names <- unique(champs$school)
winners <- wncaa |> 
  filter(school %in% champ_names)

winners |>
  ggplot(aes(x = reorder(school, seed, median), y = seed)) +
  geom_boxplot(outlier.shape = NA) +  
  geom_jitter(alpha = 0.6, width = 0.3) +
  coord_flip() +
  labs(
    title = "Distribution of Seeds for Championship Teams",
    x = "School",
    y = "Seed"
  )

I am not suprising at all to see UConn and UTenessee having many low seeds since they were indeed the most dominant teams. However, there are low seed schools that still win little championships like Baylor and Stanford, suggesting limitations for seed ranking.

winners |>
  ggplot(aes(x = reorder(school, seed, median), y = seed)) +
  geom_violin() +
  geom_jitter(alpha = 0.6, width = 0.2) +
  coord_flip() +
  labs(
    title = "Distribution of Seeds for Championship Teams",
    x = "School",
    y = "Seed"
  )

Boxplot is more effective as it shows the upper and lower 25% as well as the median. Violin shows density better but the dots could provide insights on that as well.

Question 3

ggplot(winners, aes(x = seed, y = reorder(school, seed, median))) +
  geom_point() +
  labs(title = "Distribution of Seeds for Championship Teams",
       x = "Seed",
       y = "School")

The plot is not effective as points would overlap since they each represent an integer value. This graph fails to capture the density, quartiles and median.

Question 4

winners_mean_sd <- winners |> 
  group_by(school) |> 
  summarize(across(where(is.numeric), list(avg = mean, sd = sd), .names = "{.col}_{.fn}"))

ggplot(winners_mean_sd, aes(x = reg_percent_avg, y = reorder(school, reg_percent_avg))) +
  geom_point(size = 3) +
  labs(
    title = "Average Regular Season Win Percentage for Championship Teams",
    x = "Average Regular Season Win Percentage",
    y = "School"
  )

Texas A&M has the lowest regular season win rate, UConn, Louisiana Tech, Baylor and Stanford have signifanctly higher win rate than other schools whose win rates are mostly between 75%-80%.

ggplot(winners_mean_sd, aes(x = reg_percent_avg, 
                           y = reorder(school, reg_percent_avg),
                           xmin = reg_percent_avg - reg_percent_sd,
                           xmax = reg_percent_avg + reg_percent_sd)) +
  geom_pointrange() +
  labs(
    title = "Average Regular Season Win Percentage for Championship Teams (±1 SD)",
    x = "Regular Season Win Percentage",
    y = "School"
  )

Texas A&M has the narrowest interval, meaning that its win rate is relatively stable, varing the least over years in comparison with other teams.

ggplot(winners_mean_sd, aes(x = reg_percent_avg, 
                                              y = reorder(school, reg_percent_avg),
                                              xmin = reg_percent_avg - reg_percent_sd,
                                              xmax = reg_percent_avg + reg_percent_sd)) +
  geom_linerange() +
  labs(
    title = "Regular Season Win Percentage for Championship Teams (Line Range)",
    x = "Regular Season Win Percentage",
    y = "School"
  )

we can indeed make this graph but it is a bad idea because linerange graph fails to capture the mean thereby hinder comparison among schools, and creates a false feelings that datas are spread evenly on the ranege covered by the line.