import data
# excel file
teams <- read_excel("C:/Users/deleo/OneDrive/Desktop/PSU_DAT3000_IntroToDA/00_data/myData_updated.xlsx")
teams
## # A tibble: 236 × 20
## TEAMID TEAM PAKE PAKERANK PASE PASERANK GAMES W L WINPERCENT R64
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Abil… 0.7 45 0.7 52 3 1 2 0.333 2
## 2 2 Akron -0.9 179 -1.1 187 4 0 4 0 4
## 3 3 Alab… -2.1 211 -2.9 220 10 5 5 0.5 5
## 4 4 Alba… -0.4 147 -0.3 138 3 0 3 0 3
## 5 6 Amer… -0.5 160 -0.4 150 3 0 3 0 3
## 6 8 Ariz… -1.7 206 -2.5 216 28 17 11 0.607 11
## 7 9 Ariz… -2 209 -1.9 206 5 1 4 0.2 4
## 8 10 Arka… 4.3 11 3.5 16 18 11 7 0.611 7
## 9 11 Arka… 0 76 0 78 1 0 1 0 1
## 10 12 Aubu… 0.6 53 1.4 30 11 7 4 0.636 4
## # ℹ 226 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## # CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>
Introduction
Questions
Variation
Visualizing distributions
teams %>%
ggplot(aes(x = GAMES)) +
geom_bar()

teams %>%
ggplot(mapping = aes( x = GAMES )) +
geom_histogram(binwidth = 0.5)

teams %>%
filter(GAMES < 3 ) %>%
ggplot(aes(x = GAMES )) +
geom_histogram(binwidth = 0.5)

teams %>%
ggplot(aes(x = GAMES, color = WINPERCENT)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?

Typical values
teams %>%
# filter out larger number of games
filter(GAMES < 3) %>%
#plot
ggplot(aes(x = GAMES)) +
geom_histogram(binwidth = 0.1)

teams %>%
ggplot(aes(x = W)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Unusual values
teams %>%
ggplot(aes(x = WINPERCENT)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

teams %>%
ggplot(aes(x = WINPERCENT)) +
geom_histogram() +
coord_cartesian(ylim = c(0,50))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values
teams %>%
#filter(y < 3 | y > 20) %>%
mutate(y = ifelse(L < 3 | W > 2, NA, W)) %>%
#plot
ggplot(aes(x = W, y = L)) +
geom_point()

Covariation
A categorical and continuous variable
teams %>%
ggplot(aes(x = GAMES, y = WINPERCENT)) +
geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

Two categorical variables
teams %>%
count(GAMES, W) %>%
ggplot(aes(x = W, y = W, fill = n)) +
geom_tile()

Two continous variables
Patterns and models