#7 Exploratory Data Analysis
#Sections: Introduction, Prerequisites, Variation, Visualizing Distributions, Typical Values, Unusual Values
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.1     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#Variation
#7.3.4 Exercises
ggplot(data = diamonds, mapping = aes(x = x)) +
  geom_density() + 
  geom_rug() +
  labs(title = 'Distribution of x(length)')

ggplot(data = diamonds, mapping = aes(x = y)) +
  geom_density() +
  geom_rug() +
  labs(title = 'Distribution of y(width)')

ggplot(data = diamonds, mapping = aes(x = z)) +
  geom_density() + 
  geom_rug() +
  labs(title = 'Distribution of z(depth)')

#2 – Explore the distribution of price
ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = price), binwidth = 20)

#3 – How many diamonds are 0.99 carat? 
diamonds %>% filter(between(carat, .96, 1.05)) %>%
  group_by(carat) %>% summarize(count = n())
## `summarise()` ungrouping output (override with `.groups` argument)
#4 – Compare and contrast coord_cartesian() vs xlim() or ylim()
ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = price), binwidth = 20) +
  coord_cartesian(xlim = c(0,5000), ylim = c(0,700))

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = price), binwidth = 20) +
  xlim(c(0,5000)) +
  ylim(c(0,700))
## Warning: Removed 14714 rows containing non-finite values (stat_bin).
## Warning: Removed 3 rows containing missing values (geom_bar).

#7.4 Missing values
#1 – What happens to missing values in a histogram? 
data.frame(value = c(NA, NA, NA, rnorm(1000,0,1))) %>% ggplot() +
  geom_histogram(mapping = aes(x = value), bins = 50)
## Warning: Removed 3 rows containing non-finite values (stat_bin).

ggplot(data = data.frame(type = c('A','A','B','B','B',NA))) + 
  geom_bar(mapping = aes(x = type))

#2 – na.rm = TRUE do in mean() and sum()
mean(c(1,2,3,NA,4), na.rm = TRUE)
## [1] 2.5
#Sections: Covariation, A categorical and Continuous Variable
nycflights13::flights %>% 
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour + sched_min / 60
  ) %>% 
  ggplot(mapping = aes(sched_dep_time)) + 
    geom_density(mapping = aes(colour = cancelled))

nycflights13::flights %>% 
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour + sched_min / 60
  ) %>% 
  ggplot() +
  geom_boxplot(mapping = aes(x = cancelled, y = sched_dep_time))

# cut, color, and clarity
diamonds %>%
  mutate(cut = as.numeric(cut),
         color = as.numeric(color),
         clarity = as.numeric(clarity)) %>%
  select(price, everything()) %>%
  cor()
##               price       carat         cut       color     clarity       depth
## price    1.00000000  0.92159130 -0.05349066  0.17251093 -0.14680007 -0.01064740
## carat    0.92159130  1.00000000 -0.13496702  0.29143675 -0.35284057  0.02822431
## cut     -0.05349066 -0.13496702  1.00000000 -0.02051852  0.18917474 -0.21805501
## color    0.17251093  0.29143675 -0.02051852  1.00000000  0.02563128  0.04727923
## clarity -0.14680007 -0.35284057  0.18917474  0.02563128  1.00000000 -0.06738444
## depth   -0.01064740  0.02822431 -0.21805501  0.04727923 -0.06738444  1.00000000
## table    0.12713390  0.18161755 -0.43340461  0.02646520 -0.16032684 -0.29577852
## x        0.88443516  0.97509423 -0.12556524  0.27028669 -0.37199853 -0.02528925
## y        0.86542090  0.95172220 -0.12146187  0.26358440 -0.35841962 -0.02934067
## z        0.86124944  0.95338738 -0.14932254  0.26822688 -0.36695200  0.09492388
##              table           x           y           z
## price    0.1271339  0.88443516  0.86542090  0.86124944
## carat    0.1816175  0.97509423  0.95172220  0.95338738
## cut     -0.4334046 -0.12556524 -0.12146187 -0.14932254
## color    0.0264652  0.27028669  0.26358440  0.26822688
## clarity -0.1603268 -0.37199853 -0.35841962 -0.36695200
## depth   -0.2957785 -0.02528925 -0.02934067  0.09492388
## table    1.0000000  0.19534428  0.18376015  0.15092869
## x        0.1953443  1.00000000  0.97470148  0.97077180
## y        0.1837601  0.97470148  1.00000000  0.95200572
## z        0.1509287  0.97077180  0.95200572  1.00000000
#3 – Install the ggstance package
nycflights13::flights %>% 
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour + sched_min / 60
  ) %>% 
  ggplot() +
  geom_boxplot(mapping = aes(x = cancelled, y = sched_dep_time)) +
  coord_flip()

ggplot(data = diamonds) +
  geom_boxplot(mapping = aes(x = cut, y = price))

diamonds %>% ggplot() +
  geom_histogram(mapping = aes(x = price), binwidth = 50) +
  facet_grid(cut~.)

diamonds %>% ggplot() +
  geom_violin(mapping = aes(x = cut, y = price))

diamonds %>% ggplot() +
  geom_freqpoly(mapping = aes(x = price, color = cut), binwidth = 50)

ggplot(data = mpg) + 
  geom_jitter(mapping = aes(x = drv, y = displ))

diamonds %>% count(color, cut) %>% group_by(color) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot() +
  geom_tile(mapping = aes(x = color, y = cut, fill = prop)) +
  labs(title = 'Distribution of cut within color')

nycflights13::flights %>% group_by(dest, month) %>%
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ggplot() +
  geom_tile(mapping = aes(x = month, y = dest, fill = avg_dep_delay))
## `summarise()` regrouping output by 'dest' (override with `.groups` argument)

nycflights13::flights %>% group_by(dest, month) %>%
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ungroup() %>%
  group_by(dest) %>%
  mutate(n_month = n())%>%
  ggplot() +
  geom_tile(mapping = aes(x = factor(month),
                          y = reorder(dest, n_month),
                          fill = avg_dep_delay)) +
  scale_fill_gradient2(low = 'yellow', mid = 'orange', high = 'red',
                       midpoint = 35)
## `summarise()` regrouping output by 'dest' (override with `.groups` argument)

diamonds %>% count(color, cut) %>% group_by(color) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot() +
  geom_tile(mapping = aes(x = cut, y = color, fill = prop)) +
  labs(title = 'Distribution of cut within color')

diamonds %>% ggplot() +
  geom_freqpoly(mapping = aes(x = price,
                              color = cut_width(carat, .2)), bins = 30)

diamonds %>% ggplot() +
  geom_freqpoly(mapping = aes(x = price,
                              color = cut_width(carat, .4)), bins = 30)

diamonds %>% ggplot() +
  geom_freqpoly(mapping = aes(x = price,
                              color = cut_number(carat, 10)), bins = 30)

diamonds %>% ggplot() +
  geom_density(mapping = aes(x = carat,
                             color = cut_width(price, 5000, boundary = 0)))

diamonds %>% ggplot +
  geom_boxplot(mapping = aes(x = cut_number(carat, 10),
                             y = price)) +
  coord_flip()

diamonds %>% ggplot() +
  geom_boxplot(mapping = aes(x = cut, y = price,
                             color = cut_number(carat, 5)))

diamonds %>% mutate(carat_group = cut_number(carat, 10)) %>%
  group_by(cut, carat_group) %>%
  summarize(avg_price = mean(price)) %>%
  ggplot() +
  geom_tile(mapping = aes(x = cut, y = carat_group,
                          fill = avg_price))
## `summarise()` regrouping output by 'cut' (override with `.groups` argument)

diamonds %>% ggplot() +
  geom_bin2d(mapping = aes(x = carat, y = price)) +
  facet_grid(cut~.)

ggplot(data = diamonds) +
  geom_point(mapping = aes(x = x, y = y)) +
  coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))

ggplot(data = diamonds) +
  geom_bin2d(mapping = aes(x = x, y = y), bins = 800) +
  coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))