#7 Exploratory Data Analysis
#Sections: Introduction, Prerequisites, Variation, Visualizing Distributions, Typical Values, Unusual Values
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.1 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#Variation
#7.3.4 Exercises
ggplot(data = diamonds, mapping = aes(x = x)) +
geom_density() +
geom_rug() +
labs(title = 'Distribution of x(length)')

ggplot(data = diamonds, mapping = aes(x = y)) +
geom_density() +
geom_rug() +
labs(title = 'Distribution of y(width)')

ggplot(data = diamonds, mapping = aes(x = z)) +
geom_density() +
geom_rug() +
labs(title = 'Distribution of z(depth)')

#2 – Explore the distribution of price
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = price), binwidth = 20)

#3 – How many diamonds are 0.99 carat?
diamonds %>% filter(between(carat, .96, 1.05)) %>%
group_by(carat) %>% summarize(count = n())
## `summarise()` ungrouping output (override with `.groups` argument)
#4 – Compare and contrast coord_cartesian() vs xlim() or ylim()
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = price), binwidth = 20) +
coord_cartesian(xlim = c(0,5000), ylim = c(0,700))

ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = price), binwidth = 20) +
xlim(c(0,5000)) +
ylim(c(0,700))
## Warning: Removed 14714 rows containing non-finite values (stat_bin).
## Warning: Removed 3 rows containing missing values (geom_bar).

#7.4 Missing values
#1 – What happens to missing values in a histogram?
data.frame(value = c(NA, NA, NA, rnorm(1000,0,1))) %>% ggplot() +
geom_histogram(mapping = aes(x = value), bins = 50)
## Warning: Removed 3 rows containing non-finite values (stat_bin).

ggplot(data = data.frame(type = c('A','A','B','B','B',NA))) +
geom_bar(mapping = aes(x = type))

#2 – na.rm = TRUE do in mean() and sum()
mean(c(1,2,3,NA,4), na.rm = TRUE)
## [1] 2.5
#Sections: Covariation, A categorical and Continuous Variable
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot(mapping = aes(sched_dep_time)) +
geom_density(mapping = aes(colour = cancelled))

nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot() +
geom_boxplot(mapping = aes(x = cancelled, y = sched_dep_time))

# cut, color, and clarity
diamonds %>%
mutate(cut = as.numeric(cut),
color = as.numeric(color),
clarity = as.numeric(clarity)) %>%
select(price, everything()) %>%
cor()
## price carat cut color clarity depth
## price 1.00000000 0.92159130 -0.05349066 0.17251093 -0.14680007 -0.01064740
## carat 0.92159130 1.00000000 -0.13496702 0.29143675 -0.35284057 0.02822431
## cut -0.05349066 -0.13496702 1.00000000 -0.02051852 0.18917474 -0.21805501
## color 0.17251093 0.29143675 -0.02051852 1.00000000 0.02563128 0.04727923
## clarity -0.14680007 -0.35284057 0.18917474 0.02563128 1.00000000 -0.06738444
## depth -0.01064740 0.02822431 -0.21805501 0.04727923 -0.06738444 1.00000000
## table 0.12713390 0.18161755 -0.43340461 0.02646520 -0.16032684 -0.29577852
## x 0.88443516 0.97509423 -0.12556524 0.27028669 -0.37199853 -0.02528925
## y 0.86542090 0.95172220 -0.12146187 0.26358440 -0.35841962 -0.02934067
## z 0.86124944 0.95338738 -0.14932254 0.26822688 -0.36695200 0.09492388
## table x y z
## price 0.1271339 0.88443516 0.86542090 0.86124944
## carat 0.1816175 0.97509423 0.95172220 0.95338738
## cut -0.4334046 -0.12556524 -0.12146187 -0.14932254
## color 0.0264652 0.27028669 0.26358440 0.26822688
## clarity -0.1603268 -0.37199853 -0.35841962 -0.36695200
## depth -0.2957785 -0.02528925 -0.02934067 0.09492388
## table 1.0000000 0.19534428 0.18376015 0.15092869
## x 0.1953443 1.00000000 0.97470148 0.97077180
## y 0.1837601 0.97470148 1.00000000 0.95200572
## z 0.1509287 0.97077180 0.95200572 1.00000000
#3 – Install the ggstance package
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot() +
geom_boxplot(mapping = aes(x = cancelled, y = sched_dep_time)) +
coord_flip()

ggplot(data = diamonds) +
geom_boxplot(mapping = aes(x = cut, y = price))

diamonds %>% ggplot() +
geom_histogram(mapping = aes(x = price), binwidth = 50) +
facet_grid(cut~.)

diamonds %>% ggplot() +
geom_violin(mapping = aes(x = cut, y = price))

diamonds %>% ggplot() +
geom_freqpoly(mapping = aes(x = price, color = cut), binwidth = 50)

ggplot(data = mpg) +
geom_jitter(mapping = aes(x = drv, y = displ))

diamonds %>% count(color, cut) %>% group_by(color) %>%
mutate(prop = n / sum(n)) %>%
ggplot() +
geom_tile(mapping = aes(x = color, y = cut, fill = prop)) +
labs(title = 'Distribution of cut within color')

nycflights13::flights %>% group_by(dest, month) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot() +
geom_tile(mapping = aes(x = month, y = dest, fill = avg_dep_delay))
## `summarise()` regrouping output by 'dest' (override with `.groups` argument)

nycflights13::flights %>% group_by(dest, month) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
ungroup() %>%
group_by(dest) %>%
mutate(n_month = n())%>%
ggplot() +
geom_tile(mapping = aes(x = factor(month),
y = reorder(dest, n_month),
fill = avg_dep_delay)) +
scale_fill_gradient2(low = 'yellow', mid = 'orange', high = 'red',
midpoint = 35)
## `summarise()` regrouping output by 'dest' (override with `.groups` argument)

diamonds %>% count(color, cut) %>% group_by(color) %>%
mutate(prop = n / sum(n)) %>%
ggplot() +
geom_tile(mapping = aes(x = cut, y = color, fill = prop)) +
labs(title = 'Distribution of cut within color')

diamonds %>% ggplot() +
geom_freqpoly(mapping = aes(x = price,
color = cut_width(carat, .2)), bins = 30)

diamonds %>% ggplot() +
geom_freqpoly(mapping = aes(x = price,
color = cut_width(carat, .4)), bins = 30)

diamonds %>% ggplot() +
geom_freqpoly(mapping = aes(x = price,
color = cut_number(carat, 10)), bins = 30)

diamonds %>% ggplot() +
geom_density(mapping = aes(x = carat,
color = cut_width(price, 5000, boundary = 0)))

diamonds %>% ggplot +
geom_boxplot(mapping = aes(x = cut_number(carat, 10),
y = price)) +
coord_flip()

diamonds %>% ggplot() +
geom_boxplot(mapping = aes(x = cut, y = price,
color = cut_number(carat, 5)))

diamonds %>% mutate(carat_group = cut_number(carat, 10)) %>%
group_by(cut, carat_group) %>%
summarize(avg_price = mean(price)) %>%
ggplot() +
geom_tile(mapping = aes(x = cut, y = carat_group,
fill = avg_price))
## `summarise()` regrouping output by 'cut' (override with `.groups` argument)

diamonds %>% ggplot() +
geom_bin2d(mapping = aes(x = carat, y = price)) +
facet_grid(cut~.)

ggplot(data = diamonds) +
geom_point(mapping = aes(x = x, y = y)) +
coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))

ggplot(data = diamonds) +
geom_bin2d(mapping = aes(x = x, y = y), bins = 800) +
coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))
