Part A:
Part B:
Part C:
source("create_datasets.R")
load('data/test_datasets.RData')
library(readr)
library(dplyr)
library(ggplot2)
library(purrr)
library(ggplot2movies)
library(viridis)
library(GGally)
library(ggtern)
library(ggthemes)
library(geomnet)
library(ggmap)
library(ggfortify)
# Create movies_small
# library(ggplot2movies)
set.seed(123)
movies_small <- movies[sample(nrow(movies), 1000), ]
movies_small$rating <- factor(round(movies_small$rating))
# Explore movies_small with str()
str(movies_small)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 24 variables:
## $ title : chr "Fair and Worm-er" "Shelf Life" "House: After Five Years of Living" "Three Long Years" ...
## $ year : int 1946 2000 1955 2003 1963 1992 1999 1972 1994 1985 ...
## $ length : int 7 4 11 76 103 107 87 84 127 94 ...
## $ budget : int NA NA NA NA NA NA NA NA NA NA ...
## $ rating : Factor w/ 10 levels "1","2","3","4",..: 7 7 6 8 8 5 4 8 5 5 ...
## $ votes : int 16 11 15 11 103 28 105 9 37 28 ...
## $ r1 : num 0 0 14.5 4.5 4.5 4.5 14.5 0 4.5 4.5 ...
## $ r2 : num 0 0 0 0 4.5 0 4.5 0 4.5 0 ...
## $ r3 : num 0 0 4.5 4.5 0 4.5 4.5 0 14.5 4.5 ...
## $ r4 : num 0 0 4.5 0 4.5 4.5 4.5 0 4.5 14.5 ...
## $ r5 : num 4.5 4.5 0 0 4.5 0 4.5 14.5 24.5 4.5 ...
## $ r6 : num 4.5 24.5 34.5 4.5 4.5 0 14.5 0 4.5 14.5 ...
## $ r7 : num 64.5 4.5 24.5 0 14.5 4.5 14.5 14.5 14.5 14.5 ...
## $ r8 : num 14.5 24.5 4.5 4.5 14.5 24.5 14.5 24.5 14.5 14.5 ...
## $ r9 : num 0 0 0 14.5 14.5 24.5 14.5 14.5 4.5 4.5 ...
## $ r10 : num 14.5 24.5 14.5 44.5 44.5 24.5 14.5 44.5 4.5 24.5 ...
## $ mpaa : chr "" "" "" "" ...
## $ Action : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Animation : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Comedy : int 1 0 0 1 0 1 1 1 0 0 ...
## $ Drama : int 0 0 0 0 1 0 0 0 1 1 ...
## $ Documentary: int 0 0 1 0 0 0 0 0 0 0 ...
## $ Romance : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Short : int 1 1 1 0 0 0 0 0 0 0 ...
# Build a scatter plot with mean and 95% CI
ggplot(movies_small, aes(x = rating, y = votes)) +
geom_point() +
stat_summary(fun.data = "mean_cl_normal",
geom = "crossbar",
width = 0.2,
col = "red") +
scale_y_log10()
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# Reproduce the plot
ggplot(diamonds, aes(x = carat, y = price, col = color)) +
geom_point(alpha = 0.5, size = 0.5, shape = 16) +
scale_x_log10(expression(log[10](Carat)), limits = c(0.1, 10)) +
scale_y_log10(expression(log[10](Price)), limits = c(100, 100000)) +
scale_color_brewer(palette = "YlOrRd") +
coord_equal() +
theme_classic()
# Add smooth layer and facet the plot
ggplot(diamonds, aes(x = carat, y = price, col = color)) +
stat_smooth(method = "lm") +
scale_x_log10(expression(log[10](Carat)), limits = c(0.1,10)) +
scale_y_log10(expression(log[10](Price)), limits = c(100,100000)) +
scale_color_brewer(palette = "YlOrRd") +
coord_equal() +
theme_classic()
# movies_small is available
# Add a boxplot geom
d <- ggplot(movies_small, aes(x = rating, y = votes)) +
geom_point() +
geom_boxplot() +
stat_summary(fun.data = "mean_cl_normal",
geom = "crossbar",
width = 0.2,
col = "red")
# Untransformed plot
d
# Transform the scale
d + scale_y_log10()
# Transform the coordinates
## coord_trans is different to scale transformations in that it occurs after statistical transformation and will affect the visual appearance of geoms - there is no guarantee that straight lines will continue to be straight.
## This does not work in my case.
## It likely is from the statitics having a zero value.
## d + coord_trans(y = "log10")
# It works fine without the stats layer
ggplot(movies_small, aes(x = rating, y = votes)) +
geom_point() +
geom_boxplot() +
coord_trans(y = "log10")
This is the example from the documentation which actually works
# Three ways of doing transformation in ggplot:
# * by transforming the data
ggplot(diamonds, aes(log10(carat), log10(price))) +
geom_point()
# * by transforming the scales
ggplot(diamonds, aes(carat, price)) +
geom_point() +
scale_x_log10() +
scale_y_log10()
# * by transforming the coordinate system:
ggplot(diamonds, aes(carat, price)) +
geom_point() +
coord_trans(x = "log10", y = "log10")
# The difference between transforming the scales and
# transforming the coordinate system is that scale
# transformation occurs BEFORE statistics, and coordinate
# transformation afterwards. Coordinate transformation also
# changes the shape of geoms:
d <- subset(diamonds, carat > 0.5)
ggplot(d, aes(carat, price)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_log10() +
scale_y_log10()
ggplot(d, aes(carat, price)) +
geom_point() +
geom_smooth(method = "lm") +
coord_trans(x = "log10", y = "log10")
# Plot object p
p <- ggplot(diamonds, aes(x = carat, y = price))
# Use cut_interval
p + geom_boxplot(aes(group = cut_interval(carat, n = 10)))
# Use cut_number
p + geom_boxplot(aes(group = cut_number(carat, n = 10)))
# Use cut_width
p + geom_boxplot(aes(group = cut_width(carat, width = 0.25)))
cut_interval(x, n)
makes n groups from vector x with equal range.cut_number(x, n)
makes n groups from vector x with (approximately) equal numbers of observations.cut_width(x, width)
makes groups of width width from vector x.plot_quart <- function(n) {
set.seed(123)
playData <- data.frame(raw.values = rnorm(n, 1, 6))
quan.summary <- data.frame(t(sapply(1:9, function(x) quantile(playData$raw.values, type = x))))
names(quan.summary) <- c("Min", "Q1", "Median", "Q3", "Max")
quan.summary$Type <- as.factor(1:9)
library(reshape2)
quan.summary <- melt(quan.summary, id = "Type")
quan.summary <- list(quartiles = quan.summary, values = playData)
ggplot(quan.summary$quartiles, aes(x = Type, y = value, col = variable)) +
geom_point() +
geom_rug(data = quan.summary$values, aes(y = raw.values), sides = "l", inherit.aes = F)
}
plot_quart(4)
plot_quart(10)
plot_quart(50)
plot_quart(100)
# test_datasets.RData has been loaded
str(ch1_test_data)
## 'data.frame': 200 obs. of 3 variables:
## $ norm : num -0.5605 -0.2302 1.5587 0.0705 0.1293 ...
## $ bimodal: num 0.199 -0.688 -2.265 -1.457 -2.414 ...
## $ uniform: num -0.117 -0.537 -1.515 -1.812 -0.949 ...
# Calculating density: d
d <- density(ch1_test_data$norm, bw = "nrd0", kernel = "gaussian")
# Use which.max() to calculate mode
mode <- d$x[which.max(d$y)]
# Finish the ggplot call
ggplot(ch1_test_data, aes(x = norm)) +
geom_density() +
geom_rug() +
geom_vline(xintercept = mode, col = "red")
# ch1_test_data is available
# Arguments you'll need later on
fun_args <- list(mean = mean(ch1_test_data$norm), sd = sd(ch1_test_data$norm))
# Finish the ggplot
ggplot(ch1_test_data, aes(x = norm)) +
geom_histogram(aes(y = ..density..)) +
geom_density(col = "red") +
stat_function(
fun = dnorm,
args = fun_args,
col = "blue")
There are three parameters that you may be tempted to adjust in a density plot:
# small_data is available
small_data <- structure(list(x = c(-3.5, 0, 0.5, 6)), .Names = "x", row.names = c(NA,
-4L), class = "data.frame")
# Get the bandwith
get_bw <- density(small_data$x)$bw
# Basic plotting object
p <- ggplot(small_data, aes(x = x)) +
geom_rug() +
coord_cartesian(ylim = c(0,0.5))
# Create three plots
p + geom_density()
p + geom_density(adjust = 0.25)
p + geom_density(bw = 0.25 * get_bw)
# Create two plots
## rectangular kernel
p + geom_density(kernel = "r")
## epanechnikov kernel
p + geom_density(kernel = "e")
# Finish the plot
ggplot(diamonds, aes(x = cut, y = price, col = color)) +
geom_boxplot(varwidth = T) +
facet_grid(. ~ color)
# ch1_test_data and ch1_test_data2 are available
str(ch1_test_data)
## 'data.frame': 200 obs. of 3 variables:
## $ norm : num -0.5605 -0.2302 1.5587 0.0705 0.1293 ...
## $ bimodal: num 0.199 -0.688 -2.265 -1.457 -2.414 ...
## $ uniform: num -0.117 -0.537 -1.515 -1.812 -0.949 ...
str(ch1_test_data2)
## 'data.frame': 400 obs. of 2 variables:
## $ dist : Factor w/ 2 levels "norm","bimodal": 1 1 1 1 1 1 1 1 1 1 ...
## $ value: num -0.5605 -0.2302 1.5587 0.0705 0.1293 ...
# Plot with ch1_test_data
ggplot(ch1_test_data, aes(x = norm)) +
geom_rug() +
geom_density()
# Plot two distributions with ch1_test_data2
ggplot(ch1_test_data2, aes(x = value, fill = dist, col = dist)) +
geom_rug(alpha = 0.6) +
geom_density(alpha = 0.6)
# Individual densities
ggplot(mammals[mammals$vore == "Insectivore", ],
aes(x = sleep_total, fill = vore)) +
geom_density(col = NA, alpha = 0.35) +
scale_x_continuous(limits = c(0, 24)) +
coord_cartesian(ylim = c(0, 0.3))
# With faceting
ggplot(mammals, aes(x = sleep_total, fill = vore)) +
geom_density(col = NA, alpha = 0.35) +
scale_x_continuous(limits = c(0, 24)) +
coord_cartesian(ylim = c(0, 0.3)) +
facet_wrap( ~ vore, nrow = 2)
# Note that by default, the x ranges fill the scale
ggplot(mammals, aes(x = sleep_total, fill = vore)) +
geom_density(col = NA, alpha = 0.35) +
scale_x_continuous(limits = c(0, 24)) +
coord_cartesian(ylim = c(0, 0.3))
# Trim each density plot individually
ggplot(mammals, aes(x = sleep_total, fill = vore)) +
geom_density(col = NA, alpha = 0.35, trim = T) +
scale_x_continuous(limits=c(0,24)) +
coord_cartesian(ylim = c(0, 0.3))
# Unweighted density plot from before
ggplot(mammals, aes(x = sleep_total, fill = vore)) +
geom_density(col = NA, alpha = 0.35) +
scale_x_continuous(limits = c(0, 24)) +
coord_cartesian(ylim = c(0, 0.3))
# Unweighted violin plot
ggplot(mammals, aes(x = vore, y = sleep_total, fill = vore)) +
geom_violin()
# Calculate weighting measure
library(dplyr)
mammals2 <- mammals %>%
group_by(vore) %>%
mutate(n = n() / nrow(mammals)) -> mammals
str(mammals2, give.attr = F)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 76 obs. of 3 variables:
## $ vore : Factor w/ 4 levels "Carnivore","Herbivore",..: 1 4 2 4 2 2 1 1 2 2 ...
## $ sleep_total: num 12.1 17 14.4 14.9 4 14.4 8.7 10.1 3 5.3 ...
## $ n : num 0.25 0.263 0.421 0.263 0.421 ...
str(mammals, give.attr = F)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 76 obs. of 3 variables:
## $ vore : Factor w/ 4 levels "Carnivore","Herbivore",..: 1 4 2 4 2 2 1 1 2 2 ...
## $ sleep_total: num 12.1 17 14.4 14.9 4 14.4 8.7 10.1 3 5.3 ...
## $ n : num 0.25 0.263 0.421 0.263 0.421 ...
# Weighted density plot
## I remove the ylim because the y scale changes here
ggplot(mammals, aes(x = sleep_total, fill = vore)) +
geom_density(aes(weight = n), col = NA, alpha = 0.35) +
scale_x_continuous(limits = c(0, 24))
# Weighted violin plot
ggplot(mammals, aes(x = vore, y = sleep_total, fill = vore)) +
geom_violin(aes(weight = n), col = NA)
# Base layers
p <- ggplot(faithful, aes(x = waiting, y = eruptions)) +
scale_y_continuous(limits = c(1, 5.5), expand = c(0, 0)) +
scale_x_continuous(limits = c(40, 100), expand = c(0, 0)) +
coord_fixed(60 / 4.5)
# 1 - Use geom_density_2d()
p + geom_density_2d()
# 2 - Use stat_density_2d() with arguments
p + stat_density_2d(aes(col = ..level..), h = c(5, 0.5))
# Load in the viridis package
library(viridis)
# Add viridis color scale
ggplot(faithful, aes(x = waiting, y = eruptions)) +
scale_y_continuous(limits = c(1, 5.5), expand = c(0,0)) +
scale_x_continuous(limits = c(40, 100), expand = c(0,0)) +
coord_fixed(60/4.5) +
stat_density_2d(geom = "tile", aes(fill = ..density..), h=c(5,.5), contour = FALSE) +
scale_fill_viridis()
# pairs
pairs(iris[1:4])
# chart.Correlation
library(PerformanceAnalytics)
chart.Correlation(iris[1:4])
# ggpairs
# library(GGally)
mtcars_fact <- mtcars %>%
mutate(
cyl = as.factor(cyl),
vs = as.factor(vs),
am = as.factor(am),
gear = as.factor(gear),
carb = as.factor(carb)
)
ggpairs(mtcars_fact[1:3])
library(ggplot2)
library(reshape2)
cor_list <- function(x) {
L <- M <- cor(x)
M[lower.tri(M, diag = TRUE)] <- NA
M <- melt(M)
names(M)[3] <- "points"
L[upper.tri(L, diag = TRUE)] <- NA
L <- melt(L)
names(L)[3] <- "labels"
merge(M, L)
}
# Calculate xx with cor_list
library(dplyr)
xx <- iris %>%
group_by(Species) %>%
do(cor_list(.[1:4]))
# Finish the plot
ggplot(xx, aes(x = Var1, y = Var2)) +
geom_point(
aes(col = points, size = abs(points)),
shape = 16
) +
geom_text(
aes(col = labels, size = abs(labels), label = round(labels, 2))
) +
scale_size(range = c(0, 6)) +
scale_color_gradient2("r", limits = c(-1, 1)) +
scale_y_discrete("", limits = rev(levels(xx$Var1))) +
scale_x_discrete("") +
guides(size = FALSE) +
geom_abline(slope = -1, intercept = nlevels(xx$Var1) + 1) +
coord_fixed() +
facet_grid(. ~ Species) +
theme(axis.text.y = element_text(angle = 45, hjust = 1),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.background = element_blank())
# Explore africa
load('data/africa.RData')
africa_sample <- sample_n(africa, 50)
str(africa_sample)
## 'data.frame': 50 obs. of 3 variables:
## $ Sand: num 42 28 83 40 35 24 29 30 75 92 ...
## $ Silt: num 15 29 6 14 10 22 20 4 10 6 ...
## $ Clay: num 43 43 11 46 55 54 51 66 15 2 ...
head(africa_sample)
## Sand Silt Clay
## 34893 42 15 43
## 85261 28 29 43
## 64201 83 6 11
## 52595 40 14 46
## 44793 35 10 55
## 79175 24 22 54
# Add an ID column from the row.names
africa_sample$ID <- row.names(africa_sample)
# Gather africa_sample
library(tidyr)
africa_sample_tidy <- gather(africa_sample, key, value, -ID)
head(africa_sample_tidy)
## ID key value
## 1 34893 Sand 42
## 2 85261 Sand 28
## 3 64201 Sand 83
## 4 52595 Sand 40
## 5 44793 Sand 35
## 6 79175 Sand 24
# Finish the ggplot command
ggplot(africa_sample_tidy, aes(x = factor(ID), y = value, fill = key)) +
geom_col() +
coord_flip()
# The ggtern library is loaded
# Build ternary plot
str(africa)
## 'data.frame': 40093 obs. of 3 variables:
## $ Sand: num 24 36 56 52 65 43 42 47 57 51 ...
## $ Silt: num 12 14 18 21 3 14 22 19 15 14 ...
## $ Clay: num 64 50 26 27 32 43 36 34 28 35 ...
ggtern(africa, aes(x = Sand, y = Silt, z = Clay)) +
geom_point(shape = 16, alpha = 0.2)
# ggtern and ggplot2 are loaded
# Plot 1
ggtern(africa, aes(x = Sand, y = Silt, z = Clay)) +
geom_density_tern()
# Plot 2
ggtern(africa, aes(x = Sand, y = Silt, z = Clay)) +
stat_density_tern(geom = 'polygon', aes(fill = ..level.., alpha = ..level..)) +
guides(fill = FALSE)
## I want to see all the points on there
ggtern(africa, aes(x = Sand, y = Silt, z = Clay)) +
geom_point(alpha = 0.1, color = "navyblue", size = .5) +
stat_density_tern(
geom = 'polygon',
aes(fill = ..level.., alpha = ..level..),
bins = 100
) +
guides(alpha = FALSE) +
scale_fill_viridis()
# Load geomnet & examine structure of madmen
# The geomnet library is loaded
str(madmen)
## List of 2
## $ edges :'data.frame': 39 obs. of 2 variables:
## ..$ Name1: Factor w/ 9 levels "Betty Draper",..: 1 1 2 2 2 2 2 2 2 2 ...
## ..$ Name2: Factor w/ 39 levels "Abe Drexler",..: 15 31 2 4 5 6 8 9 11 21 ...
## $ vertices:'data.frame': 45 obs. of 2 variables:
## ..$ label : Factor w/ 45 levels "Abe Drexler",..: 5 9 16 23 26 32 33 38 39 17 ...
## ..$ Gender: Factor w/ 2 levels "female","male": 1 2 2 1 2 1 2 2 2 2 ...
## This is a much better way to see whats in each list. Love it.
## library(purrr)
madmen %>% purrr::map(head)
## $edges
## Name1 Name2
## 1 Betty Draper Henry Francis
## 2 Betty Draper Random guy
## 3 Don Draper Allison
## 4 Don Draper Bethany Van Nuys
## 5 Don Draper Betty Draper
## 6 Don Draper Bobbie Barrett
##
## $vertices
## label Gender
## 1 Betty Draper female
## 2 Don Draper male
## 3 Harry Crane male
## 4 Joan Holloway female
## 5 Lane Pryce male
## 6 Peggy Olson female
# Merge edges and vertices
mmnet <- merge(madmen$edges, madmen$vertices,
by.x = "Name1", by.y = "label",
all = TRUE)
# Examine structure of mmnet
head(mmnet)
## Name1 Name2 Gender
## 1 Betty Draper Henry Francis female
## 2 Betty Draper Random guy female
## 3 Don Draper Allison male
## 4 Don Draper Bethany Van Nuys male
## 5 Don Draper Betty Draper male
## 6 Don Draper Bobbie Barrett male
str(mmnet)
## 'data.frame': 75 obs. of 3 variables:
## $ Name1 : Factor w/ 45 levels "Betty Draper",..: 1 1 2 2 2 2 2 2 2 2 ...
## $ Name2 : Factor w/ 39 levels "Abe Drexler",..: 15 31 2 4 5 6 8 9 11 21 ...
## $ Gender: Factor w/ 2 levels "female","male": 1 1 2 2 2 2 2 2 2 2 ...
# geomnet is pre-loaded
# Merge edges and vertices
mmnet <- merge(madmen$edges, madmen$vertices,
by.x = "Name1", by.y = "label",
all = TRUE)
head(mmnet)
## Name1 Name2 Gender
## 1 Betty Draper Henry Francis female
## 2 Betty Draper Random guy female
## 3 Don Draper Allison male
## 4 Don Draper Bethany Van Nuys male
## 5 Don Draper Betty Draper male
## 6 Don Draper Bobbie Barrett male
# Finish the ggplot command
ggplot(data = mmnet, aes(from_id = Name1, to_id = Name2)) +
geom_net(
aes(col = Gender),
size = 6,
linewidth = 1,
labelon = T,
fontsize = 3,
labelcolour = "black")
# geomnet is pre-loaded
# ggmap is already loaded
# Merge edges and vertices
mmnet <- merge(madmen$edges, madmen$vertices,
by.x = "Name1", by.y = "label",
all = TRUE)
# Tweak the network plot
ggplot(data = mmnet, aes(from_id = Name1, to_id = Name2)) +
geom_net(
aes(col = Gender),
size = 6,
linewidth = 1,
labelon = TRUE,
fontsize = 3,
labelcolour = "black",
directed = T) +
scale_color_manual(values = c("#FF69B4", "#0099ff")) +
xlim(c(-0.05, 1.05)) +
theme_nothing() +
theme(legend.key = element_blank())
# Create linear model: res
res <- lm(Volume ~ Girth, data = trees)
# Plot res
plot(res)
# Import ggfortify and use autoplot()
# library(ggfortify)
autoplot(res, ncol = 2)
# ggfortify and Canada are available
# Inspect structure of Canada
str(Canada)
## Time-Series [1:84, 1:4] from 1980 to 2001: 930 930 930 931 933 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "e" "prod" "rw" "U"
head(Canada)
## [1] 929.6105 929.8040 930.3184 931.4277 932.6620 933.5509
# Call plot() on Canada
plot(Canada)
# Call autoplot() on Canada
# autoplot is from the ggfortify library
autoplot(Canada)
?cmdscale
.# ggfortify and eurodist are available
str(eurodist)
## Class 'dist' atomic [1:210] 3313 2963 3175 3339 2762 ...
## ..- attr(*, "Size")= num 21
## ..- attr(*, "Labels")= chr [1:21] "Athens" "Barcelona" "Brussels" "Calais" ...
# Autoplot + ggplot2 tweaking
autoplot(eurodist) +
coord_fixed()
# Autoplot of MDS
autoplot(cmdscale(eurodist, eig = TRUE),
label = TRUE,
label.size = 3,
size = 0)
# Perform clustering
iris_k <- kmeans(iris[-5], 3)
# Autoplot: color according to cluster
autoplot(iris_k, data = iris, frame = T)
# Autoplot: above, plus shape according to species
autoplot(iris_k, data = iris, frame = T, shape = 'Species')