install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(diamonds)
glimpse(diamonds)
## Rows: 53,940
## Columns: 10
## $ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
ggplot(mtcars,
aes(x = wt, y = mpg, size = hp)) +
geom_point(alpha = 0.7, color = "steelblue") +
scale_size(range = c(3, 10)) +
theme_minimal() +
labs(
title = "Fuel Efficiency vs Vehicle Weight",
subtitle = "Bubble size represents horsepower",
x = "Weight (1000 lbs)",
y = "Miles per Gallon",
size = "Horsepower"
)
Compare diamond price across cut quality.
ggplot(diamonds, aes(x = cut, y = price)) +
geom_boxplot()
#improve readability by rotating axis text
ggplot(diamonds, aes(x = cut, y = price)) +
geom_boxplot(fill = "steelblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#add transparency
ggplot(diamonds, aes(x = cut, y = price)) +
geom_boxplot(fill = "steelblue", alpha=0.6) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(diamonds, aes(x = cut, y = price)) +
geom_violin(fill = "skyblue", alpha = 0.6)
# Violin with boxplot overlay
ggplot(diamonds, aes(x = cut, y = price)) +
geom_violin(fill = "lightblue", alpha = 0.4, trim = FALSE) +
geom_boxplot(width = 0.1, alpha = 0.7)
ggplot(diamonds, aes(x = cut, y = price)) +
geom_violin(fill = "lightblue", alpha = 0.4, trim = FALSE) +
geom_boxplot(width = 0.1, alpha = 0.7, outlier.size = 0.5, outlier.colour = "red")
ggplot(mpg, aes(x = class, y = hwy)) +
geom_boxplot(outlier.shape = NA) + # hide default outliers
geom_jitter(width = 0.2, height = 0, alpha = 0.6, color = "red", size = 2) +
theme_minimal() +
labs(
title = "Highway MPG by Vehicle Class with Jittered Outliers",
x = "Vehicle Class",
y = "Highway MPG"
)
library(dplyr)
mpg_outliers <- mpg %>%
group_by(class) %>%
mutate(q1 = quantile(hwy, 0.25),
q3 = quantile(hwy, 0.75),
iqr = q3 - q1,
is_outlier = hwy < (q1 - 1.5*iqr) | hwy > (q3 + 1.5*iqr)) %>%
filter(is_outlier)
# Plot
ggplot(mpg, aes(x = class, y = hwy)) +
geom_boxplot() + #not this is keeping the outliers already.
geom_jitter(data = mpg_outliers, width = 0.2, color = "red", size = 2, alpha = 0.8) +
theme_minimal()
ggplot(mpg, aes(x = class, y = hwy)) +
geom_boxplot(outlier.shape = NA) + # hide default outliers
geom_jitter(data = mpg_outliers, width = 0.2, color = "red", size = 2, alpha = 0.8) +
theme_minimal()
#Rounding axis labels
ggplot(diamonds, aes(x = cut, y = price)) +
geom_boxplot() +
scale_y_continuous(labels = scales::label_number(accuracy = 100))
ggplot(diamonds, aes(x = cut, y = price)) +
geom_boxplot() +
coord_cartesian(ylim = c(0, 5000))
ggplot(diamonds, aes(x = cut, y = price)) +
geom_boxplot() +
scale_y_continuous(limits=c(0, 5000))
## Warning: Removed 14714 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
scale_y_continuous(limits = …) → removes data outside range coord_cartesian() → zooms without removing data
ggplot(diamonds, aes(x = cut, y = price, fill = cut)) +
geom_boxplot() +
scale_fill_brewer(palette = "Set2")
#Viridis is colorblind friendly
ggplot(diamonds, aes(x = cut, y = price, fill = cut)) +
geom_boxplot() +
scale_fill_viridis_d() #_d = discrete
ggplot(diamonds, aes(x = cut, y = price)) +
geom_boxplot(fill = "steelblue") +
labs(
title = "Diamond Prices by Cut Quality",
subtitle = "Boxplots show distribution of price across cut categories",
x = "Cut Quality",
y = "Price (USD)",
caption = "Source: ggplot2 diamonds dataset"
)
ggplot(diamonds, aes(x = cut, y = price)) +
geom_boxplot(fill = "steelblue") +
labs(
title = "Diamond Prices by Cut Quality",
subtitle = "Boxplots show distribution of price across cut categories",
x = "Cut Quality",
y = "Price (USD)",
caption = "Source: ggplot2 diamonds dataset"
) + theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(size = 8, hjust = 1)
)
# Discuss the ordering of themes!
ggplot(diamonds, aes(x = cut, y = price)) +
geom_boxplot(fill = "steelblue") +
labs(
title = "Diamond Prices by Cut Quality",
subtitle = "Boxplots show distribution of price across cut categories",
x = "Cut Quality",
y = "Price (USD)",
caption = "Source: ggplot2 diamonds dataset"
) + theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(size = 8, hjust = 1)
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(axis.text = element_text(size = 12)) +
theme_minimal () ########## ORDER MATTERS
ggplot(diamonds, aes(x = cut, y = price)) +
geom_boxplot(fill = "steelblue") +
labs(
title = "Diamond Prices by Cut Quality",
subtitle = "Boxplots show distribution of price across cut categories",
x = "Cut Quality",
y = "Price (USD)",
caption = "Source: ggplot2 diamonds dataset"
) + theme_minimal () +########## ORDER MATTERS
theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(size = 8, hjust = 1)
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(axis.text = element_text(size = 12))
In plain English:
Group the data by cut
Compute the median price within each cut
Reorder the cut categories from lowest median price to highest
Use that new order on the x-axis
#What if you dont link the order that it has set by default?
ggplot(diamonds, aes(x = reorder(cut, price, median), y = price)) + #“Reorder the levels of cut based on the median value of price within each cut.”
geom_boxplot(fill = "steelblue") +
coord_cartesian(ylim = c(0, 5000))
ggplot(diamonds, aes(x = reorder(cut, -price, median), y = price)) + #“Reorder the levels of cut based on the median value of price within each cut.”
geom_boxplot(fill = "steelblue") +
coord_cartesian(ylim = c(0, 5000))
#what if we add "decr" to reverse the order?
#⚠ desc() does not work inside base reorder().
ggplot(diamonds, aes(x = cut, y = price, fill = cut)) +
geom_violin(alpha = 0.4, trim = FALSE) +
geom_boxplot(width = 0.1, alpha = 0.7) +
scale_fill_viridis_d() +
coord_cartesian(ylim = c(0, 12000)) +
labs(
title = "Distribution of Diamond Prices by Cut",
subtitle = "Violin + Boxplot Comparison",
x = "Cut Quality",
y = "Price (USD)",
caption = "Source: ggplot2 diamonds dataset"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold")
)
mtcars$cyl <- as.factor(mtcars$cyl)
mtcars$gear <- as.factor(mtcars$gear)
ggplot(mtcars, aes(x = cyl)) +
geom_bar()
#geom_bar() by default uses stat = "count"
#It counts observations in each category
#No y variable needed
library(dplyr)
#If you want more than counts, you will need a summary table first
avg_mpg <- mtcars %>%
group_by(cyl) %>%
summarise(mean_mpg = mean(mpg))
ggplot(avg_mpg, aes(x = cyl, y = mean_mpg)) +
geom_col()
#geom_bar() → counts
#geom_col() → uses provided y values
# This distinction confuses students constantly.
#stacked barplots
ggplot(mtcars, aes(x = cyl, fill = gear)) +
geom_bar()
#What happens:
#Bars represent total count of cars per cylinder group
#Segments show distribution of gear types
#⚠ Hard to compare middle segments
#⚠ Hard to compare across bars
#⚠ Order of stacking affects readability
# Add proportions instead!
ggplot(mtcars, aes(x = cyl, fill = gear)) +
geom_bar(position = "fill")
#Each bar = 100%
#Shows composition, not count
#grouped barplots
ggplot(mtcars, aes(x = cyl, y = mpg, fill = gear)) +
stat_summary(fun = mean,
geom = "col",
position = "dodge")
#stat_summary() calculates summary inside ggplot
#Compare with pre-summarized approach
#The additional of error bars
ggplot(mtcars, aes(x = cyl, y = mpg, fill = gear)) +
stat_summary(fun = mean,
geom = "col",
position = position_dodge(width = 0.9)) +
stat_summary(fun.data = mean_se,
geom = "errorbar",
position = position_dodge(width = 0.9),
width = 0.2)
data(mpg)
ggplot(mpg, aes(x=class,y=hwy, fill = drv))+
geom_boxplot()+
labs(x="Type of Car",
y="Highway MPG",
title="Highway MPG per Type of Car",
caption = "Source: ggplot2 mpg dataset",
subtitle = "Boxplot shows each class of car and their MPG on a highway")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 45))
Questions:
1.Which vehicle class has the highest median highway mpg? The vehicle with the highest median class are compact vehicles, with a median of 29 2. Which has the largest variability? Subcompact has the highest variability.
Create an alteration of this plot where the outliers are shown in a way in which they do not overlap (consider jittering and coloring).
mpg_out2=mpg%>%
group_by(class)%>%
mutate(q1=quantile(hwy,0.25),
q3=quantile(hwy,0.75),
iqr=q3-q1,
is_outlier = hwy < (q1 - 1.5*iqr) | hwy > (q3 + 1.5*iqr)) %>%
filter(is_outlier)
ggplot(mpg, aes(x=class,y=hwy, fill = drv))+
geom_boxplot(outlier.shape = NA)+
labs(x="Type of Car",
y="Highway MPG",
title="Highway MPG per Type of Car",
caption = "Source: ggplot2 mpg dataset",
subtitle = "Boxplot shows each class of car and their MPG on a highway")+
geom_jitter(data=mpg_out2,color="gold",size=2, width = 0.25, alpha= 0.7)+
theme_minimal()+
theme(axis.text.x = element_text(angle = 45))
ggplot(mpg, aes(x=class, y=cty,fill=class))+
geom_violin(trim=FALSE, alpha=0.4)+
geom_boxplot(width=0.1, alpha=0.7, outlier.shape=NA)+
labs(
title = "Distribution of City MPG per Class",
subtitle = "Violin & Boxplot Comparison",
x = "Class",
y = "MPG (city)",
caption = "Source: ggplot2 mpg dataset"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
Questions:
data(diamonds)
ggplot(diamonds, aes(x=cut))+
geom_bar()+
theme_minimal()
labs(title="Basic Barplot Showing Diamonds by Cut",
x= "Cut",
y="Amount")
## <ggplot2::labels> List of 3
## $ x : chr "Cut"
## $ y : chr "Amount"
## $ title: chr "Basic Barplot Showing Diamonds by Cut"
ggplot(diamonds, aes(x=cut, fill=color))+
geom_bar()+
theme_minimal()
labs(title="Stacked Barplot Showing Diamonds by Cut and Color",
x= "Cut",
y="Amount",
fill="Color")
## <ggplot2::labels> List of 4
## $ x : chr "Cut"
## $ y : chr "Amount"
## $ fill : chr "Color"
## $ title: chr "Stacked Barplot Showing Diamonds by Cut and Color"
ggplot(diamonds, aes(x=cut, fill=color))+
geom_bar(position = "dodge")+
theme_minimal()
labs(title="Grouped Barplot Showing Diamonds by Cut and Color",
x= "Cut",
y="Amount",
fill="Color")
## <ggplot2::labels> List of 4
## $ x : chr "Cut"
## $ y : chr "Amount"
## $ fill : chr "Color"
## $ title: chr "Grouped Barplot Showing Diamonds by Cut and Color"
Questions:
Dataset: diamonds
avg_price=diamonds %>%
group_by(cut)%>%
summarize(mean_price=mean(price,na.rm=TRUE))
avg_price2=avg_price %>%
arrange(mean_price) %>%
mutate(cut=factor(cut,levels=cut))
ggplot(avg_price2, aes(x=cut, y=mean_price))+
geom_col(fill="cyan")+
theme_minimal()+
labs(title="Average Diamond Price by Cut",
x="Cut",
y="average price (USD)")
Create the same plot using the stat_summary approach. Add error bars to the plot.
ggplot(diamonds, aes(x=cut, y=price,))+
stat_summary(fun="mean", geom="col")+
stat_summary(fun.data = mean_se, geom = "errorbar", width=0.2)+
labs(title="Average Diamond Price by Cut",
x="Cut",
y="average price (USD)")