install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data(diamonds)

glimpse(diamonds)
## Rows: 53,940
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…

Something I want to add on from last class:

ggplot(mtcars, 
       aes(x = wt, y = mpg, size = hp)) +
  geom_point(alpha = 0.7, color = "steelblue") +
  scale_size(range = c(3, 10)) +
  theme_minimal() +
  labs(
    title = "Fuel Efficiency vs Vehicle Weight",
    subtitle = "Bubble size represents horsepower",
    x = "Weight (1000 lbs)",
    y = "Miles per Gallon",
    size = "Horsepower"
  )

Boxplots

Basic boxplot

Compare diamond price across cut quality.

ggplot(diamonds, aes(x = cut, y = price)) +
  geom_boxplot()

#improve readability by rotating axis text
ggplot(diamonds, aes(x = cut, y = price)) +
  geom_boxplot(fill = "steelblue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#add transparency
ggplot(diamonds, aes(x = cut, y = price)) +
  geom_boxplot(fill = "steelblue", alpha=0.6) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Violin Plots

ggplot(diamonds, aes(x = cut, y = price)) +
  geom_violin(fill = "skyblue", alpha = 0.6)

# Violin with boxplot overlay 

ggplot(diamonds, aes(x = cut, y = price)) +
  geom_violin(fill = "lightblue", alpha = 0.4, trim = FALSE) +
  geom_boxplot(width = 0.1, alpha = 0.7)

Outlier Formatting

ggplot(diamonds, aes(x = cut, y = price)) +
  geom_violin(fill = "lightblue", alpha = 0.4, trim = FALSE) +
  geom_boxplot(width = 0.1, alpha = 0.7, outlier.size = 0.5, outlier.colour = "red")

ggplot(mpg, aes(x = class, y = hwy)) +
  geom_boxplot(outlier.shape = NA) +  # hide default outliers
  geom_jitter(width = 0.2, height = 0, alpha = 0.6, color = "red", size = 2) +
  theme_minimal() +
  labs(
    title = "Highway MPG by Vehicle Class with Jittered Outliers",
    x = "Vehicle Class",
    y = "Highway MPG"
  )

library(dplyr)

mpg_outliers <- mpg %>%
  group_by(class) %>%
  mutate(q1 = quantile(hwy, 0.25),
         q3 = quantile(hwy, 0.75),
         iqr = q3 - q1,
         is_outlier = hwy < (q1 - 1.5*iqr) | hwy > (q3 + 1.5*iqr)) %>%
  filter(is_outlier)

# Plot
ggplot(mpg, aes(x = class, y = hwy)) +
  geom_boxplot() + #not this is keeping the outliers already.
  geom_jitter(data = mpg_outliers, width = 0.2, color = "red", size = 2, alpha = 0.8) +
  theme_minimal()

ggplot(mpg, aes(x = class, y = hwy)) +
  geom_boxplot(outlier.shape = NA) +  # hide default outliers
  geom_jitter(data = mpg_outliers, width = 0.2, color = "red", size = 2, alpha = 0.8) +
  theme_minimal()

Additional Axis Formatting

#Rounding axis labels 

ggplot(diamonds, aes(x = cut, y = price)) +
  geom_boxplot() +
  scale_y_continuous(labels = scales::label_number(accuracy = 100))

ggplot(diamonds, aes(x = cut, y = price)) +
  geom_boxplot() +
  coord_cartesian(ylim = c(0, 5000))

ggplot(diamonds, aes(x = cut, y = price)) +
  geom_boxplot() +
  scale_y_continuous(limits=c(0, 5000))
## Warning: Removed 14714 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

scale_y_continuous(limits = …) → removes data outside range coord_cartesian() → zooms without removing data

Color Palettes

ggplot(diamonds, aes(x = cut, y = price, fill = cut)) +
  geom_boxplot() +
  scale_fill_brewer(palette = "Set2")

#Viridis is colorblind friendly
ggplot(diamonds, aes(x = cut, y = price, fill = cut)) +
  geom_boxplot() +
  scale_fill_viridis_d() #_d = discrete

Subtitles and Captions

ggplot(diamonds, aes(x = cut, y = price)) +
  geom_boxplot(fill = "steelblue") +
  labs(
    title = "Diamond Prices by Cut Quality",
    subtitle = "Boxplots show distribution of price across cut categories",
    x = "Cut Quality",
    y = "Price (USD)",
    caption = "Source: ggplot2 diamonds dataset"
  )

Formatting Titles

ggplot(diamonds, aes(x = cut, y = price)) +
  geom_boxplot(fill = "steelblue") +
  labs(
    title = "Diamond Prices by Cut Quality",
    subtitle = "Boxplots show distribution of price across cut categories",
    x = "Cut Quality",
    y = "Price (USD)",
    caption = "Source: ggplot2 diamonds dataset"
  ) + theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 12),
    plot.caption = element_text(size = 8, hjust = 1)
  )

Improving Axis Readability

# Discuss the ordering of themes! 

ggplot(diamonds, aes(x = cut, y = price)) +
  geom_boxplot(fill = "steelblue") +
  labs(
    title = "Diamond Prices by Cut Quality",
    subtitle = "Boxplots show distribution of price across cut categories",
    x = "Cut Quality",
    y = "Price (USD)",
    caption = "Source: ggplot2 diamonds dataset"
  ) + theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 12),
    plot.caption = element_text(size = 8, hjust = 1)
  ) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(axis.text = element_text(size = 12)) +
  theme_minimal () ########## ORDER MATTERS

ggplot(diamonds, aes(x = cut, y = price)) +
  geom_boxplot(fill = "steelblue") +
  labs(
    title = "Diamond Prices by Cut Quality",
    subtitle = "Boxplots show distribution of price across cut categories",
    x = "Cut Quality",
    y = "Price (USD)",
    caption = "Source: ggplot2 diamonds dataset"
  ) +   theme_minimal () +########## ORDER MATTERS
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 12),
    plot.caption = element_text(size = 8, hjust = 1)
  ) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(axis.text = element_text(size = 12)) 

Reordering Data

In plain English:

  1. Group the data by cut

  2. Compute the median price within each cut

  3. Reorder the cut categories from lowest median price to highest

  4. Use that new order on the x-axis

#What if you dont link the order that it has set by default?


ggplot(diamonds, aes(x = reorder(cut, price, median), y = price)) + #“Reorder the levels of cut based on the median value of price within each cut.”
  geom_boxplot(fill = "steelblue") +
  coord_cartesian(ylim = c(0, 5000))

ggplot(diamonds, aes(x = reorder(cut, -price, median), y = price)) + #“Reorder the levels of cut based on the median value of price within each cut.”
  geom_boxplot(fill = "steelblue") +
  coord_cartesian(ylim = c(0, 5000))

#what if we add "decr" to reverse the order?
#⚠ desc() does not work inside base reorder().

Combining it all

ggplot(diamonds, aes(x = cut, y = price, fill = cut)) +
  geom_violin(alpha = 0.4, trim = FALSE) +
  geom_boxplot(width = 0.1, alpha = 0.7) +
  scale_fill_viridis_d() +
  coord_cartesian(ylim = c(0, 12000)) +
  labs(
    title = "Distribution of Diamond Prices by Cut",
    subtitle = "Violin + Boxplot Comparison",
    x = "Cut Quality",
    y = "Price (USD)",
    caption = "Source: ggplot2 diamonds dataset"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(face = "bold")
  )

Barplots

mtcars$cyl  <- as.factor(mtcars$cyl)
mtcars$gear <- as.factor(mtcars$gear)

ggplot(mtcars, aes(x = cyl)) +
  geom_bar()

#geom_bar() by default uses stat = "count"
#It counts observations in each category
#No y variable needed
library(dplyr)

#If you want more than counts, you will need a summary table first
avg_mpg <- mtcars %>%
  group_by(cyl) %>%
  summarise(mean_mpg = mean(mpg))

ggplot(avg_mpg, aes(x = cyl, y = mean_mpg)) +
  geom_col()

#geom_bar() → counts
#geom_col() → uses provided y values
# This distinction confuses students constantly.
#stacked barplots
ggplot(mtcars, aes(x = cyl, fill = gear)) +
  geom_bar()

#What happens:
#Bars represent total count of cars per cylinder group
#Segments show distribution of gear types

#⚠ Hard to compare middle segments
#⚠ Hard to compare across bars
#⚠ Order of stacking affects readability


# Add proportions instead!
ggplot(mtcars, aes(x = cyl, fill = gear)) +
  geom_bar(position = "fill")

#Each bar = 100%
#Shows composition, not count
#grouped barplots
ggplot(mtcars, aes(x = cyl, y = mpg, fill = gear)) +
  stat_summary(fun = mean,
               geom = "col",
               position = "dodge")

#stat_summary() calculates summary inside ggplot
#Compare with pre-summarized approach
#The additional of error bars
ggplot(mtcars, aes(x = cyl, y = mpg, fill = gear)) +
  stat_summary(fun = mean,
               geom = "col",
               position = position_dodge(width = 0.9)) +
  stat_summary(fun.data = mean_se,
               geom = "errorbar",
               position = position_dodge(width = 0.9),
               width = 0.2)

Homework :

Part 1: Boxplots (20 pts)

  • Dataset: mpg
  • Create a boxplot of hwy (highway mpg) for each vehicle class (class).
  • Rotate the x-axis text 45 degrees for readability.
  • Add a title, subtitle, and caption to your plot.
  • Color the boxes by drivetrain (drv).
data(mpg)
ggplot(mpg, aes(x=class,y=hwy, fill = drv))+
       geom_boxplot()+
  labs(x="Type of Car",
       y="Highway MPG",
       title="Highway MPG per Type of Car",
       caption = "Source: ggplot2 mpg dataset",
       subtitle = "Boxplot shows each class of car and their MPG on a highway")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45))

Questions:

1.Which vehicle class has the highest median highway mpg? The vehicle with the highest median class are compact vehicles, with a median of 29 2. Which has the largest variability? Subcompact has the highest variability.

Create an alteration of this plot where the outliers are shown in a way in which they do not overlap (consider jittering and coloring).

mpg_out2=mpg%>%
  group_by(class)%>%
  mutate(q1=quantile(hwy,0.25),
         q3=quantile(hwy,0.75),
         iqr=q3-q1,
is_outlier = hwy < (q1 - 1.5*iqr) | hwy > (q3 + 1.5*iqr)) %>%
  filter(is_outlier)

ggplot(mpg, aes(x=class,y=hwy, fill = drv))+
       geom_boxplot(outlier.shape = NA)+
  labs(x="Type of Car",
       y="Highway MPG",
       title="Highway MPG per Type of Car",
       caption = "Source: ggplot2 mpg dataset",
       subtitle = "Boxplot shows each class of car and their MPG on a highway")+
   geom_jitter(data=mpg_out2,color="gold",size=2, width = 0.25, alpha= 0.7)+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45))

Part 2: Violin Plots

  • Dataset: mpg
  • Create a violin plot of cty (city mpg) for each vehicle class.
  • Overlay boxplots on top of the violins. Change as necessary for aesthetics.
  • Use a transparent fill so that both distributions and boxplots are visible.
  • Add a descriptive title, subtitle, and labels.
ggplot(mpg, aes(x=class, y=cty,fill=class))+
       geom_violin(trim=FALSE, alpha=0.4)+
         geom_boxplot(width=0.1, alpha=0.7, outlier.shape=NA)+
  labs(
    title = "Distribution of City MPG per Class",
    subtitle = "Violin & Boxplot Comparison",
    x = "Class",
    y = "MPG (city)",
    caption = "Source: ggplot2 mpg dataset"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Questions:

  1. How does the violin plot help you understand the distribution compared to the boxplot alone? The violin plot helps show the spread of the boxplots. When combining both you get a more clear pciture of the data.
  2. Are there any classes with unusual distributions or outliers? The 2 seater has a very stuby boxplot with large spread. and the subcompact vehicles have an outlier far higher than the rest of its data.

Part 3: Barplots

  • Dataset: diamonds
  • Create a simple barplot showing the number of diamonds for each cut.
  • Create a stacked barplot showing the number of diamonds by cut and colored by color.
  • Create a grouped (side-by-side) barplot showing the same data.
data(diamonds)
ggplot(diamonds, aes(x=cut))+
  geom_bar()+
  theme_minimal()

labs(title="Basic Barplot Showing Diamonds by Cut",
     x= "Cut",
     y="Amount")
## <ggplot2::labels> List of 3
##  $ x    : chr "Cut"
##  $ y    : chr "Amount"
##  $ title: chr "Basic Barplot Showing Diamonds by Cut"
ggplot(diamonds, aes(x=cut, fill=color))+
  geom_bar()+
  theme_minimal()

labs(title="Stacked Barplot Showing Diamonds by Cut and Color",
     x= "Cut",
     y="Amount",
     fill="Color")
## <ggplot2::labels> List of 4
##  $ x    : chr "Cut"
##  $ y    : chr "Amount"
##  $ fill : chr "Color"
##  $ title: chr "Stacked Barplot Showing Diamonds by Cut and Color"
ggplot(diamonds, aes(x=cut, fill=color))+
  geom_bar(position = "dodge")+
  theme_minimal()

labs(title="Grouped Barplot Showing Diamonds by Cut and Color",
     x= "Cut",
     y="Amount",
     fill="Color")
## <ggplot2::labels> List of 4
##  $ x    : chr "Cut"
##  $ y    : chr "Amount"
##  $ fill : chr "Color"
##  $ title: chr "Grouped Barplot Showing Diamonds by Cut and Color"

Questions:

  1. What does the stacked barplot show that the simple barplot does not? The stacked barplot shows the type of color that is present in each cut along with how many diamonds are ni each cut.
  2. When is the grouped barplot more useful than the stacked barplot? The grouped barplot is more useful when you want to see the spread og the colors within each cut type. If that doesn’t mater to you you can use the stacked barplot.

Part 4: Barplots Cont.

Dataset: diamonds

  • Compute the average price of diamonds for each cut using dplyr.
  • Create a barplot showing the average price for each cut (geom_col()).
  • Add meaningful axis labels, a title, subtitle, and caption.
  • Color the bars by color or clarity to show subgroup differences.
  • Rotate x-axis labels if needed for readability.
  • Reorder by average price to make comparisons clearer.
avg_price=diamonds %>%
  group_by(cut)%>%
  summarize(mean_price=mean(price,na.rm=TRUE))
avg_price2=avg_price %>%
  arrange(mean_price) %>%
  mutate(cut=factor(cut,levels=cut))

ggplot(avg_price2, aes(x=cut, y=mean_price))+
  geom_col(fill="cyan")+
  theme_minimal()+
  labs(title="Average Diamond Price by Cut",
       x="Cut",
       y="average price (USD)")

Create the same plot using the stat_summary approach. Add error bars to the plot.

ggplot(diamonds, aes(x=cut, y=price,))+
  stat_summary(fun="mean", geom="col")+
  stat_summary(fun.data = mean_se, geom = "errorbar", width=0.2)+
  labs(title="Average Diamond Price by Cut",
        x="Cut",
       y="average price (USD)")