install.packages("ggplot2") #install ggplot
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(ggplot2) #load in ggplot to create the plot
data(mpg) #load in the mpg data
ggplot(mpg,aes(x=class, y=hwy, fill=drv))+ #set x/y axes and fill by drivetrain (drv)
geom_boxplot()+ #create boxplot
theme(axis.text.x= element_text(angle=45))+ #rotate x-axis text 45 degrees
labs( #specify captions, titles, etc.
title = "Highway mpg per Vehicle Class", #set title
subtitle = "Boxplot shows distribution across vehicle classes", #set subtitle
x = "Vehicle Class", #set x axis title
y = "Highway mpg", #set y axis title
caption = "Source: ggplot2 mpg dataset" #set caption
)+
theme_bw() #change theme for visual clarity
Questions:
1.Which vehicle class has the highest median highway mpg? Compact vehicles have the highest median highway mpg.
ggplot(mpg,aes(x=class, y=hwy, fill=drv))+ #set x/y axes and fill by drivetrain (drv)
geom_boxplot()+ #create boxplot
geom_jitter(alpha= 0.5, width = 0.5, aes(color=drv))+ #jitter so outliers don't overlap + lowered opacity + color by drv
theme(axis.text.x= element_text(angle=45))+ #rotate x-axis text 45 degrees
labs( #specify captions, titles, etc.
title = "Highway mpg per Vehicle Class", #set title
subtitle = "Boxplot shows distribution across vehicle classes", #set subtitle
x = "Vehicle Class", #set x axis title
y = "Highway mpg", #set y axis title
caption = "Source: ggplot2 mpg dataset" #set caption
)+
theme_bw() #change theme for visual clarity
ggplot(mpg, aes(x=class, y=cty))+ #specify dataset and x/y axes
geom_violin(fill= "purple1", alpha= 0.5, trim= FALSE)+ #add violin, fill purple, lower opacity, don't trim
geom_boxplot(width= 0.1, alpha = 0.5)+ #add boxplot overlay, half opacity
labs( #create labels
x = "Vehicle Class", #name x axis
y = "City mpg", #name y axis
title = "City mpg per Vehicle Class", #set title
subtitle= "Violin plot with overlayed boxplot showing average city mpg per vehicle class", #set subtitle
caption = "Source = ggplot2 mpg dataset" #set caption
)
Questions:
How does the violin plot help you understand the distribution compared to the boxplot alone? It gives us a better idea of the distribution, showing density across the boxplot.
Are there any classes with unusual distributions or outliers? Subcompact cars have a wider range/further-away outliers than any other vehicle class.
data(diamonds) #load in diamonds data
ggplot(diamonds, aes(x=cut))+ #create plot, use diamonds dataset and organize by cut
geom_bar() #specify the data will be in a barplot
ggplot(diamonds, aes(x=cut, fill=color))+ #create plot, use diamonds dataset and organize by cut, fill by color
geom_bar() #specify the data will be in a barplot
ggplot(diamonds, aes(x=cut, y=carat, fill=color))+ #create plot, use diamonds dataset and organize by cut
stat_summary(fun = mean, #create grouped plot
geom = "col",
position = "dodge")
Questions:
What does the stacked barplot show that the simple barplot does not? It shows the frequency of each color per cut category.
When is the grouped barplot more useful than the stacked barplot? When you need a clearer side-by-side comparison of the data.
install.packages("dplyr") #installing dplyr
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(dplyr) #load in dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
avg_ppc <- diamonds %>% #create new variable (avg price of diamonds per cut)
group_by(cut, color, price) %>% #group by cut
summarise(mean_pr = mean(price)) #summarize avg price per cut
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by cut, color, and price.
## ℹ Output is grouped by cut and color.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(cut, color, price))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
order_pr <- diamonds %>% # create new variable (avg price)
group_by(cut) %>% #group by cut
summarise(mean_pr = mean(price)) %>% #summarise prices
arrange(mean_pr) #arrange by price
ggplot(avg_ppc, aes(x= cut, y=mean_pr, fill= color))+ #plot using new variables, specify x/y axes and fill by color
geom_col()+ #create barplot, show avg price per cut
theme(axis.text.x= element_text(angle=45, hjust= 1))+ #rotate x-axis text 45 degrees
labs(
x = "Cut", #name x axis
y = "Average Price", #name y axis
title = "Average Price per Diamond Cut", #add title
subtitle = "Barplot showing the relationship between diamond cut and price", #add subtitle
caption = "Source:ggplot 2 diamonds dataset" #add caption
)
ggplot(diamonds, aes(x= cut, y=price, fill= color))+ #plot using new variables, specify x/y axes and fill by color
geom_col()+ #create barplot, show avg price per cut
stat_summary(fun = mean, #calculate mean
geom = "bar", #create bars
position = "dodge")+
theme(axis.text.x= element_text(angle=45, hjust= 1))+ #rotate x-axis text 45 degrees
labs(
x = "Cut", #name x axis
y = "Average Price", #name y axis
title = "Average Price per Diamond Cut", #add title
subtitle = "Barplot showing the relationship between diamond cut and price", #add subtitle
caption = "Source:ggplot 2 diamonds dataset") #add caption