install.packages("ggplot2") #install ggplot
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(ggplot2) #ensure ggplot installed
data(diamonds) #load in diamonds dataset
ggplot(diamonds, aes(x=carat, y=price)) + #set x/y axis
geom_point() + #create scatterplot
labs(x = "Carat", #name x axis
y = "Price", #name y axis
title = "Diamond Carat vs. Price") #add title
Question: What type of relationship appears between carat and price? Carat and price have a positive relationship, as one increases so does the other.
ggplot(diamonds, aes(x=carat, y=price, color = cut)) + #set x/y axis and color by cut
geom_point() + #create scatterplot
labs(x = "Carat", #x axis title
y = "Price", #y axis title
title = "Diamond Carat vs. Price") #figure title
Question: Which cut appears to have higher prices at similar carat values? Premium and very good cuts appear to have higher prices at similar carat values.
ggplot(diamonds, aes(x=carat, y=price, color = cut)) + #set x/y axis and color by cut
geom_point() +
geom_smooth(method = "lm") + #create scatterplot
labs(x = "Carat", #x axis title
y = "Price", #y axis title
title = "Diamond Carat vs. Price") #figure title
## `geom_smooth()` using formula = 'y ~ x'
Question: Does the relationship between carat and price appear linear? The relationship appears relatively positive and linear between carat and price.
Question: What does the “lm” option do in the geom_smooth command? What are the other options and what do they do? The “lm” option makes the regression line straight, whereas if you used the other default option it would be curved.
ggplot(diamonds, aes(x=carat, y=price)) + #set x/y axis and color by cut
geom_point(aes(color=cut), pch = 17, size = 3, alpha = 0.5) + #color by cut + lower opacity
geom_jitter(aes(color=cut), alpha = 0.5)+ #add jitter, keep color by cut & low opacity
geom_smooth(method = "lm", color = "mediumblue", alpha = 0.5) + #create scatterplot
labs(x = "Carat", #x axis title
y = "Price", #y axis title
title = "Diamond Carat vs. Price") #figure title
## `geom_smooth()` using formula = 'y ~ x'
Question: Why is overplotting a concern with large datasets? It becomes too visually overwhelming and unreadable.
Question: What does the alpha command do and how does it help with overplotting? The alpha command alters opacity, and can help with ensuring overlapping data is still visible.
Question: Based on what you see, what are the risks and benefits of using geom_jitter? Geom_jitter can help distinguish points, but in very large datasets, it may not make a huge difference.
ggplot(diamonds, aes(x=table, y=price))+ #set x/y axis
geom_point(aes(color=clarity), alpha=0.5)+ #create scatterplot, color by clarity, lower opacity
facet_wrap(~cut)+ #facet wrap by cut
labs(
x = "Table", #name x axis
y = "Price", #name y axis
title = "Diamond Table vs. Price" #create title
)
Question: Does the relationship differ by cut? The relationship doesn’t appear to differ significantly by cut.
data(economics) #load in dataset
ggplot(economics, aes(x=date, y=unemploy))+ #specify x/y axis
geom_line() + #create lineplot
labs(
x = "Date", #x axis title
y = "Unemployment Rate", #y axis title
title = "Unemployment Over Time") #title
Question: Describe the overall trend over time. As time increases, so does the unemployment rate.
library(dplyr) #ensure package is installed
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr) #ensure package is installed
economics_long <- economics %>% #altering dataset
pivot_longer( #pivot length
cols = c(uempmed, psavert), #specify pivot columns
names_to = "variable", #tell data where to go
values_to = "value" #tell data where to go
)
ggplot(economics_long, aes(x=date, y=value, color=variable))+ #specify new data, x/y axis & color by variable
geom_line() + #create lineplot
labs(
x = "Date", #name x axis
y = "Value", #name y axis
title = "Employment Over Time")+ #name
theme_minimal() #default/typical theme
Question: Do these variables appear to move together over time? Both of these variables tend to move closely together/parallel over time.
ggplot(economics_long, aes(x=date, y=value, color=variable))+ #specify data, x/y axes, and color by variable
geom_line(linewidth = 0.7) + #create lineplot and change line size
scale_x_date(
date_breaks = "5 years", date_labels = "%Y")+ #set x axis date interval
labs(
x = "Date", #name x axis
y = "Value", #name y axis
title = "Employment Over Time", #create title
subtitle = "Psavert & Uempmed", #create subtitle
caption = "Employment between 1970 - 2015, measured every five years.")+ #create caption
theme_classic() #change to classic theme