# List any packages you need to use here
packages <- c("ggplot2", "readr", "tidyverse", "dplyr", "ggpubr")
#Check to see if any of your listed packages need installed
check_install_packages <- function(pkg){
if (!require(pkg, character.only = TRUE)) {
install.packages(pkg, dependencies = TRUE)
library(pkg, character.only = TRUE)
}
}
# Download the packages and read in the libraries if necessary
sapply(packages, check_install_packages)
## $ggplot2
## NULL
##
## $readr
## NULL
##
## $tidyverse
## NULL
##
## $dplyr
## NULL
##
## $ggpubr
## NULL
data("USArrests")
view(USArrests)
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
?USArrests
USArrests data questions: 1: What are the variables available? murder, assault, urbanpop, and rape
2: How is each variable defined or calculated? muder, assault, and rape are numeric values calculated for one per 100,000 arrests urbanpop is defined as the percent urban population
3: Is each one numerical or categorical? each is numerical
Horsepower Plot:
# libraries
library(ggplot2)
#General format is going to be calling a ggplot, followed by the dataframe name (mtcars), followed by defining the X and Y variables of the graphic.
data("mtcars")
ggplot(mtcars, aes(x = mpg, y=hp)) +
#You then indicate the type of graph to make (in this case, a dotplot using points).
geom_point() +
#changing dots to size 2.4 and star shaped
geom_point(size = 2.4, shape = 8) +
#change theme to minimal
theme_minimal() +
#create cyl column into a color gradient
aes(colour = cyl) +
#change the legend position
theme(legend.position = "bottom") +
#labeling the graph
labs(title = "Effect of Horsepower on Fuel Efficiency", subtitle = "Categorized by Number of Cylinders", x = "Horsepower", y = "Fuel Efficiency (MPG)")
Iris Plot:
#using a different demo data set to create another graph
library(ggplot2)
data(Irisi)
#create plot
ggplot(iris, aes(x = Species, y = Petal.Length, fill = Species)) + geom_boxplot() +
#change theme
theme_classic() +
#labels
labs(title = "Patel Length of Iris by Species") +
#custom legend position
theme(legend.position = c(0.2,0.8))
task 1: Scatter Plot and Regression Line:
#Load data
data("USArrests")
#view the data set
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
#load proper packages
library(ggplot2)
#create a scatter plot with the chosen x and y variables
ggplot(USArrests, aes(y = Assault, x =Murder)) + geom_point() +
#set theme
theme_classic() +
#add labels
labs(title = "Scatter Plot of Assault vs. Murder Rates", x = "Murder Rate", y =
"Assault Rate") +
#add a linear model regression line
geom_smooth(method = lm, color = "black", fill = "red") +
#changing aesthetics
geom_point(size = 3.5, shape = 21, fill = "lightblue") +
#change the font sizes a little
theme(plot.title = element_text(size=16, face = "bold"), axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14))
Task 2: Challenge Line Plot Creation:
#create a new column for the states
USArrests$State <- rownames(USArrests)
#creating a column for the means
USArrests$AverageCrimeRate <- rowMeans(USArrests[, c("Murder", "Assault", "Rape")])
# Create the line plot
ggplot(data = USArrests, aes(x = State, y = AverageCrimeRate, group = 1)) +
geom_line(color = "steelblue", size = 1) + # Line
geom_point(color = "red", size = 3) + # Points
labs(title = "Line Plot of Average Crime Rate by State",
x = "State",
y = "Average Crime Rate") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x-axis labels
#notes: couldn't figure out how to add the states to the x axis the first time
Replica With Half Violin Plot:
library(readr)
CAM <- read_csv("Violin_Plot_Data.csv")
View(CAM)
#load all packages needed for this module
library("ggplot2")
library("readr")
library("tidyverse")
library("dplyr")
library("ggpubr")
library("see")
library("scales")
#view the data
head(CAM)
## # A tibble: 2 × 21
## F1Performance Repeat1 Repeat2 Repeat3 Repeat4 Repeat5 Repeat6 Repeat7 Repeat8
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SVMWithGradCA… 0.670 0.702 0.681 0.711 0.649 0.716 0.714 0.685
## 2 SVMWithDeepSh… 0.674 0.610 0.631 0.618 0.663 0.609 0.624 0.643
## # ℹ 12 more variables: Repeat9 <dbl>, Repeat10 <dbl>, Repeat11 <dbl>,
## # Repeat12 <dbl>, Repeat13 <dbl>, Repeat14 <dbl>, Repeat15 <dbl>,
## # Repeat16 <dbl>, Repeat17 <dbl>, Repeat18 <dbl>, Repeat19 <dbl>,
## # Repeat20 <dbl>
#Calculate min and max
min_value <- min(data_long$values, na.rm = TRUE)
max_value <- max(data_long$values, na.rm = TRUE)
#give your newly formatted data a name you will recognize, in this case "data_long"
data_long <- CAM %>%
#Pivot the data from having many columns to many rows
pivot_longer(
cols = starts_with("Repeat"), # Select columns to pivot
names_to = "Repeat",
values_to = "values") #give the newly created column a name
#view the resulting data
head(CAM)
## # A tibble: 2 × 21
## F1Performance Repeat1 Repeat2 Repeat3 Repeat4 Repeat5 Repeat6 Repeat7 Repeat8
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SVMWithGradCA… 0.670 0.702 0.681 0.711 0.649 0.716 0.714 0.685
## 2 SVMWithDeepSh… 0.674 0.610 0.631 0.618 0.663 0.609 0.624 0.643
## # ℹ 12 more variables: Repeat9 <dbl>, Repeat10 <dbl>, Repeat11 <dbl>,
## # Repeat12 <dbl>, Repeat13 <dbl>, Repeat14 <dbl>, Repeat15 <dbl>,
## # Repeat16 <dbl>, Repeat17 <dbl>, Repeat18 <dbl>, Repeat19 <dbl>,
## # Repeat20 <dbl>
#calculating min and max
min_value <- min(data_long$values, na.rm = TRUE)
max_value <- max(data_long$values, na.rm = TRUE)
#writing ggplot2 base data
ggplot(data_long, aes(x = F1Performance, y = values)) + geom_jitter(size = 6, width = 0.1, height = 0, alpha = NA, aes(color = F1Performance)) + geom_violinhalf() +
#add in the color and legend
aes(fill = F1Performance) +
#making the color transparent
aes(alpha = "0.20", size.f = 2) +
#adding quantile lines
geom_violinhalf(draw_quantiles = c(0.25, 0.5, 0.75)) +
#flip the axis
coord_flip() +
#changing the color to orange and purple
scale_fill_manual(values = c("purple4", "darkorange2")) +
scale_color_manual(values = c("purple4", "darkorange2")) +
#add summary statistic and highlight it
stat_summary(fun = median, geom = "point", shape = 21, size = 3, fill = "white", color = "black", stroke = 1.5, alpha = NA) +
#changing theme
theme_minimal() +
#changing y axis
theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.line.y = element_blank()) +
#changing x axis
theme(axis.line.x = element_line(color = "black", size = 2)) +
#changing major and minor grid lines
# Remove major grid lines for y axis
theme(panel.grid.major.y = element_blank(),
# Remove minor grid lines for x axis
panel.grid.minor.x = element_blank(),
#changing the major x axis grid lines
panel.grid.major.x = element_line(color = "grey", linetype = "dashed", size = 1.5)) +
#adding title
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
theme(legend.position = "none") +
#adding text labels
geom_text(aes(x = "SVMWithGradCAMMaps", label = "SVM + GRAD-CAM++", y = 0.64), vjust = -3.5, color = "darkorange2", size = 4.5) +
geom_text(aes(x = "SMVWithDeepShapMaps", label = "SMV + Deep SHAP", y = 0.6), vjust = -14, color = "purple4", size = 4.5) +
scale_y_continuous(limits = c(min_value, max_value), breaks = seq(min_value, max_value, by = 0.02)) +
#adding a title and changing axis name
labs(title = "Fig. 7. Grad-CAM++ saliency maps capture unique predictive information.", face = "bold") +
labs(y = "F1")
#that was really difficult
Replica With Full Violin Plot:
library(readr)
CAM <- read_csv("Violin_Plot_Data.csv")
View(CAM)
#load all packages needed for this module
library("ggplot2")
library("readr")
library("tidyverse")
library("dplyr")
library("ggpubr")
library("see")
library("scales")
#view the data
head(CAM)
## # A tibble: 2 × 21
## F1Performance Repeat1 Repeat2 Repeat3 Repeat4 Repeat5 Repeat6 Repeat7 Repeat8
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SVMWithGradCA… 0.670 0.702 0.681 0.711 0.649 0.716 0.714 0.685
## 2 SVMWithDeepSh… 0.674 0.610 0.631 0.618 0.663 0.609 0.624 0.643
## # ℹ 12 more variables: Repeat9 <dbl>, Repeat10 <dbl>, Repeat11 <dbl>,
## # Repeat12 <dbl>, Repeat13 <dbl>, Repeat14 <dbl>, Repeat15 <dbl>,
## # Repeat16 <dbl>, Repeat17 <dbl>, Repeat18 <dbl>, Repeat19 <dbl>,
## # Repeat20 <dbl>
#Calculate min and max
min_value <- min(data_long$values, na.rm = TRUE)
max_value <- max(data_long$values, na.rm = TRUE)
#give your newly formatted data a name you will recognize, in this case "data_long"
data_long <- CAM %>%
#Pivot the data from having many columns to many rows
pivot_longer(
cols = starts_with("Repeat"), # Select columns to pivot
names_to = "Repeat",
values_to = "values") #give the newly created column a name
#view the resulting data
head(CAM)
## # A tibble: 2 × 21
## F1Performance Repeat1 Repeat2 Repeat3 Repeat4 Repeat5 Repeat6 Repeat7 Repeat8
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SVMWithGradCA… 0.670 0.702 0.681 0.711 0.649 0.716 0.714 0.685
## 2 SVMWithDeepSh… 0.674 0.610 0.631 0.618 0.663 0.609 0.624 0.643
## # ℹ 12 more variables: Repeat9 <dbl>, Repeat10 <dbl>, Repeat11 <dbl>,
## # Repeat12 <dbl>, Repeat13 <dbl>, Repeat14 <dbl>, Repeat15 <dbl>,
## # Repeat16 <dbl>, Repeat17 <dbl>, Repeat18 <dbl>, Repeat19 <dbl>,
## # Repeat20 <dbl>
#calculating min and max
min_value <- min(data_long$values, na.rm = TRUE)
max_value <- max(data_long$values, na.rm = TRUE)
#writing ggplot2 base data
ggplot(data_long, aes(x = F1Performance, y = values)) + geom_jitter(size = 6, width = 0.1, height = 0, alpha = NA, aes(color = F1Performance)) + geom_violin() +
#add in the color and legend
aes(fill = F1Performance) +
#making the color transparent
aes(alpha = "0.20", size.f = 2) +
#adding quantile lines
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
#flip the axis
coord_flip() +
#changing the color to orange and purple
scale_fill_manual(values = c("purple4", "darkorange2")) +
scale_color_manual(values = c("purple4", "darkorange2")) +
#add summary statistic and highlight it
stat_summary(fun = median, geom = "point", shape = 21, size = 3, fill = "white", color = "black", stroke = 1.5, alpha = NA) +
#changing theme
theme_minimal() +
#changing y axis
theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.line.y = element_blank()) +
#changing x axis
theme(axis.line.x = element_line(color = "black", size = 2)) +
#changing major and minor grid lines
# Remove major grid lines for y axis
theme(panel.grid.major.y = element_blank(),
# Remove minor grid lines for x axis
panel.grid.minor.x = element_blank(),
#changing the major x axis grid lines
panel.grid.major.x = element_line(color = "grey", linetype = "dashed", size = 1.5)) +
#adding title
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
theme(legend.position = "none") +
#adding text labels
geom_text(aes(x = "SVMWithGradCAMMaps", label = "SVM + GRAD-CAM++", y = 0.64), vjust = -3.5, color = "darkorange2", size = 4.5) +
geom_text(aes(x = "SMVWithDeepShapMaps", label = "SMV + Deep SHAP", y = 0.6), vjust = -14, color = "purple4", size = 4.5) +
scale_y_continuous(limits = c(min_value, max_value), breaks = seq(min_value, max_value, by = 0.02)) +
#adding a title and changing axis name
labs(title = "Fig. 7. Grad-CAM++ saliency maps capture unique predictive information.", face = "bold") +
labs(y = "F1")
Module 3 Challenge Violin Plus Box Plot:
# libraries
library("rmarkdown")
library("knitr")
# libraries
library("ggplot2")
library("readr")
library("tidyverse")
library("dplyr")
library("ggpubr")
library("see")
library("scales")
library("hrbrthemes")
library(readr)
CAM <- read_csv("Violin_Plot_Data.csv")
View(CAM)
# calculate min and max values
min_value <- min(data_long$values, na.rm = TRUE)
max_value <- max(data_long$values, na.rm = TRUE)
# make the data long format
data_long <- CAM %>%
#Pivot the data from having many columns to many rows
pivot_longer(
cols = starts_with("Repeat"), # Select columns to pivot
names_to = "Repeat",
values_to = "values") #give the newly created column a name
head(data_long)
## # A tibble: 6 × 3
## F1Performance Repeat values
## <chr> <chr> <dbl>
## 1 SVMWithGradCAMMaps Repeat1 0.670
## 2 SVMWithGradCAMMaps Repeat2 0.702
## 3 SVMWithGradCAMMaps Repeat3 0.681
## 4 SVMWithGradCAMMaps Repeat4 0.711
## 5 SVMWithGradCAMMaps Repeat5 0.649
## 6 SVMWithGradCAMMaps Repeat6 0.716
# create the base ggplot
ggplot(data_long, aes(x = F1Performance, y = values)) +
# add data points
geom_jitter(size = 5, width = 0.1, height = 0, alpha = NA) +
# add violin and boxplot
geom_violin(aes(fill = F1Performance, alpha = 0.2)) +
geom_boxplot(width = 0.3, aes(color_fill_manual = "grey", alpha = 0.2)) +
# add a mean point
stat_summary(fun = median, geom = "point", shape = 21, size = 3, fill = "white", color = "black", stroke = 1.5, alpha = NA) +
# changing theme
theme_classic() +
theme(legend.position = "none") +
theme(axis.title.y = element_blank()) +
labs(title = "Fig. 7. Grad-CAM++ saliency maps capture unique predictive information.")
# doing this a second time was a little easier
mtcars Barplot with Error Bars:
# libraries
library(ggplot2)
library(dplyr)
mean_mpg <- mean(mtcars$mpg)
# Task 1
# load data set mtcars
data(mtcars)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
group_data <- mtcars %>% group_by(cyl)
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
# calculating mean
mean_mpg <- mean(mtcars$mpg) +
print(mean_mpg)
## [1] 20.09062
# mean_mpg = 20.0962
# calculating Standard Error from SD
sd_mpg <- sd(mtcars$mpg, na.rm = TRUE)
# creating a sample size vriable
n <- sum(!is.na(mtcars$mpg))
se_mpg <- sd_mpg / sqrt(n)
print(se_mpg)
## [1] 1.065424
# SE_mpg = 1.065
# summarize data in new dataframe
data_summary <- mtcars %>% group_by(cyl) %>% summarise(se_mpg = sd_mpg / sqrt(n), mean_mpg = mean(mpg))
print(data_summary)
## # A tibble: 3 × 3
## cyl se_mpg mean_mpg
## <dbl> <dbl> <dbl>
## 1 4 1.07 26.7
## 2 6 1.07 19.7
## 3 8 1.07 15.1
# Task 2 creating a bar plot
ggplot(data_summary, aes(x = factor(cyl), y = mean_mpg,)) + geom_col(fill = c("skyblue", "orange", "green"), width = 0.7) +
geom_errorbar(aes(ymin = mean_mpg - se_mpg, ymax = mean_mpg + se_mpg), width = 0.25, size = 2) +
theme_classic() +
labs(title = "Bar Plot of Mean MPG by Cylinders", x = "Number of Cylinders", y = "Mean MPG")
#messed up on the summary stats at first but fixed it
2D Density Graphics Plot:
# load packages
library(readr)
library(ggplot2)
library(dplyr)
# read in csv and rename it via import dataset function
library(readr)
population_data <- read_csv("log_population_data.csv")
View(population_data)
head(population_data)
## # A tibble: 6 × 2
## Log10_Current_Population Log10_Past_Population
## <dbl> <dbl>
## 1 4.29 5.67
## 2 3.82 5.91
## 3 4.67 6.10
## 4 3.54 5.20
## 5 4.60 6.39
## 6 4.84 6.19
# create the density plot
ggplot(population_data, aes(x = Log10_Current_Population, y = Log10_Past_Population)) +
stat_density_2d(aes(fill = after_stat(level)), geom = "polygon", color = "white") +
scale_fill_gradient(low = "darkblue", high = "skyblue", name = "level") +
# I couldn't figure out how to use the scale_fill_distiller function so I used a differnt one
theme_minimal() +
labs(title = "2D Density Plot of Population Sizes", x = "Log10(Current population size N0", y = "Log10(Past population ize N1)")
Adding Density Plot To The Margins of A Different Plot:
library(readr)
library(ggplot2)
library(dplyr)
library(ggExtra)
library(readr)
longevity_data <- read_csv("longevity_data.csv")
View(longevity_data)
head(longevity_data)
## # A tibble: 6 × 9
## species class order maximum_lifespan_yr mass_g volancy fossoriallity
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Dicrostonyx_groe… Mamm… Rode… 3.3 66 nonvol… semifossorial
## 2 Didelphis_virgin… Mamm… Dide… 6.6 3000 nonvol… nonfossorial
## 3 Diphylla_ecaudata Mamm… Chir… 8 28 volant nonfossorial
## 4 Dipodillus_campe… Mamm… Rode… 7.3 28.4 nonvol… semifossorial
## 5 Dipodomys_merria… Mamm… Rode… 9.7 42 nonvol… semifossorial
## 6 Dendrolagus_good… Mamm… Dipr… 23.6 7400 nonvol… nonfossorial
## # ℹ 2 more variables: foraging_environment <chr>, daily_activity <chr>
# put the data into log format
long <- longevity_data %>% #create a new dataframe called "long" that contains all your newly calculated variables
mutate( #mutate tells the program to perform new calculations
log_mass = log10(mass_g), # create a new column called "log_mass" which Log-transforms mass values
log_lifespan = log10(maximum_lifespan_yr)) %>% # create a new colummn called "log_lifespan" that Log-transforms lifespan value
group_by(order) %>% # this tells it that after "mutate", you are going to start a new function. for each "order" or group of animals
mutate(order_size = n()) #calculate the sample size of each order and put it in a column called "order_size".
#Now you have a sample size for each order, and you have transformed each mass and lifespan value to log form.
head(long)
## # A tibble: 6 × 12
## # Groups: order [4]
## species class order maximum_lifespan_yr mass_g volancy fossoriallity
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Dicrostonyx_groe… Mamm… Rode… 3.3 66 nonvol… semifossorial
## 2 Didelphis_virgin… Mamm… Dide… 6.6 3000 nonvol… nonfossorial
## 3 Diphylla_ecaudata Mamm… Chir… 8 28 volant nonfossorial
## 4 Dipodillus_campe… Mamm… Rode… 7.3 28.4 nonvol… semifossorial
## 5 Dipodomys_merria… Mamm… Rode… 9.7 42 nonvol… semifossorial
## 6 Dendrolagus_good… Mamm… Dipr… 23.6 7400 nonvol… nonfossorial
## # ℹ 5 more variables: foraging_environment <chr>, daily_activity <chr>,
## # log_mass <dbl>, log_lifespan <dbl>, order_size <int>
# create a dotplot
p = ggplot(long, aes(x =log_mass, y = log_lifespan, color = class, size = order_size)) +
#makign the points transparent by 30%
geom_point(alpha = 0.3) +
#regression line
geom_smooth(method = "lm", se = FALSE, linetype = "solid", aes(color = class)) +
# change the color scheme
scale_color_manual(values = c("lightgreen", "darkslategray")) +
# change labels and theme
labs(title = "Bubble Chart of Longevity and Body Mass", x = "Log (Body Mass [g])", y = "Log (Maximum Lifespan [yr])") +
theme_minimal() +
# remove legends
theme(legend.position = "none") +
# changing axis and title labels theme
theme(plot.title = element_text(size = 12, face = "bold"),
axis.title.x = element_text(size = 12, face = "bold"),
axis.title.y = element_text(size = 12, face = "bold")) +
# add text annotations inside the graph
annotate("text", label = "Aves", x = 5.6, y = 1.9, color = "lightgreen", size = 5, face = "bold") +
annotate("text", label = "Mammals", x = 6.5, y = 1.4, color = "darkslategray", size = 5, face = "bold")
ggExtra::ggMarginal(p, type = "density", groupFill = TRUE, alpha = 0.4)
Interpretation Questions: 1) adding the density plots in the margins adds another visualization tool that shows how the different classes are distributed in the x and y axis.
1: log mass plotted on the x axis. 2: log lifespan plotted on the y axis. 3: making the classes differnet colors. 4: order size depicts the sample size of the populations. 5: transparency adds a visual aid allowing you to see more dense populations. 6: marginal density plots add another visual to detemrine where populations are the most dense. 6 1/2: regression lines show the trend of the two classes.
the more body mass = longer lifespan. More extreme in Aves as seen by the trendlines.
for Aves, there are more samples in the lower left of the graph (low body mass and short lifespan), compared to other areas of the graph. This could be a possible bias since most of the data collected was of that low body mass and short lifespan category. The same can be sais for the Mammals where the data is more accumulated towards a high lifespan and body mass.
We could use the dodge function to create 2 side-by-side graphs that might make the graph easier to read. Or, we could change “se = TRUE” to show error along the trend lines.
Create Your Own Plot Challenge:
# load packages
library(ggplot2)
library(dplyr)
library(ggExtra)
# using CO2 dataset
data("CO2")
head(CO2)
## Grouped Data: uptake ~ conc | Plant
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
p1 = ggplot(CO2, aes(x = conc, y = uptake, color = Plant, fill = Treatment, linetype = Treatment)) +
# Line plot for trends
geom_point(aes(group = interaction(Plant, Treatment)), size = 1, alpha = 0.8) + geom_line() +
# set theme
theme_minimal() + theme(legend.position = "bottom") +
# set labels
labs(title = "CO2 Uptake Across Plant Origin, Treatment, and Concentration", x = "Concentration of CO2(mL/L)", y = "Uptake of CO2 (μmol/m^2 sec)") +
theme(plot.title = element_text(face = "bold"),
plot.title.x = element_text(face = "bold"),
plot.title.y = element_text(face = "bold"))
ggMarginal(p1, type = "density", groupFill = TRUE, alpha = 0.4)
Module 6 Notes: export a figure as a pdf, eps, or png: ggexport(figure, filename, “figure1.pdf”, nrow = 2, ncol = 1)
to save a plot in r: save_plot(“plot name.pdf”, plot name, ncol = x, nrow = 2, base_aspect_ratio = 1.4) # always 1.4 to add room for legend
Chick Weight Plot Code:
# load libraries
library(ggplot2) # facet_grid(), facet_wrap() for creating multiple figures with the same axis
library(cowplot) # plot_grid() and draw_plot(plot, x = , y = , width = , height = )
library(gridExtra) # grid.arrange()
library(ggpubr) # ggarrange() easiest to use
library(patchwork) # plot_layout()
library(dplyr)
# load dataset
data(ChickWeight)
head(ChickWeight)
## Grouped Data: weight ~ Time | Chick
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
## 6 93 10 1 1
# summary statistics
summary(ChickWeight)
## weight Time Chick Diet
## Min. : 35.0 Min. : 0.00 13 : 12 1:220
## 1st Qu.: 63.0 1st Qu.: 4.00 9 : 12 2:120
## Median :103.0 Median :10.00 20 : 12 3:120
## Mean :121.8 Mean :10.72 10 : 12 4:118
## 3rd Qu.:163.8 3rd Qu.:16.00 17 : 12
## Max. :373.0 Max. :21.00 19 : 12
## (Other):506
# create plot
p <- ggplot(ChickWeight, aes(x = Time, y = weight, color = Chick, alpha = 0.1)) + geom_line() +
# regressChickWeight# regression line
geom_smooth(color = "black", size = 1.2) +
# labels
labs(title = "Chick Growth by Diet Type", x = "Time (Days)", y = "Weight (Grams)") +
# theme and legend
theme_minimal() + theme(legend.position = "none") +
# facet wrap
facet_wrap(vars(Diet), ncol = 4, nrow = 1)
Output of Chick Plot:
plot(p)
Multipanel Figure Plot:
# load packages
library(dplyr)
library(ggplot2) # facet_grid(), facet_wrap() for creating multiple figures with the same axis
library(cowplot) # plot_grid() and draw_plot(plot, x = , y = , width = , height = )
library(gridExtra) # grid.arrange()
library(ggpubr) # ggarrange() easiest to use
library(patchwork) # plot_layout()
# load data befor each plot
data(CO2)
head(CO2)
## Grouped Data: uptake ~ conc | Plant
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
# 1. violin plot
v <- ggplot(CO2, aes(x = Treatment, y = uptake, alpha = 0.4, fill = Type)) +
geom_jitter(size = 3, alpha = NA, position = position_jitterdodge(jitter.width = 0.2, dodge.width = 1)) +
geom_violin(position = position_dodge(1), aes(alpha = 0.2)) +
geom_boxplot(width = 0.3, position = position_dodge(1), aes(color_fill_manual = "grey", alpha = 0.2)) +
# i don't know why i can't change the fill of my boxplots to be grey
# labels
labs(title = "CO2 Uptake of Different Plant Type by Treatment", x = "Treatment", y = "CO2 Uptake (μmol/m^2 sec)") +
# theme # theme Type
theme_classic() + theme(plot.title = element_text(face = "bold"),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold")) +
# remove alpha legend
guides(alpha = "none")
v
# 2. Line plot
l <- ggplot(CO2, aes(x = conc, y = uptake)) + geom_line(aes(color = Treatment, fill = Treatment, size = 2, alpha = 0.8)) +
# facet
facet_grid(cols = vars(Type)) +
# labels
labs(title = "Ambient CO2 Uptake in Different Areas and CO2 Concentration by Treatment", x = "CO2 Concentration (mL/L)", y = "CO2 Uptake (μmol/m^2 sec)") +
# themes
theme_minimal() +
theme(plot.title = element_text(face = "bold", hjust = 0.5),
axis.title = element_text(face = "bold")) +
# remove alpha and size legend
guides(alpha = "none", size = "none")
l
# 3. density plot with a vertical dashed line at mean
# create the plot
d <- ggplot(CO2, aes(x = uptake, fill = Treatment)) +
geom_density(alpha = 0.5) +
# add a vertical line at mean
geom_vline(aes(xintercept = mean(uptake, na.rm = TRUE)),
linetype = "dashed", color = "black", size = 1) +
# labels
labs(title = "Density Plot of CO2 Uptake by Treatment with Mean Line", x = "CO2 Uptake", y = "Density") +
# theme
theme_minimal() + theme(plot.title = element_text(face = "bold"),
title.axis = element_text(face = "bold"))
d
# combine plots into one figure
multipanel_plot1 <- plot_grid(
v + theme(plot.title = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank()),
l + theme(plot.title = element_blank(), axis.title = element_blank()),
d + theme(plot.title = element_blank(), axis.title = element_blank()),
ncol = 1, # Arrange in 1 column
labels = c("A", "B", "C"), # Add labels to each plot
label_size = 13,
label_y = c(1.03, 1.05, 1.12),
rel_heights = c(1.2, 1, 0.9))
# print multipanel_plot1
multipanel_plot1
Link to data: https://opendata.maryland.gov/d/tm86-dujs
# libraries
library(ggplot2)
library(tidyr)
library(dplyr)
library(readr)
library(tidyverse)
library(ggExtra)
# load dataset
data <- read.csv("mdcovid_cases_by_county.csv")
# convert DATE to proper format
data$DATE <- as.Date(data$DATE, format = "%m/%d/%Y")
# select relevant columns
selected_data <- data[, c("DATE", "St_Marys", "Calvert", "Charles")]
# convert data to long format
long_data <- pivot_longer(selected_data,
cols = c(St_Marys, Calvert, Charles),
names_to = "County",
values_to = "Cases")
# create the plot
countyp <- ggplot(long_data, aes(x = DATE, y = Cases)) +
geom_point(aes(color = County)) +
# labels
labs(title = "COVID-19 Cases in St. Mary's, Calvert, and Charles Counties",
x = "Date",
y = "Number of Cases",
color = "County") +
# theme
theme_minimal() + theme(plot.title = element_text(face = "bold")) + theme(legend.position = c(0.1,0.81),
legend.title = element_text(size = 13),
legend.background = element_rect(fill = "white", color = "black"))
# add margin density plot
ggMarginal(countyp, type = "density", margins = "y", groupColour = TRUE, groupFill = TRUE)
Link to data: https://opendata.maryland.gov/d/sjqg-bqsu
# libraries
library(ggplot2)
library(tidyr)
library(dplyr)
library(readr)
library(tidyverse)
library(ggExtra)
# read in the data
data <- read.csv("mdcovid_cases_by_age.csv")
# remove the "Age_Unknown" column
age <- data[, !names(data) %in% c("Age_Unknown")]
# gather the data into a long format for ggplot2
age <- data %>%
select(-c(OBJECTID, DATE)) %>%
gather(key = "Age_Group", value = "Cases")
# create the histogram plot
agep <- ggplot(age, aes(x = Age_Group, y = Cases, fill = Age_Group)) +
geom_bar(stat = "identity") +
# add labels
labs(title = "COVID-19 Cases by Age Group", x = "Age Group", y = "Number of Cases") +
# change theme
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
# print plot
agep
Link to data: https://opendata.maryland.gov/d/py3p-2bgq
# libraries
library(ggplot2)
library(tidyr)
library(dplyr)
library(readr)
library(tidyverse)
library(ggExtra)
# read in the dataset
data <- read.csv("mdcovid_cases_by_gender.csv")
# remove the "Unknown" column because it has no data
data <- data[, !names(data) %in% c("Unknown")]
# convert the data into a long format
long_data <- data %>%
select(-c(OBJECTID, DATE)) %>%
pivot_longer(cols = everything(), names_to = "Gender", values_to = "Cases")
# make sure data is in long format
print(head(long_data))
## # A tibble: 6 × 2
## Gender Cases
## <chr> <int>
## 1 Male 6
## 2 Female 6
## 3 Male 9
## 4 Female 6
## 5 Male 16
## 6 Female 10
# create the violin plot combined with boxplot and jitter
pgender <- ggplot(long_data, aes(x = Gender, y = Cases, fill = Gender)) +
geom_jitter(size = 2, width = 0.1, alpha = 0.9) +
geom_violin(trim = FALSE, alpha = 0.6) +
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.6) +
# labels
labs(title = "COVID-19 Cases by Gender", x = "Gender", y = "Number of Cases") +
# theme
theme_minimal() +
theme(legend.position = "none", plot.title = element_text(face = "bold"))
# Print the gender plot
pgender
Link to data: https://opendata.maryland.gov/d/xnfm-sgpt
# libraries
library(ggplot2)
library(tidyr)
library(dplyr)
library(readr)
library(tidyverse)
library(ggExtra)
# read in the dataset
data <- read.csv("mdcovid_cases_by_ethnicity.csv")
# select the most recent data (last row of the dataset)
latest_data <- tail(data, 1)
# remove out of date columns
ethnicity_data <- latest_data[, c("African_American", "White", "Hispanic", "Asian", "Other")]
# calculate the percentage for each ethnicity
ethnicity_percent <- colSums(ethnicity_data) / sum(ethnicity_data) * 100
# prepare data for plotting
ethnicity_df <- data.frame(
Ethnicity = names(ethnicity_percent),
Percentage = ethnicity_percent
)
# create the pie chart
ethp <- ggplot(ethnicity_df, aes(x = "", y = Percentage, fill = Ethnicity)) +
geom_bar(stat = "identity", width = 1) +
# this actually makes the bar graph into a pie chart
coord_polar("y", start = 0) +
# adds the percentages
geom_text(aes(label = sprintf("%.1f%%", Percentage)),
position = position_stack(vjust = 0.5),
size = 4) +
# labels
labs(title = "COVID-19 Cases by Ethnicity") +
# theme
theme_void() +
theme(legend.position = "right", plot.title = element_text(face = "bold")) +
scale_fill_brewer(palette = "Set3")
# print plot
ethp
# libraries
library(ggplot2)
library(tidyr)
library(cowplot)
multipanel_plot <- plot_grid(
agep + theme(plot.title = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank()),
ethp + theme(plot.title = element_blank(), axis.title = element_blank()),
pgender + theme(plot.title = element_blank(), axis.title = element_blank()),
ncol = 2, # Arrange 2 column
labels = c("A", "C", "B"), # Add labels to each plot
label_size = 12,
label_y = c(1, 1, 1.05),
rel_heights = c(1, 1, 1))
#print multipanel plot
multipanel_plot