# this entire coding chunk installs and loads the packages needed to complete this midterm
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ readr 2.2.0
## ✔ ggplot2 4.0.2 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.1 ✔ tidyr 1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
Part 1
data(iris) #loads the dataset and allows me to see the variables
new = iris %>% # new creates the name of the edited data, iris is the dataset we are using, %>% pipes the line of code to the next
mutate(sepal_ratio = Sepal.Length/Sepal.Width) %>% #creating a new variable to get the ratio
filter(Petal.Length > 3.5) #excludes any petal lengths less than 3.5
new_outliers = new %>%
group_by(Species) %>% #separates the categories
mutate(q1 = quantile(sepal_ratio, 0.25),
q3 = quantile(sepal_ratio, 0.75),
iqr = q3 - q1,
is_outlier = sepal_ratio < (q1 - 1.5*iqr) | sepal_ratio > (q3 + 1.5*iqr)) %>%
filter(is_outlier) #manually creates the outliers that are normally created in boxplot and keeps only the data that is an outlier
ggplot(new, aes(Species, sepal_ratio)) + #uses the cleaned data that was previoulsy used and selects x and y
geom_violin(fill = "lightblue", alpha = 0.4, trim = FALSE) + #creats violin plot that is lightblue, transparent and no trim
geom_boxplot(width = 0.4, alpha = 0.7, outliers = FALSE) + #creates an overlying boxplot that excludes the outliers
geom_jitter(data = new_outliers, color = "red") + #overlaps a jitter plot but uses my new_outlier dataset so that the outliers are visible with no overlap
labs(#creates labels
title = "Distribution of Sepal Length to Width Ratio Between Species",
x = "Flower Species",
y = "Speal Length to Width Ratio (cm)",
caption = "Created using the ggplot Iris dataset but excluded all flowers with petal length less than 3.5 cm (n = 95). "
) +
theme_bw() #applies theme bw

Part 2
2a
head(economics_long) #allows me to preveiw the data
## # A tibble: 6 × 4
## date variable value value01
## <date> <chr> <dbl> <dbl>
## 1 1967-07-01 pce 507. 0
## 2 1967-08-01 pce 510. 0.000265
## 3 1967-09-01 pce 516. 0.000762
## 4 1967-10-01 pce 512. 0.000471
## 5 1967-11-01 pce 517. 0.000916
## 6 1967-12-01 pce 525. 0.00157
plot(economics_long$date, #x value
economics_long$value, #y axis
type = "b", #creates line plot with plots and line
lwd = 0.1) #applies change to line to make it smaller then default

2b
ggplot(economics_long, aes(date, #x axis
value, #y axis
color = variable)) + # sorts by variable to show relationship
geom_line( lwd = 0.8) + # creates line plot with adjustment of line size
labs(
title = "Comparisson of Economic Variables Over Time", #title label
x = "Date", # x axis label
y = "Value" , # y axis label
color = "Variable" #applies label to Key
) +
theme_bw() #apllies theme to better qualtiy and reading

2c
#zooming in on the lower range data without altering data
ggplot(economics_long, aes(date, #x axis
value, #y axis
color = variable)) + # sorts by variable to show relationship
geom_line( lwd = 0.8) + # creates line plot with adjustment of line size
labs(
title = "Comparisson of Economic Variables Over Time", #title label
x = "Date", # x axis label
y = "Value" , # y axis label
color = "Variable", #applies label to Key
caption = "Zoomed in on Y-axis to better analyze lower variables"
) +
theme_bw() + #apply theme for better quality and reading
scale_y_continuous(limits=c(0,20000)) #adjust y axis to zoom in on certain data without editing data
## Warning: Removed 574 rows containing missing values or values outside the scale range
## (`geom_line()`).

Part 3
3a
data("penguins")
p_new = penguins %>%
filter(!is.na(body_mass)) # removes any NA values in body mass
ggplot(p_new, aes(body_mass, # uses removed NA for body mass dataset and sets x-axis to body mass
fill = species)) + # categorize the values by species
geom_density(alpha = 0.6 )+ # density plot with transparency
scale_fill_manual(values = c( #manually changes the fill color to edited color
"Adelie" = "darkseagreen3",
"Chinstrap" = "mistyrose3",
"Gentoo" = "darkslategrey"
)) +
labs( # creates labels
title = "Body Mass Comparison Between Species",
x = "Body Mass",
y = "Density",
caption = "Created using the penguins dataset in ggplot with all NA Mass variables removed "
) +
theme_bw() #makes the theme bw settings in ggplot

Part 4
4a
data("diamonds") #loading diamonds dataset
ggplot( diamonds, aes(cut, fill = color)) + #uses diamonds data set, x-axis is cut and the bars are sorted by color
geom_bar(position = "fill") + #creates bar plot and sets the y-axis to percent instead of values
scale_fill_viridis_d() + # color blind friendly color set
labs( #creating labels for title, subtitle, x and y axis
title = "Stacked Barplot Showing Proportion of Color When Comparing to Cut",
subtitle = "Source = ggplot Diamonds Dataset",
x = "Diamond Cut",
y = "Count",
fill = "Color"
) +
theme_bw()

4b
ggplot( diamonds, aes(cut, fill = color)) + #uses diamonds data set, x-axis is cut and the bars are sorted by color
geom_bar( position = "dodge") + #creates grouped bar plot
scale_fill_viridis_d() + # color blind friendly color set
labs( #creating labels for title, subtitle, x and y axis
title = "Groupws Barplot Showing Color Amounts by Cut",
subtitle = "Source = ggplot Diamonds Dataset",
x = "Diamond Cut",
y = "Count",
fill = "Color"
) +
theme_bw()
