# this entire coding chunk installs and loads the packages needed to complete this midterm 
install.packages("dplyr") 
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.2.0
## ✔ ggplot2   4.0.2     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.1     ✔ tidyr     1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)

Part 1

data(iris) #loads the dataset and allows me to see the variables 

new = iris %>% # new creates the name of the edited data, iris is the dataset we are using, %>% pipes the line of code to the next
  mutate(sepal_ratio = Sepal.Length/Sepal.Width) %>% #creating a new variable to get the ratio
  filter(Petal.Length > 3.5) #excludes any petal lengths less than 3.5

new_outliers = new %>% 
  group_by(Species) %>% #separates the categories
  mutate(q1 = quantile(sepal_ratio, 0.25),
         q3 = quantile(sepal_ratio, 0.75),
         iqr = q3 - q1,
         is_outlier = sepal_ratio < (q1 - 1.5*iqr) | sepal_ratio > (q3 + 1.5*iqr)) %>%
  filter(is_outlier) #manually creates the outliers that are normally created in boxplot and keeps only the data that is an outlier

ggplot(new, aes(Species, sepal_ratio)) + #uses the cleaned data that was previoulsy used and selects x and y
  geom_violin(fill = "lightblue", alpha = 0.4, trim = FALSE) + #creats violin plot that is lightblue, transparent and no trim 
  geom_boxplot(width = 0.4, alpha = 0.7, outliers = FALSE) + #creates an overlying boxplot that excludes the outliers
  geom_jitter(data = new_outliers, color = "red") + #overlaps a jitter plot but uses my new_outlier dataset so that the outliers are visible with no overlap
  labs(#creates labels 
    title = "Distribution of Sepal Length to Width Ratio Between Species",
    x = "Flower Species",
    y = "Speal Length to Width Ratio (cm)",
    caption = "Created using the ggplot Iris dataset but excluded all flowers with petal length less than 3.5 cm (n = 95). "
  ) +
  theme_bw() #applies theme bw

Part 2

2a

head(economics_long) #allows me to preveiw the data
## # A tibble: 6 × 4
##   date       variable value  value01
##   <date>     <chr>    <dbl>    <dbl>
## 1 1967-07-01 pce       507. 0       
## 2 1967-08-01 pce       510. 0.000265
## 3 1967-09-01 pce       516. 0.000762
## 4 1967-10-01 pce       512. 0.000471
## 5 1967-11-01 pce       517. 0.000916
## 6 1967-12-01 pce       525. 0.00157
plot(economics_long$date, #x value
     economics_long$value, #y axis 
     type = "b", #creates line plot with plots and line
     lwd = 0.1) #applies change to line to make it smaller then default

2b

ggplot(economics_long, aes(date, #x axis 
                           value, #y axis 
                           color = variable)) + # sorts by variable to show relationship
  geom_line( lwd = 0.8) + # creates line plot with adjustment of line size 
  labs(
    title = "Comparisson of Economic Variables Over Time", #title label 
    x = "Date", # x axis label 
    y = "Value" , # y axis label 
    color = "Variable" #applies label to Key
  ) + 
  theme_bw() #apllies theme to better qualtiy and reading

2c

#zooming in on the lower range data without altering data

ggplot(economics_long, aes(date, #x axis 
                           value, #y axis 
                           color = variable)) + # sorts by variable to show relationship
  geom_line( lwd = 0.8) + # creates line plot with adjustment of line size 
  labs(
    title = "Comparisson of Economic Variables Over Time", #title label 
    x = "Date", # x axis label 
    y = "Value" , # y axis label 
    color = "Variable", #applies label to Key
    caption = "Zoomed in on Y-axis to better analyze lower variables"
  ) + 
  theme_bw() + #apply theme for better quality and reading
  scale_y_continuous(limits=c(0,20000)) #adjust y axis to zoom in on certain data without editing data 
## Warning: Removed 574 rows containing missing values or values outside the scale range
## (`geom_line()`).

Part 3

3a

data("penguins")

p_new = penguins %>%
  filter(!is.na(body_mass)) # removes any NA values in body mass

ggplot(p_new, aes(body_mass, # uses removed NA for body mass dataset and sets x-axis to body mass
                  fill = species)) + # categorize the values by species 
  geom_density(alpha = 0.6 )+ # density plot with transparency
  scale_fill_manual(values = c( #manually changes the fill color to edited color
    "Adelie" = "darkseagreen3",
    "Chinstrap" = "mistyrose3",
    "Gentoo" = "darkslategrey"
  )) + 
  labs( # creates labels
    title = "Body Mass Comparison Between Species",
    x = "Body Mass",
    y = "Density",
    caption = "Created using the penguins dataset in ggplot with all NA Mass variables removed "
  ) + 
  theme_bw() #makes the theme bw settings in ggplot 

Part 4

4a

data("diamonds") #loading diamonds dataset 

ggplot( diamonds, aes(cut, fill = color)) + #uses diamonds data set, x-axis is cut and the bars are sorted by color
  geom_bar(position = "fill") + #creates bar plot and sets the y-axis to percent instead of values
  scale_fill_viridis_d() + # color blind friendly color set 
  labs( #creating labels for title, subtitle, x and y axis
    title = "Stacked Barplot Showing Proportion of Color When Comparing to Cut",
    subtitle = "Source = ggplot Diamonds Dataset",
    x = "Diamond Cut",
    y = "Count",
    fill = "Color"
  ) +
  theme_bw()

4b

ggplot( diamonds, aes(cut, fill = color)) + #uses diamonds data set, x-axis is cut and the bars are sorted by color
  geom_bar( position = "dodge") + #creates grouped bar plot
  scale_fill_viridis_d() + # color blind friendly color set 
  labs( #creating labels for title, subtitle, x and y axis
    title = "Groupws Barplot Showing Color Amounts by Cut",
    subtitle = "Source = ggplot Diamonds Dataset",
    x = "Diamond Cut",
    y = "Count",
    fill = "Color"
  ) +
  theme_bw()