Take Home Midterm

Part 1

library(ggplot2) ### load in ggplot package
library(dplyr) ### load in dplyr package

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

sepal_data <- iris %>% ## load in + rename altered data as "sepal_data"
  filter(Petal.Length != 3.5) %>% ### exclude petals at 3.5 length
  mutate(S.ratio = Sepal.Length/Sepal.Width) ## define sepal L:W ratio value

ggplot(sepal_data, aes(x = Species, y = S.ratio, fill = Species)) + ### create plot & define x/y variables
  geom_violin(trim = FALSE, alpha = 0.6) + ## create violin, cut off the tail, adjust transparency
  geom_boxplot(width = 0.15, outlier.shape = NA, alpha = 0.8) + ## create boxplot, adjust width to fit inside violin, adjust transparency
  geom_jitter(width = 0.1, size = 1.0, alpha = 0.5)+ ## show outliers as jitter points, with adjusted width/size/transparency
  labs(
    title = "Sepal Length / Width Ratio (cm) across Species", ### title
    x = "Species", ### x axis title
    y = "Sepal Length / Width (cm)" ### y axis title
  ) +
  theme_minimal() ### set theme (minimal is usually most ideal)

Figure 1: Data collected from setosa, versicolor, and virginica plants showing the distribution of their sepal length to width ratio measured in centimeters (n = 148). Plants with a petal length of 3.5 were excluded from the figure.

Part 2:

Part 2a:

data(economics_long) ## loading in data
ec_data <- subset(economics_long, variable == "unemploy") ## specifying the variable we're testing so it's no longer long-form data
plot(ec_data$date, ec_data$value, ## specify x/y axes
     type = "b", ## telling r we want datapoints and a line
     xlab = "Date", ## name x axis
     ylab = "Employment", ## name y axis
     main = "Change in Employment Over Time") ### title

Figure 2: A base R plot displaying the relationship between time and employment, with the dataset showing many clear peaks and troughs as numbers fall and rise.

Part 2b:

ggplot(economics_long, aes(x=date, y=value, color=variable))+ ## since we're using ggplot which is better with data than base r, we can use the long form of the data
  geom_line() + ## create line plot
  labs( ## name titles etc
    title = "Economics Over Time", ## add title
    subtitle = "Colored by variable", ## add subtitle (wasn't sure what to label this one)
    x = "Date", ## name x axis
    y = "Value", ## name y axis
    color = "Variable") + ## name of color
  theme_minimal() ## basic theme

## Figure 3: A line plot showing all the variables of the original data. As population increases through the years, our othr variables remain essentially unchanging.

Part 2c:

ggplot(economics_long, aes(x=date, y=value, color=variable))+ ## since we're using ggplot which is better with data than base r, we can use the long form of the data
  geom_line() + ## create line plot
  labs( ## name titles etc
    title = "Economics Over Time", ## add title
    subtitle = "Colored by variable", ## add subtitle (wasn't sure what to label this one)
    x = "Date", ## name x axis
    y = "Value", ## name y axis
    color = "Variable") + ## name of color
  theme_minimal() + ## basic theme
  coord_cartesian(ylim=c(2500,15000)) ## zoom in on the data without altering it - you can choose the exact range

Figure 3: A line plot showing all the variables of the original data. This plot has been zoomed in without alterations. Having a much closer view, you can see that as the pce variable increases steadily, unemployment continues to have varying troughs and peaks, with a general trend of going upwards.

Part 3:

library(palmerpenguins) ## making sure the data installed

## 
## Attaching package: 'palmerpenguins'

## The following objects are masked from 'package:datasets':
## 
##     penguins, penguins_raw

data(penguins) ## loading in the data
pengs <- penguins %>% ## filtering the data and naming the filtered data
  filter(!is.na(body_mass_g)) ## filtering the data by removing n/a values from body mass
ggplot(pengs, aes(x=body_mass_g, fill=species)) + ## 
  geom_density(alpha = 0.6) + ## create density plot & alter opacity (alpha)
  scale_fill_manual(values = c( ## fill data, but with certain variables corresponding to certain colors
    "Adelie" = "darkseagreen3",
    "Chinstrap" = "mistyrose3",
    "Gentoo" = "darkslategrey")) +
  labs( ## changing titles/axes
    title = "Distribution of Penguin Body Mass by Species", ## create title
    x = "Body Mass(g)", ## name x axis
    y = "Density", ## name y axis
    fill = "Species") + ## fill the data by species (using aforementioned colors)
  theme_minimal() ## pick the usual theme

Figure 4: A density plot displaying the relationship between body mass (g) and penguin species. Predominantly, the Gentoo penguins appear to have the higher average body mass.

Part 4:

Part 4a:

ggplot(diamonds, aes(x=color, fill=cut)) + ## specify dataset, x value, & fill
  geom_bar(position = "fill") + ## create barplot
  scale_fill_brewer(palette = "Set2") + ## selecting colorblind friendly theme
  labs( ## adding titles & axes labels
    title = "The Relationship Between Diamond Cut and Color", ## add title
    x = "Diamond Color", ## name x axis
    y = "Proportion", ## name y axis
    fill = "Cut") + ## fill/color by cut
  theme_minimal() ## usual theme

Figure 5: A proportional stacked barplot displaying the proportion of each diamond cut category within each color category.

Part 4b:

ggplot(diamonds, aes(x=color, fill=cut)) + ## specify dataset, x value, & fill
  geom_bar(position = "dodge") + ## create barplot & specify it's grouped by using dodge
  scale_fill_brewer(palette = "Set2") + ## set to colorblind friendly palette
  labs( ## change names of titles & axes
    title = "The Relationship Between Diamond Cut and Color", ## create title
    x = "Diamond Color", ## name x axis
    y = "Count", ## name y axis
    fill = "Cut") + ## fill/color by cut
  theme_minimal() ## use the usual theme

Figure 6: A proportional grouped barplot displaying the proportion of each diamond cut category within each color category, showing the raw counts of each cut category

DataVis_Midterm

Leo Colburn

2026-03-22