Load library tidyverse

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load the dataset into global environment

data("airquality")

##Look at the structure of the data In the global environment, click on the row with the airquality dataset and it will take you to a “spreadsheet” view of the data.

##View the data using the “head” function The function, head, will only disply the first 6 rows of the dataset. Notice in the global environment to the right, there are 153 observations (rows)

head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

##Calculate Summary Statistics

mean(airquality$Temp)
## [1] 77.88235
mean(airquality[,4]) 
## [1] 77.88235

##Calculate Median, Standard Deviation, and Variance

median(airquality$Temp)
## [1] 79
sd(airquality$Wind)
## [1] 3.523001
var(airquality$Wind)
## [1] 12.41154

##Rename the Months from number to names Sometimes we prefer the months to be numerical, but here, we need them as the month names. There are MANY ways to do this. Here is one way to convert numbers 5 - 9 to May through September

airquality$Month[airquality$Month == 5]<- "May"
airquality$Month[airquality$Month == 6]<- "June"
airquality$Month[airquality$Month == 7]<- "July"
airquality$Month[airquality$Month == 8]<- "August"
airquality$Month[airquality$Month == 9]<- "September"

##Now look at the summary statistics of the dataset See how Month has changed to have characters instead of numbers (it is now classified as “character” rather than “integer”)

summary(airquality$Month)
##    Length     Class      Mode 
##       153 character character

##Month is a categorical variable with different levels, called factors. This is one way to reorder the Months so they do not default to alphabetical (you will see another way to reorder DIRECTLY in the chunk that creates the plot below in Plot #1

airquality$Month<-factor(airquality$Month, 
                         levels=c("May", "June","July", "August",
                                  "September"))

##Plot 1: Create a histogram categorized by Month ##plot 1 Code

p1 <- airquality |>
  ggplot(aes(x=Temp, fill=Month)) +
  geom_histogram(position="identity")+
  scale_fill_discrete(name = "Month", 
                      labels = c("May", "June","July", "August", "September")) +
  labs(x = "Monthly Temperatures from May - Sept", 
       y = "Frequency of Temps",
       title = "Histogram of Monthly Temperatures from May - Sept, 1973",
       caption = "New York State Department of Conservation and the National Weather Service")  #provide the data source

Plot 1 output

p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##Plot 2: Improve the histogram of Average Temperature by Month Outline the bars in white using the color = “white” command

Use alpha to add some transparency (values between 0 and 1)

Change the binwidth

Add some transparency and white borders around the histogram bars.

##Plot 2 Code

p2 <- airquality |>
  ggplot(aes(x=Temp, fill=Month)) +
  geom_histogram(position="identity", alpha=0.5, binwidth = 5, color = "white")+
  scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September")) +
  labs(x = "Monthly Temperatures from May - Sept", 
       y = "Frequency of Temps",
       title = "Histogram of Monthly Temperatures from May - Sept, 1973",
       caption = "New York State Department of Conservation and the National Weather Service")

Plot 2 output

p2

## Plot 3: Create side-by-side boxplots categorized by Month

p3 <- airquality |>
  ggplot(aes(Month, Temp, fill = Month)) + 
  labs(x = "Months from May through September", y = "Temperatures", 
       title = "Side-by-Side Boxplot of Monthly Temperatures",
       caption = "New York State Department of Conservation and the National Weather Service") +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))

Plot 3 output

p3

##Plot 4: Side by Side Boxplots in Gray Scale

p4 <- airquality |>
ggplot(aes(Month, Temp, fill = Month)) + 
  labs(x = "Monthly Temperatures", y = "Temperatures", 
       title = "Side-by-Side Boxplot of Monthly Temperatures",
       caption = "New York State Department of Conservation and the National Weather Service") +
  geom_boxplot()+
  scale_fill_grey(name = "Month", labels = c("May", "June","July", "August", "September"))

Plot 4 output

p4

## Plot 5:Create side-by-side Average Wind speed boxplots categorized by Month

p5 <- airquality |>
  ggplot(aes(Month, Wind, fill = Month)) + 
  labs(x = "Months from May through September", y = "Wind", 
       title = "Side-by-Side Boxplot of Monthly Average Wind Speed",
       caption = "New York State Department of Conservation and the National Weather Service") +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p5

Brief essay

Describe the plot type you have created

I have a created a side by side box plot

Insights that the plot shows

This side by side box plot shows the monthly average wind speed from May to September 1973, recorded daily. The source for this dataset is the New York State Department of Conservation and the National Weather.

Description of special code you used to make this plot

p5 <- airquality |> ggplot(aes(Month, Wind, fill = Month)) + labs(x = “Months from May through September”, y = “Wind”, title = “Side-by-Side Boxplot of Monthly Average Wind Speed”, caption = “New York State Department of Conservation and the National Weather Service”) + geom_boxplot() + scale_fill_discrete(name = “Month”, labels = c(“May”, “June”,“July”, “August”, “September”)) I replaced the variable “Temperatures” by the Variable: Wind