library(tidyverse)
library(knitr)Air Quality (HW week 2)
Load in the dataset
data("airquality")Head shows us the 1st 6 rows
head(airquality) Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
Stats summary
mean(airquality$Temp)[1] 77.88235
mean(airquality[,4])[1] 77.88235
median(airquality$Temp)[1] 79
sd(airquality$Wind)[1] 3.523001
var(airquality$Wind)[1] 12.41154
Renames months from number to name
airquality$Month[airquality$Month == 5]<- "May"
airquality$Month[airquality$Month == 6]<- "June"
airquality$Month[airquality$Month == 7]<- "July"
airquality$Month[airquality$Month == 8]<- "August"
airquality$Month[airquality$Month == 9]<- "September"Shows months are now strings instead of ints
summary(airquality$Month) Length Class Mode
153 character character
Order the months (default is alphabetical)
airquality$Month<-factor(airquality$Month, levels=c("May", "June","July", "August", "September"))Histogram by month
p1 <- airquality |>
ggplot(aes(x=Temp, fill=Month)) +
geom_histogram(position="identity")+
scale_fill_discrete(name = "Month",
labels = c("May", "June","July", "August", "September")) +
labs(x = "Monthly Temperatures from May - Sept",
y = "Frequency of Temps",
title = "Histogram of Monthly Temperatures from May - Sept, 1973",
caption = "New York State Department of Conservation and the National Weather Service") #provide the data source
p1`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Better formatted histogram (tranparency, outline)
p2 <- airquality |>
ggplot(aes(x=Temp, fill=Month)) +
geom_histogram(position="identity", alpha=0.5, binwidth = 5, color = "white")+
scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September")) +
labs(x = "Monthly Temperatures from May - Sept",
y = "Frequency of Temps",
title = "Histogram of Monthly Temperatures from May - Sept, 1973",
caption = "New York State Department of Conservation and the National Weather Service")
p2Boxplots for each month
p3 <- airquality |>
ggplot(aes(Month, Temp, fill = Month)) +
labs(x = "Months from May through September", y = "Temperatures",
title = "Side-by-Side Boxplot of Monthly Temperatures",
caption = "New York State Department of Conservation and the National Weather Service") +
geom_boxplot() +
scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p3 The same boxplots but greyscale
p4 <- airquality |>
ggplot(aes(Month, Temp, fill = Month)) +
labs(x = "Monthly Temperatures", y = "Temperatures",
title = "Side-by-Side Boxplot of Monthly Temperatures",
caption = "New York State Department of Conservation and the National Weather Service") +
geom_boxplot()+
scale_fill_grey(name = "Month", labels = c("May", "June","July", "August", "September"))
p4Ozone layer size by temperature
suppressMessages(
p5 <- airquality |>
ggplot(aes(x=Temp, y=Ozone, color = Month)) +
labs(y = "Ozone level", x = "Temperature",
title = "Scatterplot of Ozone levels by Temmperature",
caption = "New York State Department of Conservation and the National Weather Service") +
geom_point(size = 2) +
ylim(0,150) +
geom_smooth(method=lm, se = FALSE, linewidth=0.4, alpha=0.1) +
scale_color_manual(values = c('red', 'orange', 'green','blue', 'purple')) +
theme(legend.position="right"))
p5Warning: Removed 38 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 38 rows containing missing values (`geom_point()`).
Warning: Removed 2 rows containing missing values (`geom_smooth()`).
The data I chose to display is a scatterplot of the ozone levels by temperature. The colors display what month the data is from and the lines are regression lines by month. This dataset shows that there is a positive correlation between temperature and ozone levels.