Airquality HW

Author

E Higgs

Load the library

library(tidyverse)

Load the dataset into your global environment

data("airquality")

Look at the structure of the data

head(airquality)
  Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
5    NA      NA 14.3   56     5   5
6    28      NA 14.9   66     5   6

Calculate summary statistics

mean(airquality$Temp)
[1] 77.88235
mean(airquality[,4])
[1] 77.88235

Calculate median, standard deviation, and variance

median(airquality$Temp)
[1] 79
sd(airquality$Wind)
[1] 3.523001
var(airquality$Wind)
[1] 12.41154

Rename the months from numbers to names

airquality$Month[airquality$Month == 5]<- "May"
airquality$Month[airquality$Month == 6]<- "June"
airquality$Month[airquality$Month == 7]<- "July"
airquality$Month[airquality$Month == 8]<- "August"
airquality$Month[airquality$Month == 9]<- "September"

Look at the summary statistics of the dataset

unique(airquality$Month)
[1] "May"       "June"      "July"      "August"    "September"

Month is a categorical variable with different levels, called factors.

airquality$Month<-factor(airquality$Month, 
                        levels=c("May", "June","July", "August",
                                  "September"))

Plot 1: Create a histogram categorized by Month

p1 <- airquality |>
  ggplot(aes(x=Temp, fill=Month)) +
  geom_histogram(position="identity")+
  scale_fill_discrete(name = "Month", 
                      labels = c("May", "June","July", "August", "September")) +
  labs(x = "Monthly Temperatures from May - Sept", 
       y = "Frequency of Temps",
       title = "Histogram of Monthly Temperatures from May - Sept, 1973",
       caption = "New York State Department of Conservation and the National Weather Service")  #provide the data source
p1
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot 2: Improve the histogram of Average Temperature by Month

p2 <- airquality |>
  ggplot(aes(x=Temp, fill=Month)) +
  geom_histogram(position="identity", alpha=0.5, binwidth = 5, color = "white")+
  scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September")) +
  labs(x = "Monthly Temperatures from May - Sept", 
       y = "Frequency of Temps",
       title = "Histogram of Monthly Temperatures from May - Sept, 1973",
       caption = "New York State Department of Conservation and the National Weather Service")
p2

Plot 3: Create side-by-side boxplots categorized by Month

p3 <- airquality |>
  ggplot(aes(Month, Temp, fill = Month)) + 
  labs(x = "Months from May through September", y = "Temperatures", 
       title = "Side-by-Side Boxplot of Monthly Temperatures",
       caption = "New York State Department of Conservation and the National Weather Service") +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p3

Plot 4: Side by Side Boxplots in Gray Scale

p4 <- airquality |>
ggplot(aes(Month, Temp, fill = Month)) + 
  labs(x = "Monthly Temperatures", y = "Temperatures", 
       title = "Side-by-Side Boxplot of Monthly Temperatures",
       caption = "New York State Department of Conservation and the National Weather Service") +
  geom_boxplot()+
  scale_fill_grey(name = "Month", labels = c("May", "June","July", "August", "September"))
p4

Plot 5:

p5 <- airquality |>
  ggplot(aes(x=Temp, fill=Month)) +
  geom_histogram(position="identity", alpha=0.5, binwidth = 5, color = "white")+
  scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September","October","November","December","January","February","March","April")) +
  labs(x = "Monthly Temperatures from May - Sept", 
       y = "Frequency of Temps",
       title = "Histogram of Monthly Temperatures from May - Sept, 1973",
caption = "New York State Department of Conservation and the National Weather Service")
p5

a <- filter(airquality, Month=="May")  

a
   Ozone Solar.R Wind Temp Month Day
1     41     190  7.4   67   May   1
2     36     118  8.0   72   May   2
3     12     149 12.6   74   May   3
4     18     313 11.5   62   May   4
5     NA      NA 14.3   56   May   5
6     28      NA 14.9   66   May   6
7     23     299  8.6   65   May   7
8     19      99 13.8   59   May   8
9      8      19 20.1   61   May   9
10    NA     194  8.6   69   May  10
11     7      NA  6.9   74   May  11
12    16     256  9.7   69   May  12
13    11     290  9.2   66   May  13
14    14     274 10.9   68   May  14
15    18      65 13.2   58   May  15
16    14     334 11.5   64   May  16
17    34     307 12.0   66   May  17
18     6      78 18.4   57   May  18
19    30     322 11.5   68   May  19
20    11      44  9.7   62   May  20
21     1       8  9.7   59   May  21
22    11     320 16.6   73   May  22
23     4      25  9.7   61   May  23
24    32      92 12.0   61   May  24
25    NA      66 16.6   57   May  25
26    NA     266 14.9   58   May  26
27    NA      NA  8.0   57   May  27
28    23      13 12.0   67   May  28
29    45     252 14.9   81   May  29
30   115     223  5.7   79   May  30
31    37     279  7.4   76   May  31
p8 <- a |>
ggplot(aes(x=Day,y=Solar.R, fill = Solar.R))+
  geom_histogram(stat="identity")+
  #scale_fill_grey(name = "Solar Rays per Day in May", labels = c("May"))+
    scale_colour_brewer(type = "seq", palette = "Spectral")+
  labs(x = "Day", y = "Solar Rays", title = "Solar Rays by Day in May")
Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
`binwidth`, `bins`, and `pad`
p8
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_bar()`).

Essay

To make my graph, I had to make a new version of the “airquality” library that only had data for the month of May. To do this, I made a new library named “a” which used the “filter” command to only have datapoints that were connected with the month “5” or May. Afterwards, I made a new histogram where the x-axis was the days of the month and the y-axis was the solar rays measured per day. I colored the graph so that the darker the bar was, the lower the y value. I think that the overall graph is good, but the y-axis could probably use some more detail with the numbers. I think that there is too much space between each displayed value of y.