library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Structure of the data

str(airquality)
## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...

Calculating Summary Statistics

Two ways to find the mean:

First finds mean of variable named Temp, second finds mean of the 4th variable which is Temp

mean(airquality$Temp)
## [1] 77.88235
mean(airquality[,4])
## [1] 77.88235

Calculating the Median, Standard Dev, and Variance

First median of “Temp”, Second std dev of “Wind”, Third variance of “Wind”

median(airquality$Temp)
## [1] 79
sd(airquality$Wind)
## [1] 3.523001
var(airquality$Wind)
## [1] 12.41154

Giving numerical month values the corresponding month names

airquality$Month[airquality$Month == 5]<- "May"
airquality$Month[airquality$Month == 6]<- "June"
airquality$Month[airquality$Month == 7]<- "July"
airquality$Month[airquality$Month == 8]<- "August"
airquality$Month[airquality$Month == 9]<- "September"

The changes can be observed when recalling the structure of the dataset

str(airquality)
## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : chr  "May" "May" "May" "May" ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...

Months are now associated with characters rather than numbers:

summary(airquality)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##     Month                Day      
##  Length:153         Min.   : 1.0  
##  Class :character   1st Qu.: 8.0  
##  Mode  :character   Median :16.0  
##                     Mean   :15.8  
##                     3rd Qu.:23.0  
##                     Max.   :31.0  
## 

The Months are categorical variables with different levels, otherwise known as “Factors”

The Months are organized alphabetically by default, since this has no meaning in the dataset reorder them chronologically

airquality$Month<-factor(airquality$Month, levels=c("May", "June","July", "August", "September"))

Plot 1: Create a histogram of the data by Month using qplot

qplot = quick plot

p1 <- qplot(data = airquality,Temp,fill = Month,geom = "histogram", bins = 30)
p1

p1 <- qplot(data = airquality,Temp,fill = Month,geom = "histogram", bins = 40)
p1

The higher the number of bins, the higher the number of evenly distributed groups

Plot 2: Create a histogram of the data using ggplot

First outline the bars with white and then reorder the months in their natural order

p2 <- airquality %>%
  ggplot(aes(x=Temp, fill=Month)) +
  geom_histogram(position="identity", alpha=0.5, binwidth = 5, color = "white")+
  scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p2

p2 <- airquality %>%
  ggplot(aes(x=Temp, fill=Month)) +
  geom_histogram(position="identity", alpha=1.0, binwidth = 1, color = "white")+
  scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p2

“binwidth” defines the size of the bins or evenly distributed groups

Plot 3: Create side by side boxplots of the data categorized by Month

“fill = Month” colors each boxplot differently according to the Month “scale_fill_discrete” forms the legend on the right for the discrete colors

p3 <- airquality %>%
  ggplot(aes(Month, Temp, fill = Month)) + 
  ggtitle("Summary Stats of Temperature Each Month") +
  xlab("Months") +
  ylab("Temperature") +
  theme(plot.title = element_text(hjust = 0.6)) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p3 

Plot 4: Create the same boxlots but in grayscale

Similar to before “scale_fill_gray” forms the legend but in grayscale

p4 <- airquality %>%
  ggplot(aes(Month, Temp, fill = Month)) + 
  ggtitle("Summary Stats of Temperature Each Month") +
  xlab("Months") +
  ylab("Temperature") +
  theme(plot.title = element_text(face = "bold", hjust = 0.6)) +
  geom_boxplot() +
  scale_fill_grey(name = "Month", labels = c("May", "June","July", "August", "September"))
p4

“theme(element_text)” changes the plot font appearance, in this case the plot title was chosen 1) “face = bold” made the title bold 2) “hjust = 0.6” positioned the title of the plot in the center relative to the entire plot

Plot 5: Summary Stats of Ozone Levels Relative to Each Month Using Boxplots

p5 <- airquality %>%
  ggplot(aes(Month,Ozone, fill=Month)) +
  ggtitle("Ozone Levels Relative to Each Month") +
  xlab("Months") +
  ylab("Ozone Levels") +
  theme(plot.title = element_text(face = "bold", hjust = 0.6)) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p5
## Warning: Removed 37 rows containing non-finite values (stat_boxplot).