Load in the Dataset
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages --------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.1 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Show the data
Look at the structure of the data
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
Calculating Summary Statistics
mean(airquality$Temp)
## [1] 77.88235
Change the Months from 5 - 9 to May through September
airquality$Month[airquality$Month == 5] <- "May"
airquality$Month[airquality$Month == 6] <- "June"
airquality$Month[airquality$Month == 7] <- "July"
airquality$Month[airquality$Month == 8] <- "August"
airquality$Month[airquality$Month == 9] <- "September"
airquality$Month <- factor(airquality$Month, levels=c("May", "June","July", "August", "September"))
Plot 1: Create a histogram categorized by Month with qplot
p1 <- qplot(data = airquality,Temp,fill = Month,geom = "histogram", bins = 20)
p1

Plot 2: Make a histogram using ggplot
p2 <- airquality %>%
ggplot(aes(x=Temp, fill=Month)) +
geom_histogram(position="identity", alpha=0.5, binwidth = 5, color = "white")+
scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p2

Plot 3: Create side-by-side boxplots categorized by Month
p3 <- airquality %>%
ggplot(aes(Month, Temp, fill = Month)) +
ggtitle("Temperatures") +
xlab("Months") +
ylab("Frequency") +
geom_boxplot() +
scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p3

Plot 4: Make the same side-by-side boxplots, but in grey-scale
p4 <- airquality %>%
ggplot(aes(Month, Temp, fill = Month)) +
ggtitle("Temperatures") +
xlab("Temperatures") +
ylab("Frequency") +
geom_boxplot()+
scale_fill_grey(name = "Month", labels = c("May", "June","July", "August", "September"))
p4

Plot 5: Scatterplot with facets of Temperature and Solar Radiation of the Months
# Missing values in Temp
sum(is.na(airquality$Temp))
## [1] 0
# Missing values in Solar.R
sum(is.na(airquality$Solar.R))
## [1] 7
# Replace missing values with median of Solar.R
airquality <- airquality %>% mutate(Solar.R = replace(Solar.R, is.na(Solar.R), median((Solar.R), na.rm = TRUE)))
airquality$Solar.R
## [1] 190 118 149 313 205 205 299 99 19 194 205 256 290 274 65 334 307 78
## [19] 322 44 8 320 25 92 66 266 205 13 252 223 279 286 287 242 186 220
## [37] 264 127 273 291 323 259 250 148 332 322 191 284 37 120 137 150 59 91
## [55] 250 135 127 47 98 31 138 269 248 236 101 175 314 276 267 272 175 139
## [73] 264 175 291 48 260 274 285 187 220 7 258 295 294 223 81 82 213 275
## [91] 253 254 83 24 77 205 205 205 255 229 207 222 137 192 273 157 64 71
## [109] 51 115 244 190 259 36 255 212 238 215 153 203 225 237 188 167 197 183
## [127] 189 95 92 252 220 230 259 236 259 238 24 112 237 224 27 238 201 238
## [145] 14 139 49 20 193 145 191 131 223
# Plot 5
p5 <- airquality %>% ggplot() +
geom_point(mapping = aes(x = Solar.R, y = Temp)) +
ggtitle("Temperature and Solar Radiation each Month") +
xlab("Solar Radiation") +
ylab("Temperature") +
facet_wrap(~ Month, nrow = 2)
p5
