#Load the dataset and Look at data structure
airquality <- airquality
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
mean(airquality$Temp)
## [1] 77.88235
mean(airquality[,4])
## [1] 77.88235
#Calculate Median, Standard Deviation, and Variance
median(airquality$Temp)
## [1] 79
sd(airquality$Wind)
## [1] 3.523001
var(airquality$Wind)
## [1] 12.41154
#Change the Months from 5 - 9 to May through September
airquality$Month[airquality$Month == 5]<- "May"
airquality$Month[airquality$Month == 6]<- "June"
airquality$Month[airquality$Month == 7]<- "July"
airquality$Month[airquality$Month == 8]<- "August"
airquality$Month[airquality$Month == 9]<- "September"
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : chr "May" "May" "May" "May" ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Length:153 Min. : 1.0
## Class :character 1st Qu.: 8.0
## Mode :character Median :16.0
## Mean :15.8
## 3rd Qu.:23.0
## Max. :31.0
##
#Reorder the Months so they do not default to alphabetical
airquality$Month<-factor(airquality$Month, levels=c("May", "June","July", "August", "September"))
#Check changed dataset
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : Factor w/ 5 levels "May","June","July",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## May :31 Min. : 1.0
## June :30 1st Qu.: 8.0
## July :31 Median :16.0
## August :31 Mean :15.8
## September:30 3rd Qu.:23.0
## Max. :31.0
##
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 May 1
## 2 36 118 8.0 72 May 2
## 3 12 149 12.6 74 May 3
## 4 18 313 11.5 62 May 4
## 5 NA NA 14.3 56 May 5
## 6 28 NA 14.9 66 May 6
#Histogram categorized by Month using qplot
p1 <- qplot(data = airquality,Temp,fill = Month,geom = "histogram", bins = 20)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
p1
#Histogram using ggplot
p2 <- airquality %>%
ggplot(aes(x=Temp, fill=Month)) +
geom_histogram(position="identity", alpha=0.5, binwidth = 5, color = "white")+
scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p2
#Side by Side Boxplots Categorized by Month
p3 <- airquality %>%
ggplot(aes(Month, Temp, fill = Month)) +
ggtitle("Temperatures") +
xlab("Monthly Temperatures") +
ylab("Frequency") +
geom_boxplot() +
scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p3
#Side-by-Side Boxplots in Grey-scale
p4 <- airquality %>%
ggplot(aes(Month, Temp, fill = Month)) +
ggtitle("Monthly Temperature Variations") +
xlab("Monthly Temperatures") +
ylab("Frequency") +
geom_boxplot()+
scale_fill_grey(name = "Month", labels = c("May", "June","July", "August", "September"))
p4
#Side-by-Side Boxplots of Ozone Levels Categorized by Month
p5 <- airquality %>%
ggplot(aes(Ozone, Month, fill = Month)) +
ggtitle("Monthly Ozone Levels") +
xlab("Ozone Level") +
ylab("Month") +
geom_boxplot() +
scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p5
## Warning: Removed 37 rows containing non-finite values (`stat_boxplot()`).
Plot 5, or “Monthly Ozone Levels,” uses side-by-side boxplots to show Ozone levels categorize by months. The months are color-coordinated to help with visualization. As you can see, July and August clearly are the months with the highest Ozone levels and the months where Ozone levels varied the most. On the other hand, September, June, and May have very low levels, all under 50, except for minimal outliers. —
| For my code, I had to make modifications to the ggplot replacing the “x,y” values in the aes() from(Month, Temp, fill = Month)) to (Ozone, Month, fill = Month)). This replaced the x value Temp with Month and y value from Month to Ozone, keeping the fill = month rotated the boxplots from being vertical to horizontal, an easier way to view this data. The next modifications I made were to the ggtitle, xlab, and ylab. I created a more appropriate title, “Monthly Ozone Levels” and correctly labeled the x and y axis with “Ozone Level” and “Month”. These were all the necessary changes made to my code in order to generate plot 5. |