This is the published document for Week 2 solutions
airpollutionurl <- "https://raw.githubusercontent.com/john-grando/Masters/master/Workshop/R/Week2/assignment/summer.csv"
airpollutiondata <- read.csv(file = airpollutionurl, header <- TRUE, sep <- ",", colClasses = c('NULL','numeric','numeric','numeric','numeric','numeric'))
summary(airpollutiondata, digits <- 4)
## O3 NO2 NO SO2
## Min. : 8 Min. : 9.00 Min. : 4.00 Min. : 0.00
## 1st Qu.:26 1st Qu.: 30.00 1st Qu.: 30.00 1st Qu.: 5.00
## Median :31 Median : 36.00 Median : 47.00 Median : 8.00
## Mean :32 Mean : 37.63 Mean : 55.20 Mean : 17.37
## 3rd Qu.:36 3rd Qu.: 45.00 3rd Qu.: 71.75 3rd Qu.: 15.00
## Max. :84 Max. :105.00 Max. :256.00 Max. :313.00
## PM10
## Min. : 9.00
## 1st Qu.: 26.00
## Median : 34.00
## Mean : 41.12
## 3rd Qu.: 50.00
## Max. :185.00
print(c(O3.Median = median(airpollutiondata$O3), O3.Mean = mean(airpollutiondata$O3),NO2.Median = median(airpollutiondata$NO2), NO2.Mean = mean(airpollutiondata$NO2)))
## O3.Median O3.Mean NO2.Median NO2.Mean
## 31.00000 31.99654 36.00000 37.62803
newDF <- subset(x = airpollutiondata, airpollutiondata$PM10 >34)
colnames(newDF) <- c("newO3", "newNO2", "NewNO", "newSO2", "newPM10")
head(newDF)
## newO3 newNO2 NewNO newSO2 newPM10
## 11 27 53 102 16 46
## 12 23 58 153 35 46
## 13 28 29 54 13 44
## 14 34 29 43 4 67
## 18 26 45 106 15 44
## 20 16 49 111 14 49
summary(newDF, digits <- 4)
## newO3 newNO2 NewNO newSO2
## Min. : 8.00 Min. : 9.00 Min. : 4.0 Min. : 1.0
## 1st Qu.:26.00 1st Qu.: 35.00 1st Qu.: 34.0 1st Qu.: 7.0
## Median :32.00 Median : 41.00 Median : 56.0 Median : 13.0
## Mean :34.31 Mean : 42.37 Mean : 65.1 Mean : 26.2
## 3rd Qu.:40.00 3rd Qu.: 48.00 3rd Qu.: 86.0 3rd Qu.: 23.5
## Max. :84.00 Max. :105.00 Max. :256.0 Max. :313.0
## newPM10
## Min. : 35.00
## 1st Qu.: 40.00
## Median : 50.00
## Mean : 56.91
## 3rd Qu.: 65.00
## Max. :185.00
print(c(newO3.Median = median(newDF$newO3), newO3.Mean = mean(newDF$newO3), newNO2.Median = median(newDF$newNO2), newNO2.Mean = mean(newDF$newNO2)))
## newO3.Median newO3.Mean newNO2.Median newNO2.Mean
## 32.00000 34.31010 41.00000 42.36934
#It appears that the mean and median values for all attributes are approximately the same between the upper half of the PM10 readings and all readings. Without further analysis, it cannot be determined whether or not these differences are statistically significant. More outputs have been included for side by side comparisons.
#Create copy data tables with an extra column for the data set type
require(ggplot2)
## Loading required package: ggplot2
cpyDF <- cbind(airpollutiondata, DataSet = "All")
cpynewDF <- subset(x = airpollutiondata, airpollutiondata$PM10 >34)
cpynewDF <- cbind(cpynewDF, DataSet = "PM10_GT_34")
#Combine the tables into one for box plots
combinedDF <- rbind(cpyDF, cpynewDF)
#Box plots of O3
ggplot(data = combinedDF, aes(y = O3, x = DataSet)) + geom_boxplot()
#Box plots of NO2
ggplot(data = combinedDF, aes(y = NO2, x = DataSet)) + geom_boxplot()
#Box plots of NO
ggplot(data = combinedDF, aes(y = NO, x = DataSet)) + geom_boxplot()
#Box plots of PM10
ggplot(data = combinedDF, aes(y = PM10, x = DataSet)) + geom_boxplot()
head(newDF)
## newO3 newNO2 NewNO newSO2 newPM10
## 11 27 53 102 16 46
## 12 23 58 153 35 46
## 13 28 29 54 13 44
## 14 34 29 43 4 67
## 18 26 45 106 15 44
## 20 16 49 111 14 49
newDF$newSO2[newDF$newSO2 < 10] <- 0
newDF$newNO2[newDF$newNO2 == 29] <- 31
newDF$newO3[newDF$newNO2 > 30] <- newDF$newO3[newDF$newNO2 >30] / 2
head(newDF)
## newO3 newNO2 NewNO newSO2 newPM10
## 11 13.5 53 102 16 46
## 12 11.5 58 153 35 46
## 13 14.0 31 54 13 44
## 14 17.0 31 43 0 67
## 18 13.0 45 106 15 44
## 20 8.0 49 111 14 49
#Additionally, I was experimenting with facet_wrap and melt to combine all the box plots into one statement. It did not turn out as well as I hoped, due to scaling in the y axis, so I did not include it in the main outputs. However, since I already wrote the code, I figured I would show it.
require(reshape2)
## Loading required package: reshape2
combinedDF.m <- melt(data = combinedDF, id.vars = "DataSet", measure.vars = c("O3","NO2","NO","PM10"))
ggplot(data = combinedDF.m, aes(y = value, x = DataSet)) + facet_wrap(~variable) + geom_boxplot()
#Examples have been provided within each step to make review easier