Week 2 Solutions

This is the published document for Week 2 solutions

Problem 1 - Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes

Probelm 7 - BONUS – place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.

airpollutionurl <- "https://raw.githubusercontent.com/john-grando/Masters/master/Workshop/R/Week2/assignment/summer.csv"
airpollutiondata <- read.csv(file = airpollutionurl, header <- TRUE, sep <- ",", colClasses = c('NULL','numeric','numeric','numeric','numeric','numeric'))
summary(airpollutiondata, digits <- 4)
##        O3          NO2               NO              SO2        
##  Min.   : 8   Min.   :  9.00   Min.   :  4.00   Min.   :  0.00  
##  1st Qu.:26   1st Qu.: 30.00   1st Qu.: 30.00   1st Qu.:  5.00  
##  Median :31   Median : 36.00   Median : 47.00   Median :  8.00  
##  Mean   :32   Mean   : 37.63   Mean   : 55.20   Mean   : 17.37  
##  3rd Qu.:36   3rd Qu.: 45.00   3rd Qu.: 71.75   3rd Qu.: 15.00  
##  Max.   :84   Max.   :105.00   Max.   :256.00   Max.   :313.00  
##       PM10       
##  Min.   :  9.00  
##  1st Qu.: 26.00  
##  Median : 34.00  
##  Mean   : 41.12  
##  3rd Qu.: 50.00  
##  Max.   :185.00
print(c(O3.Median = median(airpollutiondata$O3), O3.Mean = mean(airpollutiondata$O3),NO2.Median = median(airpollutiondata$NO2), NO2.Mean = mean(airpollutiondata$NO2)))
##  O3.Median    O3.Mean NO2.Median   NO2.Mean 
##   31.00000   31.99654   36.00000   37.62803

Problem 2 - Create a new data frame with a subset of the columns and rows. Make sure to rename it

newDF <- subset(x = airpollutiondata, airpollutiondata$PM10 >34)

Problem 3 - Create new column names for the new data frame.

colnames(newDF) <- c("newO3", "newNO2", "NewNO", "newSO2", "newPM10")
head(newDF)
##    newO3 newNO2 NewNO newSO2 newPM10
## 11    27     53   102     16      46
## 12    23     58   153     35      46
## 13    28     29    54     13      44
## 14    34     29    43      4      67
## 18    26     45   106     15      44
## 20    16     49   111     14      49

Problem 4 - Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare

summary(newDF, digits <- 4)
##      newO3           newNO2           NewNO           newSO2     
##  Min.   : 8.00   Min.   :  9.00   Min.   :  4.0   Min.   :  1.0  
##  1st Qu.:26.00   1st Qu.: 35.00   1st Qu.: 34.0   1st Qu.:  7.0  
##  Median :32.00   Median : 41.00   Median : 56.0   Median : 13.0  
##  Mean   :34.31   Mean   : 42.37   Mean   : 65.1   Mean   : 26.2  
##  3rd Qu.:40.00   3rd Qu.: 48.00   3rd Qu.: 86.0   3rd Qu.: 23.5  
##  Max.   :84.00   Max.   :105.00   Max.   :256.0   Max.   :313.0  
##     newPM10      
##  Min.   : 35.00  
##  1st Qu.: 40.00  
##  Median : 50.00  
##  Mean   : 56.91  
##  3rd Qu.: 65.00  
##  Max.   :185.00
print(c(newO3.Median = median(newDF$newO3), newO3.Mean = mean(newDF$newO3), newNO2.Median = median(newDF$newNO2), newNO2.Mean = mean(newDF$newNO2)))
##  newO3.Median    newO3.Mean newNO2.Median   newNO2.Mean 
##      32.00000      34.31010      41.00000      42.36934
#It appears that the mean and median values for all attributes are approximately the same between the upper half of the PM10 readings and all readings. Without further analysis, it cannot be determined whether or not these differences are statistically significant.  More outputs have been included for side by side comparisons.
#Create copy data tables with an extra column for the data set type
require(ggplot2)
## Loading required package: ggplot2
cpyDF <- cbind(airpollutiondata, DataSet = "All")
cpynewDF <- subset(x = airpollutiondata, airpollutiondata$PM10 >34)
cpynewDF <- cbind(cpynewDF, DataSet = "PM10_GT_34")
#Combine the tables into one for box plots
combinedDF <- rbind(cpyDF, cpynewDF)

#Box plots of O3
ggplot(data = combinedDF, aes(y = O3, x = DataSet)) + geom_boxplot()

#Box plots of NO2
ggplot(data = combinedDF, aes(y = NO2, x = DataSet)) + geom_boxplot()

#Box plots of NO
ggplot(data = combinedDF, aes(y = NO, x = DataSet)) + geom_boxplot()

#Box plots of PM10
ggplot(data = combinedDF, aes(y = PM10, x = DataSet)) + geom_boxplot()

Problem 5 - For at least 3 values in a column please rename so that every value in that column is renamed. For example, suppose I have 20 values of the letter ā€œeā€ in one column. Rename those values so that all 20 would show as excellent

head(newDF)
##    newO3 newNO2 NewNO newSO2 newPM10
## 11    27     53   102     16      46
## 12    23     58   153     35      46
## 13    28     29    54     13      44
## 14    34     29    43      4      67
## 18    26     45   106     15      44
## 20    16     49   111     14      49
newDF$newSO2[newDF$newSO2 < 10] <- 0
newDF$newNO2[newDF$newNO2 == 29] <- 31
newDF$newO3[newDF$newNO2 > 30] <- newDF$newO3[newDF$newNO2 >30] / 2 
head(newDF)
##    newO3 newNO2 NewNO newSO2 newPM10
## 11  13.5     53   102     16      46
## 12  11.5     58   153     35      46
## 13  14.0     31    54     13      44
## 14  17.0     31    43      0      67
## 18  13.0     45   106     15      44
## 20   8.0     49   111     14      49
#Additionally, I was experimenting with facet_wrap and melt to combine all the box plots into one statement.  It did not turn out as well as I hoped, due to scaling in the y axis, so I did not include it in the main outputs.  However, since I already wrote the code, I figured I would show it.

require(reshape2)
## Loading required package: reshape2
combinedDF.m <- melt(data = combinedDF, id.vars = "DataSet", measure.vars = c("O3","NO2","NO","PM10"))
ggplot(data = combinedDF.m, aes(y = value, x = DataSet)) + facet_wrap(~variable) + geom_boxplot()

Problem 6 - Display enough rows to see examples of all of steps 1-5 above.

#Examples have been provided within each step to make review easier