I have provided two data sets: a training data set and a predictions data set. The training data set (full description below) contains ozone measurements as well as other factors. The prediction data set contains all of the factors but is missing the ozone measurement.
train<-read.csv("https://www.dropbox.com/s/zvsxuvywvqvw3zc/smogData.csv?dl=1")
smogDataPredict<-read.csv("https://www.dropbox.com/s/a21lbe38ehq5ugn/smogDataPredict.csv?dl=1")
library(tree); library(randomForest);library(rpart); library(rpart.plot); library(dplyr)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(1234)
ind <- sample(1:255, 190)
dat.train <- train[ind,]
dat.test <- train[-ind,]
tree <- rpart(O3 ~ .-X, data=dat.train)
rpart.plot(tree, branch.lty=3, extra=101, box.palette="RdYlGn", main="Regression Tree with O3 outcome")
tree <- tree(O3 ~ .-X, data=dat.train,control=tree.control(nobs=190, mindev=0.01))
# Regression tree & test subset of the training data
mean((predict(tree, dat.test)-dat.test$O3)^2)
## [1] 25.76112
# Use tree to predict missing ozone values from smogDataPredict.csv
tree.predict <- predict(tree, smogDataPredict)
forest <- randomForest(O3 ~ .-X, data=dat.train, importance=TRUE)
varImpPlot(forest) # If I had more time I would use this information and move forward with purposeful variable selection to build a better model. Next time!
# Forest model and test subset of the training data
mean((predict(forest, dat.test)-dat.test$O3)^2)
## [1] 14.11043
# Use random forest model to predict missing ozone values from smogDataPredict.csv
randomForest.predict <- predict(forest, smogDataPredict)
copy <- select(smogDataPredict, -O3)
copy$Tree.O3 <- tree.predict; copy$RF.O3 <- randomForest.predict
colnames(copy)
## [1] "X" "vh" "wind" "humidity" "temp" "ibh"
## [7] "dpg" "ibt" "vis" "doy" "Tree.O3" "RF.O3"
output <- copy[, c(1, 11, 12, 2, 3, 4, 5, 6, 7, 8, 9, 10)]
write.csv(output, file="//Users/sofia/Desktop/SofiaBello_O3_Predictions.csv")