Step 1: Load the Data

train <- read.csv("https://www.dropbox.com/s/zvsxuvywvqvw3zc/smogData.csv?dl=1")
test <- read.csv("https://www.dropbox.com/s/a21lbe38ehq5ugn/smogDataPredict.csv?dl=1")

#Subsetting the Data
set.seed(1234)
rows_for_testing <- sort(sample(1:nrow(train), 140)) 
testsubset <- train[rows_for_testing,] 
trainsubset <- train[-rows_for_testing,]

Step 2: Look at some Plots

From the tree, we see that Temp and IBT are two variables driving the differences within the data.

library(tree)
library(rpart)
library(rpart.plot)
tree <- rpart(O3 ~ ., data=trainsubset)
rpart.plot(tree)

Forward Model Selection

Unclear from BIC which variables to keep. They all are directionally similar.

library(leaps)
fwd_O3<-regsubsets(O3 ~ . -X , data = trainsubset, method="forward")
fwd_O3
## Subset selection object
## Call: regsubsets.formula(O3 ~ . - X, data = trainsubset, method = "forward")
## 9 Variables  (and intercept)
##          Forced in Forced out
## vh           FALSE      FALSE
## wind         FALSE      FALSE
## humidity     FALSE      FALSE
## temp         FALSE      FALSE
## ibh          FALSE      FALSE
## dpg          FALSE      FALSE
## ibt          FALSE      FALSE
## vis          FALSE      FALSE
## doy          FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
reg.summary <- summary(fwd_O3)
reg.summary$bic
## [1]  -90.38523 -115.88853 -127.09696 -123.84667 -120.49085 -116.78034
## [7] -112.57299 -107.95117

CART/Tree Prediction

The MSE here is not great. I think we can do better with PCR or a Random Forest.

The tree with the lowest cp value is better than a normal tree without the cp specified.

set.seed(1234)
train$fold <- sample(1:5,255,replace=TRUE)
cp <- seq(0.01,0.02,length=20)
cpList<-list()
for (j in 1:20){print(j)
sse <- list()
for (i in 1:5){
tree_K <- rpart(O3 ~ .-X, data=train[!(train$fold == i),], control = rpart.control(cp = cp[j] ,minsplit=10,minbucket=5))
#Predicted values
yhat<- predict(tree_K,train[(train$fold == i),])
#Actual Values
y <- train$O3[(train$fold == i)]
sse[[i]]<-sum((y-yhat)^2)
}
cpList[[j]]<-sum(unlist(sse))/nrow(train)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
plot(cp,unlist(cpList),pch=16)

#True Values for O3
O3_true <- testsubset$O3

#Tree1
tree1 <- rpart(O3 ~. -X, data=trainsubset, control = rpart.control(cp = 0.01,minsplit=10,minbucket=5, mindev=0.01))

#MSE
O3hat1 <- predict(tree1,testsubset)
mean((O3_true-O3hat1)^2)
## [1] 24.67823
#Tree2 
tree2<-tree(O3~.-X,data=trainsubset, control = tree.control(nrow(trainsubset), mincut = 10, minsize = 30, mindev = 0.01))

#MSE2
O3hat2 <- predict(tree2,testsubset)
mean((O3_true-O3hat2)^2)
## [1] 24.94491
summary(tree1)
## Call:
## rpart(formula = O3 ~ . - X, data = trainsubset, control = rpart.control(cp = 0.01, 
##     minsplit = 10, minbucket = 5, mindev = 0.01))
##   n= 115 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.55879508      0 1.0000000 1.0089652 0.10308471
## 2 0.10003580      1 0.4412049 0.4963357 0.06147158
## 3 0.06900352      2 0.3411691 0.4285137 0.05486474
## 4 0.04607001      3 0.2721656 0.3950922 0.05090696
## 5 0.02553228      4 0.2260956 0.3505338 0.05067233
## 6 0.01387208      5 0.2005633 0.3503543 0.05044127
## 7 0.01127670      6 0.1866912 0.3923886 0.05857371
## 8 0.01000000      7 0.1754145 0.4021067 0.06058050
## 
## Variable importance
##     temp      ibt       vh      ibh humidity      doy      dpg      vis 
##       26       21       15       14        9        7        4        3 
##     wind 
##        1 
## 
## Node number 1: 115 observations,    complexity param=0.5587951
##   mean=12.33043, MSE=59.03864 
##   left son=2 (64 obs) right son=3 (51 obs)
##   Primary splits:
##       temp     < 65.5   to the left,  improve=0.5587951, (0 missing)
##       ibt      < 177.5  to the left,  improve=0.4882956, (0 missing)
##       ibh      < 3567.5 to the right, improve=0.4075666, (0 missing)
##       vh       < 5825   to the left,  improve=0.2553016, (0 missing)
##       humidity < 37.5   to the left,  improve=0.2334339, (0 missing)
##   Surrogate splits:
##       ibt      < 195.5  to the left,  agree=0.835, adj=0.627, (0 split)
##       vh       < 5795   to the left,  agree=0.774, adj=0.490, (0 split)
##       ibh      < 2452   to the right, agree=0.722, adj=0.373, (0 split)
##       doy      < 148.5  to the left,  agree=0.687, adj=0.294, (0 split)
##       humidity < 59.5   to the left,  agree=0.678, adj=0.275, (0 split)
## 
## Node number 2: 64 observations,    complexity param=0.06900352
##   mean=7.203125, MSE=19.31812 
##   left son=4 (37 obs) right son=5 (27 obs)
##   Primary splits:
##       ibt      < 142.5  to the left,  improve=0.3789315, (0 missing)
##       ibh      < 1989.5 to the right, improve=0.3696774, (0 missing)
##       temp     < 61.5   to the left,  improve=0.1902221, (0 missing)
##       dpg      < -18    to the left,  improve=0.1848832, (0 missing)
##       humidity < 37.5   to the left,  improve=0.1822094, (0 missing)
##   Surrogate splits:
##       ibh  < 3567.5 to the right, agree=0.875, adj=0.704, (0 split)
##       temp < 58.5   to the left,  agree=0.766, adj=0.444, (0 split)
##       vh   < 5705   to the left,  agree=0.703, adj=0.296, (0 split)
##       vis  < 55     to the right, agree=0.672, adj=0.222, (0 split)
##       wind < 5.5    to the right, agree=0.656, adj=0.185, (0 split)
## 
## Node number 3: 51 observations,    complexity param=0.1000358
##   mean=18.76471, MSE=34.49366 
##   left son=6 (29 obs) right son=7 (22 obs)
##   Primary splits:
##       ibt      < 227    to the left,  improve=0.3860826, (0 missing)
##       ibh      < 2785   to the right, improve=0.2749432, (0 missing)
##       humidity < 79.5   to the left,  improve=0.1793164, (0 missing)
##       temp     < 79.5   to the left,  improve=0.1773606, (0 missing)
##       vis      < 65     to the right, improve=0.1745971, (0 missing)
##   Surrogate splits:
##       temp < 79.5   to the left,  agree=0.843, adj=0.636, (0 split)
##       vh   < 5845   to the left,  agree=0.824, adj=0.591, (0 split)
##       ibh  < 1124.5 to the right, agree=0.824, adj=0.591, (0 split)
##       dpg  < 20     to the right, agree=0.725, adj=0.364, (0 split)
##       vis  < 65     to the right, agree=0.706, adj=0.318, (0 split)
## 
## Node number 4: 37 observations
##   mean=4.891892, MSE=5.663988 
## 
## Node number 5: 27 observations,    complexity param=0.04607001
##   mean=10.37037, MSE=20.67764 
##   left son=10 (8 obs) right son=11 (19 obs)
##   Primary splits:
##       dpg      < -18.5  to the left,  improve=0.5602576, (0 missing)
##       humidity < 47     to the left,  improve=0.4484480, (0 missing)
##       vis      < 175    to the right, improve=0.2441716, (0 missing)
##       ibh      < 1989.5 to the right, improve=0.2343829, (0 missing)
##       vh       < 5795   to the right, improve=0.2144687, (0 missing)
##   Surrogate splits:
##       humidity < 31.5   to the left,  agree=0.926, adj=0.750, (0 split)
##       vis      < 175    to the right, agree=0.852, adj=0.500, (0 split)
##       vh       < 5805   to the right, agree=0.815, adj=0.375, (0 split)
##       ibh      < 4224   to the right, agree=0.778, adj=0.250, (0 split)
##       doy      < 67.5   to the left,  agree=0.778, adj=0.250, (0 split)
## 
## Node number 6: 29 observations,    complexity param=0.02553228
##   mean=15.58621, MSE=16.79429 
##   left son=12 (8 obs) right son=13 (21 obs)
##   Primary splits:
##       ibh  < 2785   to the right, improve=0.3559295, (0 missing)
##       doy  < 268    to the right, improve=0.2524730, (0 missing)
##       wind < 5.5    to the right, improve=0.2448898, (0 missing)
##       dpg  < 8.5    to the left,  improve=0.1422407, (0 missing)
##       ibt  < 208.5  to the left,  improve=0.1270193, (0 missing)
##   Surrogate splits:
##       humidity < 33     to the left,  agree=0.793, adj=0.250, (0 split)
##       dpg      < -16.5  to the left,  agree=0.793, adj=0.250, (0 split)
##       doy      < 268    to the right, agree=0.793, adj=0.250, (0 split)
##       vis      < 200    to the right, agree=0.759, adj=0.125, (0 split)
## 
## Node number 7: 22 observations,    complexity param=0.01387208
##   mean=22.95455, MSE=26.95248 
##   left son=14 (16 obs) right son=15 (6 obs)
##   Primary splits:
##       humidity < 79     to the left,  improve=0.15883800, (0 missing)
##       vis      < 70     to the right, improve=0.14352630, (0 missing)
##       vh       < 5845   to the right, improve=0.08835389, (0 missing)
##       ibt      < 284.5  to the right, improve=0.08279846, (0 missing)
##       temp     < 82     to the right, improve=0.08099436, (0 missing)
##   Surrogate splits:
##       ibh < 860.5  to the left,  agree=0.773, adj=0.167, (0 split)
## 
## Node number 10: 8 observations
##   mean=5.125, MSE=2.109375 
## 
## Node number 11: 19 observations
##   mean=12.57895, MSE=12.03324 
## 
## Node number 12: 8 observations
##   mean=11.625, MSE=14.23438 
## 
## Node number 13: 21 observations
##   mean=17.09524, MSE=9.514739 
## 
## Node number 14: 16 observations,    complexity param=0.0112767
##   mean=21.6875, MSE=29.33984 
##   left son=28 (8 obs) right son=29 (8 obs)
##   Primary splits:
##       vis      < 70     to the right, improve=0.16309410, (0 missing)
##       ibt      < 261.5  to the right, improve=0.11333600, (0 missing)
##       humidity < 72.5   to the right, improve=0.08351307, (0 missing)
##       dpg      < -2     to the left,  improve=0.08106656, (0 missing)
##       vh       < 5875   to the right, improve=0.08035833, (0 missing)
##   Surrogate splits:
##       humidity < 71.5   to the right, agree=0.688, adj=0.375, (0 split)
##       temp     < 75     to the right, agree=0.688, adj=0.375, (0 split)
##       dpg      < -18    to the right, agree=0.688, adj=0.375, (0 split)
##       doy      < 267    to the left,  agree=0.688, adj=0.375, (0 split)
##       vh       < 5845   to the right, agree=0.625, adj=0.250, (0 split)
## 
## Node number 15: 6 observations
##   mean=26.33333, MSE=4.888889 
## 
## Node number 28: 8 observations
##   mean=19.5, MSE=29.75 
## 
## Node number 29: 8 observations
##   mean=23.875, MSE=19.35938

PCR

PCR did a better job here. I think a random forest would do the best though.

library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
fit.pcr <- pcr(O3 ~ .-X, data = trainsubset, scale = TRUE, validation = "CV")
validationplot(fit.pcr, val.type = "MSEP")

predicted_pcr <- predict(fit.pcr, testsubset , ncomp = 4)
mean((predicted_pcr - testsubset$O3)^2)
## [1] 21.33016

GBM

Trying to find the best shrinkage parameter. It seems like its 0.02 based on the purple line.

library(gbm)
## Warning: package 'gbm' was built under R version 3.5.2
## Loaded gbm 2.1.5
nt <- 250
gbm1<-gbm(O3~., data=trainsubset, cv.folds = 3, shrinkage=0.001, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm2<-gbm(O3~., data=trainsubset, cv.folds = 3, shrinkage=0.01, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm3<-gbm(O3~., data=trainsubset, cv.folds = 3, shrinkage=0.1, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm4<-gbm(O3~., data=trainsubset, cv.folds = 3, shrinkage=0.2, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm5<-gbm(O3~., data=trainsubset, cv.folds = 3, shrinkage=0., n.trees=nt)
## Distribution not specified, assuming gaussian ...
plot(gbm1$cv.error,ylim=c(0,100),col="red",type="l")
points(gbm2$cv.error,col="green",type="l")
points(gbm3$cv.error,col="blue",type="l")
points(gbm4$cv.error,col="purple",type="l")
points(gbm5$cv.error,col="pink",type="l")

boostkajalset <- gbm(O3 ~ . -X, data=trainsubset, distribution="gaussian", n.trees=250, interaction.depth=4, shrinkage=0.02 )

y_hat <- predict(boostkajalset, newdata=testsubset, n.trees=250 )
training_set_mse<- mean(( y_hat - testsubset$O3)^2 )
training_set_mse
## [1] 19.82269

Random Forest

Lowest MSE here using Random Forest. I am not too surprised. I think my MSE would be even lower if I used a larger percentage of my data in the training subset.

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
O3_RF_all <-randomForest(O3 ~., data=trainsubset, importance =TRUE, ntree = 250)
O3_RF <-randomForest(O3 ~ ibt + temp + ibh + humidity + doy ,data=trainsubset,importance=TRUE, ntree=250)

plot(O3_RF_all)

plot(O3_RF)

varImpPlot(O3_RF_all)

varImpPlot(O3_RF)

#Using All Variables
predicted_O3_all <- predict(O3_RF_all, newdata=testsubset, OOB=T)
mean((predicted_O3_all-testsubset$O3)^2)
## [1] 18.1743
#Using Some Variables 
predicted_O3<-predict(O3_RF, newdata=testsubset, OOB=T)
mean((predicted_O3-testsubset$O3)^2)
## [1] 17.08361

Applying Model to Test Data

test$O3 <- predict(O3_RF, newdata=test)
head(test)
##     X        O3   vh wind humidity temp  ibh dpg ibt vis doy
## 1 118  5.203090 5680    6       77   41 5000  75  49 120 156
## 2 283  5.381314 5710    4       67   49 5000  24  55 200 341
## 3  23 10.143133 5680    3       64   53  111 -10 153  50  55
## 4 108 18.270067 5780    7       78   68 1479  40 200 100 144
## 5 310  8.104933 5760    0       32   62  826 -16 182 300 368
## 6 148  9.569644 5650    6       66   61 2670  54 130 120 188

CSV Creation

kajalfile = cbind(test$X, test$O3)
write.csv(kajalfile, file="KajalChokshiHW5", fileEncoding = "macroman")