train <- read.csv("https://www.dropbox.com/s/zvsxuvywvqvw3zc/smogData.csv?dl=1")
test <- read.csv("https://www.dropbox.com/s/a21lbe38ehq5ugn/smogDataPredict.csv?dl=1")
#Subsetting the Data
set.seed(1234)
rows_for_testing <- sort(sample(1:nrow(train), 140))
testsubset <- train[rows_for_testing,]
trainsubset <- train[-rows_for_testing,]
From the tree, we see that Temp and IBT are two variables driving the differences within the data.
library(tree)
library(rpart)
library(rpart.plot)
tree <- rpart(O3 ~ ., data=trainsubset)
rpart.plot(tree)
Unclear from BIC which variables to keep. They all are directionally similar.
library(leaps)
fwd_O3<-regsubsets(O3 ~ . -X , data = trainsubset, method="forward")
fwd_O3
## Subset selection object
## Call: regsubsets.formula(O3 ~ . - X, data = trainsubset, method = "forward")
## 9 Variables (and intercept)
## Forced in Forced out
## vh FALSE FALSE
## wind FALSE FALSE
## humidity FALSE FALSE
## temp FALSE FALSE
## ibh FALSE FALSE
## dpg FALSE FALSE
## ibt FALSE FALSE
## vis FALSE FALSE
## doy FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
reg.summary <- summary(fwd_O3)
reg.summary$bic
## [1] -90.38523 -115.88853 -127.09696 -123.84667 -120.49085 -116.78034
## [7] -112.57299 -107.95117
The MSE here is not great. I think we can do better with PCR or a Random Forest.
The tree with the lowest cp value is better than a normal tree without the cp specified.
set.seed(1234)
train$fold <- sample(1:5,255,replace=TRUE)
cp <- seq(0.01,0.02,length=20)
cpList<-list()
for (j in 1:20){print(j)
sse <- list()
for (i in 1:5){
tree_K <- rpart(O3 ~ .-X, data=train[!(train$fold == i),], control = rpart.control(cp = cp[j] ,minsplit=10,minbucket=5))
#Predicted values
yhat<- predict(tree_K,train[(train$fold == i),])
#Actual Values
y <- train$O3[(train$fold == i)]
sse[[i]]<-sum((y-yhat)^2)
}
cpList[[j]]<-sum(unlist(sse))/nrow(train)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
plot(cp,unlist(cpList),pch=16)
#True Values for O3
O3_true <- testsubset$O3
#Tree1
tree1 <- rpart(O3 ~. -X, data=trainsubset, control = rpart.control(cp = 0.01,minsplit=10,minbucket=5, mindev=0.01))
#MSE
O3hat1 <- predict(tree1,testsubset)
mean((O3_true-O3hat1)^2)
## [1] 24.67823
#Tree2
tree2<-tree(O3~.-X,data=trainsubset, control = tree.control(nrow(trainsubset), mincut = 10, minsize = 30, mindev = 0.01))
#MSE2
O3hat2 <- predict(tree2,testsubset)
mean((O3_true-O3hat2)^2)
## [1] 24.94491
summary(tree1)
## Call:
## rpart(formula = O3 ~ . - X, data = trainsubset, control = rpart.control(cp = 0.01,
## minsplit = 10, minbucket = 5, mindev = 0.01))
## n= 115
##
## CP nsplit rel error xerror xstd
## 1 0.55879508 0 1.0000000 1.0089652 0.10308471
## 2 0.10003580 1 0.4412049 0.4963357 0.06147158
## 3 0.06900352 2 0.3411691 0.4285137 0.05486474
## 4 0.04607001 3 0.2721656 0.3950922 0.05090696
## 5 0.02553228 4 0.2260956 0.3505338 0.05067233
## 6 0.01387208 5 0.2005633 0.3503543 0.05044127
## 7 0.01127670 6 0.1866912 0.3923886 0.05857371
## 8 0.01000000 7 0.1754145 0.4021067 0.06058050
##
## Variable importance
## temp ibt vh ibh humidity doy dpg vis
## 26 21 15 14 9 7 4 3
## wind
## 1
##
## Node number 1: 115 observations, complexity param=0.5587951
## mean=12.33043, MSE=59.03864
## left son=2 (64 obs) right son=3 (51 obs)
## Primary splits:
## temp < 65.5 to the left, improve=0.5587951, (0 missing)
## ibt < 177.5 to the left, improve=0.4882956, (0 missing)
## ibh < 3567.5 to the right, improve=0.4075666, (0 missing)
## vh < 5825 to the left, improve=0.2553016, (0 missing)
## humidity < 37.5 to the left, improve=0.2334339, (0 missing)
## Surrogate splits:
## ibt < 195.5 to the left, agree=0.835, adj=0.627, (0 split)
## vh < 5795 to the left, agree=0.774, adj=0.490, (0 split)
## ibh < 2452 to the right, agree=0.722, adj=0.373, (0 split)
## doy < 148.5 to the left, agree=0.687, adj=0.294, (0 split)
## humidity < 59.5 to the left, agree=0.678, adj=0.275, (0 split)
##
## Node number 2: 64 observations, complexity param=0.06900352
## mean=7.203125, MSE=19.31812
## left son=4 (37 obs) right son=5 (27 obs)
## Primary splits:
## ibt < 142.5 to the left, improve=0.3789315, (0 missing)
## ibh < 1989.5 to the right, improve=0.3696774, (0 missing)
## temp < 61.5 to the left, improve=0.1902221, (0 missing)
## dpg < -18 to the left, improve=0.1848832, (0 missing)
## humidity < 37.5 to the left, improve=0.1822094, (0 missing)
## Surrogate splits:
## ibh < 3567.5 to the right, agree=0.875, adj=0.704, (0 split)
## temp < 58.5 to the left, agree=0.766, adj=0.444, (0 split)
## vh < 5705 to the left, agree=0.703, adj=0.296, (0 split)
## vis < 55 to the right, agree=0.672, adj=0.222, (0 split)
## wind < 5.5 to the right, agree=0.656, adj=0.185, (0 split)
##
## Node number 3: 51 observations, complexity param=0.1000358
## mean=18.76471, MSE=34.49366
## left son=6 (29 obs) right son=7 (22 obs)
## Primary splits:
## ibt < 227 to the left, improve=0.3860826, (0 missing)
## ibh < 2785 to the right, improve=0.2749432, (0 missing)
## humidity < 79.5 to the left, improve=0.1793164, (0 missing)
## temp < 79.5 to the left, improve=0.1773606, (0 missing)
## vis < 65 to the right, improve=0.1745971, (0 missing)
## Surrogate splits:
## temp < 79.5 to the left, agree=0.843, adj=0.636, (0 split)
## vh < 5845 to the left, agree=0.824, adj=0.591, (0 split)
## ibh < 1124.5 to the right, agree=0.824, adj=0.591, (0 split)
## dpg < 20 to the right, agree=0.725, adj=0.364, (0 split)
## vis < 65 to the right, agree=0.706, adj=0.318, (0 split)
##
## Node number 4: 37 observations
## mean=4.891892, MSE=5.663988
##
## Node number 5: 27 observations, complexity param=0.04607001
## mean=10.37037, MSE=20.67764
## left son=10 (8 obs) right son=11 (19 obs)
## Primary splits:
## dpg < -18.5 to the left, improve=0.5602576, (0 missing)
## humidity < 47 to the left, improve=0.4484480, (0 missing)
## vis < 175 to the right, improve=0.2441716, (0 missing)
## ibh < 1989.5 to the right, improve=0.2343829, (0 missing)
## vh < 5795 to the right, improve=0.2144687, (0 missing)
## Surrogate splits:
## humidity < 31.5 to the left, agree=0.926, adj=0.750, (0 split)
## vis < 175 to the right, agree=0.852, adj=0.500, (0 split)
## vh < 5805 to the right, agree=0.815, adj=0.375, (0 split)
## ibh < 4224 to the right, agree=0.778, adj=0.250, (0 split)
## doy < 67.5 to the left, agree=0.778, adj=0.250, (0 split)
##
## Node number 6: 29 observations, complexity param=0.02553228
## mean=15.58621, MSE=16.79429
## left son=12 (8 obs) right son=13 (21 obs)
## Primary splits:
## ibh < 2785 to the right, improve=0.3559295, (0 missing)
## doy < 268 to the right, improve=0.2524730, (0 missing)
## wind < 5.5 to the right, improve=0.2448898, (0 missing)
## dpg < 8.5 to the left, improve=0.1422407, (0 missing)
## ibt < 208.5 to the left, improve=0.1270193, (0 missing)
## Surrogate splits:
## humidity < 33 to the left, agree=0.793, adj=0.250, (0 split)
## dpg < -16.5 to the left, agree=0.793, adj=0.250, (0 split)
## doy < 268 to the right, agree=0.793, adj=0.250, (0 split)
## vis < 200 to the right, agree=0.759, adj=0.125, (0 split)
##
## Node number 7: 22 observations, complexity param=0.01387208
## mean=22.95455, MSE=26.95248
## left son=14 (16 obs) right son=15 (6 obs)
## Primary splits:
## humidity < 79 to the left, improve=0.15883800, (0 missing)
## vis < 70 to the right, improve=0.14352630, (0 missing)
## vh < 5845 to the right, improve=0.08835389, (0 missing)
## ibt < 284.5 to the right, improve=0.08279846, (0 missing)
## temp < 82 to the right, improve=0.08099436, (0 missing)
## Surrogate splits:
## ibh < 860.5 to the left, agree=0.773, adj=0.167, (0 split)
##
## Node number 10: 8 observations
## mean=5.125, MSE=2.109375
##
## Node number 11: 19 observations
## mean=12.57895, MSE=12.03324
##
## Node number 12: 8 observations
## mean=11.625, MSE=14.23438
##
## Node number 13: 21 observations
## mean=17.09524, MSE=9.514739
##
## Node number 14: 16 observations, complexity param=0.0112767
## mean=21.6875, MSE=29.33984
## left son=28 (8 obs) right son=29 (8 obs)
## Primary splits:
## vis < 70 to the right, improve=0.16309410, (0 missing)
## ibt < 261.5 to the right, improve=0.11333600, (0 missing)
## humidity < 72.5 to the right, improve=0.08351307, (0 missing)
## dpg < -2 to the left, improve=0.08106656, (0 missing)
## vh < 5875 to the right, improve=0.08035833, (0 missing)
## Surrogate splits:
## humidity < 71.5 to the right, agree=0.688, adj=0.375, (0 split)
## temp < 75 to the right, agree=0.688, adj=0.375, (0 split)
## dpg < -18 to the right, agree=0.688, adj=0.375, (0 split)
## doy < 267 to the left, agree=0.688, adj=0.375, (0 split)
## vh < 5845 to the right, agree=0.625, adj=0.250, (0 split)
##
## Node number 15: 6 observations
## mean=26.33333, MSE=4.888889
##
## Node number 28: 8 observations
## mean=19.5, MSE=29.75
##
## Node number 29: 8 observations
## mean=23.875, MSE=19.35938
PCR did a better job here. I think a random forest would do the best though.
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
fit.pcr <- pcr(O3 ~ .-X, data = trainsubset, scale = TRUE, validation = "CV")
validationplot(fit.pcr, val.type = "MSEP")
predicted_pcr <- predict(fit.pcr, testsubset , ncomp = 4)
mean((predicted_pcr - testsubset$O3)^2)
## [1] 21.33016
Trying to find the best shrinkage parameter. It seems like its 0.02 based on the purple line.
library(gbm)
## Warning: package 'gbm' was built under R version 3.5.2
## Loaded gbm 2.1.5
nt <- 250
gbm1<-gbm(O3~., data=trainsubset, cv.folds = 3, shrinkage=0.001, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm2<-gbm(O3~., data=trainsubset, cv.folds = 3, shrinkage=0.01, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm3<-gbm(O3~., data=trainsubset, cv.folds = 3, shrinkage=0.1, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm4<-gbm(O3~., data=trainsubset, cv.folds = 3, shrinkage=0.2, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm5<-gbm(O3~., data=trainsubset, cv.folds = 3, shrinkage=0., n.trees=nt)
## Distribution not specified, assuming gaussian ...
plot(gbm1$cv.error,ylim=c(0,100),col="red",type="l")
points(gbm2$cv.error,col="green",type="l")
points(gbm3$cv.error,col="blue",type="l")
points(gbm4$cv.error,col="purple",type="l")
points(gbm5$cv.error,col="pink",type="l")
boostkajalset <- gbm(O3 ~ . -X, data=trainsubset, distribution="gaussian", n.trees=250, interaction.depth=4, shrinkage=0.02 )
y_hat <- predict(boostkajalset, newdata=testsubset, n.trees=250 )
training_set_mse<- mean(( y_hat - testsubset$O3)^2 )
training_set_mse
## [1] 19.82269
Lowest MSE here using Random Forest. I am not too surprised. I think my MSE would be even lower if I used a larger percentage of my data in the training subset.
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
O3_RF_all <-randomForest(O3 ~., data=trainsubset, importance =TRUE, ntree = 250)
O3_RF <-randomForest(O3 ~ ibt + temp + ibh + humidity + doy ,data=trainsubset,importance=TRUE, ntree=250)
plot(O3_RF_all)
plot(O3_RF)
varImpPlot(O3_RF_all)
varImpPlot(O3_RF)
#Using All Variables
predicted_O3_all <- predict(O3_RF_all, newdata=testsubset, OOB=T)
mean((predicted_O3_all-testsubset$O3)^2)
## [1] 18.1743
#Using Some Variables
predicted_O3<-predict(O3_RF, newdata=testsubset, OOB=T)
mean((predicted_O3-testsubset$O3)^2)
## [1] 17.08361
test$O3 <- predict(O3_RF, newdata=test)
head(test)
## X O3 vh wind humidity temp ibh dpg ibt vis doy
## 1 118 5.203090 5680 6 77 41 5000 75 49 120 156
## 2 283 5.381314 5710 4 67 49 5000 24 55 200 341
## 3 23 10.143133 5680 3 64 53 111 -10 153 50 55
## 4 108 18.270067 5780 7 78 68 1479 40 200 100 144
## 5 310 8.104933 5760 0 32 62 826 -16 182 300 368
## 6 148 9.569644 5650 6 66 61 2670 54 130 120 188
kajalfile = cbind(test$X, test$O3)
write.csv(kajalfile, file="KajalChokshiHW5", fileEncoding = "macroman")