library(MASS)
library(pls)
library(factoextra)
# Loading the data
train = as.data.frame(read.table("XGtrainRain.txt",sep=",",header=T))
X.train = as.matrix(train[,-366])
Z.train = as.matrix(train[,366])
test = as.data.frame(read.table("XGtestRain.txt",sep=",",header=T))
X.test = as.matrix(test[,-366])
Z.test = as.matrix(test[,366])
#X.train$G<-as.factor(X.train$G)
ntrain = 150
ntest = 41
p = 365
#predictors = head(colnames(X.train),-1)
dim(X.train)
## [1] 150 365
#qda.fit<-qda(G ~ .,data=train)
#qda.fit
throws error because too many variables
logreg<-glm(G~.,data=train, family=binomial(link="logit"),maxit = 100)
#summary(logreg)
NA for predictors greater than 150 - since only 150 observations.
prefer log classifer?
X.train_cent<-scale(X.train, scale = F)
set.seed(1)
PLX <- plsr(G~.,data=train,validation="LOO")
summary(PLX)
## Data: X dimension: 150 365
## Y dimension: 150 1
## Fit method: kernelpls
## Number of components considered: 148
##
## VALIDATION: RMSEP
## Cross-validated using 150 leave-one-out segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 0.4903 0.3968 0.3737 0.3163 0.2932 0.2843 0.2761
## adjCV 0.4903 0.3968 0.3737 0.3162 0.2931 0.2840 0.2758
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 0.2738 0.2872 0.2990 0.3063 0.3121 0.3223 0.3354
## adjCV 0.2734 0.2867 0.2984 0.3056 0.3114 0.3215 0.3346
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps 20 comps
## CV 0.3507 0.3524 0.3623 0.3695 0.3751 0.3831 0.3881
## adjCV 0.3496 0.3514 0.3612 0.3684 0.3740 0.3818 0.3868
## 21 comps 22 comps 23 comps 24 comps 25 comps 26 comps 27 comps
## CV 0.3922 0.3959 0.3987 0.4003 0.4024 0.4047 0.4068
## adjCV 0.3909 0.3946 0.3974 0.3989 0.4011 0.4034 0.4054
## 28 comps 29 comps 30 comps 31 comps 32 comps 33 comps 34 comps
## CV 0.4077 0.4080 0.4086 0.4087 0.4086 0.4083 0.4083
## adjCV 0.4063 0.4066 0.4073 0.4073 0.4072 0.4069 0.4069
## 35 comps 36 comps 37 comps 38 comps 39 comps 40 comps 41 comps
## CV 0.4081 0.4078 0.4076 0.4073 0.4073 0.4072 0.4070
## adjCV 0.4068 0.4065 0.4062 0.4060 0.4059 0.4058 0.4057
## 42 comps 43 comps 44 comps 45 comps 46 comps 47 comps 48 comps
## CV 0.4071 0.4071 0.4070 0.4070 0.4069 0.4069 0.4069
## adjCV 0.4057 0.4057 0.4057 0.4056 0.4056 0.4055 0.4055
## 49 comps 50 comps 51 comps 52 comps 53 comps 54 comps 55 comps
## CV 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069
## adjCV 0.4055 0.4055 0.4056 0.4056 0.4056 0.4056 0.4056
## 56 comps 57 comps 58 comps 59 comps 60 comps 61 comps 62 comps
## CV 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069
## adjCV 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056
## 63 comps 64 comps 65 comps 66 comps 67 comps 68 comps 69 comps
## CV 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069
## adjCV 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056
## 70 comps 71 comps 72 comps 73 comps 74 comps 75 comps 76 comps
## CV 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069
## adjCV 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056
## 77 comps 78 comps 79 comps 80 comps 81 comps 82 comps 83 comps
## CV 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069
## adjCV 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056
## 84 comps 85 comps 86 comps 87 comps 88 comps 89 comps 90 comps
## CV 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069
## adjCV 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056
## 91 comps 92 comps 93 comps 94 comps 95 comps 96 comps 97 comps
## CV 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069
## adjCV 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056
## 98 comps 99 comps 100 comps 101 comps 102 comps 103 comps
## CV 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069
## adjCV 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056
## 104 comps 105 comps 106 comps 107 comps 108 comps 109 comps
## CV 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069
## adjCV 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056
## 110 comps 111 comps 112 comps 113 comps 114 comps 115 comps
## CV 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069
## adjCV 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056
## 116 comps 117 comps 118 comps 119 comps 120 comps 121 comps
## CV 0.4069 0.4069 0.4069 0.4069 0.4069 0.4069
## adjCV 0.4056 0.4056 0.4056 0.4056 0.4056 0.4056
## 122 comps 123 comps 124 comps 125 comps 126 comps 127 comps
## CV 0.4069 0.4069 0.4070 0.4070 0.4072 0.4094
## adjCV 0.4056 0.4056 0.4056 0.4056 0.4058 0.4080
## 128 comps 129 comps 130 comps 131 comps 132 comps 133 comps
## CV 0.4370 0.8140 4.940 35.12 199.5 1015
## adjCV 0.4355 0.8113 4.924 35.00 198.8 1012
## 134 comps 135 comps 136 comps 137 comps 138 comps 139 comps
## CV 7575 58446 316228 1994640 9949199 23840288
## adjCV 7550 58251 315172 1987980 9915980 23760688
## 140 comps 141 comps 142 comps 143 comps 144 comps 145 comps
## CV 26596559 29291519 28981085 28986132 28986050 28986061
## adjCV 26507756 29193718 28884320 28889350 28889268 28889279
## 146 comps 147 comps 148 comps
## CV 28986305 28986178 28986116
## adjCV 28889522 28889396 28889334
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 54.94 78.91 83.56 85.37 86.52 88.20 88.88 89.70
## G 37.59 46.46 64.43 72.51 80.63 84.14 88.20 90.41
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.22 90.70 91.27 91.65 92.17 92.35 92.65
## G 92.27 93.77 94.76 95.88 96.52 97.67 98.15
## 16 comps 17 comps 18 comps 19 comps 20 comps 21 comps 22 comps
## X 92.91 93.17 93.35 93.49 93.63 93.80 94.04
## G 98.59 98.88 99.18 99.44 99.60 99.73 99.80
## 23 comps 24 comps 25 comps 26 comps 27 comps 28 comps 29 comps
## X 94.23 94.39 94.55 94.74 94.90 95.17 95.28
## G 99.86 99.90 99.93 99.95 99.97 99.97 99.98
## 30 comps 31 comps 32 comps 33 comps 34 comps 35 comps 36 comps
## X 95.42 95.53 95.65 95.75 95.9 96.03 96.13
## G 99.99 99.99 99.99 100.00 100.0 100.00 100.00
## 37 comps 38 comps 39 comps 40 comps 41 comps 42 comps 43 comps
## X 96.23 96.33 96.45 96.53 96.63 96.74 96.84
## G 100.00 100.00 100.00 100.00 100.00 100.00 100.00
## 44 comps 45 comps 46 comps 47 comps 48 comps 49 comps 50 comps
## X 96.91 96.99 97.08 97.16 97.24 97.33 97.38
## G 100.00 100.00 100.00 100.00 100.00 100.00 100.00
## 51 comps 52 comps 53 comps 54 comps 55 comps 56 comps 57 comps
## X 97.46 97.53 97.59 97.63 97.69 97.75 97.81
## G 100.00 100.00 100.00 100.00 100.00 100.00 100.00
## 58 comps 59 comps 60 comps 61 comps 62 comps 63 comps 64 comps
## X 97.87 97.92 97.97 98.03 98.12 98.18 98.23
## G 100.00 100.00 100.00 100.00 100.00 100.00 100.00
## 65 comps 66 comps 67 comps 68 comps 69 comps 70 comps 71 comps
## X 98.28 98.33 98.38 98.42 98.46 98.49 98.53
## G 100.00 100.00 100.00 100.00 100.00 100.00 100.00
## 72 comps 73 comps 74 comps 75 comps 76 comps 77 comps 78 comps
## X 98.57 98.6 98.65 98.67 98.71 98.73 98.77
## G 100.00 100.0 100.00 100.00 100.00 100.00 100.00
## 79 comps 80 comps 81 comps 82 comps 83 comps 84 comps 85 comps
## X 98.8 98.84 98.87 98.9 98.93 98.95 98.98
## G 100.0 100.00 100.00 100.0 100.00 100.00 100.00
## 86 comps 87 comps 88 comps 89 comps 90 comps 91 comps 92 comps
## X 99.01 99.04 99.07 99.1 99.12 99.15 99.17
## G 100.00 100.00 100.00 100.0 100.00 100.00 100.00
## 93 comps 94 comps 95 comps 96 comps 97 comps 98 comps 99 comps
## X 99.2 99.22 99.24 99.26 99.29 99.31 99.33
## G 100.0 100.00 100.00 100.00 100.00 100.00 100.00
## 100 comps 101 comps 102 comps 103 comps 104 comps 105 comps 106 comps
## X 99.34 99.37 99.39 99.41 99.43 99.44 99.46
## G 100.00 100.00 100.00 100.00 100.00 100.00 100.00
## 107 comps 108 comps 109 comps 110 comps 111 comps 112 comps 113 comps
## X 99.48 99.5 99.51 99.53 99.55 99.57 99.58
## G 100.00 100.0 100.00 100.00 100.00 100.00 100.00
## 114 comps 115 comps 116 comps 117 comps 118 comps 119 comps 120 comps
## X 99.6 99.61 99.63 99.65 99.66 99.68 99.69
## G 100.0 100.00 100.00 100.00 100.00 100.00 100.00
## 121 comps 122 comps 123 comps 124 comps 125 comps 126 comps 127 comps
## X 99.71 99.72 99.73 99.75 99.76 99.78 99.79
## G 100.00 100.00 100.00 100.00 100.00 100.00 100.00
## 128 comps 129 comps 130 comps 131 comps 132 comps 133 comps 134 comps
## X 99.8 99.81 99.82 99.83 99.84 99.85 99.86
## G 100.0 100.00 100.00 100.00 100.00 100.00 100.00
## 135 comps 136 comps 137 comps 138 comps 139 comps 140 comps 141 comps
## X 99.87 99.88 99.89 99.9 99.91 99.92 99.93
## G 100.00 100.00 100.00 100.0 100.00 100.00 100.00
## 142 comps 143 comps 144 comps 145 comps 146 comps 147 comps 148 comps
## X 99.94 99.95 99.96 99.97 99.98 99.99 99.99
## G 100.00 100.00 100.00 100.00 100.00 100.00 100.00
plot(RMSEP(PLX), legendpos = "topright")
#results from function
PLStrain=PLX$scores
## Use projection matrix Phi to compute components
Phi = PLX$projection
T = as.matrix(X.train_cent)%*%Phi
results = PLStrain - T
if(all(results == 0) != TRUE){
print("Error")
}
dim(T)
## [1] 150 148
## PCA ##
covX = cov(X.train)
gamma = eigen(covX)$vector
PCX=(X-matrix(rep(1,ntrain),nrow=ntrain)%*%colMeans(X.train))%*%gamma
## Error in eval(expr, envir, enclos): object 'X' not found
# the proportion of variance (y-axis) for each PC (x-axis)
PCX = prcomp(X.train)
fviz_eig(PCX, xlab="Index (PC)",ylab="Proportion of Variance (%)",
linecolor = "black",barfill = "lightgrey",barcolor = "lightgrey",
main = "Scree plot: Proportion of Variance of each PC")+
theme(axis.ticks = element_blank())
#PLS variables
#PLScomp-PLStrain
## Check the results manually: # column-center X
#Ztrain_center = X.train$G - mean(X.train$G) # center Z
#proj_vec_1_unnorm <- t(X.train_cent)%*%Ztrain_center
#proj_vec_1_unnorm_length = sqrt(sum(proj_vec_1_unnorm^2))
# this should be the same as the first column in PLX$projection
# after renormalization
#round(projMatrix[, 1] - as.numeric(proj_vec_1_unnorm/proj_vec_1_unnorm_length ),4)
PCX.train<-PCX$x[,1:6]
dim(PCX.train)
## [1] 150 6
PCX.train<-cbind(PCX.train,as.numeric(train$G))
colnames(PCX.train)[7] <- "G"
PCX.train<-as.data.frame(PCX.train)
PLX.qda <- qda(G ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data = PCX.train)
predictions.qda <- predict(PLX.qda, test)$class
## Error in eval(predvars, data, env): object 'PC1' not found
library(randomForest)
train$G <- as.factor(train$G)
m.rf <- randomForest(formula=G ~. , data=train, ntree=2000,importance=TRUE)
head(importance(m.rf))
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## X1 4.775216 4.782717 6.124046 0.6823443
## X2 2.544173 3.763945 4.382670 0.1836947
## X3 4.654183 7.103073 7.597488 0.5494819
## X4 0.896448 3.740100 4.023360 0.1446955
## X5 2.023874 4.016781 4.043635 0.1266095
## X6 3.898899 4.687041 5.787929 0.4115724
#week 10
#higher number is more important
## OOB
OOBpred=predict(m.rf)
ClassPred=predict(m.rf,newdata=test)
sum(ClassPred!=Z.test)
## [1] 3
get_error<-function(B){
m.rf <- randomForest(formula=G ~. , data=train, ntree=B,importance=TRUE)
OOBpred=predict(m.rf)
ClassPred=predict(m.rf,newdata=test)
error = sum(ClassPred!=Z.test)
return(c(B,error))
}
results = c()
for (i in seq(50,1000,by=50)){
output = get_error(i)
results = rbind(results,output)
}
OOB_error = as.data.frame(results)
plot(OOB_error)
Choosing large \(B\) won’t cause overfitting of the random forest, but choosing small \(B\) can limit the model. Therefore, the correct compromise is to choose a large enough \(B\) such that we see convergence/asympotitc behaviour of LLN, while it is small enough to be computationally feasible.
varImpPlot(m.rf)
## classification error