May 20, 2016

Data

library(glmnet);library(reshape2);library(lm.beta);library(e1071);library(ggplot2)
#dta <- readRDS("ML_NIRS.Rdata")

Preprocessing

# PCA & Correlation
dta <- readRDS("MLNIRS.Rdata")
#saveRDS(dta,"MLNIRS.Rdata")
dta_word <-dta[,c(2,18:223)]
dim(dta_word)
[1]  20 207

Regularization

x <- model.matrix(Word~.,data = dta_word)[, -1]
y <- dta_word$Word
model <- cv.glmnet(x, y, alpha = 1,family = "gaussian", nfolds = 10,type.measure = "deviance")
yhat <- predict(model, s = model$lambda.min , newx =x,type="response")
beta <- predict(model, s = model$lambda.min , newx =x,type="coefficients")
Est <- data.frame(Est = beta[,1][beta[,1]!=0])

Regularization

row.names(Est)[-1]
[1] "Ch1_PC19" "Ch6_PC19"
Est
                  Est
(Intercept) 98.300000
Ch1_PC19     7.728836
Ch6_PC19    -4.700825
1-(mean((y - yhat)^2)/var(y))
[1] 0.4096143

Regularization

Estarray <- array(NA,c(18,100,60))
Rsmat <- matrix(NA,18,60)
dta_word <-dta[,c(2,18:223)]
for (i in 1 :60){
        set.seed(i)
        for (j in 3: 20){
               x <- model.matrix(Word~.,data = dta_word)[, -1]
               y <- dta_word$Word
               model <- cv.glmnet(x, y, alpha = 1,family = "gaussian", nfolds = j,type.measure = "deviance")
               yhat <- predict(model, s = model$lambda.min , newx =x,type="response")
               beta <- predict(model, s = model$lambda.min , newx =x,type="coefficients")
               Est <- data.frame(Est = beta[,1][beta[,1]!=0])
               for (k in 1:length(row.names(Est))){
                       Estarray[(j-2),k,i] <- row.names(Est)[k]
               }
               Rsmat[j-2,i] <- (1-(mean((y - yhat)^2)/var(y)))
        }
}

Regularization

table(Estarray)
Estarray
(Intercept)    Ch1_PC19    Ch6_PC19 
       1080        1074        1010 
quantile(Rsmat,prob=c(0.01,0.99))
       1%       99% 
0.1672032 0.4552263 
draw <- melt(Rsmat)
ggplot(draw,aes(x = value))+
        geom_density()+
        geom_vline(xintercept = c(quantile(Rsmat,prob=c(0.01,0.99))[1],
                   quantile(Rsmat,prob=c(0.01,0.99))[2]),col="red")+
        theme_bw()+
        labs(list(title = "Lasso : Word ~ . ", x = "R-Squared"))

Regularization

# Ch1_PC19+Ch6_PC19
Rsmat <- matrix(NA,18,100)
dta_word <-dta[,c(2,18:223)]
for (i in 1 :100){
        set.seed(i)
        for (j in 3: 20){
               x <- model.matrix(Word~Ch1_PC19+Ch6_PC19,data = dta_word)[,-1]
               y <- dta_word$Word
               model <- cv.glmnet(x, y, alpha = 1,family = "gaussian", nfolds = j,type.measure = "deviance")
               yhat <- predict(model, s = model$lambda.min , newx =x,type="response")
               #beta <- predict(model, s = model$lambda.min , newx =x,type="coefficients")
               #Est <- data.frame(Est = beta[,1][beta[,1]!=0])
               #for (k in 1:length(row.names(Est))){
               #   Estarray[(j-2),k,i] <- row.names(Est)[k]
               #}
               Rsmat[j-2,i] <- (1-(mean((y - yhat)^2)/var(y)))
        }
}

Regularization

quantile(Rsmat,prob=c(0.01,0.99))
       1%       99% 
0.5829798 0.5968034 
draw <- melt(Rsmat)
ggplot(draw,aes(x = value))+
        geom_density()+
        geom_vline(xintercept = c(quantile(Rsmat,prob=c(0.01,0.99))[1],
                   quantile(Rsmat,prob=c(0.01,0.99))[2]),col="red")+
        theme_bw()+
        labs(list(title = "Lasso : Word ~ Ch1_PC19 + Ch6_PC19", x = "R-Squared"))

library(lm.beta)
summary(lm.beta(fit <- lm(Word~Ch1_PC19+Ch6_PC19,data = dta_word)))
Call:
lm(formula = Word ~ Ch1_PC19 + Ch6_PC19, data = dta_word)

Residuals:
    Min      1Q  Median      3Q     Max 
-40.762  -8.220   1.813   8.676  29.816 

Coefficients:
            Estimate Standardized Std. Error t value Pr(>|t|)    
(Intercept)  98.3000       0.0000     4.0567  24.232 1.27e-14 ***
Ch1_PC19     14.7997       0.4896     7.9151   1.870   0.0788 .  
Ch6_PC19    -19.5837      -0.3084    16.6289  -1.178   0.2551    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 18.14 on 17 degrees of freedom
Multiple R-squared:  0.5756,    Adjusted R-squared:  0.5257 
F-statistic: 11.53 on 2 and 17 DF,  p-value: 0.0006854

SVM

How to solve overfitting problem

library(e1071)
dta_word <-dta[,c(2,18:223)]
svm_tune <- tune(svm, Word~., data =dta_word, kernel="radial",
                 ranges = list(cost = seq(.1, 2, .1), gamma = seq(.1, 2, .1)),
                 tunecontrol = tune.control(sampling = "cross", cross = 10))
rst_svm <- svm(Word ~., data = dta_word, type = "eps-regression",
               cost = svm_tune$best.model$cost, kernel = "radial",
               gamma = svm_tune$best.model$gamma)
yhat_svm <- predict(rst_svm, newdata = dta_word )

SVM

(1-(mean((dta$Word - yhat_svm)^2)/var(dta$Word)))
[1] 0.9498113

SVM

Rsmat <- matrix(NA,18,60)
for (i in 1:60){
        set.seed(i)
        for (j in 3:20){
        svm_tune <- tune(svm, Word~., data =dta_word, kernel="radial",
                 ranges = list(cost = seq(.1, 2, .1), gamma = seq(.1, 2, .1)),
                 tunecontrol = tune.control(sampling = "cross", cross = j))
        rst_svm <- svm(Word ~., data = dta[,c(2,18:223)], type = "eps-regression",
                       cost = svm_tune$best.model$cost, kernel = "radial",
                       gamma = svm_tune$best.model$gamma)
        yhat_svm <- predict(rst_svm, newdata = dta_word )
        Rsmat[j-2,i] <- (1-(mean((dta$Word - yhat_svm)^2)/var(dta$Word)))
}
}

SVM

quantile(Rsmat,prob=c(0.01,0.99))
       1%       99% 
0.2169494 0.9735125 
draw <- melt(Rsmat)
ggplot(draw,aes(x = value))+
        geom_density()+
        geom_vline(xintercept = c(quantile(Rsmat,prob=c(0.01,0.99))[1],
                   quantile(Rsmat,prob=c(0.01,0.99))[2]),col="red")+
        theme_bw()+
        labs(list(title = " SVM : Word ~ . ", x = "R-Squared"))

## SVM

#Ch1_PC19 Ch6_PC19
Rsmat2 <- matrix(NA,18,60)
for (i in 1:60){
        set.seed(i)
        for (j in 3:20){
        svm_tune <- tune(svm, Word~Ch1_PC19+Ch6_PC19, data =dta_word, kernel="radial",
                 ranges = list(cost = seq(.1, 2, .1), gamma = seq(.1, 2, .1)),
                 tunecontrol = tune.control(sampling = "cross", cross = 10))
        rst_svm <- svm(Word ~Ch1_PC19+Ch6_PC19, data = dta[,c(2,18:223)], type = "eps-regression",
                       cost = svm_tune$best.model$cost, kernel = "radial",
                       gamma = svm_tune$best.model$gamma)
        yhat_svm <- predict(rst_svm, newdata = dta_word )
        Rsmat2[j-2,i] <- (1-(mean((dta$Word - yhat_svm)^2)/var(dta$Word)))
}
}

SVM

quantile(Rsmat2,prob=c(0.01,0.99))
       1%       99% 
0.2851531 0.5957688 
draw2 <- melt(Rsmat2)
ggplot(draw2,aes(x = value))+
        geom_density()+
        geom_vline(xintercept = c(quantile(Rsmat2,prob=c(0.01,0.99))[1],
                   quantile(Rsmat2,prob=c(0.01,0.99))[2]),col="red")+
        theme_bw()+
        labs(list(title = "SVM : Word ~ Ch1_PC19 + Ch6_PC19", x = "R-Squared"))

Boosting

Tree Based