library(glmnet);library(reshape2);library(lm.beta);library(e1071);library(ggplot2)
#dta <- readRDS("ML_NIRS.Rdata")
May 20, 2016
library(glmnet);library(reshape2);library(lm.beta);library(e1071);library(ggplot2)
#dta <- readRDS("ML_NIRS.Rdata")
# PCA & Correlation
dta <- readRDS("MLNIRS.Rdata")
#saveRDS(dta,"MLNIRS.Rdata")
dta_word <-dta[,c(2,18:223)]
dim(dta_word)
[1] 20 207
x <- model.matrix(Word~.,data = dta_word)[, -1] y <- dta_word$Word model <- cv.glmnet(x, y, alpha = 1,family = "gaussian", nfolds = 10,type.measure = "deviance") yhat <- predict(model, s = model$lambda.min , newx =x,type="response") beta <- predict(model, s = model$lambda.min , newx =x,type="coefficients") Est <- data.frame(Est = beta[,1][beta[,1]!=0])
row.names(Est)[-1]
[1] "Ch1_PC19" "Ch6_PC19"
Est
Est (Intercept) 98.300000 Ch1_PC19 7.728836 Ch6_PC19 -4.700825
1-(mean((y - yhat)^2)/var(y))
[1] 0.4096143
Estarray <- array(NA,c(18,100,60))
Rsmat <- matrix(NA,18,60)
dta_word <-dta[,c(2,18:223)]
for (i in 1 :60){
set.seed(i)
for (j in 3: 20){
x <- model.matrix(Word~.,data = dta_word)[, -1]
y <- dta_word$Word
model <- cv.glmnet(x, y, alpha = 1,family = "gaussian", nfolds = j,type.measure = "deviance")
yhat <- predict(model, s = model$lambda.min , newx =x,type="response")
beta <- predict(model, s = model$lambda.min , newx =x,type="coefficients")
Est <- data.frame(Est = beta[,1][beta[,1]!=0])
for (k in 1:length(row.names(Est))){
Estarray[(j-2),k,i] <- row.names(Est)[k]
}
Rsmat[j-2,i] <- (1-(mean((y - yhat)^2)/var(y)))
}
}
table(Estarray)
Estarray
(Intercept) Ch1_PC19 Ch6_PC19
1080 1074 1010
quantile(Rsmat,prob=c(0.01,0.99))
1% 99% 0.1672032 0.4552263
draw <- melt(Rsmat)
ggplot(draw,aes(x = value))+
geom_density()+
geom_vline(xintercept = c(quantile(Rsmat,prob=c(0.01,0.99))[1],
quantile(Rsmat,prob=c(0.01,0.99))[2]),col="red")+
theme_bw()+
labs(list(title = "Lasso : Word ~ . ", x = "R-Squared"))
# Ch1_PC19+Ch6_PC19
Rsmat <- matrix(NA,18,100)
dta_word <-dta[,c(2,18:223)]
for (i in 1 :100){
set.seed(i)
for (j in 3: 20){
x <- model.matrix(Word~Ch1_PC19+Ch6_PC19,data = dta_word)[,-1]
y <- dta_word$Word
model <- cv.glmnet(x, y, alpha = 1,family = "gaussian", nfolds = j,type.measure = "deviance")
yhat <- predict(model, s = model$lambda.min , newx =x,type="response")
#beta <- predict(model, s = model$lambda.min , newx =x,type="coefficients")
#Est <- data.frame(Est = beta[,1][beta[,1]!=0])
#for (k in 1:length(row.names(Est))){
# Estarray[(j-2),k,i] <- row.names(Est)[k]
#}
Rsmat[j-2,i] <- (1-(mean((y - yhat)^2)/var(y)))
}
}
quantile(Rsmat,prob=c(0.01,0.99))
1% 99% 0.5829798 0.5968034
draw <- melt(Rsmat)
ggplot(draw,aes(x = value))+
geom_density()+
geom_vline(xintercept = c(quantile(Rsmat,prob=c(0.01,0.99))[1],
quantile(Rsmat,prob=c(0.01,0.99))[2]),col="red")+
theme_bw()+
labs(list(title = "Lasso : Word ~ Ch1_PC19 + Ch6_PC19", x = "R-Squared"))
library(lm.beta) summary(lm.beta(fit <- lm(Word~Ch1_PC19+Ch6_PC19,data = dta_word)))
Call:
lm(formula = Word ~ Ch1_PC19 + Ch6_PC19, data = dta_word)
Residuals:
Min 1Q Median 3Q Max
-40.762 -8.220 1.813 8.676 29.816
Coefficients:
Estimate Standardized Std. Error t value Pr(>|t|)
(Intercept) 98.3000 0.0000 4.0567 24.232 1.27e-14 ***
Ch1_PC19 14.7997 0.4896 7.9151 1.870 0.0788 .
Ch6_PC19 -19.5837 -0.3084 16.6289 -1.178 0.2551
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 18.14 on 17 degrees of freedom
Multiple R-squared: 0.5756, Adjusted R-squared: 0.5257
F-statistic: 11.53 on 2 and 17 DF, p-value: 0.0006854
library(e1071)
dta_word <-dta[,c(2,18:223)]
svm_tune <- tune(svm, Word~., data =dta_word, kernel="radial",
ranges = list(cost = seq(.1, 2, .1), gamma = seq(.1, 2, .1)),
tunecontrol = tune.control(sampling = "cross", cross = 10))
rst_svm <- svm(Word ~., data = dta_word, type = "eps-regression",
cost = svm_tune$best.model$cost, kernel = "radial",
gamma = svm_tune$best.model$gamma)
yhat_svm <- predict(rst_svm, newdata = dta_word )
(1-(mean((dta$Word - yhat_svm)^2)/var(dta$Word)))
[1] 0.9498113
Rsmat <- matrix(NA,18,60)
for (i in 1:60){
set.seed(i)
for (j in 3:20){
svm_tune <- tune(svm, Word~., data =dta_word, kernel="radial",
ranges = list(cost = seq(.1, 2, .1), gamma = seq(.1, 2, .1)),
tunecontrol = tune.control(sampling = "cross", cross = j))
rst_svm <- svm(Word ~., data = dta[,c(2,18:223)], type = "eps-regression",
cost = svm_tune$best.model$cost, kernel = "radial",
gamma = svm_tune$best.model$gamma)
yhat_svm <- predict(rst_svm, newdata = dta_word )
Rsmat[j-2,i] <- (1-(mean((dta$Word - yhat_svm)^2)/var(dta$Word)))
}
}
quantile(Rsmat,prob=c(0.01,0.99))
1% 99% 0.2169494 0.9735125
draw <- melt(Rsmat)
ggplot(draw,aes(x = value))+
geom_density()+
geom_vline(xintercept = c(quantile(Rsmat,prob=c(0.01,0.99))[1],
quantile(Rsmat,prob=c(0.01,0.99))[2]),col="red")+
theme_bw()+
labs(list(title = " SVM : Word ~ . ", x = "R-Squared"))
## SVM
#Ch1_PC19 Ch6_PC19
Rsmat2 <- matrix(NA,18,60)
for (i in 1:60){
set.seed(i)
for (j in 3:20){
svm_tune <- tune(svm, Word~Ch1_PC19+Ch6_PC19, data =dta_word, kernel="radial",
ranges = list(cost = seq(.1, 2, .1), gamma = seq(.1, 2, .1)),
tunecontrol = tune.control(sampling = "cross", cross = 10))
rst_svm <- svm(Word ~Ch1_PC19+Ch6_PC19, data = dta[,c(2,18:223)], type = "eps-regression",
cost = svm_tune$best.model$cost, kernel = "radial",
gamma = svm_tune$best.model$gamma)
yhat_svm <- predict(rst_svm, newdata = dta_word )
Rsmat2[j-2,i] <- (1-(mean((dta$Word - yhat_svm)^2)/var(dta$Word)))
}
}
quantile(Rsmat2,prob=c(0.01,0.99))
1% 99% 0.2851531 0.5957688
draw2 <- melt(Rsmat2)
ggplot(draw2,aes(x = value))+
geom_density()+
geom_vline(xintercept = c(quantile(Rsmat2,prob=c(0.01,0.99))[1],
quantile(Rsmat2,prob=c(0.01,0.99))[2]),col="red")+
theme_bw()+
labs(list(title = "SVM : Word ~ Ch1_PC19 + Ch6_PC19", x = "R-Squared"))