library(ggplot2)
library(tidyverse)
library(ISLR)
library(reshape2)
library(GGally)
library(boot)
library(leaps)
library(infer)
library(boot)
library(base)
library(gridExtra)
twelve <- read.csv("Twelve.csv")
head(twelve)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 1 6.7 0.28 0.28 2.4 0.012 36 100 0.99064 3.26 0.39 11.7 7
## 2 6.7 0.28 0.28 2.4 0.012 36 100 0.99064 3.26 0.39 11.7 7
## 3 5.1 0.47 0.02 1.3 0.034 18 44 0.99210 3.90 0.62 12.8 6
## 4 9.3 0.37 0.44 1.6 0.038 21 42 0.99526 3.24 0.81 10.8 7
## 5 6.4 0.38 0.14 2.2 0.038 15 25 0.99514 3.44 0.65 11.1 6
## 6 9.7 0.53 0.60 2.0 0.039 5 19 0.99585 3.30 0.86 12.4 6
any(is.na(twelve))
## [1] FALSE
All Values seem to be numeric at first glance. Importance notes: - V3 is between 0 and 1 - V12 is a discrete value between 3 and 8. It could be considered as a factor for later analysis.
stats <- apply(twelve,2, function(x){
return (c(mean(x), sd(x), median(x), range(x), typeof(x)))
})
row.names(stats) <- c("Mean","SD","Median","Lower Range", "Upper Range", "Type")
stats
## V1 V2 V3
## Mean "8.31963727329581" "0.527820512820513" "0.270975609756098"
## SD "1.7410963181277" "0.179059704153535" "0.194801137405319"
## Median "7.9" "0.52" "0.26"
## Lower Range "4.6" "0.12" "0"
## Upper Range "15.9" "1.58" "1"
## Type "double" "double" "double"
## V4 V5 V6
## Mean "2.53880550343965" "0.0874665415884928" "15.8749218261413"
## SD "1.40992805950728" "0.0470653020100901" "10.4601569698097"
## Median "2.2" "0.079" "14"
## Lower Range "0.9" "0.012" "1"
## Upper Range "15.5" "0.611" "72"
## Type "double" "double" "double"
## V7 V8 V9
## Mean "46.4677923702314" "0.996746679174484" "3.31111319574734"
## SD "32.8953244782991" "0.00188733395384256" "0.154386464903543"
## Median "38" "0.99675" "3.31"
## Lower Range "6" "0.99007" "2.74"
## Upper Range "289" "1.00369" "4.01"
## Type "double" "double" "double"
## V10 V11 V12
## Mean "0.658148843026892" "10.4229831144465" "5.63602251407129"
## SD "0.16950697959011" "1.0656675818564" "0.807569439734705"
## Median "0.62" "10.2" "6"
## Lower Range "0.33" "8.4" "3"
## Upper Range "2" "14.9" "8"
## Type "double" "double" "double"
After this stage, analysis will be done on twelve & twelve.filtered which refer to the unfiltered data and the filtered data respectively. There are several ways to detect a value as an Outlier. I choose Outlier is (>1.5IQR+Q3 || <Q1-1.5IQR) A Strong Outlier is (>3IQR+Q3 || <Q1-3IQR)
### Observe the Summary of the Data to take an Idea
summary(twelve)
## V1 V2 V3 V4
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## V5 V6 V7 V8
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## V9 V10 V11 V12
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
length(twelve$V1)
## [1] 1599
### It seems like there will be several Outliers
### (The Max Value looks far from 3rd Q. for the several Factors)
### We Continue:
### Note that quantile(x) returns 0% - 25% - 50% - 75% - 100% Quantiles
check_outliers <- function (data, IQR_coeff){
outl <- rep(0:(length(data)-1))
for (i in seq(from=1, to=ncol(data))){
quant <- quantile(data[,i])
c1 <- length(which(data[,i]>(IQR_coeff*(quant[4]-quant[1])+quant[4])))
c2 <- length(which(data[,i]<(quant[1]-IQR_coeff*(quant[4]-quant[1]))))
outl[i] <- c1+c2
}
names(outl) <- colnames(data)
return (outl)
}
## Outliers:
outl <- check_outliers(twelve, 1.5)
outl
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 0 1 0 78 39 12 15 0 0 12 0 0
## Strong Outliers: (>3IQR+Q3 || <Q1-3IQR)
strong_outl <- check_outliers(twelve, 3)
strong_outl
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 0 0 0 26 22 0 2 0 0 4 0 0
### Since there are a lot of samples we remove the Outliers
### (Weak & Strong) as they are relatively few.
twelve.filtered <- twelve
for (i in seq(from=1, to=ncol(twelve.filtered))){
quant <- quantile(twelve[,i])
upper_outl <- which(twelve.filtered[,i]>(1.5*(quant[4]-quant[1])+quant[4]))
if(length(upper_outl)!=0){
twelve.filtered <- twelve.filtered[-upper_outl,]
}
lower_outl <- which(twelve.filtered[,i]<(quant[1]-1.5*(quant[4]-quant[1])))
if(length(lower_outl)!=0){
twelve.filtered <- twelve.filtered[-lower_outl,]
}
}
## Observing the SD of the Data
## Calculating how many SD are the max and min measurements far from the Mean
## 1. Before Removing The Outliers
tb_org <- apply(twelve, 2, function(x){return(c(sd(x),(mean(x)-min(x))/sd(x),
(max(x)-mean(x))/sd(x)))})
rownames(tb_org) <- c("SD","kSD(min,mean)","kSD(max,mean)")
tb_org
## V1 V2 V3 V4 V5 V6
## SD 1.741096 0.1790597 0.1948011 1.409928 0.0470653 10.460157
## kSD(min,mean) 2.136377 2.2775672 1.3910371 1.162333 1.6034433 1.422055
## kSD(max,mean) 4.353787 5.8761378 3.7424031 9.192806 11.1235546 5.365606
## V7 V8 V9 V10 V11 V12
## SD 32.895324 0.001887334 0.1543865 0.169507 1.065668 0.8075694
## kSD(min,mean) 1.230199 3.537624680 3.6992439 1.935902 1.898325 3.2641435
## kSD(max,mean) 7.372847 3.678904208 4.5268658 7.916200 4.201138 2.9272746
## 2. After Removing The Outliers
tb <- apply(twelve.filtered, 2, function(x){return(c(sd(x),(mean(x)-min(x))/sd(x),
(max(x)-mean(x))/sd(x)))})
rownames(tb) <- c("SD","kSD(min,mean)","kSD(max,mean)")
tb
## V1 V2 V3 V4 V5 V6
## SD 1.749929 0.1760607 0.1917241 0.6406856 0.01991076 9.581720
## kSD(min,mean) 2.120943 2.3064656 1.3669682 2.1726062 3.45315531 1.496118
## kSD(max,mean) 4.165027 4.5661628 2.7535360 4.4609123 5.98897331 3.722152
## V7 V8 V9 V10 V11 V12
## SD 29.183437 0.001831386 0.1521618 0.1349886 1.058969 0.7959249
## kSD(min,mean) 1.299927 3.590994625 3.0139669 2.3250373 1.927917 3.3160014
## kSD(max,mean) 3.463048 3.578440686 4.5437789 4.2681122 3.360246 2.9659981
## By Removing the Outliers, the SD of some of the Variables decreased
## which shows measurements are more close to the mean.
data <- twelve.filtered
summary(data)
## V1 V2 V3 V4
## Min. : 4.600 Min. :0.1200 Min. :0.0000 Min. :0.900
## 1st Qu.: 7.100 1st Qu.:0.3900 1st Qu.:0.0900 1st Qu.:1.900
## Median : 7.900 Median :0.5200 Median :0.2500 Median :2.200
## Mean : 8.311 Mean :0.5261 Mean :0.2621 Mean :2.292
## 3rd Qu.: 9.200 3rd Qu.:0.6350 3rd Qu.:0.4200 3rd Qu.:2.500
## Max. :15.600 Max. :1.3300 Max. :0.7900 Max. :5.150
## V5 V6 V7 V8
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9955
## Median :0.07900 Median :13.00 Median : 37.00 Median :0.9966
## Mean :0.08075 Mean :15.34 Mean : 43.94 Mean :0.9966
## 3rd Qu.:0.08900 3rd Qu.:21.00 3rd Qu.: 59.00 3rd Qu.:0.9977
## Max. :0.20000 Max. :51.00 Max. :145.00 Max. :1.0032
## V9 V10 V11 V12
## Min. :2.860 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.220 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.319 Mean :0.6439 Mean :10.44 Mean :5.639
## 3rd Qu.:3.410 3rd Qu.:0.7200 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :1.2200 Max. :14.00 Max. :8.000
## V11 is the response
predictors <- data[,-11]
quantile(data$V11)
## 0% 25% 50% 75% 100%
## 8.4 9.5 10.2 11.1 14.0
colors_byQ <- rep(1:length(data$V11))
colors_byQ[data$V11>=8.4 & data$V11<9.5] <- 0
colors_byQ[data$V11>=9.5 & data$V11<10.2] <- 1
colors_byQ[data$V11>=10.2 & data$V11<11.1] <- 2
colors_byQ[data$V11>=11.1 & data$V11<=14] <- 3
## Colors: RGB - R for lower values of V11, and B for higher values of V11 (Response)
ggp = ggpairs(data[,c(seq(from=1, to=4),11)],
aes(color=as.factor(colors_byQ)), progress = ggmatrix_progress())
print(ggp)
### There appears to be a correlation between: V3-V1[0.7]
ggp = ggpairs(data[,c(seq(from=5, to=8),11)],
aes(color=as.factor(colors_byQ)), progress = ggmatrix_progress())
print(ggp)
### There appears to be a correlation between: V6-V7[0.65], V8-V11[-0.5]
ggp = ggpairs(data[,c(9,10,12,11)],
aes(color=as.factor(colors_byQ)), progress = ggmatrix_progress())
print(ggp)
### There appears to be a correlation between: V11-V12[0.48]
The Latter Approach Doesn’t consider the correlation between all variables (Only Between every other 4 variables & response).
Note that ggpairs() in the previous Visualization achieves this task.
>> Note: Tried a loop, some errors rose, so went for manual
pred <- seq(1:12)
v1 <- (ggplot(data, aes(x=V1, y=V11)) +
stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
geom_point(alpha=1, color="blue", size = 1) +
xlab("V1") +
ylab("V11"))
v2 <- (ggplot(data, aes(x=V2, y=V11)) +
stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
geom_point(alpha=1, color="blue", size = 1) +
xlab("V2") +
ylab("V11"))
v3 <- (ggplot(data, aes(x=V3, y=V11)) +
stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
geom_point(alpha=1, color="blue", size = 1) +
xlab("V3") +
ylab("V11"))
v4 <- (ggplot(data, aes(x=V4, y=V11)) +
stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
geom_point(alpha=1, color="blue", size = 1) +
xlab("V4") +
ylab("V11"))
##-----------------------------------------------------------------
grid.arrange(v1,v2,v3,v4, ncol=2)
v5 <- (ggplot(data, aes(x=V5, y=V11)) +
stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
geom_point(alpha=1, color="blue", size = 1) +
xlab("V5") +
ylab("V11"))
v6 <- (ggplot(data, aes(x=V6, y=V11)) +
stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
geom_point(alpha=1, color="blue", size = 1) +
xlab("V6") +
ylab("V11"))
v7 <- (ggplot(data, aes(x=V7, y=V11)) +
stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
geom_point(alpha=1, color="blue", size = 1) +
xlab("V7") +
ylab("V11"))
v8 <- (ggplot(data, aes(x=V8, y=V11)) +
stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
geom_point(alpha=1, color="blue", size = 1) +
xlab("V8") +
ylab("V11"))
##-----------------------------------------------------------------
grid.arrange(v5,v6,v7,v8, ncol=2)
v9 <- (ggplot(data, aes(x=V9, y=V11)) +
stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
geom_point(alpha=1, color="blue", size = 1) +
xlab("V9") +
ylab("V11"))
v10 <- (ggplot(data, aes(x=V10, y=V11)) +
stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
geom_point(alpha=1, color="blue", size = 1) +
xlab("V10") +
ylab("V11"))
v12 <- (ggplot(data, aes(x=V12, y=V11)) +
stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
geom_point(alpha=1, color="blue", size = 1) +
xlab("V12") +
ylab("V11"))
##-----------------------------------------------------------------
grid.arrange(v9,v10,v12, ncol=2)
We observe a very low R^2 and AdjR^2 = 0.001 No more than approximately 1% of the response can be explained by a variation in V6.
Estimates: an increase in a unit of V6 will only lead to a decrease in V11 equivalent to B1 = -0.004
P-Value: The p-value is not significant = 0.12
In summary, the model is very weak and V6 is not enought to predict V11.
lm.fit <- lm(V11~V6, data=data)
vis <- ggplot(data, aes(x=V6, y=V11)) +
stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
geom_point(alpha=1, color="blue", size = 1) +
xlab("V6") +
ylab("V11")
vis
summary(lm.fit)
##
## Call:
## lm(formula = V11 ~ V6, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0881 -0.8970 -0.2656 0.6704 3.6153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.510520 0.052273 201.071 <2e-16 ***
## V6 -0.004494 0.002891 -1.554 0.12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.058 on 1459 degrees of freedom
## Multiple R-squared: 0.001653, Adjusted R-squared: 0.0009691
## F-statistic: 2.416 on 1 and 1459 DF, p-value: 0.1203
The model has only one significant predictor (by p-value) which is 12.
The latter is logical as V12 and V11 were seen to be slightly correlated (corr = 0.48).
As of other variables, they don’t seem to have any significance.
R-squared and AdjR^2 are still very low: less than 25%
of the Response is explained by these Predictors.
V6 is still insignificant.
mult.fit <- lm(V11~V2+V6+V10+V12, data=data)
summary(mult.fit)
##
## Call:
## lm(formula = V11 ~ V2 + V6 + V10 + V12, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3353 -0.6287 -0.1920 0.5543 3.6961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.721008 0.251285 26.747 <2e-16 ***
## V2 0.022375 0.152854 0.146 0.884
## V6 -0.002720 0.002533 -1.074 0.283
## V10 0.316442 0.197686 1.601 0.110
## V12 0.628943 0.034296 18.339 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9251 on 1456 degrees of freedom
## Multiple R-squared: 0.2389, Adjusted R-squared: 0.2368
## F-statistic: 114.2 on 4 and 1456 DF, p-value: < 2.2e-16
V6 appears to be significant in this case.
However, still V6 explains less than 0.5% (R^2) of the response!
lm.fit <- lm(V11~V6, data=twelve)
vis <- ggplot(twelve, aes(x=V6, y=V11)) +
stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
geom_point(alpha=1, color="blue", size = 1) +
xlab("V6") +
ylab("V11")
vis
summary(lm.fit)
##
## Call:
## lm(formula = V11 ~ V6, data = twelve)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0999 -0.8999 -0.2585 0.6779 4.5203
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.535238 0.048345 217.92 < 2e-16 ***
## V6 -0.007071 0.002543 -2.78 0.00549 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.063 on 1597 degrees of freedom
## Multiple R-squared: 0.004818, Adjusted R-squared: 0.004194
## F-statistic: 7.731 on 1 and 1597 DF, p-value: 0.005492
V6 is less significant after adding the new predictors, the Estimate is approximately the same.
The model is still weak as the predictors cannot explain more than 25% of the variation in V11.
mult.fit <- lm(V11~V2+V6+V10+V12, data=twelve)
summary(mult.fit)
##
## Call:
## lm(formula = V11 ~ V2 + V6 + V10 + V12, data = twelve)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3037 -0.6335 -0.1965 0.5502 4.9071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.196874 0.237290 30.329 <2e-16 ***
## V2 -0.157667 0.144561 -1.091 0.2756
## V6 -0.004514 0.002248 -2.008 0.0448 *
## V10 -0.185035 0.145489 -1.272 0.2036
## V12 0.621497 0.032038 19.399 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9363 on 1594 degrees of freedom
## Multiple R-squared: 0.2299, Adjusted R-squared: 0.228
## F-statistic: 119 on 4 and 1594 DF, p-value: < 2.2e-16
mlcv.fit <- glm(V11 ~ V2+V6+V10+V12, data = data)
cv.error <- cv.glm(data, mlcv.fit, K = 10)$delta[1]
cv.error
## [1] 0.8600801
start_time <- Sys.time()
glm.quad <- glm(V10 ~ poly(V4, 2, raw = TRUE), data = data)
cv.error <- cv.glm(data, glm.quad)$delta[1]
end_time <- Sys.time()
time_loocv <- end_time - start_time
time_loocv
## Time difference of 4.20779 secs
We notice LOOCV is slower by ~ 200 folds than CV with K=5.
start_time <- Sys.time()
glm.quad <- glm(V10 ~ poly(V4, 2, raw = TRUE), data = data)
cv.error <- cv.glm(data, glm.quad, K=5)$delta[1]
end_time <- Sys.time()
time_cv <- end_time - start_time
time_diff <- c(time_cv, time_loocv, time_loocv-time_cv)
names(time_diff) <- c("Time for CV in seconds", "Time for LOOCV in seconds", "Difference")
time_diff
## Time differences in secs
## Time for CV in seconds Time for LOOCV in seconds Difference
## 0.01702499 4.20778990 4.19076490
set.seed(1234)
bestfit.reg <- regsubsets(V11~., data = data, nvmax = ncol(data), method="exhaustive")
summ = summary(bestfit.reg)
names(summ)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
summ$outmat
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V12
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*"
## 3 ( 1 ) "*" " " " " " " " " " " " " "*" "*" " " " "
## 4 ( 1 ) "*" " " " " "*" " " " " " " "*" "*" " " " "
## 5 ( 1 ) "*" " " " " "*" " " " " " " "*" "*" " " "*"
## 6 ( 1 ) "*" " " " " "*" " " " " " " "*" "*" "*" "*"
## 7 ( 1 ) "*" " " "*" "*" " " " " " " "*" "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" " " " " " " "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
plot(summ$adjr2, xlab = "num of variables", ylab = "AdjR^2", type = "b", col = "purple")
points(which.max(summ$adjr2), summ$adjr2[which.max(summ$adjr2)], col="red", cex=1, pch=20)
### Then, it is the model with 9 predictors that performs best according to AdjR^2
### From the Outmat above, we can see that the model belongs to that without V5,V6
bestfit.model <- glm(V11~.- V5 - V6, data = data)
summary(bestfit.model)
##
## Call:
## glm(formula = V11 ~ . - V5 - V6, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.39437 -0.35751 -0.04548 0.32463 2.12005
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.833e+02 1.328e+01 43.934 < 2e-16 ***
## V1 4.634e-01 2.021e-02 22.926 < 2e-16 ***
## V2 4.650e-01 1.125e-01 4.132 3.80e-05 ***
## V3 7.694e-01 1.319e-01 5.835 6.61e-09 ***
## V4 5.657e-01 2.496e-02 22.661 < 2e-16 ***
## V7 -2.068e-03 5.368e-04 -3.853 0.000122 ***
## V8 -5.935e+02 1.354e+01 -43.834 < 2e-16 ***
## V9 3.396e+00 1.473e-01 23.054 < 2e-16 ***
## V10 9.998e-01 1.236e-01 8.087 1.28e-15 ***
## V12 2.231e-01 2.251e-02 9.915 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.3114849)
##
## Null deviance: 1637.27 on 1460 degrees of freedom
## Residual deviance: 451.96 on 1451 degrees of freedom
## AIC: 2454
##
## Number of Fisher Scoring iterations: 2
forwardfit.reg <- regsubsets(V11~., data = data, nvmax = ncol(data), method="forward")
summ_forward = summary(forwardfit.reg)
names(summ_forward )
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
summ_forward$outmat
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V12
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*"
## 3 ( 1 ) " " " " " " "*" " " " " " " "*" " " " " "*"
## 4 ( 1 ) "*" " " " " "*" " " " " " " "*" " " " " "*"
## 5 ( 1 ) "*" " " " " "*" " " " " " " "*" "*" " " "*"
## 6 ( 1 ) "*" " " " " "*" " " " " " " "*" "*" "*" "*"
## 7 ( 1 ) "*" " " "*" "*" " " " " " " "*" "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" " " " " " " "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
plot(summ_forward$cp, xlab = "num of variables", ylab = "Cp", type = "b", col = "purple")
points(which.min(summ_forward$cp), summ_forward$cp[which.min(summ_forward$cp)],
col="red", cex=1, pch=20)
### Then, it is the model with 9 predictors also that performs best according to Cp
### From the Outmat above, we can see that the model belongs to that without V5,V6
### This is the same model as that of the bestfit.model
forwardfit.model <- bestfit.model
summary(forwardfit.model)
##
## Call:
## glm(formula = V11 ~ . - V5 - V6, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.39437 -0.35751 -0.04548 0.32463 2.12005
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.833e+02 1.328e+01 43.934 < 2e-16 ***
## V1 4.634e-01 2.021e-02 22.926 < 2e-16 ***
## V2 4.650e-01 1.125e-01 4.132 3.80e-05 ***
## V3 7.694e-01 1.319e-01 5.835 6.61e-09 ***
## V4 5.657e-01 2.496e-02 22.661 < 2e-16 ***
## V7 -2.068e-03 5.368e-04 -3.853 0.000122 ***
## V8 -5.935e+02 1.354e+01 -43.834 < 2e-16 ***
## V9 3.396e+00 1.473e-01 23.054 < 2e-16 ***
## V10 9.998e-01 1.236e-01 8.087 1.28e-15 ***
## V12 2.231e-01 2.251e-02 9.915 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.3114849)
##
## Null deviance: 1637.27 on 1460 degrees of freedom
## Residual deviance: 451.96 on 1451 degrees of freedom
## AIC: 2454
##
## Number of Fisher Scoring iterations: 2
forwardfit.model
##
## Call: glm(formula = V11 ~ . - V5 - V6, data = data)
##
## Coefficients:
## (Intercept) V1 V2 V3 V4 V7
## 5.833e+02 4.634e-01 4.650e-01 7.694e-01 5.657e-01 -2.068e-03
## V8 V9 V10 V12
## -5.935e+02 3.396e+00 9.998e-01 2.231e-01
##
## Degrees of Freedom: 1460 Total (i.e. Null); 1451 Residual
## Null Deviance: 1637
## Residual Deviance: 452 AIC: 2454
backfit.reg <- regsubsets(V11~., data = data, nvmax = ncol(data), method="backward")
summ_back = summary(backfit.reg)
names(summ_back)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
summ_back$outmat
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V12
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " " "*" " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " " "*" "*" " " " "
## 4 ( 1 ) "*" " " " " "*" " " " " " " "*" "*" " " " "
## 5 ( 1 ) "*" " " " " "*" " " " " " " "*" "*" " " "*"
## 6 ( 1 ) "*" " " " " "*" " " " " " " "*" "*" "*" "*"
## 7 ( 1 ) "*" " " "*" "*" " " " " " " "*" "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" " " " " " " "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
plot(summ_back$bic, xlab = "num of variables", ylab = "BIC", type = "b", col = "purple")
points(which.min(summ_back$bic), summ_back$bic[which.min(summ_back$bic)],
col="red", cex=1, pch=20)
### Also the model with 9 Predictors is the best model obtained from backward subset selection