Loading Required Libraries

library(ggplot2)
library(tidyverse)
library(ISLR)
library(reshape2)
library(GGally)
library(boot)
library(leaps)
library(infer)
library(boot)
library(base)
library(gridExtra)

1- Importing & Exploring the DataSet

a- Reading the Data

twelve <- read.csv("Twelve.csv")
head(twelve)

##    V1   V2   V3  V4    V5 V6  V7      V8   V9  V10  V11 V12
## 1 6.7 0.28 0.28 2.4 0.012 36 100 0.99064 3.26 0.39 11.7   7
## 2 6.7 0.28 0.28 2.4 0.012 36 100 0.99064 3.26 0.39 11.7   7
## 3 5.1 0.47 0.02 1.3 0.034 18  44 0.99210 3.90 0.62 12.8   6
## 4 9.3 0.37 0.44 1.6 0.038 21  42 0.99526 3.24 0.81 10.8   7
## 5 6.4 0.38 0.14 2.2 0.038 15  25 0.99514 3.44 0.65 11.1   6
## 6 9.7 0.53 0.60 2.0 0.039  5  19 0.99585 3.30 0.86 12.4   6

b- Check for Missing Data

any(is.na(twelve))

## [1] FALSE

c- Basic Statistics Exploration

All Values seem to be numeric at first glance. Importance notes: - V3 is between 0 and 1 - V12 is a discrete value between 3 and 8. It could be considered as a factor for later analysis.

stats <- apply(twelve,2, function(x){
  return (c(mean(x), sd(x), median(x), range(x), typeof(x)))
})
row.names(stats) <- c("Mean","SD","Median","Lower Range", "Upper Range", "Type")
stats

##             V1                 V2                  V3                 
## Mean        "8.31963727329581" "0.527820512820513" "0.270975609756098"
## SD          "1.7410963181277"  "0.179059704153535" "0.194801137405319"
## Median      "7.9"              "0.52"              "0.26"             
## Lower Range "4.6"              "0.12"              "0"                
## Upper Range "15.9"             "1.58"              "1"                
## Type        "double"           "double"            "double"           
##             V4                 V5                   V6                
## Mean        "2.53880550343965" "0.0874665415884928" "15.8749218261413"
## SD          "1.40992805950728" "0.0470653020100901" "10.4601569698097"
## Median      "2.2"              "0.079"              "14"              
## Lower Range "0.9"              "0.012"              "1"               
## Upper Range "15.5"             "0.611"              "72"              
## Type        "double"           "double"             "double"          
##             V7                 V8                    V9                 
## Mean        "46.4677923702314" "0.996746679174484"   "3.31111319574734" 
## SD          "32.8953244782991" "0.00188733395384256" "0.154386464903543"
## Median      "38"               "0.99675"             "3.31"             
## Lower Range "6"                "0.99007"             "2.74"             
## Upper Range "289"              "1.00369"             "4.01"             
## Type        "double"           "double"              "double"           
##             V10                 V11                V12                
## Mean        "0.658148843026892" "10.4229831144465" "5.63602251407129" 
## SD          "0.16950697959011"  "1.0656675818564"  "0.807569439734705"
## Median      "0.62"              "10.2"             "6"                
## Lower Range "0.33"              "8.4"              "3"                
## Upper Range "2"                 "14.9"             "8"                
## Type        "double"            "double"           "double"

d- Dealing with Outliers:

After this stage, analysis will be done on twelve & twelve.filtered which refer to the unfiltered data and the filtered data respectively. There are several ways to detect a value as an Outlier. I choose Outlier is (>1.5IQR+Q3 || <Q1-1.5IQR) A Strong Outlier is (>3IQR+Q3 || <Q1-3IQR)

### Observe the Summary of the Data to take an Idea
summary(twelve)

##        V1              V2               V3              V4        
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##        V5                V6              V7               V8        
##  Min.   :0.01200   Min.   : 1.00   Min.   :  6.00   Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00   1st Qu.: 22.00   1st Qu.:0.9956  
##  Median :0.07900   Median :14.00   Median : 38.00   Median :0.9968  
##  Mean   :0.08747   Mean   :15.87   Mean   : 46.47   Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00   3rd Qu.: 62.00   3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00   Max.   :289.00   Max.   :1.0037  
##        V9             V10              V11             V12       
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000

length(twelve$V1)

## [1] 1599

### It seems like there will be several Outliers
### (The Max Value looks far from 3rd Q. for the several Factors)
### We Continue:
### Note that quantile(x) returns 0% - 25% - 50% - 75% - 100% Quantiles

check_outliers <- function (data, IQR_coeff){
    outl <- rep(0:(length(data)-1))
    for (i in seq(from=1, to=ncol(data))){
        quant <- quantile(data[,i])
        c1 <- length(which(data[,i]>(IQR_coeff*(quant[4]-quant[1])+quant[4])))
        c2 <- length(which(data[,i]<(quant[1]-IQR_coeff*(quant[4]-quant[1]))))
        outl[i] <- c1+c2
    }
    names(outl) <- colnames(data)
    return (outl)
}


## Outliers:
outl <- check_outliers(twelve, 1.5)
outl

##  V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 
##   0   1   0  78  39  12  15   0   0  12   0   0

## Strong Outliers: (>3IQR+Q3 || <Q1-3IQR)
strong_outl <- check_outliers(twelve, 3)
strong_outl

##  V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 
##   0   0   0  26  22   0   2   0   0   4   0   0

### Since there are a lot of samples we remove the Outliers
### (Weak & Strong) as they are relatively few.
twelve.filtered <- twelve
for (i in seq(from=1, to=ncol(twelve.filtered))){
    quant <- quantile(twelve[,i])
    
    upper_outl <- which(twelve.filtered[,i]>(1.5*(quant[4]-quant[1])+quant[4]))
    if(length(upper_outl)!=0){
       twelve.filtered <- twelve.filtered[-upper_outl,]
    }
    
    
    lower_outl <- which(twelve.filtered[,i]<(quant[1]-1.5*(quant[4]-quant[1])))
    if(length(lower_outl)!=0){
       twelve.filtered <- twelve.filtered[-lower_outl,]
    }
     
}

## Observing the SD of the Data
## Calculating how many SD are the max and min measurements far from the Mean
## 1. Before Removing The Outliers
tb_org <- apply(twelve, 2, function(x){return(c(sd(x),(mean(x)-min(x))/sd(x),
                                                (max(x)-mean(x))/sd(x)))})
rownames(tb_org) <- c("SD","kSD(min,mean)","kSD(max,mean)")
tb_org

##                     V1        V2        V3       V4         V5        V6
## SD            1.741096 0.1790597 0.1948011 1.409928  0.0470653 10.460157
## kSD(min,mean) 2.136377 2.2775672 1.3910371 1.162333  1.6034433  1.422055
## kSD(max,mean) 4.353787 5.8761378 3.7424031 9.192806 11.1235546  5.365606
##                      V7          V8        V9      V10      V11       V12
## SD            32.895324 0.001887334 0.1543865 0.169507 1.065668 0.8075694
## kSD(min,mean)  1.230199 3.537624680 3.6992439 1.935902 1.898325 3.2641435
## kSD(max,mean)  7.372847 3.678904208 4.5268658 7.916200 4.201138 2.9272746

## 2. After Removing The Outliers
tb <- apply(twelve.filtered, 2, function(x){return(c(sd(x),(mean(x)-min(x))/sd(x),
                                                     (max(x)-mean(x))/sd(x)))})
rownames(tb) <- c("SD","kSD(min,mean)","kSD(max,mean)")
tb

##                     V1        V2        V3        V4         V5       V6
## SD            1.749929 0.1760607 0.1917241 0.6406856 0.01991076 9.581720
## kSD(min,mean) 2.120943 2.3064656 1.3669682 2.1726062 3.45315531 1.496118
## kSD(max,mean) 4.165027 4.5661628 2.7535360 4.4609123 5.98897331 3.722152
##                      V7          V8        V9       V10      V11       V12
## SD            29.183437 0.001831386 0.1521618 0.1349886 1.058969 0.7959249
## kSD(min,mean)  1.299927 3.590994625 3.0139669 2.3250373 1.927917 3.3160014
## kSD(max,mean)  3.463048 3.578440686 4.5437789 4.2681122 3.360246 2.9659981

## By Removing the Outliers, the SD of some of the Variables decreased
## which shows measurements are more close to the mean. 
data <- twelve.filtered
summary(data)

##        V1               V2               V3               V4       
##  Min.   : 4.600   Min.   :0.1200   Min.   :0.0000   Min.   :0.900  
##  1st Qu.: 7.100   1st Qu.:0.3900   1st Qu.:0.0900   1st Qu.:1.900  
##  Median : 7.900   Median :0.5200   Median :0.2500   Median :2.200  
##  Mean   : 8.311   Mean   :0.5261   Mean   :0.2621   Mean   :2.292  
##  3rd Qu.: 9.200   3rd Qu.:0.6350   3rd Qu.:0.4200   3rd Qu.:2.500  
##  Max.   :15.600   Max.   :1.3300   Max.   :0.7900   Max.   :5.150  
##        V5                V6              V7               V8        
##  Min.   :0.01200   Min.   : 1.00   Min.   :  6.00   Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00   1st Qu.: 22.00   1st Qu.:0.9955  
##  Median :0.07900   Median :13.00   Median : 37.00   Median :0.9966  
##  Mean   :0.08075   Mean   :15.34   Mean   : 43.94   Mean   :0.9966  
##  3rd Qu.:0.08900   3rd Qu.:21.00   3rd Qu.: 59.00   3rd Qu.:0.9977  
##  Max.   :0.20000   Max.   :51.00   Max.   :145.00   Max.   :1.0032  
##        V9             V10              V11             V12       
##  Min.   :2.860   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.220   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.319   Mean   :0.6439   Mean   :10.44   Mean   :5.639  
##  3rd Qu.:3.410   3rd Qu.:0.7200   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :1.2200   Max.   :14.00   Max.   :8.000

e- Correlation between variables

## V11 is the response
predictors <- data[,-11]
quantile(data$V11)

##   0%  25%  50%  75% 100% 
##  8.4  9.5 10.2 11.1 14.0

colors_byQ <- rep(1:length(data$V11))

colors_byQ[data$V11>=8.4 & data$V11<9.5] <- 0
colors_byQ[data$V11>=9.5 & data$V11<10.2] <- 1
colors_byQ[data$V11>=10.2 & data$V11<11.1] <- 2
colors_byQ[data$V11>=11.1 & data$V11<=14] <- 3


## Colors: RGB - R for lower values of V11, and B for higher values of V11 (Response)
ggp = ggpairs(data[,c(seq(from=1, to=4),11)],
              aes(color=as.factor(colors_byQ)), progress = ggmatrix_progress())
print(ggp)

### There appears to be a correlation between: V3-V1[0.7]

ggp = ggpairs(data[,c(seq(from=5, to=8),11)],
              aes(color=as.factor(colors_byQ)), progress = ggmatrix_progress())
print(ggp)

### There appears to be a correlation between: V6-V7[0.65], V8-V11[-0.5]


ggp = ggpairs(data[,c(9,10,12,11)],
              aes(color=as.factor(colors_byQ)), progress = ggmatrix_progress())
print(ggp)

### There appears to be a correlation between: V11-V12[0.48]

The Latter Approach Doesn’t consider the correlation between all variables (Only Between every other 4 variables & response).

i- Extracting Highest Correlated Variables with V11 as Response

corr <- cor(data)
highest_corr = reshape2::melt(corr, na.rm = TRUE, value.name = "corr", diag=FALSE)
highest_corr <- highest_corr[order(abs(highest_corr$corr),decreasing=TRUE),]
highest_corr <- highest_corr[-seq(1:12),]
highest_corr <- highest_corr[-seq(from=2, to=length(highest_corr$corr), by=2),]
head(highest_corr, n=10)

##     Var1 Var2       corr
## 9     V9   V1 -0.7148997
## 3     V3   V1  0.6997252
## 8     V8   V1  0.6904176
## 67    V7   V6  0.6554051
## 15    V3   V2 -0.5771667
## 33    V9   V3 -0.5305481
## 95   V11   V8 -0.5112825
## 132  V12  V11  0.4868390
## 32    V8   V3  0.3855120
## 24   V12   V2 -0.3850204

Highest Correlations with Response: V8-V11[-0.5] && V11-V12[0.48] Extracting More Correlations with Response:

ii- Confirming Highest Correlated Variables with the Response

corr <- cor(data)
corr <- subset(corr, select=-diag(corr))

format = reshape2::melt(corr, na.rm = TRUE, value.name = "corr")
cor_v11 <- subset(format, format$Var1 == "V11")
cor_v11[order(abs(cor_v11$corr),decreasing=TRUE),c(2,3)]

##     Var2        corr
## 119  V11  1.00000000
## 83    V8 -0.51128254
## 131  V12  0.48683897
## 47    V5 -0.24743020
## 71    V7 -0.22155407
## 107  V10  0.20964396
## 95    V9  0.20012820
## 11    V2 -0.19126235
## 35    V4  0.11981652
## 23    V3  0.11526138
## 59    V6 -0.04066149

Most significant Notes so far:
The response is correlated with V8,V12
V1 is highly correlated with other Factors V3,V8,V9

2- Visualizing the Distribution & Trends of Variables in Response to V11

Note that ggpairs() in the previous Visualization achieves this task.
>> Note: Tried a loop, some errors rose, so went for manual

pred <- seq(1:12)

v1 <- (ggplot(data, aes(x=V1, y=V11)) +
              stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
              geom_point(alpha=1, color="blue", size = 1) +
              xlab("V1") +
              ylab("V11"))
v2 <- (ggplot(data, aes(x=V2, y=V11)) +
              stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
              geom_point(alpha=1, color="blue", size = 1) +
              xlab("V2") +
              ylab("V11"))
v3 <- (ggplot(data, aes(x=V3, y=V11)) +
              stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
              geom_point(alpha=1, color="blue", size = 1) +
              xlab("V3") +
              ylab("V11"))
v4 <- (ggplot(data, aes(x=V4, y=V11)) +
              stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
              geom_point(alpha=1, color="blue", size = 1) +
              xlab("V4") +
              ylab("V11"))
##-----------------------------------------------------------------
grid.arrange(v1,v2,v3,v4, ncol=2)

v5 <- (ggplot(data, aes(x=V5, y=V11)) +
              stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
              geom_point(alpha=1, color="blue", size = 1) +
              xlab("V5") +
              ylab("V11"))
v6 <- (ggplot(data, aes(x=V6, y=V11)) +
              stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
              geom_point(alpha=1, color="blue", size = 1) +
              xlab("V6") +
              ylab("V11"))
v7 <- (ggplot(data, aes(x=V7, y=V11)) +
              stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
              geom_point(alpha=1, color="blue", size = 1) +
              xlab("V7") +
              ylab("V11"))
v8 <- (ggplot(data, aes(x=V8, y=V11)) +
              stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
              geom_point(alpha=1, color="blue", size = 1) +
              xlab("V8") +
              ylab("V11"))
##-----------------------------------------------------------------
grid.arrange(v5,v6,v7,v8, ncol=2)

v9 <- (ggplot(data, aes(x=V9, y=V11)) +
              stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
              geom_point(alpha=1, color="blue", size = 1) +
              xlab("V9") +
              ylab("V11"))
v10 <- (ggplot(data, aes(x=V10, y=V11)) +
              stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
              geom_point(alpha=1, color="blue", size = 1) +
              xlab("V10") +
              ylab("V11"))
v12 <- (ggplot(data, aes(x=V12, y=V11)) +
              stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
              geom_point(alpha=1, color="blue", size = 1) +
              xlab("V12") +
              ylab("V11"))
##-----------------------------------------------------------------
grid.arrange(v9,v10,v12, ncol=2)

3- Linear Regression Model V6 as Predictor

We observe a very low R^2 and AdjR^2 = 0.001 No more than approximately 1% of the response can be explained by a variation in V6.

Estimates: an increase in a unit of V6 will only lead to a decrease in V11 equivalent to B1 = -0.004
P-Value: The p-value is not significant = 0.12

In summary, the model is very weak and V6 is not enought to predict V11.

lm.fit <- lm(V11~V6, data=data)
vis <- ggplot(data, aes(x=V6, y=V11)) +
              stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
              geom_point(alpha=1, color="blue", size = 1) +
              xlab("V6") +
              ylab("V11")
vis

summary(lm.fit)

## 
## Call:
## lm(formula = V11 ~ V6, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0881 -0.8970 -0.2656  0.6704  3.6153 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.510520   0.052273 201.071   <2e-16 ***
## V6          -0.004494   0.002891  -1.554     0.12    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.058 on 1459 degrees of freedom
## Multiple R-squared:  0.001653,   Adjusted R-squared:  0.0009691 
## F-statistic: 2.416 on 1 and 1459 DF,  p-value: 0.1203

4- Multiple Linear Regression V2,6,10,12 as predictors

The model has only one significant predictor (by p-value) which is 12.
The latter is logical as V12 and V11 were seen to be slightly correlated (corr = 0.48).
As of other variables, they don’t seem to have any significance.

R-squared and AdjR^2 are still very low: less than 25%
of the Response is explained by these Predictors.

V6 is still insignificant.

mult.fit <- lm(V11~V2+V6+V10+V12, data=data)
summary(mult.fit)

## 
## Call:
## lm(formula = V11 ~ V2 + V6 + V10 + V12, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3353 -0.6287 -0.1920  0.5543  3.6961 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.721008   0.251285  26.747   <2e-16 ***
## V2           0.022375   0.152854   0.146    0.884    
## V6          -0.002720   0.002533  -1.074    0.283    
## V10          0.316442   0.197686   1.601    0.110    
## V12          0.628943   0.034296  18.339   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9251 on 1456 degrees of freedom
## Multiple R-squared:  0.2389, Adjusted R-squared:  0.2368 
## F-statistic: 114.2 on 4 and 1456 DF,  p-value: < 2.2e-16

A- Linear Regression: Using the Data with the Outliers

V6 appears to be significant in this case.
However, still V6 explains less than 0.5% (R^2) of the response!

lm.fit <- lm(V11~V6, data=twelve)
vis <- ggplot(twelve, aes(x=V6, y=V11)) +
              stat_smooth(method=lm, formula = y ~ x, colour="magenta")+
              geom_point(alpha=1, color="blue", size = 1) +
              xlab("V6") +
              ylab("V11")
vis

summary(lm.fit)

## 
## Call:
## lm(formula = V11 ~ V6, data = twelve)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0999 -0.8999 -0.2585  0.6779  4.5203 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.535238   0.048345  217.92  < 2e-16 ***
## V6          -0.007071   0.002543   -2.78  0.00549 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.063 on 1597 degrees of freedom
## Multiple R-squared:  0.004818,   Adjusted R-squared:  0.004194 
## F-statistic: 7.731 on 1 and 1597 DF,  p-value: 0.005492

B- Multiple Regression: Using the Data with the Outliers

V6 is less significant after adding the new predictors, the Estimate is approximately the same.
The model is still weak as the predictors cannot explain more than 25% of the variation in V11.

mult.fit <- lm(V11~V2+V6+V10+V12, data=twelve)
summary(mult.fit)

## 
## Call:
## lm(formula = V11 ~ V2 + V6 + V10 + V12, data = twelve)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3037 -0.6335 -0.1965  0.5502  4.9071 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.196874   0.237290  30.329   <2e-16 ***
## V2          -0.157667   0.144561  -1.091   0.2756    
## V6          -0.004514   0.002248  -2.008   0.0448 *  
## V10         -0.185035   0.145489  -1.272   0.2036    
## V12          0.621497   0.032038  19.399   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9363 on 1594 degrees of freedom
## Multiple R-squared:  0.2299, Adjusted R-squared:  0.228 
## F-statistic:   119 on 4 and 1594 DF,  p-value: < 2.2e-16

5- Multiple Linear Regression with CV k=10

mlcv.fit <- glm(V11 ~ V2+V6+V10+V12, data = data)
cv.error <- cv.glm(data, mlcv.fit, K = 10)$delta[1]
cv.error

## [1] 0.8600801

6- Quadratic Regression Model V10~V4^2 using LOOCV: Highly Expesnive

start_time <- Sys.time()
glm.quad <- glm(V10 ~ poly(V4, 2, raw = TRUE), data = data)
cv.error <- cv.glm(data, glm.quad)$delta[1]
end_time <- Sys.time()
time_loocv <- end_time - start_time
time_loocv

## Time difference of 4.20779 secs

7- Comparing the Computation Time for K=5 and K=1(LOOCV):

We notice LOOCV is slower by ~ 200 folds than CV with K=5.

start_time <- Sys.time()
glm.quad <- glm(V10 ~ poly(V4, 2, raw = TRUE), data = data)
cv.error <- cv.glm(data, glm.quad, K=5)$delta[1]
end_time <- Sys.time()

time_cv <- end_time - start_time
time_diff <- c(time_cv, time_loocv, time_loocv-time_cv)
names(time_diff) <- c("Time for CV in seconds", "Time for LOOCV in seconds", "Difference")
time_diff

## Time differences in secs
##    Time for CV in seconds Time for LOOCV in seconds                Difference 
##                0.01702499                4.20778990                4.19076490

8- Subset Selection with V11 as Response:

A- Best Subset Selection based on Adjusted R^2

set.seed(1234)
bestfit.reg <- regsubsets(V11~., data = data, nvmax = ncol(data), method="exhaustive")
summ = summary(bestfit.reg)
names(summ)

## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"

summ$outmat

##           V1  V2  V3  V4  V5  V6  V7  V8  V9  V10 V12
## 1  ( 1 )  " " " " " " " " " " " " " " "*" " " " " " "
## 2  ( 1 )  " " " " " " " " " " " " " " "*" " " " " "*"
## 3  ( 1 )  "*" " " " " " " " " " " " " "*" "*" " " " "
## 4  ( 1 )  "*" " " " " "*" " " " " " " "*" "*" " " " "
## 5  ( 1 )  "*" " " " " "*" " " " " " " "*" "*" " " "*"
## 6  ( 1 )  "*" " " " " "*" " " " " " " "*" "*" "*" "*"
## 7  ( 1 )  "*" " " "*" "*" " " " " " " "*" "*" "*" "*"
## 8  ( 1 )  "*" "*" "*" "*" " " " " " " "*" "*" "*" "*"
## 9  ( 1 )  "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*"
## 10  ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*"
## 11  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"

plot(summ$adjr2, xlab = "num of variables", ylab = "AdjR^2", type = "b", col = "purple")
points(which.max(summ$adjr2), summ$adjr2[which.max(summ$adjr2)], col="red", cex=1, pch=20)

### Then, it is the model with 9 predictors that performs best according to AdjR^2
### From the Outmat above, we can see that the model belongs to that without V5,V6
bestfit.model <- glm(V11~.- V5 - V6, data = data)
summary(bestfit.model)

## 
## Call:
## glm(formula = V11 ~ . - V5 - V6, data = data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.39437  -0.35751  -0.04548   0.32463   2.12005  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.833e+02  1.328e+01  43.934  < 2e-16 ***
## V1           4.634e-01  2.021e-02  22.926  < 2e-16 ***
## V2           4.650e-01  1.125e-01   4.132 3.80e-05 ***
## V3           7.694e-01  1.319e-01   5.835 6.61e-09 ***
## V4           5.657e-01  2.496e-02  22.661  < 2e-16 ***
## V7          -2.068e-03  5.368e-04  -3.853 0.000122 ***
## V8          -5.935e+02  1.354e+01 -43.834  < 2e-16 ***
## V9           3.396e+00  1.473e-01  23.054  < 2e-16 ***
## V10          9.998e-01  1.236e-01   8.087 1.28e-15 ***
## V12          2.231e-01  2.251e-02   9.915  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3114849)
## 
##     Null deviance: 1637.27  on 1460  degrees of freedom
## Residual deviance:  451.96  on 1451  degrees of freedom
## AIC: 2454
## 
## Number of Fisher Scoring iterations: 2

B- Forward Subset Selection based on Cp

forwardfit.reg <- regsubsets(V11~., data = data, nvmax = ncol(data), method="forward")
summ_forward = summary(forwardfit.reg)
names(summ_forward )

## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"

summ_forward$outmat

##           V1  V2  V3  V4  V5  V6  V7  V8  V9  V10 V12
## 1  ( 1 )  " " " " " " " " " " " " " " "*" " " " " " "
## 2  ( 1 )  " " " " " " " " " " " " " " "*" " " " " "*"
## 3  ( 1 )  " " " " " " "*" " " " " " " "*" " " " " "*"
## 4  ( 1 )  "*" " " " " "*" " " " " " " "*" " " " " "*"
## 5  ( 1 )  "*" " " " " "*" " " " " " " "*" "*" " " "*"
## 6  ( 1 )  "*" " " " " "*" " " " " " " "*" "*" "*" "*"
## 7  ( 1 )  "*" " " "*" "*" " " " " " " "*" "*" "*" "*"
## 8  ( 1 )  "*" "*" "*" "*" " " " " " " "*" "*" "*" "*"
## 9  ( 1 )  "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*"
## 10  ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*"
## 11  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"

plot(summ_forward$cp, xlab = "num of variables", ylab = "Cp", type = "b", col = "purple")
points(which.min(summ_forward$cp), summ_forward$cp[which.min(summ_forward$cp)],
       col="red", cex=1, pch=20)

### Then, it is the model with 9 predictors also that performs best according to Cp
### From the Outmat above, we can see that the model belongs to that without V5,V6
### This is the same model as that of the bestfit.model
forwardfit.model <- bestfit.model
summary(forwardfit.model)

## 
## Call:
## glm(formula = V11 ~ . - V5 - V6, data = data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.39437  -0.35751  -0.04548   0.32463   2.12005  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.833e+02  1.328e+01  43.934  < 2e-16 ***
## V1           4.634e-01  2.021e-02  22.926  < 2e-16 ***
## V2           4.650e-01  1.125e-01   4.132 3.80e-05 ***
## V3           7.694e-01  1.319e-01   5.835 6.61e-09 ***
## V4           5.657e-01  2.496e-02  22.661  < 2e-16 ***
## V7          -2.068e-03  5.368e-04  -3.853 0.000122 ***
## V8          -5.935e+02  1.354e+01 -43.834  < 2e-16 ***
## V9           3.396e+00  1.473e-01  23.054  < 2e-16 ***
## V10          9.998e-01  1.236e-01   8.087 1.28e-15 ***
## V12          2.231e-01  2.251e-02   9.915  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3114849)
## 
##     Null deviance: 1637.27  on 1460  degrees of freedom
## Residual deviance:  451.96  on 1451  degrees of freedom
## AIC: 2454
## 
## Number of Fisher Scoring iterations: 2

forwardfit.model

## 
## Call:  glm(formula = V11 ~ . - V5 - V6, data = data)
## 
## Coefficients:
## (Intercept)           V1           V2           V3           V4           V7  
##   5.833e+02    4.634e-01    4.650e-01    7.694e-01    5.657e-01   -2.068e-03  
##          V8           V9          V10          V12  
##  -5.935e+02    3.396e+00    9.998e-01    2.231e-01  
## 
## Degrees of Freedom: 1460 Total (i.e. Null);  1451 Residual
## Null Deviance:       1637 
## Residual Deviance: 452   AIC: 2454

C- Backward Subset Selection based on BIC

backfit.reg <- regsubsets(V11~., data = data, nvmax = ncol(data), method="backward")
summ_back = summary(backfit.reg)
names(summ_back)

## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"

summ_back$outmat

##           V1  V2  V3  V4  V5  V6  V7  V8  V9  V10 V12
## 1  ( 1 )  " " " " " " " " " " " " " " "*" " " " " " "
## 2  ( 1 )  "*" " " " " " " " " " " " " "*" " " " " " "
## 3  ( 1 )  "*" " " " " " " " " " " " " "*" "*" " " " "
## 4  ( 1 )  "*" " " " " "*" " " " " " " "*" "*" " " " "
## 5  ( 1 )  "*" " " " " "*" " " " " " " "*" "*" " " "*"
## 6  ( 1 )  "*" " " " " "*" " " " " " " "*" "*" "*" "*"
## 7  ( 1 )  "*" " " "*" "*" " " " " " " "*" "*" "*" "*"
## 8  ( 1 )  "*" "*" "*" "*" " " " " " " "*" "*" "*" "*"
## 9  ( 1 )  "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*"
## 10  ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*"
## 11  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"

plot(summ_back$bic, xlab = "num of variables", ylab = "BIC", type = "b", col = "purple")
points(which.min(summ_back$bic), summ_back$bic[which.min(summ_back$bic)],
       col="red", cex=1, pch=20)

### Also the model with 9 Predictors is the best model obtained from backward subset selection

Basic Data Cleaning & Modeling