ICC-cost-analysis.R

# Libraries
library(glmnet)     #  This library contains the functions for training regularised linear regression models.

## Loading required package: Matrix

## Loaded glmnet 4.1-8

library(DescTools)  #  Descriptive statistics library.
library(ggplot2)
data <- read.csv("data.csv")

set.seed(2)  #  Setting random seed for random sample reproducibility

# Gathering training indices: 67% for training/cross-val, the rest for test.
train_indices <- sample(1:nrow(data), nrow(data) * 0.67, replace = FALSE)

# Now we'll partition the data as per the test design/plan.
train <- data[train_indices,]
test <- data[-train_indices,]

################## Data Understanding Code #####################
Str(data)     # Display the structure of the data

## 'data.frame':    369 obs. of  9 variables:
##  1 $ V.1: int  1 1 1 1 1 1 1 1 1 1 ...
##  2 $ V.2: num  685 750 1150 1450 1460 1480 1590 1810 1830 1860 ...
##  3 $ V.3: num  202 200 380 370 380 380 350 492 410 480 ...
##  4 $ V.4: num  13.7 90 575 406 627.8 ...
##  5 $ V.5: int  20 120 500 280 430 150 420 640 270 290 ...
##  6 $ V.6: num  460 846 591 702 733 ...
##  7 $ V.7: int  12 18 18 15 15 15 18 33 18 12 ...
##  8 $ V.8: int  140 1100 5300 3800 4700 1700 4900 5700 3700 3300 ...
##  9 $ Y  : int  30 150 600 370 590 190 620 900 300 380 ...

# View(data)
summary(data)   # Summary statistics of the data

##       V.1             V.2             V.3              V.4        
##  Min.   : 1.00   Min.   :  200   Min.   :  60.0   Min.   :   3.7  
##  1st Qu.: 4.00   1st Qu.:  715   1st Qu.: 190.0   1st Qu.:  67.2  
##  Median : 8.00   Median : 1195   Median : 300.0   Median : 163.8  
##  Mean   : 9.75   Mean   : 1648   Mean   : 407.7   Mean   : 302.2  
##  3rd Qu.:17.00   3rd Qu.: 2035   3rd Qu.: 480.0   3rd Qu.: 354.6  
##  Max.   :20.00   Max.   :15670   Max.   :5000.0   Max.   :7208.2  
##  NA's   :1       NA's   :1       NA's   :1        NA's   :1       
##       V.5             V.6              V.7             V.8        
##  Min.   : 10.0   Min.   : 193.1   Min.   : 5.00   Min.   :  40.0  
##  1st Qu.: 80.0   1st Qu.: 391.7   1st Qu.:15.00   1st Qu.: 437.5  
##  Median :140.0   Median : 518.1   Median :18.00   Median : 795.0  
##  Mean   :161.6   Mean   : 552.8   Mean   :18.47   Mean   :1081.9  
##  3rd Qu.:230.0   3rd Qu.: 660.7   3rd Qu.:21.00   3rd Qu.:1300.0  
##  Max.   :640.0   Max.   :3436.9   Max.   :48.00   Max.   :5700.0  
##  NA's   :1       NA's   :1        NA's   :1       NA's   :1       
##        Y        
##  Min.   : 20.0  
##  1st Qu.:107.5  
##  Median :190.0  
##  Mean   :228.8  
##  3rd Qu.:330.0  
##  Max.   :900.0  
##  NA's   :1

dim(data)       # Dimensions of the data (rows and columns)

## [1] 369   9

nrow(data)      # Number of rows in the data

## [1] 369

ncol(data)     # Number of columns in the data

## [1] 9

## DescTools::
# We've used Desc() to gather descriptive statistics and visualisations of data before. 
Desc(data)

## ------------------------------------------------------------------------------ 
## Describe data (data.frame):
## 
## data frame:  369 obs. of  9 variables
##      368 complete cases (99.7%)
## 
##   Nr  ColName  Class    NAs       Levels
##   1   V.1      integer  1 (0.3%)        
##   2   V.2      numeric  1 (0.3%)        
##   3   V.3      numeric  1 (0.3%)        
##   4   V.4      numeric  1 (0.3%)        
##   5   V.5      integer  1 (0.3%)        
##   6   V.6      numeric  1 (0.3%)        
##   7   V.7      integer  1 (0.3%)        
##   8   V.8      integer  1 (0.3%)        
##   9   Y        integer  1 (0.3%)        
## 
## 
## ------------------------------------------------------------------------------ 
## 1 - V.1 (integer)
## 
##   length      n    NAs  unique     0s   mean  meanCI'
##      369    368      1      20      0   9.75    9.08
##           99.7%   0.3%           0.0%          10.42
##                                                     
##      .05    .10    .25  median    .75    .90     .95
##     1.00   2.00   4.00    8.00  17.00  19.00   20.00
##                                                     
##    range     sd  vcoef     mad    IQR   skew    kurt
##    19.00   6.55   0.67    8.90  13.00   0.24   -1.45
##                                                     
## lowest : 1 (29), 2 (25), 3 (29), 4 (33), 5 (20)
## highest: 16 (7), 17 (21), 18 (20), 19 (20), 20 (32)
## 
## heap(?): remarkable frequency (9.0%) for the mode(s) (= 4)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 2 - V.2 (numeric)
## 
##      length         n     NAs    unique        0s      mean    meanCI'
##         369       368       1       213         0  1'648.35  1'483.95
##                 99.7%    0.3%                0.0%            1'812.76
##                                                                      
##         .05       .10     .25    median       .75       .90       .95
##      370.00    450.00  715.00  1'195.00  2'035.00  3'166.00  4'295.00
##                                                                      
##       range        sd   vcoef       mad       IQR      skew      kurt
##   15'470.00  1'603.86    0.97    896.97  1'320.00      4.10     27.08
##                                                                      
## lowest : 200.0 (2), 220.0, 230.0, 270.0, 280.0 (3)
## highest: 6'740.0, 6'930.0, 8'800.0, 14'500.0, 15'670.0
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 3 - V.3 (numeric)
## 
##     length       n     NAs  unique      0s    mean  meanCI'
##        369     368       1     101       0  407.73  363.79
##              99.7%    0.3%            0.0%          451.67
##                                                           
##        .05     .10     .25  median     .75     .90     .95
##     100.00  120.00  190.00  300.00  480.00  723.00  936.50
##                                                           
##      range      sd   vcoef     mad     IQR    skew    kurt
##   4'940.00  428.64    1.05  207.56  290.00    5.55   45.62
##                                                           
## lowest : 60.0 (2), 70.0, 80.0 (5), 90.0 (4), 100.0 (8)
## highest: 1'780.0, 2'500.0, 2'960.0, 3'440.0, 5'000.0
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 4 - V.4 (numeric)
## 
##      length        n     NAs   unique       0s     mean     meanCI'
##         369      368       1      333        0  302.210    250.610
##                99.7%    0.3%              0.0%             353.810
##                                                                   
##         .05      .10     .25   median      .75      .90        .95
##      20.140   32.220  67.200  163.800  354.650  675.920  1'034.385
##                                                                   
##       range       sd   vcoef      mad      IQR     skew       kurt
##   7'204.500  503.371   1.666  174.280  287.450    7.870     96.099
##                                                                   
## lowest : 3.7, 4.5, 6.0, 8.0, 8.6
## highest: 1'798.0, 1'800.0, 1'854.400, 2'668.0, 7'208.200
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 5 - V.5 (integer)
## 
##   length       n    NAs  unique      0s    mean  meanCI'
##      369     368      1      50       0  161.58  150.10
##            99.7%   0.3%            0.0%          173.05
##                                                        
##      .05     .10    .25  median     .75     .90     .95
##    30.00   40.00  80.00  140.00  230.00  300.00  370.00
##                                                        
##    range      sd  vcoef     mad     IQR    skew    kurt
##   630.00  111.91   0.69  103.78  150.00    1.16    1.57
##                                                        
## lowest : 10 (5), 20 (9), 30 (15), 40 (17), 50 (10)
## highest: 500, 530, 540, 580 (2), 640
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 6 - V.6 (numeric)
## 
##       length         n       NAs    unique        0s      mean    meanCI'
##          369       368         1       310         0  552.8204  524.6332
##                  99.7%      0.3%                0.0%            581.0075
##                                                                         
##          .05       .10       .25    median       .75       .90       .95
##     273.7285  306.4060  391.6775  518.0950  660.7250  793.5190  877.6435
##                                                                         
##        range        sd     vcoef       mad       IQR      skew      kurt
##   3'243.8500  274.9744    0.4974  203.3979  269.0475    4.6295   40.4373
##                                                                         
## lowest : 193.08 (2), 196.07 (2), 205.31, 206.95, 216.22
## highest: 1'362.9200, 1'446.6700, 1'495.0500, 2'648.0400, 3'436.9300
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 7 - V.7 (integer)
## 
##   length      n    NAs  unique     0s   mean  meanCI'
##      369    368      1      18      0  18.47   17.88
##           99.7%   0.3%           0.0%          19.06
##                                                     
##      .05    .10    .25  median    .75    .90     .95
##    12.00  12.00  15.00   18.00  21.00  24.00   30.00
##                                                     
##    range     sd  vcoef     mad    IQR   skew    kurt
##    43.00   5.74   0.31    4.45   6.00   1.41    4.39
##                                                     
## lowest : 5, 6 (3), 7, 8, 9 (11)
## highest: 36, 39 (3), 42, 45, 48
## 
## heap(?): remarkable frequency (30.2%) for the mode(s) (= 18)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 8 - V.8 (integer)
## 
##     length       n     NAs  unique        0s      mean    meanCI'
##        369     368       1     121         0  1'081.88    979.79
##              99.7%    0.3%              0.0%            1'183.96
##                                                                 
##        .05     .10     .25  median       .75       .90       .95
##     170.00  217.00  437.50  795.00  1'300.00  2'300.00  3'300.00
##                                                                 
##      range      sd   vcoef     mad       IQR      skew      kurt
##   5'660.00  995.88    0.92  600.45    862.50      2.07      4.71
##                                                                 
## lowest : 40, 80 (2), 90, 100 (2), 110 (2)
## highest: 4'800, 4'900 (2), 5'000, 5'300, 5'700
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 9 - Y (integer)
## 
##   length       n     NAs  unique      0s    mean  meanCI'
##      369     368       1      61       0  228.79  212.75
##            99.7%    0.3%            0.0%          244.83
##                                                         
##      .05     .10     .25  median     .75     .90     .95
##    50.00   67.00  107.50  190.00  330.00  430.00  486.50
##                                                         
##    range      sd   vcoef     mad     IQR    skew    kurt
##   880.00  156.43    0.68  148.26  222.50    1.10    1.36
##                                                         
## lowest : 20 (3), 30 (8), 40 (4), 50 (13), 60 (9)
## highest: 620, 630, 650 (3), 700 (3), 900 (2)
## 
## ' 95%-CI (classic)

# Calculate the mean for each variable
mean_values <- sapply(data, mean, na.rm = TRUE)
print(mean_values)

##        V.1        V.2        V.3        V.4        V.5        V.6        V.7 
##    9.75000 1648.35462  407.72962  302.20984  161.57609  552.82035   18.47011 
##        V.8          Y 
## 1081.87500  228.79076

# Calculate the median for each variable
median_values <- sapply(data, median, na.rm = TRUE)
print(median_values)

##      V.1      V.2      V.3      V.4      V.5      V.6      V.7      V.8 
##    8.000 1195.000  300.000  163.800  140.000  518.095   18.000  795.000 
##        Y 
##  190.000

# Function to calculate mode
calculate_mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

# Calculate the mode for each variable
mode_values <- sapply(data, calculate_mode)
print(mode_values)

##     V.1     V.2     V.3     V.4     V.5     V.6     V.7     V.8       Y 
##    4.00 1540.00  250.00  154.00  100.00  499.25   18.00 1100.00  110.00

# Calculate the range for each variable
range_values <- sapply(data, function(x) max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
print(range_values)

##      V.1      V.2      V.3      V.4      V.5      V.6      V.7      V.8 
##    19.00 15470.00  4940.00  7204.50   630.00  3243.85    43.00  5660.00 
##        Y 
##   880.00

# Calculate the variance for each variable
variance_values <- sapply(data, var, na.rm = TRUE)
print(variance_values)

##          V.1          V.2          V.3          V.4          V.5          V.6 
## 4.290736e+01 2.572359e+06 1.837289e+05 2.533826e+05 1.252394e+04 7.561089e+04 
##          V.7          V.8            Y 
## 3.299093e+01 9.917673e+05 2.447163e+04

# Calculate the standard deviation for each variable
std_dev_values <- sapply(data, sd, na.rm = TRUE)
print(std_dev_values)

##         V.1         V.2         V.3         V.4         V.5         V.6 
##    6.550371 1603.857419  428.636121  503.371240  111.910409  274.974351 
##         V.7         V.8           Y 
##    5.743773  995.875153  156.434096

# Count missing values for each variable
missing_values <- colSums(is.na(data))
print(missing_values)

## V.1 V.2 V.3 V.4 V.5 V.6 V.7 V.8   Y 
##   1   1   1   1   1   1   1   1   1

# Identify rows with missing values
rows_with_missing <- which(rowSums(is.na(data)) > 0)
print(rows_with_missing)

## [1] 369

# View the row with missing values
print(data[369, ])

##     V.1 V.2 V.3 V.4 V.5 V.6 V.7 V.8  Y
## 369  NA  NA  NA  NA  NA  NA  NA  NA NA

# Remove the row with missing values
data <- data[-369, ]

### Correlation###
# Exploring associations using Pearson's correlation statistic 
# is important if we might wish to explore the data using
# linear regression. cor(data) below facilitates calculations 
# of correlation matrices based on input matrices or data
# frames.
train.corr <- cor(train)
print(train.corr)

##            V.1        V.2        V.3         V.4         V.5        V.6
## V.1  1.0000000 -0.3668054 -0.3444946 -0.32308120 -0.32995247 -0.5764518
## V.2 -0.3668054  1.0000000  0.9549185  0.82890755  0.29772735  0.3161668
## V.3 -0.3444946  0.9549185  1.0000000  0.76838678  0.27227474  0.2889082
## V.4 -0.3230812  0.8289075  0.7683868  1.00000000  0.58715391  0.3649887
## V.5 -0.3299525  0.2977273  0.2722747  0.58715391  1.00000000  0.3252373
## V.6 -0.5764518  0.3161668  0.2889082  0.36498870  0.32523730  1.0000000
## V.7  0.1389064 -0.1197643 -0.1608562 -0.09320812  0.04491195  0.1173662
## V.8 -0.4305652  0.3031388  0.2758476  0.49795034  0.79720694  0.1349534
## Y   -0.2790013  0.2781865  0.2509129  0.56824306  0.97575557  0.3006600
##              V.7          V.8          Y
## V.1  0.138906413 -0.430565227 -0.2790013
## V.2 -0.119764316  0.303138792  0.2781865
## V.3 -0.160856247  0.275847617  0.2509129
## V.4 -0.093208115  0.497950339  0.5682431
## V.5  0.044911951  0.797206940  0.9757556
## V.6  0.117366208  0.134953431  0.3006600
## V.7  1.000000000 -0.001755917  0.1551566
## V.8 -0.001755917  1.000000000  0.7802423
## Y    0.155156620  0.780242301  1.0000000

ggcorrplot::ggcorrplot(train.corr)  #  Correlation heat map

# Faceted scatter plots of Total Floor Area (V.2) and Preliminary Estimated Construction Cost (V.4) by Project Locality (V.1)
ggplot(data, aes(x = V.2, y = V.4)) + 
  geom_point() +
  facet_wrap(~V.1) +
  labs(title = "Scatter Plots of V.2 vs V.4 by Project Locality", x = "Total Floor Area (m^2)", y = "Preliminary Estimated Construction Cost (IRR)")

# Correlation matrix
correlation_matrix <- cor(data[, c("V.1","V.2", "V.3", "V.4", "V.5", "V.6", "V.7", "V.8", "Y")], use = "complete.obs")
print(correlation_matrix)

##             V.1        V.2        V.3         V.4         V.5        V.6
## V.1  1.00000000 -0.2679524 -0.1905237 -0.31618372 -0.30916504 -0.6137182
## V.2 -0.26795239  1.0000000  0.9504365  0.71839832  0.20041915  0.2307232
## V.3 -0.19052374  0.9504365  1.0000000  0.58660991  0.13807650  0.1635075
## V.4 -0.31618372  0.7183983  0.5866099  1.00000000  0.59254692  0.3541253
## V.5 -0.30916504  0.2004191  0.1380765  0.59254692  1.00000000  0.3094863
## V.6 -0.61371820  0.2307232  0.1635075  0.35412534  0.30948629  1.0000000
## V.7  0.01812362 -0.1216557 -0.1782655 -0.04741677  0.06221763  0.1411306
## V.8 -0.42444770  0.2459270  0.1791519  0.53392526  0.81448707  0.1656995
## Y   -0.29125587  0.1995680  0.1359838  0.58009348  0.97307794  0.2975429
##             V.7         V.8          Y
## V.1  0.01812362 -0.42444770 -0.2912559
## V.2 -0.12165574  0.24592705  0.1995680
## V.3 -0.17826546  0.17915194  0.1359838
## V.4 -0.04741677  0.53392526  0.5800935
## V.5  0.06221763  0.81448707  0.9730779
## V.6  0.14113060  0.16569948  0.2975429
## V.7  1.00000000  0.02892702  0.1837235
## V.8  0.02892702  1.00000000  0.8004003
## Y    0.18372346  0.80040025  1.0000000

# Identify potential multicollinearity (absolute correlation > 0.7)
col_to_check <- which(abs(correlation_matrix) > 0.7 & correlation_matrix != 1, arr.ind = TRUE)
if (any(col_to_check)) {
  print(paste0("High correlation detected between: ", rownames(correlation_matrix)[col_to_check[, 1]], " and ", 
               rownames(correlation_matrix)[col_to_check[, 2]]))
  # Further investigation needed to decide if variable removal or other techniques are necessary
} else {
  print("No high correlations (> 0.7) detected among relevant variables.")
}

##  [1] "High correlation detected between: V.3 and V.2"
##  [2] "High correlation detected between: V.4 and V.2"
##  [3] "High correlation detected between: V.2 and V.3"
##  [4] "High correlation detected between: V.2 and V.4"
##  [5] "High correlation detected between: V.8 and V.5"
##  [6] "High correlation detected between: Y and V.5"  
##  [7] "High correlation detected between: V.5 and V.8"
##  [8] "High correlation detected between: Y and V.8"  
##  [9] "High correlation detected between: V.5 and Y"  
## [10] "High correlation detected between: V.8 and Y"

#################### Data Preparation Code ######################

library(tidyverse)  #  Streamlined data manipulation, visualization and analysis

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)      #  Building, training, evaluating and tuning models

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE

head(train, 4)

##     V.1 V.2 V.3  V.4 V.5    V.6 V.7 V.8   Y
## 341  20 810 250 32.4  40 274.06  15 570  50
## 198   9 520 120 62.4 120 667.98  27 360 170
## 262  15 430 110 73.1 170 409.62  21 650 210
## 273  16 810 210 48.6  60 432.44  18 260  80

tail(train, 4)

##     V.1  V.2 V.3   V.4 V.5    V.6 V.7  V.8   Y
## 31    2  450 120  58.5 130 913.31  21  690 160
## 330  19 1530 390  15.3  10 193.08  15  120  40
## 19    1 3420 880 547.2 160 890.64  12 2000 180
## 30    2  370 120  40.7 110 585.22  15 1200 140

class(train) # show the data type

## [1] "data.frame"

str(train)  # Checking the structure/format of the data.

## 'data.frame':    247 obs. of  9 variables:
##  $ V.1: int  20 9 15 16 20 10 18 8 3 5 ...
##  $ V.2: num  810 520 430 810 1060 450 450 450 3280 2630 ...
##  $ V.3: num  250 120 110 210 330 120 120 120 720 580 ...
##  $ V.4: num  32.4 62.4 73.1 48.6 21.2 ...
##  $ V.5: int  40 120 170 60 20 120 180 230 140 120 ...
##  $ V.6: num  274 668 410 432 205 ...
##  $ V.7: int  15 27 21 18 12 24 21 21 18 18 ...
##  $ V.8: int  570 360 650 260 210 500 920 1700 1200 850 ...
##  $ Y  : int  50 170 210 80 30 210 270 390 150 160 ...

summary(train)

##       V.1              V.2             V.3              V.4        
##  Min.   : 1.000   Min.   :  200   Min.   :  60.0   Min.   :   3.7  
##  1st Qu.: 4.000   1st Qu.:  705   1st Qu.: 190.0   1st Qu.:  67.6  
##  Median : 8.000   Median : 1150   Median : 290.0   Median : 160.5  
##  Mean   : 9.534   Mean   : 1603   Mean   : 387.7   Mean   : 317.2  
##  3rd Qu.:15.500   3rd Qu.: 1905   3rd Qu.: 445.0   3rd Qu.: 352.4  
##  Max.   :20.000   Max.   :15670   Max.   :3440.0   Max.   :7208.2  
##       V.5             V.6              V.7             V.8             Y      
##  Min.   : 10.0   Min.   : 193.1   Min.   : 5.00   Min.   :  80   Min.   : 20  
##  1st Qu.: 80.0   1st Qu.: 396.5   1st Qu.:15.00   1st Qu.: 465   1st Qu.:110  
##  Median :140.0   Median : 523.6   Median :18.00   Median : 810   Median :190  
##  Mean   :165.4   Mean   : 563.2   Mean   :18.53   Mean   :1113   Mean   :232  
##  3rd Qu.:230.0   3rd Qu.: 665.7   3rd Qu.:21.00   3rd Qu.:1350   3rd Qu.:330  
##  Max.   :640.0   Max.   :3436.9   Max.   :42.00   Max.   :5700   Max.   :900

train <- mutate_at(train, vars(V.1, V.5, V.7, V.8, Y), as.numeric) # convert total to numeric variable
str(train)                                 # let`s check the data structure again

## 'data.frame':    247 obs. of  9 variables:
##  $ V.1: num  20 9 15 16 20 10 18 8 3 5 ...
##  $ V.2: num  810 520 430 810 1060 450 450 450 3280 2630 ...
##  $ V.3: num  250 120 110 210 330 120 120 120 720 580 ...
##  $ V.4: num  32.4 62.4 73.1 48.6 21.2 ...
##  $ V.5: num  40 120 170 60 20 120 180 230 140 120 ...
##  $ V.6: num  274 668 410 432 205 ...
##  $ V.7: num  15 27 21 18 12 24 21 21 18 18 ...
##  $ V.8: num  570 360 650 260 210 500 920 1700 1200 850 ...
##  $ Y  : num  50 170 210 80 30 210 270 390 150 160 ...

is.na(train)                                     # classic way to check NA`s

##       V.1   V.2   V.3   V.4   V.5   V.6   V.7   V.8     Y
## 341 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 198 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 262 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 273 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 349 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 204 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 297 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 178 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 75  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 131 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 306 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 311 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 63  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 136 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 231 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 289 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 54  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 112 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 171 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 38  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 110 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 144 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 45  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 238 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 208 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 134 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 339 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 9   FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 130 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 244 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 3   FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 129 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 304 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 363 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 301 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 354 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 274 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 8   FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 164 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 37  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 226 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 149 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 205 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 242 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 44  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 276 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 156 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 342 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 106 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 175 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 359 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 182 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 224 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 271 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 13  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 189 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 96  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 166 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 265 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 53  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 143 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 36  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 17  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 241 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 80  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 127 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 267 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 79  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 78  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 364 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 153 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 192 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 23  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 28  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 1   FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 196 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 185 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 221 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 102 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 151 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 71  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 275 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 212 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 239 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 269 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 133 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 154 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 163 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 91  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 225 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 191 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 142 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 25  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 361 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 197 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 350 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 295 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 172 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 356 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 94  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 34  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 72  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 77  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 81  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 176 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 355 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 29  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 51  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 200 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 263 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 82  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 287 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 109 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 362 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 12  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 222 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 314 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 150 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 345 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 353 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 278 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 298 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 76  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 195 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 310 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 59  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 229 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 320 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 146 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 326 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 313 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 118 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 282 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 181 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 210 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 180 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 107 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2   FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 14  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 98  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 213 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 56  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 254 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 247 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 124 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 281 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 139 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 62  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 35  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 86  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 217 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 335 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 88  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 70  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 89  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 95  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 252 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 103 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 43  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 16  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 344 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 333 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 257 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 227 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 11  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 50  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 108 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 155 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 209 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 42  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 135 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 319 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 332 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 87  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 308 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 193 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 100 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 138 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 67  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 177 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 218 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 169 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 145 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 234 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 141 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 184 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 283 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 327 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 256 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 233 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 343 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 148 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 117 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 55  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 315 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 346 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 255 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 250 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 41  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 48  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 173 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 61  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 111 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 90  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 165 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 4   FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 85  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 243 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 57  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 6   FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 161 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 199 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 261 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 20  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 328 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 104 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 290 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 321 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 264 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 235 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 360 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 122 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 277 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 338 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 168 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 83  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 317 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 215 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 337 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 206 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 240 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 232 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 93  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 347 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 286 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 167 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 201 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 219 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 7   FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 211 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 251 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 312 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 322 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 31  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 330 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 19  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 30  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

sum(is.na(train))                                # counting NA`s

## [1] 0

apply(is.na(train),2, which)                     # which indexes of NA`s (df only)

## integer(0)

which(complete.cases(train))                     # identify observed complete values

##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
##  [19]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
##  [37]  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
##  [55]  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
##  [73]  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
##  [91]  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
## [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## [145] 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## [163] 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## [181] 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## [199] 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## [217] 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## [235] 235 236 237 238 239 240 241 242 243 244 245 246 247

train <- na.omit(train)

clean.vector <- na.omit(list(train))               # clean/remove a vector NA`s
clean.df <- na.omit(train)                         # clean/remove a dataframe NA`s
apply(is.na(clean.df),2, which)                   # make sure if there are missing values

## integer(0)

any(is.na(clean.vector))

## [1] FALSE

any(is.na(clean.df))

## [1] FALSE

train %>% pull() %>% head()                 # extract column values of `state` as a vector

## [1]  50 170 210  80  30 210

print(train)

##     V.1     V.2    V.3     V.4 V.5     V.6 V.7  V.8   Y
## 341  20   810.0  250.0   32.40  40  274.06  15  570  50
## 198   9   520.0  120.0   62.40 120  667.98  27  360 170
## 262  15   430.0  110.0   73.10 170  409.62  21  650 210
## 273  16   810.0  210.0   48.60  60  432.44  18  260  80
## 349  20  1060.0  330.0   21.20  20  205.31  12  210  30
## 204  10   450.0  120.0   54.00 120  490.57  24  500 210
## 297  18   450.0  120.0   81.00 180  324.97  21  920 270
## 178   8   450.0  120.0  103.50 230  505.25  21 1700 390
## 75    3  3280.0  720.0  459.20 140  805.03  18 1200 150
## 131   5  2630.0  580.0  315.60 120  599.10  18  850 160
## 306  18  1120.0  250.0  246.40 220  339.07  21 1200 350
## 311  18  1380.0  360.0   82.80  60  245.28  15  430  90
## 63    3  1820.0  400.0  655.20 360  474.74  15 3400 450
## 136   5  6740.0 1300.0  674.00 100  603.07  18  780 130
## 231  13   730.0  230.0   14.60  20  397.43  15  150  30
## 289  17  1410.0  310.0  239.70 170  338.79  18  960 270
## 54    2  5540.0  850.0 1551.20 280  957.42  27 1200 440
## 112   4  2120.0  470.0  508.80 240  817.74  18 1100 350
## 171   7  2110.0  470.0  189.90  90  620.86  15  490 110
## 38    2   900.0  300.0  351.00 390  755.51  18 2600 600
## 110   4  2100.0  380.0  819.00 390  601.08  12 2800 510
## 144   6  1020.0  270.0  142.80 140  829.60  18  840 180
## 45    2  1710.0  440.0  171.00 100  798.68  15  630 120
## 238  14   310.0   80.0   31.00 100  487.86  27  770 170
## 208  11   220.0   70.0   30.80 140  478.71  21  770 210
## 134   5  3650.0  940.0 1168.00 320  802.83  15 1500 420
## 339  20   580.0  150.0  104.40 180  348.70  21 1400 300
## 9     1  1830.0  410.0  494.10 270  650.57  18 3700 300
## 130   5  2500.0  640.0  300.00 120  599.10  12  810 140
## 244  14   580.0  150.0   75.40 130  442.94  21 1100 190
## 3     1  1150.0  380.0  575.00 500  590.68  18 5300 600
## 129   5  2370.0  730.0   71.10  30  400.73   9  210  90
## 304  18   950.0  250.0  133.00 140  337.33  18  610 160
## 363  20  2350.0  610.0  423.00 180  324.97  12  860 220
## 301  18   610.0  160.0   73.20 120  302.12  21  440 160
## 354  20  1370.0  360.0  191.80 140  271.21  15 1200 220
## 274  16   940.0  250.0  244.40 260  400.72  15 1400 370
## 8     1  1810.0  492.0 1158.40 640  843.98  33 5700 900
## 164   7  1060.0  240.0  190.80 180  784.87  21 1100 250
## 37    2   820.0  250.0  303.40 370  928.27  21 2400 580
## 226  13   520.0  140.0   78.00 150  466.12  21 1200 220
## 149   6  1450.0  320.0  377.00 260  443.35  21 3100 410
## 205  10   490.0  160.0   14.70  30  467.30  15  100  50
## 242  14   470.0  120.0   32.90  70  493.59  21  380  90
## 44    2  1540.0  590.0  215.60 140  698.95   6  820 160
## 276  17   370.0   80.0   51.80 140  337.33  42  470 270
## 156   7   350.0  143.0  122.50 350  442.61   9 1800 420
## 342  20   810.0  250.0   32.40  40  275.94  12  390  50
## 106   4  1900.0  550.0  437.00 230 1362.92  21  550 320
## 175   7  3270.0  510.0  327.00 100  579.84  27  230  90
## 359  20  1840.0  420.0  220.80 120  263.61  15 1500 170
## 182   8   550.0  140.0   44.00  80  578.67  21  530 100
## 224  12  2390.0  530.0  239.00 100  499.25  18  400 140
## 271  16   450.0  120.0    4.50  10  193.08  21  180  50
## 13    1  2620.0  670.0 1231.40 470  724.38  15 3900 650
## 189   8  1050.0  270.0  115.50 110  612.31  18  840 140
## 96    4  1500.0  330.0  540.00 360  867.43  21 1300 440
## 166   7  1380.0  310.0  193.20 140  744.83  21  700 200
## 265  15   750.0  200.0   45.00  60  423.07  18  360  80
## 53    2  4330.0 1110.0   86.60  20  421.26  12  170  30
## 143   6   920.0  210.0  119.60 130  649.02  24 1300 200
## 36    2   760.0  200.0  106.40 140  616.47  18  940 190
## 17    1  3040.0  670.0  881.60 290  561.79  15 4900 440
## 241  14   370.0  100.0   74.00 200  481.91  24  650 280
## 80    3  4840.0  930.0 1452.00 300  541.61  21 3300 430
## 127   5  2330.0  600.0  256.30 110  792.82  12  390 120
## 267  15  1120.0  290.0  358.40 320  421.99  18 1600 430
## 79    3  4600.0 1000.0 2668.00 580 3436.93  33 1300 900
## 78    3  4090.0  900.0 1104.30 270  655.93  15 2200 370
## 364  20  2500.0  510.0  325.00 130  251.84  18 1100 200
## 153   6  2880.0  740.0  662.40 230  579.07  15 2300 300
## 192   8  1510.0  390.0  241.60 160  654.09  15 1100 210
## 23    1  4080.0  790.0  530.40 130  759.17  21 1800 190
## 28    1  6930.0 1780.0  485.10  70  718.57  15  590  90
## 1     1   685.0  202.0   13.70  20  459.54  12  140  30
## 196   8  1580.0  410.0   47.40  30  585.11  15  210  70
## 185   8   670.0  180.0  154.10 230  577.03  18 1700 350
## 221  12  1000.0  260.0   20.00  20  436.43  15   80  30
## 102   4  1660.0  430.0  282.20 170  728.63  15 1200 240
## 151   6  1820.0  400.0  546.00 300  541.61  21 2600 430
## 71    3  2530.0  560.0  759.00 300  722.86  18 1800 330
## 275  16  6700.0 2500.0  603.00  90  279.67  27  580 150
## 212  12   230.0   60.0   18.40  80  460.02  30  350 110
## 239  14   330.0   90.0   23.10  70  414.80  27  320 100
## 269  16   200.0  100.0   46.00 230  445.56   9  820 300
## 133   5  3180.0  700.0 1049.40 330  732.16  15 1600 430
## 154   6  3050.0  670.0 1006.50 330  508.61  15 4300 460
## 163   7   940.0  250.0   94.00 100  723.33  18  550 120
## 91    4  1200.0 1100.0  300.00 250  451.35  21  980 370
## 225  13   430.0  110.0   81.70 190  476.68  24  820 310
## 191   8  1300.0  290.0  247.00 190  735.11  21 1100 300
## 142   6   850.0  220.0   68.00  80  548.12  15  870 100
## 25    1  4880.0 1070.0 1854.40 380  614.00  15 4100 520
## 361  20  2100.0  600.0  210.00 100  288.37  30  640 190
## 197   9   500.0  130.0  145.00 290  523.56  21  920 420
## 350  20  1070.0  280.0  160.50 150  290.58  18 1500 240
## 295  17  2170.0  480.0  195.30  90  439.07  18  340 130
## 172   7  2190.0  420.0  459.90 210  652.57  21 1000 300
## 356  20  1520.0  350.0  212.80 140  310.61  39 1200 300
## 94    4  1450.0  336.0  507.50 350  442.61   6 1800 400
## 34    2   690.0  160.0   89.70 130  770.35  24  990 190
## 72    3  3100.0  640.0 1798.00 580  647.79  21 4600 700
## 77    3  4070.0 1040.0  895.40 220  634.42  12 1800 290
## 81    3  5110.0  980.0 1430.80 280  505.51  21 2800 410
## 176   8   270.0   90.0   24.30  90  693.15  21  480 100
## 355  20  1390.0  360.0   41.70  30  206.95  12  350  40
## 29    1 15670.0 3440.0 7208.20 460 1010.50   5 3000 630
## 51    2  3600.0  980.0  612.00 170 2648.04  21  250 145
## 200   9  1680.0  370.0  285.60 170  490.23  18  800 260
## 263  15   460.0  120.0   96.60 210  461.32  21 1300 360
## 82    3  5200.0  800.0  208.00  40  958.39  39  120 130
## 287  17  1020.0  230.0   71.40  70  408.78  21  290 100
## 109   4  2070.0  400.0  269.10 130  723.64  21  730 170
## 362  20  2310.0  600.0  531.30 230  303.30  12 1400 280
## 12    1  2500.0  640.0 1050.00 420  647.32  36 4800 700
## 222  12  1370.0  300.0  109.60  80  476.68  21  640 100
## 314  18  1700.0  290.0  323.00 190  417.38  27  820 340
## 150   6  1800.0  400.0  378.00 210  659.84  18 2000 300
## 345  20   870.0  230.0  208.80 240  308.05  15 1300 320
## 353  20  1350.0  350.0  108.00  80  251.37  27  830 150
## 278  17   470.0  130.0  126.90 270  416.13  21  750 430
## 298  18   480.0  130.0   62.40 130  403.97  21  420 190
## 76    3  4000.0  880.0  400.00 100  750.70  15  540 110
## 195   8  1580.0  363.0  284.40 180  615.49  24  770 250
## 310  18  1380.0  360.0  289.80 210  339.32  15  690 300
## 59    3  1380.0  360.0  110.40  80  735.87  15  630 100
## 229  13   670.0  180.0   46.90  70  539.11  18  290  80
## 320  19   510.0  130.0   20.40  40  300.28  21  150  50
## 146   6  1070.0  210.0   42.80  40  772.31  27  430 110
## 326  19   940.0  240.0  141.00 150  377.65  18  430 220
## 313  18  1540.0  340.0  338.80 220  339.07  18  660 330
## 118   5   990.0  260.0  297.00 300  581.16  18 1700 470
## 282  17   540.0  140.0  113.40 210  406.81  21  610 350
## 181   8   530.0  140.0  148.40 280  542.42  21 1800 470
## 210  11   870.0  230.0  174.00 200  481.91  18 1300 220
## 180   8   510.0  160.0   15.30  30  654.64  15  150  50
## 107   4  1900.0  420.0  190.00 100  575.02  18  720 110
## 2     1   750.0  200.0   90.00 120  846.15  18 1100 150
## 14    1  2750.0  610.0  357.50 130  916.66  18 1300 160
## 98    4  1560.0  400.0  171.60 110  878.55  15  420 130
## 213  12   280.0   80.0   22.40  80  444.03  27  650 140
## 56    3   820.0  260.0   49.20  60  644.47  12  630  90
## 254  14   940.0  250.0   56.40  60  462.80  15  190  70
## 247  14   720.0  190.0  165.60 230  354.48  18 1200 350
## 124   5  2140.0  470.0  235.40 110  663.38  18  660 140
## 281  17   530.0  140.0   31.80  60  350.38  21  450  90
## 139   6   540.0  140.0   70.20 130  759.17  21 1400 180
## 62    3  1710.0  440.0  119.70  70  525.49  15  880  80
## 35    2   730.0  190.0  226.30 310  893.95  18 1800 450
## 86    4   720.0  190.0   72.00 100  798.68  18  490 120
## 217  12   810.0  210.0  194.40 240  532.48  18  900 330
## 335  19  2620.0  580.0  786.00 300  395.62  18 1000 400
## 88    4  1052.0  250.0  168.32 160  948.12  18  770 200
## 70    3  2260.0  440.0  158.20  70  569.44  21  620  80
## 89    4  1130.0  260.0  339.00 300  511.56   9 1400 390
## 95    4  1450.0  336.0  536.50 370  371.34  12 2300 390
## 252  14   850.0  220.0   59.50  70  422.15  15  520  80
## 103   4  1800.0  500.0  954.00 530  626.12  21 1900 650
## 43    2  1540.0  400.0  154.00 100  771.33  15  700 110
## 16    1  3030.0  930.0  515.10 170 1007.38   9 1500 190
## 344  20   840.0  190.0   67.20  80  273.55  24  650 120
## 333  19  1820.0  470.0  382.20 210  379.13  15  720 300
## 257  14  1110.0  250.0   88.80  80  390.29  21  900 120
## 227  13   550.0  140.0   77.00 140  439.90  21 1000 210
## 11    1  2110.0  540.0  189.90  90  732.14  15  690 110
## 50    2  3160.0  700.0  537.20 170  694.97  18 1100 300
## 108   4  2050.0  700.0  492.00 240  527.22  30 2100 450
## 155   6  3260.0  720.0  489.00 150  642.91  15 1500 220
## 209  11   600.0  160.0   12.00  20  392.14  18  110  60
## 42    2  1350.0  350.0  270.00 200  576.74  15 1700 270
## 135   5  5280.0 1160.0 1584.00 300  659.02  15 1500 420
## 319  19   410.0  110.0   20.50  50  301.54  24  400  70
## 332  19  1650.0  430.0  297.00 180  433.72  15  490 200
## 87    4   930.0  240.0   93.00 100  770.16  15  460 110
## 308  18  1210.0  270.0  193.60 160  318.86  21  680 260
## 193   8  1540.0  340.0  154.00 100  592.57  21  710 140
## 100   4  1650.0  432.0  264.00 160  851.23  15  550 200
## 138   6   360.0  100.0   86.40 240  409.25  24 3300 390
## 67    3  2120.0  470.0  572.40 270  650.57  18 2900 300
## 177   8   400.0  110.0   40.00 100  499.25  21  880 160
## 218  12   960.0  250.0   67.20  70  417.09  15  350  80
## 169   7  1980.0  440.0  495.00 250  549.19  18 1700 360
## 145   6  1040.0  270.0   93.60  90  718.81  15  810 110
## 234  13   960.0  220.0   67.20  70  405.89  21  140  60
## 141   6   840.0  220.0  100.80 120  690.02  18  770 130
## 184   8   640.0  250.0   96.00 150  654.06   6 1100 170
## 283  17   630.0  170.0  163.80 260  342.87  18 1100 350
## 327  19  1000.0  260.0  160.00 160  309.95  15  860 250
## 256  14  1080.0  240.0  205.20 190  343.02  18 1000 270
## 233  13   930.0  240.0   46.50  50  513.26  18  220  80
## 343  20   830.0  260.0   41.50  50  287.51  12  600  60
## 148   6  1330.0  300.0  172.90 130  649.02  21 1100 190
## 117   5   870.0  230.0   69.60  80  650.79  18  320 100
## 55    3   580.0  150.0   17.40  30  631.89  18  240  70
## 315  18  1910.0  420.0  305.60 160  388.70  18  810 210
## 346  20   890.0  230.0   35.60  40  237.03  18  350  50
## 255  14  1070.0  280.0   74.90  70  539.11  18  260  80
## 250  14   800.0  210.0   56.00  70  372.41  18  440 100
## 41    2  1260.0  330.0  189.00 150  875.96  15  790 210
## 48    2  2710.0  600.0  867.20 320  637.72  18 2300 490
## 173   7  2240.0  430.0  134.40  60  434.00  24  390  90
## 61    3  1610.0  410.0  177.10 110  585.22  15 1700 140
## 111   4  2100.0  450.0  546.00 260  816.95  15  600 330
## 90    4  1190.0  310.0  130.90 110  642.37  15  870 150
## 165   7  1310.0  290.0  104.80  80  578.67  21  480 100
## 4     1  1450.0  370.0  406.00 280  702.48  15 3800 370
## 85    4   440.0  120.0   13.20  30  779.57  24  160  60
## 243  14   520.0  140.0  109.20 210  323.66  18 1000 320
## 57    3  1090.0  280.0  152.60 140  777.06  15 1500 180
## 6     1  1480.0  380.0  222.00 150  731.79  15 1700 190
## 161   7   900.0  150.0   63.00  70  751.88  18  210 100
## 199   9   970.0  250.0   97.00 100  583.97  18  360 140
## 261  15   400.0  110.0   40.00 100  428.61  24  740 160
## 20    1  3560.0  690.0  925.60 260  885.89  21 2500 360
## 328  19  1150.0  260.0  161.00 140  307.54  21  990 240
## 104   4  1800.0  560.0   36.00  20  459.54  12  110  30
## 290  17  1510.0  330.0  166.10 110  341.82  18  640 140
## 321  19   520.0  140.0   57.20 110  341.82  21  420 160
## 264  15   620.0  160.0  136.40 220  375.15  18 1100 330
## 235  13   982.5  202.5  353.70 360  554.84  15 1900 500
## 360  20  1900.0  430.0  285.00 150  364.41  21  640 220
## 122   5  1810.0  400.0  488.70 270  538.07  18 2000 420
## 277  17   420.0  110.0   79.80 190  307.00  24  750 310
## 338  20   560.0  220.0   22.40  40  319.47   9  350  50
## 168   7  1540.0  400.0  154.00 100  555.04  15  870 140
## 83    3  6000.0 1320.0 1800.00 300  752.65  15 2800 400
## 317  19   280.0   90.0   11.20  40  325.39  18  180  50
## 215  12   540.0  140.0   75.60 140  478.71  18  900 200
## 337  20   510.0  160.0   30.60  60  245.28  27  790 110
## 206  10   660.0  170.0  112.20 170  490.23  18  680 260
## 240  14   370.0  120.0    3.70  10  196.07  15  180  20
## 232  13   740.0  190.0   37.00  50  361.67  18  340  70
## 93    4  1430.0  320.0   42.90  30  477.90  18  170  30
## 347  20   900.0  250.0   45.00  50  278.32  21  350  70
## 286  17   970.0  250.0    9.70  10  196.07  15  100  20
## 167   7  1450.0  320.0  203.00 140  600.05  21  840 250
## 201  10   280.0   90.0   44.80 160  388.70  18  710 210
## 219  12   980.0  250.0  107.80 110  471.47  15  780 160
## 7     1  1590.0  350.0  667.80 420  678.64  18 4900 620
## 211  11  2650.0  590.0  238.50  90  449.32  15  810 110
## 251  14   840.0  215.0  218.40 260 1495.05  30  440 350
## 312  18  1510.0  330.0  392.60 260  400.72  18  960 390
## 322  19   540.0  140.0   37.80  70  372.41  21  420 100
## 31    2   450.0  120.0   58.50 130  913.31  21  690 160
## 330  19  1530.0  390.0   15.30  10  193.08  15  120  40
## 19    1  3420.0  880.0  547.20 160  890.64  12 2000 180
## 30    2   370.0  120.0   40.70 110  585.22  15 1200 140

#Histogram of a numerical variable
hist(train$V.2, 
     main = "Histogram of V.2 (Total Floor Area)", 
     xlab = "Total Floor Area (m^2)", ylab = "Frequency", 
     col = "skyblue", border = "black")
abline(v = mean(train$V.2), 
       col = "red", 
       lwd = 2) # Add a vertical line for the mean
legend("topright", 
       legend = paste("Mean:", 
                      round(mean(train$V.2), 2)), 
       col = "red", 
       lwd = 2) # Add a legend for the mean

#Boxplot of a numerical variable to identify outliers
boxplot(train$V.4, train$V.2,
        main = "Boxplot of Preliminary Estimated Construction Cost (IRR)",
        xlab = "Variable V.4",
        ylab = "Construction Cost (IRR)",
        col = "skyblue",     # Customize colors
        border = "black",
        notch = TRUE,        # Add a notch
        pch = 19,            # Adjust outlier symbol
        horizontal = FALSE, # Horizontal orientation
        grid = TRUE         # Add grid lines
)

boxplot(train$V.2,
        main = "Boxplot of Preliminary Estimated Construction Cost (IRR)",
        xlab = "Variable V.4",
        ylab = "Construction Cost (IRR)",
        col = "skyblue",     # Customize colors
        border = "black",
        notch = TRUE,        # Add a notch
        pch = 19,            # Adjust outlier symbol
        horizontal = FALSE, # Horizontal orientation
        grid = TRUE         # Add grid lines
)

# scatter plot
plot(train$V.2, train$V.4, 
     main = "Scatter Plot: Total Floor Area vs Preliminary Estimated Construction Cost",
     xlab = "Total Floor Area (m^2)",
     ylab = "Preliminary Estimated Construction Cost (IRR)",
     col = "blue",
     pch = 19)

#bar plot
barplot(table(train$V.1), 
        main = "Bar Plot of Project Locality",
        xlab = "Project Locality",
        ylab = "Frequency",
        col = "skyblue")

# Time series plot of Actual Construction Costs (Y) over time (V.7)
plot(train$V.7, train$Y,
     type = "l",
     main = "Line Plot: Duration of Construction vs Actual Construction Costs",
     xlab = "Duration of Construction (Months)",
     ylab = "Actual Construction Costs (IRR)",
     col = "red")

# heatmap
heatmap(cor(train[, c("V.1","V.2", "V.3", "V.4", "V.5", "V.6", "V.7", "V.8", "Y")]),
        main = "Correlation Heatmap",
        xlab = "Variables",
        ylab = "Variables")

# Example density plot
plot(density(train$V.8),
     main = "Density Plot of Price per Unit Area",
     xlab = "Price per Unit Area (IRR)",
     col = "blue")

pairs(train[, c("V.2", "V.3", "V.4", "V.5", "V.6", "V.7", "V.8", "Y")])

heatmap(cor(train[, c("V.2", "V.3", "V.4", "Y")]), 
        main = "Correlation Heatmap (Subset of Variables)", 
        xlab = "Variables", 
        ylab = "Variables")

#### Handling of the outliers########
#by examining each variable by boxplot, I have found outliers in 2 variables i.e. V.2 and V.4

# Boxplot before removing outliers
par(mfrow=c(1, 2))
boxplot(data$V.2, main="Boxplot of V.2 (Before)")
boxplot(data$V.4, main="Boxplot of V.4 (Before)")

# Identify outliers for V.2 and V.4 variables
outliers_V2 <- boxplot.stats(data$V.2)$out
outliers_V4 <- boxplot.stats(data$V.4)$out

# Remove outliers from the dataset
cleaned_data <- data[!(data$V.2 %in% outliers_V2 | data$V.4 %in% outliers_V4), ]

# Boxplot after removing outliers
par(mfrow=c(1, 2))
boxplot(cleaned_data$V.2, main="Boxplot of V.2 (After)")
boxplot(cleaned_data$V.4, main="Boxplot of V.4 (After)")

# Summary of removed outliers
cat("Outliers removed from V.2:", outliers_V2, "\n")

## Outliers removed from V.2: 4040 4080 4800 4880 5020 5030 6930 15670 4230 4330 5540 4070 4090 4600 4840 5110 5200 6000 5280 6740 6700 8800 5500 14500

cat("Outliers removed from V.4:", outliers_V4, "\n")

## Outliers removed from V.4: 1158.4 1050 1231.4 1230 881.6 925.6 962 1212 1854.4 1706.8 7208.2 867.2 1551.2 810 813.2 1798 895.4 1104.3 2668 1452 1430.8 1800 954 819 804 1049.4 1168 1584 1006.5 1100

# Remove rows with NA values
cleaned_data <- na.omit(cleaned_data)

# Summary of cleaned dataset
cat("Summary of cleaned dataset:\n")

## Summary of cleaned dataset:

summary(cleaned_data)

##       V.1             V.2            V.3              V.4       
##  Min.   : 1.00   Min.   : 200   Min.   :  60.0   Min.   :  3.7  
##  1st Qu.: 4.00   1st Qu.: 670   1st Qu.: 170.0   1st Qu.: 60.8  
##  Median :10.00   Median :1060   Median : 270.0   Median :136.4  
##  Mean   :10.47   Mean   :1265   Mean   : 316.2   Mean   :195.1  
##  3rd Qu.:17.00   3rd Qu.:1705   3rd Qu.: 415.0   3rd Qu.:277.1  
##  Max.   :20.00   Max.   :4000   Max.   :1100.0   Max.   :786.0  
##       V.5             V.6              V.7             V.8        
##  Min.   : 10.0   Min.   : 193.1   Min.   : 6.00   Min.   :  40.0  
##  1st Qu.: 80.0   1st Qu.: 378.4   1st Qu.:15.00   1st Qu.: 420.0  
##  Median :130.0   Median : 490.2   Median :18.00   Median : 750.0  
##  Mean   :146.3   Mean   : 532.6   Mean   :18.34   Mean   : 925.9  
##  3rd Qu.:210.0   3rd Qu.: 650.6   3rd Qu.:21.00   3rd Qu.:1200.0  
##  Max.   :500.0   Max.   :2648.0   Max.   :48.00   Max.   :5300.0  
##        Y        
##  Min.   : 20.0  
##  1st Qu.:100.0  
##  Median :180.0  
##  Mean   :206.6  
##  3rd Qu.:300.0  
##  Max.   :620.0

### IQR Method - to identify & handle outliers ###

# The IQR is used to identify and deal with outliers. It is a measure of 
# the spread of the data values. It is a reliable measure of 
# dispersion because it is not affected by extreme values of outliers.
#
# In the IQR method a range is defined by using first and third quartile and a 
# multiplier which is usually set as 1.5. All the values below the lower  
# limit and above the upper limit are considered as outliers.

# Calculate IQR for each variable
Q1 <- apply(train, 2, quantile, probs = 0.25)
Q3 <- apply(train, 2, quantile, probs = 0.75)
IQR <- Q3 - Q1

outliers <- apply(train, 2, function(x) x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))

## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length

## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length

cleaned_data <- train
cleaned_data[outliers] <- NA
cleaned_data <- na.omit(cleaned_data)

summary(cleaned_data)

##       V.1             V.2            V.3              V.4        
##  Min.   : 1.00   Min.   : 230   Min.   :  60.0   Min.   :  3.70  
##  1st Qu.: 7.00   1st Qu.: 470   1st Qu.: 127.5   1st Qu.: 32.77  
##  Median :13.00   Median : 820   Median : 225.0   Median : 75.95  
##  Mean   :11.97   Mean   : 958   Mean   : 251.6   Mean   :151.42  
##  3rd Qu.:18.00   3rd Qu.:1238   3rd Qu.: 305.0   3rd Qu.:204.45  
##  Max.   :20.00   Max.   :2710   Max.   :1100.0   Max.   :867.20  
##       V.5             V.6             V.7             V.8        
##  Min.   : 10.0   Min.   :193.1   Min.   : 9.00   Min.   :  80.0  
##  1st Qu.: 70.0   1st Qu.:341.2   1st Qu.:15.00   1st Qu.: 387.5  
##  Median :120.0   Median :447.7   Median :18.00   Median : 670.0  
##  Mean   :138.3   Mean   :459.6   Mean   :18.75   Mean   : 800.0  
##  3rd Qu.:210.0   3rd Qu.:543.8   3rd Qu.:21.00   3rd Qu.: 992.5  
##  Max.   :390.0   Max.   :798.7   Max.   :30.00   Max.   :2600.0  
##        Y        
##  Min.   : 20.0  
##  1st Qu.: 97.5  
##  Median :160.0  
##  Mean   :198.4  
##  3rd Qu.:300.0  
##  Max.   :600.0

##################  Modelling Code ###################

# Create Predictor matrix & Response vector for both train & test set
x_train <- as.matrix(train[ , -1])   # predictor matrix for the training set.
y_train <- train[ , 1]            # response vector for the training set.

x_test <- as.matrix(test[ , -1])   # predictor matrix for the test set.
y_test <- test[ , 1]              # response vector for the test set.

# Excluding variables V.1 and V.3 from the model and adding predictor matrix
# in the training & test set 

x_train_excl_V3 <- as.matrix(x_train[ , -3])
# View(x_train_excl_V3)      # checking the correct variable has been removed
x_test_excl_V3 <- x_test[ , -3]
# View(x_test_excl_V3)      # checking the correct variable has been removed

# a matrix is a two-dimensionsal collection of elements of the same data type 
# (numeric, character, or logical) arranged into a fixed number of rows and columns. 

## Standardizing the predictor variables ##

# Standardizing the predictor variables to have mean 
# zero and unit variance. This is important for ridge regression 
# because it ensures that the penalty term is applied equally to 
# all the coefficients. Using scale function to do this.

x_train <- scale(x_train)  # standardize the training predictors
x_test <- scale(x_test)    # standardize the test predictors

x_train_excl_V3 <- scale(x_train_excl_V3)


##### Build Training Model - Cross Validation using Ridge Regression ####

# To perform ridge regression, I’ll use functions from the glmnet package. 

# I’ll use the glmnet() function to fit the ridge regression model 
# and specify alpha=0 to select Ridge Regression

# Setting alpha equal to 1 is equivalent to using Lasso Regression and 
# setting alpha to some value between 0 and 1 is equivalent to using an elastic net.

# I’ll use the default values of alpha and lambda and 
# let the function choose the optimal values for us. 

# Alpha = 0 corresponds to ridge regression, 
# Alpha = 1 corresponds to lasso regression and 
# 0 < alpha < 1 corresponds to elastic net regression, 
# a combination of ridge and lasso.

# Ideally, producing multiple models and based on the lowest mean squared 
# error and decide on the final model for implementation.

## Fitting the ridge regression model ##

# Model 1 for ridge regression
ridge_model <- glmnet(x_train, y_train, alpha = 0, standardize = FALSE) 

# Model 2 for Lasso regression
lasso_model <- glmnet(x_train, y_train, alpha = 1, standardize = FALSE)  

# Model 3 removing variable 3
ridge_model_excl_V3 <- glmnet(x_train_excl_V3, y_train, alpha = 0, standardize = FALSE)   

# (The predictor value/ input features will need to be amended to exclude V3 in this example)

# Model 4 - Ridge Regression with lambda_min
ridge_model_lambda_min <- glmnet(x_train, y_train, alpha=0, lambda = ridge_model$lambda.min, standardize = FALSE)

# Model 5 Lasso regression with lambda_min
lasso_model_lambda.min <- glmnet(x_train, y_train, alpha = 1, lambda = lasso_model$lambda.min, standardize = FALSE) 

# Note that by default, the glmnet() function standardizes the
# variables so that they are on the same scale. To turn off this default setting,
# use the argument standardize=FALSE.

# View summary model which will show  the length, class, mode, and dimensions of the elements.
summary(ridge_model)

##           Length Class     Mode   
## a0        100    -none-    numeric
## beta      800    dgCMatrix S4     
## df        100    -none-    numeric
## dim         2    -none-    numeric
## lambda    100    -none-    numeric
## dev.ratio 100    -none-    numeric
## nulldev     1    -none-    numeric
## npasses     1    -none-    numeric
## jerr        1    -none-    numeric
## offset      1    -none-    logical
## call        5    -none-    call   
## nobs        1    -none-    numeric

summary(ridge_model_lambda_min)

##           Length Class     Mode   
## a0        100    -none-    numeric
## beta      800    dgCMatrix S4     
## df        100    -none-    numeric
## dim         2    -none-    numeric
## lambda    100    -none-    numeric
## dev.ratio 100    -none-    numeric
## nulldev     1    -none-    numeric
## npasses     1    -none-    numeric
## jerr        1    -none-    numeric
## offset      1    -none-    logical
## call        6    -none-    call   
## nobs        1    -none-    numeric

summary(lasso_model_lambda.min)

##           Length Class     Mode   
## a0         79    -none-    numeric
## beta      632    dgCMatrix S4     
## df         79    -none-    numeric
## dim         2    -none-    numeric
## lambda     79    -none-    numeric
## dev.ratio  79    -none-    numeric
## nulldev     1    -none-    numeric
## npasses     1    -none-    numeric
## jerr        1    -none-    numeric
## offset      1    -none-    logical
## call        6    -none-    call   
## nobs        1    -none-    numeric

# The corresponding values of lambda, beta, df, dev. ratio, and a0. 
# The beta element is a sparse matrix, which means that it only stores 
# the non-zero values of the coefficients. 
# The df element is the degrees of freedom, the number of non-zero coefficients. 
# The dev.ratio element is the fraction of deviance the model explains. 
# The a0 element is the intercept term.

# Associated with each value of  λ is a vector of ridge regression coefficients, 
# stored in a matrix that can be accessed by coef()
# coef(model)

# To check the coefficients using the dim()
dim(coef(ridge_model))

## [1]   9 100

ridge_model

## 
## Call:  glmnet(x = x_train, y = y_train, alpha = 0, standardize = FALSE) 
## 
##     Df  %Dev Lambda
## 1    8  0.00 3724.0
## 2    8  0.41 3393.0
## 3    8  0.45 3092.0
## 4    8  0.49 2817.0
## 5    8  0.54 2567.0
## 6    8  0.59 2339.0
## 7    8  0.64 2131.0
## 8    8  0.71 1942.0
## 9    8  0.77 1769.0
## 10   8  0.85 1612.0
## 11   8  0.93 1469.0
## 12   8  1.02 1338.0
## 13   8  1.11 1219.0
## 14   8  1.22 1111.0
## 15   8  1.33 1012.0
## 16   8  1.46  922.4
## 17   8  1.60  840.5
## 18   8  1.74  765.8
## 19   8  1.91  697.8
## 20   8  2.08  635.8
## 21   8  2.28  579.3
## 22   8  2.48  527.8
## 23   8  2.71  481.0
## 24   8  2.96  438.2
## 25   8  3.22  399.3
## 26   8  3.51  363.8
## 27   8  3.82  331.5
## 28   8  4.16  302.1
## 29   8  4.52  275.2
## 30   8  4.91  250.8
## 31   8  5.32  228.5
## 32   8  5.77  208.2
## 33   8  6.25  189.7
## 34   8  6.76  172.8
## 35   8  7.30  157.5
## 36   8  7.88  143.5
## 37   8  8.50  130.8
## 38   8  9.14  119.1
## 39   8  9.83  108.6
## 40   8 10.55   98.9
## 41   8 11.30   90.1
## 42   8 12.09   82.1
## 43   8 12.91   74.8
## 44   8 13.76   68.2
## 45   8 14.64   62.1
## 46   8 15.55   56.6
## 47   8 16.49   51.6
## 48   8 17.45   47.0
## 49   8 18.43   42.8
## 50   8 19.43   39.0
## 51   8 20.44   35.5
## 52   8 21.47   32.4
## 53   8 22.51   29.5
## 54   8 23.56   26.9
## 55   8 24.61   24.5
## 56   8 25.67   22.3
## 57   8 26.74   20.3
## 58   8 27.80   18.5
## 59   8 28.87   16.9
## 60   8 29.93   15.4
## 61   8 30.99   14.0
## 62   8 32.05   12.8
## 63   8 33.11   11.6
## 64   8 34.16   10.6
## 65   8 35.21    9.7
## 66   8 36.25    8.8
## 67   8 37.28    8.0
## 68   8 38.30    7.3
## 69   8 39.32    6.7
## 70   8 40.32    6.1
## 71   8 41.30    5.5
## 72   8 42.27    5.0
## 73   8 43.23    4.6
## 74   8 44.16    4.2
## 75   8 45.06    3.8
## 76   8 45.95    3.5
## 77   8 46.80    3.2
## 78   8 47.63    2.9
## 79   8 48.42    2.6
## 80   8 49.18    2.4
## 81   8 49.91    2.2
## 82   8 50.60    2.0
## 83   8 51.25    1.8
## 84   8 51.87    1.6
## 85   8 52.45    1.5
## 86   8 52.98    1.4
## 87   8 53.49    1.2
## 88   8 53.95    1.1
## 89   8 54.37    1.0
## 90   8 54.77    0.9
## 91   8 55.12    0.9
## 92   8 55.45    0.8
## 93   8 55.74    0.7
## 94   8 56.01    0.7
## 95   8 56.25    0.6
## 96   8 56.47    0.5
## 97   8 56.66    0.5
## 98   8 56.83    0.4
## 99   8 56.98    0.4
## 100  8 57.12    0.4

# Associated with each value of λ is a vector of ridge regression coefficients,
# stored in a matrix that can be accessed by coef(). In this case, it is a 
# 5X100 matrix, with 5 rows (one for each predictor, plus an intercept) and
# 100 columns (one for each value of lambda).

ridge_model$lambda[20]

## [1] 635.7929

coef(ridge_model) [ , 20]

##  (Intercept)          V.2          V.3          V.4          V.5          V.6 
##  9.534412955 -0.023183111 -0.021753041 -0.020171515 -0.020727795 -0.037175004 
##          V.7          V.8            Y 
##  0.009042002 -0.027449376 -0.017414847

# Plotting the model object using plot function, shows coef change as a func of lambda

plot(ridge_model, xvar = "lambda", label = TRUE)# plot the coefficients vs lambda

# The x-axis is on a log scale (Log Lambda), so the smaller lambda 
# values are on the right, and the larger values are on the left. 

# The y-axis shows the values of the coefficients, and each line 
# corresponds to a different predictor variable.

# In this example, when lambda log is 8, the coefficients are essentially zero.
# When we relax lambda the coefficients grow away from zero in a smooth way.
# The sum of squares of the coefficients are getting bigger and bigger until 
# we reach a point where Lambda is effectively zero & the coefficients
# are regularized & so these would be the coefficients  that you get from an 
# ordinary least squares fit of these variables. 

### Perform k-fold cross-validation to find optimal lambda value ###

# Next, we’ll identify the lambda value that produces the 
# lowest test mean squared error (MSE) by using k-fold 
# cross-validation a technique that splits the data into 
# several subsets and uses some for training and some for testing.

# glmnet has the function cv.glmnet() that automatically 
# performs k-fold cross validation. A fold is a subset of the 
# data used for testing, while the rest is used for training. 
# The default value is 10, meaning the data is split into 10 subsets, 
# and each subset is used as a test set once. 

# Model 1 - Ridge Regression cross validation
cv_ridge_model <- cv.glmnet(x_train, y_train, alpha = 1, nfolds = 5) 
# Performs 5 fold cross validation on ridge model

# Model 2 - Lasso Regression cross validation
cv_lasso_model <- cv.glmnet(x_train, y_train, alpha = 0, nfolds = 5) 
# Performs 5 fold cross validation on lasso model

# Model 3 - Ridge Regression with variable 3 excluded 
cv_ridge_model_excl_V3 <- cv.glmnet(x_train_excl_V3, y_train, alpha = 1, nfolds = 5) 

# Model 4 - Ridge Regression with lambda.min
# ridge_model_lambda_min <- glmnet(x_train, y_train, alpha=0, lambda = cv_ridge_model$lambda.min)
cv_ridge_model_lambda_min <- cv.glmnet(x_train, y_train, alpha = 0, nfolds = 5) 

# Model 5 - Lasso Regression with lambda.min
cv_lasso_model_lambda_min <- cv.glmnet(x_train, y_train, alpha = 0, nfolds = 5) 

summary(cv_ridge_model)

##            Length Class  Mode     
## lambda     79     -none- numeric  
## cvm        79     -none- numeric  
## cvsd       79     -none- numeric  
## cvup       79     -none- numeric  
## cvlo       79     -none- numeric  
## nzero      79     -none- numeric  
## call        5     -none- call     
## name        1     -none- character
## glmnet.fit 12     elnet  list     
## lambda.min  1     -none- numeric  
## lambda.1se  1     -none- numeric  
## index       2     -none- numeric

summary(ridge_model_lambda_min)

##           Length Class     Mode   
## a0        100    -none-    numeric
## beta      800    dgCMatrix S4     
## df        100    -none-    numeric
## dim         2    -none-    numeric
## lambda    100    -none-    numeric
## dev.ratio 100    -none-    numeric
## nulldev     1    -none-    numeric
## npasses     1    -none-    numeric
## jerr        1    -none-    numeric
## offset      1    -none-    logical
## call        6    -none-    call   
## nobs        1    -none-    numeric

# The cvm element is the mean cross-validated error for each value of lambda. 

# The cvsd element is the standard deviation of the cross-validated error for 
# each lambda value. 

# The cvup and cvlo elements are the upper and lower confidence bounds 
# for the cross-validated error for each lambda value. 

# The nzero element is the number of non-zero coefficients for each value of lambda. 

# The lambda.min element is the value of lambda that gives the minimum
# cross-validated error. 

# The lambda.1se element is the largest value of lambda, giving a 
# cross-validated error within one standard error of the minimum.

## plotting the cv_model object using the plot function ##

# Shows the cross-validated error changes as a function of lambda. 

# The x-axis is on a log scale, so the smaller lambda values are on 
# the right, and the larger values are on the left. 

# The y-axis shows the values of the cross-validated error, and the 
# error bars show the confidence bounds. 

# The vertical dotted lines indicate the values of lambda that give 
# the minimum cross-validated error and the largest error within 
# one standard error of the minimum.

plot(cv_ridge_model) # plot the cross-validated error vs lambda

# This is a plot of the cross validated MSE and from the right hand side
# it dips downs. In the beginning the MSE is very high and the coefficients 
# are restricted to be too small and then it starts to level off. 
# This indicates that the full model is doing a good job.

# There are two vertical lines:
# The first one indicates the min. MSE.
# The second indicates the one standard error of the min. MSE.
# This is a more restricted model that can do as well as the min. MSE and
# we can decide to use this value instead of the min. MSE.

plot(cv_lasso_model)  # Model 2 Lasso Regression
plot(cv_ridge_model_excl_V3)  # Model 3 Ridge Regression with V3 removed

plot(cv_ridge_model_lambda_min) # Model 4 Ridge regression with lambda min.
plot(cv_lasso_model_lambda_min) # Model 5 Lasso regression with lambda min.

### This will show the results of the best results from each model. ###

cv_ridge_model

## 
## Call:  cv.glmnet(x = x_train, y = y_train, nfolds = 5, alpha = 1) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index Measure     SE Nonzero
## min  0.109    39   30.14 12.149       6
## 1se  3.731     1   40.42  2.702       0

cv_lasso_model

## 
## Call:  cv.glmnet(x = x_train, y = y_train, nfolds = 5, alpha = 0) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index Measure    SE Nonzero
## min 0.5414    96   22.95 2.874       8
## 1se 2.8891    78   25.69 1.568       8

cv_ridge_model_excl_V3

## 
## Call:  cv.glmnet(x = x_train_excl_V3, y = y_train, nfolds = 5, alpha = 1) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index Measure     SE Nonzero
## min  0.082    42   29.87 12.215       5
## 1se  3.731     1   40.32  2.319       0

cv_ridge_model_lambda_min

## 
## Call:  cv.glmnet(x = x_train, y = y_train, nfolds = 5, alpha = 0) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index Measure    SE Nonzero
## min 0.6521    94   20.59 2.921       8
## 1se 2.8891    78   23.27 1.801       8

cv_lasso_model_lambda_min

## 
## Call:  cv.glmnet(x = x_train, y = y_train, nfolds = 5, alpha = 0) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index Measure    SE Nonzero
## min 0.7157    93   20.92 2.064       8
## 1se 2.6324    79   22.97 2.140       8

# This will pick the coefficient corresponding to the best model.  
coef(cv_ridge_model)

## 9 x 1 sparse Matrix of class "dgCMatrix"
##                   s1
## (Intercept) 9.534413
## V.2         .       
## V.3         .       
## V.4         .       
## V.5         .       
## V.6         .       
## V.7         .       
## V.8         .       
## Y           .

coef(cv_lasso_model_lambda_min)

## 9 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept)  9.53441296
## V.2         -0.64990363
## V.3         -0.45393667
## V.4          0.38171489
## V.5         -0.09739081
## V.6         -2.44145999
## V.7          0.73682873
## V.8         -1.73367451
## Y            0.23855216

coef(ridge_model_lambda_min)

## 9 x 100 sparse Matrix of class "dgCMatrix"

##   [[ suppressing 100 column names 's0', 's1', 's2' ... ]]

##                                                                              
## (Intercept)  9.534413e+00  9.534412955  9.534412955  9.534412955  9.534412955
## V.2         -2.393481e-36 -0.004486371 -0.004920155 -0.005395506 -0.005916320
## V.3         -2.247898e-36 -0.004212768 -0.004620024 -0.005066286 -0.005555210
## V.4         -2.108171e-36 -0.003942568 -0.004322814 -0.004739295 -0.005195372
## V.5         -2.153008e-36 -0.004031027 -0.004420300 -0.004846769 -0.005313906
## V.6         -3.761467e-36 -0.007077590 -0.007764819 -0.008518479 -0.009344929
## V.7          9.063929e-37  0.001708537  0.001874760  0.002057118  0.002257168
## V.8         -2.809527e-36 -0.005274764 -0.005785703 -0.006345780 -0.006959650
## Y           -1.820541e-36 -0.003404471 -0.003732803 -0.004092417 -0.004486216
##                                                                             
## (Intercept)  9.534412955  9.534412955  9.534412955  9.534412955  9.534412955
## V.2         -0.006486852 -0.007111736 -0.007796017 -0.008545178 -0.009365178
## V.3         -0.006090786 -0.006677356 -0.007319649 -0.008022802 -0.008792396
## V.4         -0.005694704 -0.006241263 -0.006839362 -0.007493675 -0.008209263
## V.5         -0.005825493 -0.006385643 -0.006998828 -0.007669901 -0.008404129
## V.6         -0.010251129 -0.011244686 -0.012333916 -0.013527907 -0.014836585
## V.7          0.002476618  0.002717337  0.002981375  0.003270973  0.003588586
## V.8         -0.007632393 -0.008369554 -0.009177177 -0.010061850 -0.011030748
## Y           -0.004917358 -0.005389271 -0.005905678 -0.006470612 -0.007088442
##                                                                             
## (Intercept)  9.534412955  9.534412955  9.534412955  9.534412955  9.534412955
## V.2         -0.010262485 -0.011244108 -0.012317641 -0.013491291 -0.014774154
## V.3         -0.009634488 -0.010555641 -0.011562960 -0.012664123 -0.013867545
## V.4         -0.008991600 -0.009846590 -0.010780600 -0.011800475 -0.012913644
## V.5         -0.009207216 -0.010085332 -0.011045144 -0.012093843 -0.013239196
## V.6         -0.016270785 -0.017842331 -0.019564118 -0.021450201 -0.023515890
## V.7          0.003936903  0.004318862  0.004737681  0.005196876  0.005700291
## V.8         -0.012091676 -0.013253125 -0.014524315 -0.015915255 -0.017436792
## Y           -0.007763889 -0.008502049 -0.009308416 -0.010188894 -0.011149821
##                                                                             
## (Intercept)  9.534412955  9.534412955  9.534412955  9.534412955  9.534412955
## V.2         -0.016175392 -0.017705466 -0.019375419 -0.021197085 -0.023183111
## V.3         -0.015181942 -0.016617015 -0.018183073 -0.019891153 -0.021753041
## V.4         -0.014127832 -0.015451502 -0.016893594 -0.018463591 -0.020171515
## V.5         -0.014489477 -0.015853622 -0.017341170 -0.018962290 -0.020727795
## V.6         -0.025777842 -0.028254182 -0.030964602 -0.033930480 -0.037175004
## V.7          0.006252128  0.006856976  0.007519843  0.008246198  0.009042002
## V.8         -0.019100678 -0.020919617 -0.022907328 -0.025078607 -0.027449376
## Y           -0.012197983 -0.013340624 -0.014585452 -0.015940646 -0.017414847
##                                                                         
## (Intercept)  9.534412955  9.53441296  9.53441296  9.53441296  9.53441296
## V.2         -0.025346979 -0.02770301 -0.03026638 -0.03305309 -0.03607995
## V.3         -0.023781289 -0.02598922 -0.02839093 -0.03100126 -0.03383578
## V.4         -0.022027909 -0.02404381 -0.02623069 -0.02860042 -0.03116518
## V.5         -0.022649143 -0.02473842 -0.02700833 -0.02947216 -0.03214369
## V.6         -0.040723297 -0.04460254 -0.04884212 -0.05347373 -0.05853156
## V.7          0.009913755  0.01086854  0.01191406  0.01305869  0.01431156
## V.8         -0.030036745 -0.03285906 -0.03593595 -0.03928837 -0.04293862
## Y           -0.019017145 -0.02075705 -0.02264446 -0.02468961 -0.02690299
##                                                                        
## (Intercept)  9.53441296  9.53441296  9.53441296  9.53441296  9.53441296
## V.2         -0.03936453 -0.04292511 -0.04678057 -0.05095028 -0.05545396
## V.3         -0.03691074 -0.04024299 -0.04384991 -0.04774924 -0.05195902
## V.4         -0.03393730 -0.03692921 -0.04015318 -0.04362117 -0.04734455
## V.5         -0.03503719 -0.03816725 -0.04154869 -0.04519640 -0.04912516
## V.6         -0.06405234 -0.07007558 -0.07664357 -0.08380160 -0.09159798
## V.7          0.01568255  0.01718239  0.01882272  0.02061613  0.02257622
## V.8         -0.04691037 -0.05122870 -0.05592001 -0.06101209 -0.06653402
## Y           -0.02929525 -0.03187709 -0.03465912 -0.03765165 -0.04086453
##                                                                        
## (Intercept)  9.53441296  9.53441296  9.53441296  9.53441296  9.53441296
## V.2         -0.06031149 -0.06554269 -0.07116707 -0.07720352 -0.08366998
## V.3         -0.05649729 -0.06138199 -0.06663060 -0.07225989 -0.07828557
## V.4         -0.05133379 -0.05559815 -0.06014523 -0.06498056 -0.07010704
## V.5         -0.05334937 -0.05788282 -0.06273838 -0.06792759 -0.07346031
## V.6         -0.10008418 -0.10931484 -0.11934789 -0.13024448 -0.14206904
## V.7          0.02471767  0.02705634  0.02960925  0.03239472  0.03543238
## V.8         -0.07251609 -0.07898975 -0.08598747 -0.09354260 -0.10168921
## Y           -0.04430683 -0.04798660 -0.05191052 -0.05608349 -0.06050824
##                                                                        
## (Intercept)  9.53441296  9.53441296  9.53441296  9.53441296  9.53441296
## V.2         -0.09058303 -0.09795753 -0.10580610 -0.11413871 -0.12296215
## V.3         -0.08472190 -0.09158129 -0.09887382 -0.10660675 -0.11478409
## V.4         -0.07552440 -0.08122859 -0.08721106 -0.09345813 -0.09995020
## V.5         -0.07934426 -0.08558451 -0.09218298 -0.09913788 -0.10644316
## V.6         -0.15488919 -0.16877567 -0.18380224 -0.20004546 -0.21758453
## V.7          0.03874324  0.04234971  0.04627569  0.05054653  0.05518910
## V.8         -0.11046189 -0.11989558 -0.13002534 -0.14088610 -0.15251254
## Y           -0.06518483 -0.07011018 -0.07527747 -0.08067562 -0.08628871
##                                                                      
## (Intercept)  9.53441296  9.53441296  9.5344130  9.5344130  9.53441296
## V.2         -0.13227956 -0.14208210 -0.1523779 -0.1631504 -0.17438393
## V.3         -0.12340603 -0.13246791 -0.1419623 -0.1518748 -0.16218611
## V.4         -0.10666104 -0.11355965 -0.1206005 -0.1277351 -0.13490409
## V.5         -0.11408794 -0.12205785 -0.1303277 -0.1388703 -0.14765062
## V.6         -0.23650100 -0.25687887 -0.2788028 -0.3023596 -0.32763670
## V.7          0.06023174  0.06570427  0.0716379  0.0780652  0.08501994
## V.8         -0.16493883 -0.17819879 -0.1923248 -0.2073492 -0.22330347
## Y           -0.09209535 -0.09806811 -0.1041732 -0.1103698 -0.11660958
##                                                                               
## (Intercept)  9.53441296  9.5344130  9.5344130  9.5344130  9.5344130  9.5344130
## V.2         -0.18605747 -0.1981447 -0.2106138 -0.2234284 -0.2365472 -0.2499253
## V.3         -0.17287130 -0.1838998 -0.1952351 -0.2068356 -0.2186541 -0.2306390
## V.4         -0.14203856 -0.1490596 -0.1558784 -0.1623963 -0.1685047 -0.1740862
## V.5         -0.15662649 -0.1657487 -0.1749608 -0.1841996 -0.1933951 -0.2024712
## V.6         -0.35472183 -0.3837024 -0.4146649 -0.4476942 -0.4828727 -0.5202796
## V.7          0.09253698  0.1006520  0.1094014  0.1188218  0.1289499  0.1398216
## V.8         -0.24021875 -0.2581260 -0.2770565 -0.2970427 -0.3181186 -0.3403209
## Y           -0.12283688 -0.1289882 -0.1349922 -0.1407704 -0.1462368 -0.1512991
##                                                                              
## (Intercept)  9.5344130  9.5344130  9.5344130  9.5344130  9.5344130  9.5344130
## V.2         -0.2635148 -0.2772657 -0.2911274 -0.3050498 -0.3189320 -0.3328210
## V.3         -0.2427347 -0.2548826 -0.2670216 -0.2790901 -0.2910208 -0.3027676
## V.4         -0.1790148 -0.1831572 -0.1863735 -0.1885191 -0.1894716 -0.1890416
## V.5         -0.2113464 -0.2199342 -0.2281449 -0.2358856 -0.2430815 -0.2496057
## V.6         -0.5599898 -0.6020726 -0.6465910 -0.6936002 -0.7431493 -0.7952662
## V.7          0.1514720  0.1639348  0.1772408  0.1914184  0.2064925  0.2224810
## V.8         -0.3636899 -0.3882706 -0.4141138 -0.4412774 -0.4698270 -0.4998346
## Y           -0.1558590 -0.1598134 -0.1630553 -0.1654751 -0.1669572 -0.1673964
##                                                                              
## (Intercept)  9.5344130  9.5344130  9.5344130  9.5344130  9.5344130  9.5344130
## V.2         -0.3466365 -0.3603456 -0.3739235 -0.3873545 -0.4006333 -0.4137655
## V.3         -0.3142665 -0.3254643 -0.3363127 -0.3467690 -0.3567968 -0.3663663
## V.4         -0.1870961 -0.1834870 -0.1780692 -0.1707029 -0.1612545 -0.1495991
## V.5         -0.2553782 -0.2603067 -0.2643021 -0.2672788 -0.2691567 -0.2698617
## V.6         -0.8499761 -0.9072867 -0.9671897 -1.0296593 -1.0946503 -1.1620971
## V.7          0.2393981  0.2572507  0.2760378  0.2957496  0.3163667  0.3378590
## V.8         -0.5313845 -0.5645696 -0.5994922 -0.6362637 -0.6750038 -0.7158391
## Y           -0.1666799 -0.1647000 -0.1613527 -0.1565393 -0.1501671 -0.1421506
##                                                                     
## (Intercept)  9.5344130  9.5344130  9.5344130  9.53441296  9.53441296
## V.2         -0.4267683 -0.4396706 -0.4525130 -0.46515209 -0.47802420
## V.3         -0.3754548 -0.3840462 -0.3921307 -0.39976113 -0.40686594
## V.4         -0.1356219 -0.1192198 -0.1003035 -0.07900570 -0.05490795
## V.5         -0.2693271 -0.2674946 -0.2643161 -0.25979861 -0.25379989
## V.6         -1.2319124 -1.3039863 -1.3781865 -1.45433852 -1.53228489
## V.7          0.3601854  0.3832931  0.4071174  0.43159029  0.45660992
## V.8         -0.7589009 -0.8043227 -0.8522372 -0.90268017 -0.95590919
## Y           -0.1324124 -0.1208837 -0.1075050 -0.09220825 -0.07500598
##                                                                         
## (Intercept)  9.53441296  9.534412955  9.53441296  9.53441296  9.53441296
## V.2         -0.49103400 -0.504266795 -0.51781519 -0.53177746 -0.54597130
## V.3         -0.41347740 -0.419603158 -0.42525107 -0.43042774 -0.43526475
## V.4         -0.02812692  0.001360286  0.03355605  0.06844075  0.10562324
## V.5         -0.24635677 -0.237464774 -0.22713515 -0.21539616 -0.20221605
## V.6         -1.61181966 -1.692724251 -1.77476297 -1.85768544 -1.94116165
## V.7          0.48208493  0.507908201  0.53396495  0.56013454  0.58631677
## V.8         -1.01197189 -1.070955804 -1.13292598 -1.19791987 -1.26570401
## Y           -0.05585134 -0.034732582 -0.01164990  0.01338446  0.04026687
##                                                                     
## (Intercept)  9.53441296  9.53441296  9.5344130  9.5344130  9.5344130
## V.2         -0.56106309 -0.57689741 -0.5935826 -0.6109031 -0.6295970
## V.3         -0.43956643 -0.44340199 -0.4467556 -0.4497545 -0.4521201
## V.4          0.14568559  0.18825022  0.2332089  0.2800363  0.3293327
## V.5         -0.18770175 -0.17192673 -0.1549870 -0.1367780 -0.1177105
## V.6         -2.02502318 -2.10895348 -2.1926749 -2.2758209 -2.3582699
## V.7          0.61234467  0.63811224  0.6634983  0.6884284  0.7127252
## V.8         -1.33664886 -1.41051353 -1.4871869 -1.5661971 -1.6479041
## Y            0.06903048  0.09959555  0.1318920  0.1656202  0.2009766
##                                                                        
## (Intercept)  9.53441296  9.53441296  9.53441296  9.53441296  9.53441296
## V.2         -0.64945693 -0.67057461 -0.69271188 -0.71660207 -0.74198472
## V.3         -0.45390464 -0.45505128 -0.45569169 -0.45540443 -0.45425438
## V.4          0.38057730  0.43358015  0.48773155  0.54361722  0.60061751
## V.5         -0.09783206 -0.07730442 -0.05581313 -0.03431348 -0.01267964
## V.6         -2.43969791 -2.51985375 -2.59838243 -2.67527269 -2.75022863
## V.7          0.73632270  0.75913476  0.78115431  0.80220490  0.82229278
## V.8         -1.73182226 -1.81766993 -1.90473212 -1.99339323 -2.08293888
## Y            0.23773598  0.27577467  0.31447829  0.35446979  0.39527557
##                                                                        
## (Intercept)  9.534412955  9.53441296  9.5344130  9.53441296  9.53441296
## V.2         -0.768918646 -0.79719756 -0.8274124 -0.85930001 -0.89271958
## V.3         -0.452149150 -0.44922484 -0.4449498 -0.43943058 -0.43276655
## V.4          0.658496218  0.71668421  0.7756400  0.83477436  0.89363069
## V.5          0.008893561  0.03104782  0.0521298  0.07260134  0.09323983
## V.6         -2.823066495 -2.89349543 -2.9616184 -3.02720241 -3.09004138
## V.7          0.841385259  0.85956092  0.8766345  0.89269249  0.90786024
## V.8         -2.172972053 -2.26265845 -2.3524109 -2.44142476 -2.52892234
## Y            0.436737408  0.47786064  0.5199410  0.56216753  0.60341169
##                                                                              
## (Intercept)  9.5344130  9.5344130  9.5344130  9.5344130  9.5344130  9.5344130
## V.2         -0.9280584 -0.9650992 -1.0038216 -1.0441929 -1.0861673 -1.1296858
## V.3         -0.4244804 -0.4146838 -0.4033056 -0.3902934 -0.3756110 -0.3592398
## V.4          0.9525019  1.0109182  1.0686861  1.1256337  1.1816075  1.2364730
## V.5          0.1121343  0.1299273  0.1464824  0.1616841  0.1754356  0.1876579
## V.6         -3.1502778 -3.2077559 -3.2624448 -3.3143338 -3.3634315 -3.4097638
## V.7          0.9219546  0.9351017  0.9473381  0.9587047  0.9692456  0.9790062
## V.8         -2.6152790 -2.6997742 -2.7820826 -2.8619147 -2.9390197 -3.0131870
## Y            0.6453000  0.6868824  0.7280300  0.7686231  0.8085560  0.8477384
##                                                                              
## (Intercept)  9.5344130  9.5344130  9.5344130  9.5344130  9.5344130  9.5344130
## V.2         -1.1746751 -1.2210482 -1.2677078 -1.3175130 -1.3673822 -1.4181346
## V.3         -0.3411802 -0.3214521 -0.3003004 -0.2771879 -0.2527608 -0.2269537
## V.4          1.2901145  1.3424346  1.3927191  1.4428350  1.4907447  1.5371207
## V.5          0.1982901  0.2072885  0.2131191  0.2200832  0.2242677  0.2266450
## V.6         -3.4533731 -3.4943158 -3.5327196 -3.5684981 -3.6018869 -3.6329502
## V.7          0.9880327  0.9963712  1.0038955  1.0111328  1.0176932  1.0237112
## V.8         -3.0842473 -3.1520718 -3.2167169 -3.2777472 -3.3354250 -3.3897642
## Y            0.8860959  0.9235706  0.9617581  0.9959394  1.0303721  1.0640065

#### Let's store the validation results! They will be useful to compare against the test set results.
# Model 1 Ridge model MSE & RMSE
cross_validation_ridge_MSE <- min(cv_ridge_model$cvm) 
cross_validation_ridge_RMSE <- sqrt(cross_validation_ridge_MSE)  

# Model 2 Lasso model MSE & RMSE
cross_validation_lasso_MSE <- min(cv_lasso_model$cvm)  
cross_validation_lasso_RMSE <- sqrt(cross_validation_lasso_MSE)  

# Model 3 Ridge model excluding Var. 3 MSE & RMSE
cross_validation_ridge_excl_V3_MSE <- min(cv_ridge_model_excl_V3$cvm) 
cross_validation_ridge_excl_V3_RMSE <- sqrt(cross_validation_ridge_excl_V3_MSE)    

# Model 4 Ridge model with lambda min. MSE & RMSE  
cross_validation_ridge_lambda_min <- min(cv_ridge_model_lambda_min$cvm) 
cross_validation_ridge_lambda_min_RMSE <- sqrt(cross_validation_ridge_lambda_min)    # The square root of the MSE of the ridge regression model excl V3

# Model 5 Lasso model with lambda min.
cross_validation_lasso_lambda_min <- min(cv_lasso_model_lambda_min$cvm) 
cross_validation_lasso_lambda_min_RMSE <- sqrt(cross_validation_lasso_lambda_min)    # The square root of the MSE of the ridge regression model excl V3

# Create a data frame to store the MSE and RMSE values
model_comparison <- data.frame(
  Model = c("Ridge", "Lasso", "Ridge Excl. V3", "Ridge Lambda Min", "Lasso Lambda Min"),
  MSE = c(cross_validation_ridge_MSE, cross_validation_lasso_MSE, cross_validation_ridge_excl_V3_MSE, cross_validation_ridge_lambda_min, cross_validation_lasso_lambda_min),
  RMSE = c(cross_validation_ridge_RMSE, cross_validation_lasso_RMSE, cross_validation_ridge_excl_V3_RMSE, cross_validation_ridge_lambda_min_RMSE, cross_validation_lasso_lambda_min_RMSE)
)

# Print the data frame
print(model_comparison)

##              Model      MSE     RMSE
## 1            Ridge 30.13858 5.489862
## 2            Lasso 22.94890 4.790501
## 3   Ridge Excl. V3 29.86710 5.465080
## 4 Ridge Lambda Min 20.59304 4.537955
## 5 Lasso Lambda Min 20.91619 4.573422

#  min() can be used to get the smallest MSE from the evaluated lambda values.
#  The square root of the MSE of the ridge regression model

### Find optimal lambda value that minimizes test MSE ###

# From the above results we can see the Ridge Lamda Min has produced the lowest MSE. 
#         Model         MSE               RMSE
# 1            Ridge  30.13858         5.489862
# 2            Lasso  22.94890         4.790501
# 3   Ridge Excl. V3  29.86710         5.465080
# 4 Ridge Lambda Min  20.59304         4.537955
# 5 Lasso Lambda Min  20.91619         4.573422

# I've selected the optimal value of lambda, it's time for me 
# to assess the performance of the ridge regression model on the test set.

# I'll utilize the predict function to generate predictions of the response variable f
# or the test set, employing the ridge regression model with the chosen lambda value. 

# Next, I'll compare these predicted values with the actual values and compute 
# various metrics to gauge the accuracy of the predictions, 
# including Mean Squared Error (MSE), 
# Root Mean Squared Error (RMSE), and the 
# coefficient of determination (R-squared).
# Using the predict function, 

# I'll generate the predicted values for the test set using prediction func;

### To determine the optimal lambda value that minimizes cross-validated error, element.
# Lower values of lambda indicate stronger regularization, 
# while higher values indicate weaker regularization
cv_ridge_model_lambda_min$lambda.min

## [1] 0.6520751

####################  Final Evaluation ####################

test <- na.omit(test)   # Remove NA value from test set
anyNA(test)  #  If there are any missing values in the test set they will need to be handled.

## [1] FALSE

# The test set predictor matrix has already been created earlier.
x_test <- as.matrix(test[ , -1])   # predictor matrix for the test set.
y_test <- test[ , 1]
x_test <- scale(x_test)


is.infinite(x_test)

##       V.2   V.3   V.4   V.5   V.6   V.7   V.8     Y
## 5   FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 10  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 15  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 18  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 21  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 22  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 24  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 26  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 27  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 32  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 33  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 39  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 40  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 46  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 47  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 49  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 52  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 58  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 60  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 64  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 65  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 66  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 68  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 69  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 73  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 74  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 84  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 92  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 97  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 99  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 101 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 105 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 113 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 114 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 115 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 116 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 119 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 120 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 121 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 123 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 125 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 126 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 128 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 132 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 137 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 140 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 147 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 152 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 157 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 158 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 159 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 160 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 162 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 170 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 174 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 179 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 183 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 186 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 187 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 188 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 190 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 194 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 202 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 203 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 207 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 214 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 216 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 220 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 223 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 228 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 230 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 236 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 237 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 245 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 246 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 248 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 249 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 253 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 258 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 259 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 260 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 266 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 268 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 270 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 272 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 279 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 280 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 284 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 285 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 288 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 291 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 292 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 293 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 294 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 296 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 299 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 300 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 302 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 303 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 305 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 307 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 309 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 316 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 318 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 323 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 324 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 325 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 329 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 331 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 334 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 336 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 340 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 348 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 351 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 352 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 357 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 358 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 365 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 366 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 367 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 368 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

is.infinite(y_test)

##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE

head(x_test)

##            V.2         V.3       V.4       V.5       V.6         V.7       V.8
## 5  -0.16150584 -0.12469449 1.1920213 2.5705410 1.0061656 -0.50491007 3.8283033
## 10  0.06911469  0.05724249 0.8962365 1.2679695 1.5202055 -0.95745910 2.3723086
## 15  0.72638319  0.63944082 3.2069717 2.3844594 0.5006565  3.11548214 4.1403022
## 18  0.81286589  0.85776520 1.0939840 0.3375613 2.3910401  4.47312923 0.1883167
## 21  1.12996911  0.67582822 2.3102485 0.9888471 1.0886295 -0.05236104 1.6443113
## 22  1.32599656  0.80318411 3.1467440 1.3610104 1.1029971 -0.05236104 2.8923067
##           Y
## 5  2.460813
## 10 1.055977
## 15 3.196680
## 18 1.256668
## 21 1.055977
## 22 1.524256

head(y_test)

## [1] 1 1 1 1 1 1

# Check if the test set is scaled similarly to the training set
summary(x_test)

##       V.2               V.3               V.4               V.5         
##  Min.   :-0.8880   Min.   :-0.7069   Min.   :-0.8885   Min.   :-1.3372  
##  1st Qu.:-0.5824   1st Qu.:-0.4522   1st Qu.:-0.6888   1st Qu.:-0.7789  
##  Median :-0.2941   Median :-0.2339   Median :-0.3424   Median :-0.2207  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.2305   3rd Qu.: 0.1118   3rd Qu.: 0.3140   3rd Qu.: 0.6167  
##  Max.   : 7.3567   Max.   : 8.2808   Max.   : 4.8023   Max.   : 3.5940  
##       V.6                V.7                V.8                Y          
##  Min.   :-1.57311   Min.   :-1.71171   Min.   :-1.0181   Min.   :-1.3523  
##  1st Qu.:-0.78200   1st Qu.:-0.50491   1st Qu.:-0.6749   1st Qu.:-0.8171  
##  Median :-0.08592   Median :-0.05236   Median :-0.3317   Median :-0.2820  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.56332   3rd Qu.: 0.40019   3rd Qu.: 0.2923   3rd Qu.: 0.7215  
##  Max.   : 4.56528   Max.   : 4.47313   Max.   : 4.1403   Max.   : 3.1967

# Make test set predictions.
predictions <- predict(ridge_model_lambda_min, s = ridge_model_lambda_min$lambda, newx = x_test)

summary(predictions)    # Verify the prediction process

##        s1              s2              s3              s4       
##  Min.   :9.534   Min.   :9.465   Min.   :9.458   Min.   :9.451  
##  1st Qu.:9.534   1st Qu.:9.521   1st Qu.:9.519   1st Qu.:9.518  
##  Median :9.534   Median :9.540   Median :9.541   Median :9.542  
##  Mean   :9.534   Mean   :9.534   Mean   :9.534   Mean   :9.534  
##  3rd Qu.:9.534   3rd Qu.:9.549   3rd Qu.:9.551   3rd Qu.:9.552  
##  Max.   :9.534   Max.   :9.566   Max.   :9.569   Max.   :9.573  
##        s5              s6              s7              s8       
##  Min.   :9.443   Min.   :9.434   Min.   :9.424   Min.   :9.414  
##  1st Qu.:9.516   1st Qu.:9.515   1st Qu.:9.513   1st Qu.:9.511  
##  Median :9.542   Median :9.543   Median :9.544   Median :9.545  
##  Mean   :9.534   Mean   :9.534   Mean   :9.534   Mean   :9.534  
##  3rd Qu.:9.554   3rd Qu.:9.556   3rd Qu.:9.558   3rd Qu.:9.560  
##  Max.   :9.576   Max.   :9.580   Max.   :9.585   Max.   :9.590  
##        s9             s10             s11             s12       
##  Min.   :9.402   Min.   :9.390   Min.   :9.376   Min.   :9.360  
##  1st Qu.:9.508   1st Qu.:9.506   1st Qu.:9.503   1st Qu.:9.500  
##  Median :9.546   Median :9.547   Median :9.548   Median :9.549  
##  Mean   :9.534   Mean   :9.534   Mean   :9.534   Mean   :9.534  
##  3rd Qu.:9.563   3rd Qu.:9.565   3rd Qu.:9.568   3rd Qu.:9.572  
##  Max.   :9.595   Max.   :9.601   Max.   :9.607   Max.   :9.614  
##       s13             s14             s15             s16       
##  Min.   :9.344   Min.   :9.326   Min.   :9.306   Min.   :9.284  
##  1st Qu.:9.497   1st Qu.:9.493   1st Qu.:9.489   1st Qu.:9.485  
##  Median :9.551   Median :9.552   Median :9.554   Median :9.556  
##  Mean   :9.534   Mean   :9.534   Mean   :9.534   Mean   :9.534  
##  3rd Qu.:9.575   3rd Qu.:9.579   3rd Qu.:9.583   3rd Qu.:9.588  
##  Max.   :9.622   Max.   :9.630   Max.   :9.639   Max.   :9.649  
##       s17             s18             s19             s20       
##  Min.   :9.261   Min.   :9.235   Min.   :9.206   Min.   :9.176  
##  1st Qu.:9.480   1st Qu.:9.475   1st Qu.:9.470   1st Qu.:9.464  
##  Median :9.558   Median :9.560   Median :9.563   Median :9.565  
##  Mean   :9.534   Mean   :9.534   Mean   :9.534   Mean   :9.534  
##  3rd Qu.:9.593   3rd Qu.:9.598   3rd Qu.:9.604   3rd Qu.:9.611  
##  Max.   :9.660   Max.   :9.672   Max.   :9.685   Max.   :9.699  
##       s21             s22             s23             s24       
##  Min.   :9.142   Min.   :9.106   Min.   :9.066   Min.   :9.023  
##  1st Qu.:9.457   1st Qu.:9.450   1st Qu.:9.442   1st Qu.:9.433  
##  Median :9.568   Median :9.571   Median :9.575   Median :9.578  
##  Mean   :9.534   Mean   :9.534   Mean   :9.534   Mean   :9.534  
##  3rd Qu.:9.618   3rd Qu.:9.626   3rd Qu.:9.634   3rd Qu.:9.643  
##  Max.   :9.715   Max.   :9.731   Max.   :9.750   Max.   :9.770  
##       s25             s26             s27             s28       
##  Min.   :8.976   Min.   :8.925   Min.   :8.870   Min.   :8.810  
##  1st Qu.:9.424   1st Qu.:9.414   1st Qu.:9.403   1st Qu.:9.391  
##  Median :9.582   Median :9.587   Median :9.591   Median :9.596  
##  Mean   :9.534   Mean   :9.534   Mean   :9.534   Mean   :9.534  
##  3rd Qu.:9.653   3rd Qu.:9.664   3rd Qu.:9.676   3rd Qu.:9.689  
##  Max.   :9.791   Max.   :9.815   Max.   :9.840   Max.   :9.868  
##       s29             s30             s31             s32        
##  Min.   :8.746   Min.   :8.676   Min.   :8.601   Min.   : 8.520  
##  1st Qu.:9.378   1st Qu.:9.364   1st Qu.:9.349   1st Qu.: 9.333  
##  Median :9.602   Median :9.608   Median :9.614   Median : 9.621  
##  Mean   :9.534   Mean   :9.534   Mean   :9.534   Mean   : 9.534  
##  3rd Qu.:9.703   3rd Qu.:9.718   3rd Qu.:9.734   3rd Qu.: 9.752  
##  Max.   :9.898   Max.   :9.930   Max.   :9.965   Max.   :10.003  
##       s33              s34              s35              s36        
##  Min.   : 8.433   Min.   : 8.339   Min.   : 8.239   Min.   : 8.132  
##  1st Qu.: 9.315   1st Qu.: 9.297   1st Qu.: 9.277   1st Qu.: 9.257  
##  Median : 9.628   Median : 9.636   Median : 9.644   Median : 9.653  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.: 9.771   3rd Qu.: 9.791   3rd Qu.: 9.813   3rd Qu.: 9.837  
##  Max.   :10.043   Max.   :10.087   Max.   :10.133   Max.   :10.184  
##       s37              s38              s39              s40        
##  Min.   : 8.017   Min.   : 7.895   Min.   : 7.766   Min.   : 7.629  
##  1st Qu.: 9.236   1st Qu.: 9.213   1st Qu.: 9.190   1st Qu.: 9.165  
##  Median : 9.662   Median : 9.672   Median : 9.682   Median : 9.693  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.: 9.862   3rd Qu.: 9.889   3rd Qu.: 9.918   3rd Qu.: 9.948  
##  Max.   :10.237   Max.   :10.294   Max.   :10.355   Max.   :10.420  
##       s41              s42              s43              s44        
##  Min.   : 7.484   Min.   : 7.332   Min.   : 7.172   Min.   : 7.005  
##  1st Qu.: 9.140   1st Qu.: 9.113   1st Qu.: 9.086   1st Qu.: 9.058  
##  Median : 9.704   Median : 9.716   Median : 9.728   Median : 9.741  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.: 9.981   3rd Qu.:10.015   3rd Qu.:10.051   3rd Qu.:10.090  
##  Max.   :10.488   Max.   :10.560   Max.   :10.637   Max.   :10.717  
##       s45              s46              s47              s48        
##  Min.   : 6.830   Min.   : 6.648   Min.   : 6.460   Min.   : 6.266  
##  1st Qu.: 9.030   1st Qu.: 9.002   1st Qu.: 8.973   1st Qu.: 8.945  
##  Median : 9.754   Median : 9.772   Median : 9.794   Median : 9.818  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.:10.127   3rd Qu.:10.163   3rd Qu.:10.200   3rd Qu.:10.237  
##  Max.   :10.801   Max.   :10.889   Max.   :10.980   Max.   :11.075  
##       s49              s50              s51              s52        
##  Min.   : 6.067   Min.   : 5.863   Min.   : 5.654   Min.   : 5.443  
##  1st Qu.: 8.917   1st Qu.: 8.890   1st Qu.: 8.864   1st Qu.: 8.839  
##  Median : 9.837   Median : 9.849   Median : 9.876   Median : 9.905  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.:10.277   3rd Qu.:10.322   3rd Qu.:10.380   3rd Qu.:10.440  
##  Max.   :11.173   Max.   :11.274   Max.   :11.378   Max.   :11.484  
##       s53              s54              s55              s56        
##  Min.   : 5.230   Min.   : 5.015   Min.   : 4.800   Min.   : 4.585  
##  1st Qu.: 8.816   1st Qu.: 8.796   1st Qu.: 8.777   1st Qu.: 8.746  
##  Median : 9.936   Median : 9.945   Median : 9.947   Median : 9.959  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.:10.490   3rd Qu.:10.533   3rd Qu.:10.569   3rd Qu.:10.620  
##  Max.   :11.593   Max.   :11.704   Max.   :11.816   Max.   :11.930  
##       s57              s58              s59              s60        
##  Min.   : 4.372   Min.   : 4.161   Min.   : 3.954   Min.   : 3.751  
##  1st Qu.: 8.740   1st Qu.: 8.722   1st Qu.: 8.663   1st Qu.: 8.600  
##  Median : 9.959   Median : 9.956   Median : 9.950   Median :10.003  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.:10.662   3rd Qu.:10.720   3rd Qu.:10.778   3rd Qu.:10.831  
##  Max.   :12.044   Max.   :12.159   Max.   :12.274   Max.   :12.389  
##       s61              s62              s63              s64        
##  Min.   : 3.553   Min.   : 3.361   Min.   : 3.176   Min.   : 2.998  
##  1st Qu.: 8.534   1st Qu.: 8.465   1st Qu.: 8.505   1st Qu.: 8.457  
##  Median :10.062   Median :10.090   Median :10.121   Median :10.182  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.:10.883   3rd Qu.:10.919   3rd Qu.:10.981   3rd Qu.:11.010  
##  Max.   :12.504   Max.   :12.618   Max.   :12.731   Max.   :12.843  
##       s65              s66              s67              s68        
##  Min.   : 2.828   Min.   : 2.666   Min.   : 2.512   Min.   : 2.367  
##  1st Qu.: 8.370   1st Qu.: 8.278   1st Qu.: 8.267   1st Qu.: 8.253  
##  Median :10.234   Median :10.245   Median :10.257   Median :10.288  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.:11.023   3rd Qu.:11.033   3rd Qu.:11.081   3rd Qu.:11.137  
##  Max.   :12.953   Max.   :13.061   Max.   :13.168   Max.   :13.273  
##       s69              s70              s71              s72        
##  Min.   : 2.232   Min.   : 2.106   Min.   : 1.862   Min.   : 1.540  
##  1st Qu.: 8.177   1st Qu.: 8.123   1st Qu.: 8.021   1st Qu.: 7.959  
##  Median :10.290   Median :10.309   Median :10.305   Median :10.291  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.:11.193   3rd Qu.:11.249   3rd Qu.:11.304   3rd Qu.:11.360  
##  Max.   :13.375   Max.   :13.475   Max.   :13.573   Max.   :13.668  
##       s73              s74               s75               s76         
##  Min.   : 1.220   Min.   : 0.9016   Min.   : 0.5864   Min.   : 0.2756  
##  1st Qu.: 7.918   1st Qu.: 7.8949   1st Qu.: 7.8684   1st Qu.: 7.8385  
##  Median :10.280   Median :10.3186   Median :10.3191   Median :10.4190  
##  Mean   : 9.534   Mean   : 9.5344   Mean   : 9.5344   Mean   : 9.5344  
##  3rd Qu.:11.415   3rd Qu.:11.4685   3rd Qu.:11.4936   3rd Qu.:11.5648  
##  Max.   :13.760   Max.   :13.8497   Max.   :13.9366   Max.   :14.0207  
##       s77                s78               s79               s80         
##  Min.   :-0.02964   Min.   :-0.3286   Min.   :-0.6201   Min.   :-0.9036  
##  1st Qu.: 7.80511   1st Qu.: 7.6979   1st Qu.: 7.5713   1st Qu.: 7.4817  
##  Median :10.50587   Median :10.5098   Median :10.5138   Median :10.4939  
##  Mean   : 9.53441   Mean   : 9.5344   Mean   : 9.5344   Mean   : 9.5344  
##  3rd Qu.:11.59945   3rd Qu.:11.6884   3rd Qu.:11.7710   3rd Qu.:11.8164  
##  Max.   :14.10164   Max.   :14.1799   Max.   :14.2553   Max.   :14.3277  
##       s81              s82              s83              s84        
##  Min.   :-1.178   Min.   :-1.443   Min.   :-1.698   Min.   :-1.942  
##  1st Qu.: 7.470   1st Qu.: 7.461   1st Qu.: 7.454   1st Qu.: 7.424  
##  Median :10.469   Median :10.440   Median :10.426   Median :10.402  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.:11.860   3rd Qu.:11.904   3rd Qu.:12.034   3rd Qu.:12.122  
##  Max.   :14.397   Max.   :14.464   Max.   :14.527   Max.   :14.588  
##       s85              s86              s87              s88        
##  Min.   :-2.175   Min.   :-2.398   Min.   :-2.610   Min.   :-2.809  
##  1st Qu.: 7.331   1st Qu.: 7.240   1st Qu.: 7.152   1st Qu.: 7.066  
##  Median :10.377   Median :10.353   Median :10.355   Median :10.338  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.:12.298   3rd Qu.:12.397   3rd Qu.:12.417   3rd Qu.:12.433  
##  Max.   :14.646   Max.   :14.701   Max.   :14.754   Max.   :14.805  
##       s89              s90              s91              s92        
##  Min.   :-2.999   Min.   :-3.177   Min.   :-3.344   Min.   :-3.501  
##  1st Qu.: 6.984   1st Qu.: 6.904   1st Qu.: 6.901   1st Qu.: 6.817  
##  Median :10.322   Median :10.313   Median :10.356   Median :10.396  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.:12.473   3rd Qu.:12.581   3rd Qu.:12.619   3rd Qu.:12.623  
##  Max.   :14.853   Max.   :14.899   Max.   :14.943   Max.   :14.983  
##       s93              s94              s95              s96        
##  Min.   :-3.648   Min.   :-3.784   Min.   :-3.912   Min.   :-4.030  
##  1st Qu.: 6.732   1st Qu.: 6.684   1st Qu.: 6.656   1st Qu.: 6.631  
##  Median :10.431   Median :10.405   Median :10.397   Median :10.384  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.:12.626   3rd Qu.:12.651   3rd Qu.:12.702   3rd Qu.:12.757  
##  Max.   :15.021   Max.   :15.057   Max.   :15.091   Max.   :15.122  
##       s97              s98              s99              s100       
##  Min.   :-4.140   Min.   :-4.240   Min.   :-4.333   Min.   :-4.419  
##  1st Qu.: 6.608   1st Qu.: 6.585   1st Qu.: 6.565   1st Qu.: 6.547  
##  Median :10.360   Median :10.337   Median :10.315   Median :10.293  
##  Mean   : 9.534   Mean   : 9.534   Mean   : 9.534   Mean   : 9.534  
##  3rd Qu.:12.809   3rd Qu.:12.858   3rd Qu.:12.904   3rd Qu.:12.947  
##  Max.   :15.152   Max.   :15.179   Max.   :15.313   Max.   :15.461

# Finally, let's evaluate how well (or poorly) we have done on the test set.

test_MSE <- MSE(predictions, y_test)
test_RMSE <- RMSE(predictions, y_test)

test_MSE

## [1] 34.62584

test_RMSE

## [1] 5.884372

# Extracting the best lambda by indexing the glmnet lambda component
# then index it by order of RMSE, order puts them in ascending order
# smallest value and this will pick out the best lambda.

lam.best <- ridge_model_lambda_min$lambda[order(test_MSE)[1]]

lam.best <- min(ridge_model_lambda_min$lambda)
lam.best

## [1] 0.3723852

coef(ridge_model_lambda_min, s = lam.best)

## 9 x 1 sparse Matrix of class "dgCMatrix"
##                     s1
## (Intercept)  9.5344130
## V.2         -1.4181346
## V.3         -0.2269537
## V.4          1.5371207
## V.5          0.2266450
## V.6         -3.6329502
## V.7          1.0237112
## V.8         -3.3897642
## Y            1.0640065

############## Plot of test set predictions vs actual values ######################

# Pad predictions vector with zeros if its length is less than y_test
if (length(predictions) < length(y_test)) {
  extra_zeros <- rep(0, length(y_test) - length(predictions))
  predictions <- c(predictions, extra_zeros)
} else if (length(predictions) > length(y_test)) {
  predictions <- predictions[1:length(y_test)]
}

# Check if the lengths are the same
length(predictions)

## [1] 121

length(y_test)

## [1] 121

# If they are not the same, you can adjust one of the vectors to match the length of the other.
# For example, you can trim or pad one of the vectors to match the length of the other.
# But in this case its same

# Ensuring both vectors have the same length
min_length <- min(length(predictions), length(y_test))
predictions <- predictions[1:min_length]
y_test <- y_test[1:min_length]

# Now plot the data
plot(x = predictions, y = y_test, frame = FALSE, pch = 19, 
     col = "red", xlab = "Predicted Values", ylab = "Actual Values")

######Dual line chart for predicted vs. actual values########
test_instances <- seq_along(y_test)

plot(x = test_instances, y = y_test, frame = FALSE, pch = 19, type = "l",
     col = "red", xlab = "Test Instance", ylab = "Valence")

lines(x = test_instances, y = predictions, pch = 18, col = "blue", type = "l", lty = 2)  

# Adding legend
legend("topleft", legend=c("Actual", "Predicted"), col=c("red", "blue"), lty = 1:2, cex=0.8)

###### verify if the model predictions fall within  +/- 500,000 Iranian Rial##############

# Calculate the absolute difference between predictions and actual values
abs_diff <- abs(predictions - y_test)

# Check if the absolute difference is within the specified threshold
within_threshold <- abs_diff <= 500000

# Count the number of predictions within the threshold
num_within_threshold <- sum(within_threshold)

# Calculate the percentage of predictions within the threshold
percentage_within_threshold <- (num_within_threshold / length(y_test)) * 100

print(percentage_within_threshold)

## [1] 100

# Print the results
cat("Number of predictions within +/- 500,000 Iranian Rial threshold:", num_within_threshold, "\n")

## Number of predictions within +/- 500,000 Iranian Rial threshold: 121

cat("Percentage of predictions within +/- 500,000 Iranian Rial threshold:", percentage_within_threshold, "%\n")

## Percentage of predictions within +/- 500,000 Iranian Rial threshold: 100 %

#####ploting the visual for it
# Convert the logical vector to a factor with labels "Within Threshold" and "Outside Threshold"
within_threshold_factor <- factor(within_threshold, levels = c(FALSE, TRUE), labels = c("Outside Threshold", "Within Threshold"))

# Create a bar plot to visualize the proportion of predictions within the threshold
bar_colors <- c("red", "green")
bar_names <- c("Outside Threshold", "Within Threshold")
barplot(table(within_threshold_factor), col = bar_colors, main = "Predictions Within +/- 500,000 Iranian Rial threshold", ylab = "Frequency")

# Add a legend to the plot
legend("topleft", legend = bar_names, fill = bar_colors, cex = 0.8)

#########Plotting another scatter plot####################

# Calculate the mean or median of the actual values
actual_mean <- mean(y_test)
actual_median <- median(y_test)

# Calculate the threshold values
threshold_upper <- actual_mean + 500000
threshold_lower <- actual_mean - 500000

# Create a vector to indicate whether each prediction is within the threshold
within_threshold <- ifelse(predictions >= threshold_lower & predictions <= threshold_upper, TRUE, FALSE)

# Define colors for points based on whether they are within the threshold
point_colors <- ifelse(within_threshold, "blue", "red")

# Create the scatter plot
plot(predictions, y_test, col = point_colors, pch = 19, 
     xlab = "Predicted Values", ylab = "Actual Values", 
     main = "Scatter Plot with Threshold Highlighted")

# Add horizontal lines for the threshold range
abline(h = c(threshold_upper, threshold_lower), col = "green", lty = 2)

# Add legend
legend("topright", legend = c("Within Threshold", "Outside Threshold"), 
       col = c("blue", "red"), lwd = 3, cex = 0.5, text.width = 0.9)

ICC-cost-analysis.R

akumar

2024-05-24