# Libraries
library(glmnet) # This library contains the functions for training regularised linear regression models.
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(DescTools) # Descriptive statistics library.
library(ggplot2)
data <- read.csv("data.csv")
set.seed(2) # Setting random seed for random sample reproducibility
# Gathering training indices: 67% for training/cross-val, the rest for test.
train_indices <- sample(1:nrow(data), nrow(data) * 0.67, replace = FALSE)
# Now we'll partition the data as per the test design/plan.
train <- data[train_indices,]
test <- data[-train_indices,]
################## Data Understanding Code #####################
Str(data) # Display the structure of the data
## 'data.frame': 369 obs. of 9 variables:
## 1 $ V.1: int 1 1 1 1 1 1 1 1 1 1 ...
## 2 $ V.2: num 685 750 1150 1450 1460 1480 1590 1810 1830 1860 ...
## 3 $ V.3: num 202 200 380 370 380 380 350 492 410 480 ...
## 4 $ V.4: num 13.7 90 575 406 627.8 ...
## 5 $ V.5: int 20 120 500 280 430 150 420 640 270 290 ...
## 6 $ V.6: num 460 846 591 702 733 ...
## 7 $ V.7: int 12 18 18 15 15 15 18 33 18 12 ...
## 8 $ V.8: int 140 1100 5300 3800 4700 1700 4900 5700 3700 3300 ...
## 9 $ Y : int 30 150 600 370 590 190 620 900 300 380 ...
# View(data)
summary(data) # Summary statistics of the data
## V.1 V.2 V.3 V.4
## Min. : 1.00 Min. : 200 Min. : 60.0 Min. : 3.7
## 1st Qu.: 4.00 1st Qu.: 715 1st Qu.: 190.0 1st Qu.: 67.2
## Median : 8.00 Median : 1195 Median : 300.0 Median : 163.8
## Mean : 9.75 Mean : 1648 Mean : 407.7 Mean : 302.2
## 3rd Qu.:17.00 3rd Qu.: 2035 3rd Qu.: 480.0 3rd Qu.: 354.6
## Max. :20.00 Max. :15670 Max. :5000.0 Max. :7208.2
## NA's :1 NA's :1 NA's :1 NA's :1
## V.5 V.6 V.7 V.8
## Min. : 10.0 Min. : 193.1 Min. : 5.00 Min. : 40.0
## 1st Qu.: 80.0 1st Qu.: 391.7 1st Qu.:15.00 1st Qu.: 437.5
## Median :140.0 Median : 518.1 Median :18.00 Median : 795.0
## Mean :161.6 Mean : 552.8 Mean :18.47 Mean :1081.9
## 3rd Qu.:230.0 3rd Qu.: 660.7 3rd Qu.:21.00 3rd Qu.:1300.0
## Max. :640.0 Max. :3436.9 Max. :48.00 Max. :5700.0
## NA's :1 NA's :1 NA's :1 NA's :1
## Y
## Min. : 20.0
## 1st Qu.:107.5
## Median :190.0
## Mean :228.8
## 3rd Qu.:330.0
## Max. :900.0
## NA's :1
dim(data) # Dimensions of the data (rows and columns)
## [1] 369 9
nrow(data) # Number of rows in the data
## [1] 369
ncol(data) # Number of columns in the data
## [1] 9
## DescTools::
# We've used Desc() to gather descriptive statistics and visualisations of data before.
Desc(data)
## ------------------------------------------------------------------------------
## Describe data (data.frame):
##
## data frame: 369 obs. of 9 variables
## 368 complete cases (99.7%)
##
## Nr ColName Class NAs Levels
## 1 V.1 integer 1 (0.3%)
## 2 V.2 numeric 1 (0.3%)
## 3 V.3 numeric 1 (0.3%)
## 4 V.4 numeric 1 (0.3%)
## 5 V.5 integer 1 (0.3%)
## 6 V.6 numeric 1 (0.3%)
## 7 V.7 integer 1 (0.3%)
## 8 V.8 integer 1 (0.3%)
## 9 Y integer 1 (0.3%)
##
##
## ------------------------------------------------------------------------------
## 1 - V.1 (integer)
##
## length n NAs unique 0s mean meanCI'
## 369 368 1 20 0 9.75 9.08
## 99.7% 0.3% 0.0% 10.42
##
## .05 .10 .25 median .75 .90 .95
## 1.00 2.00 4.00 8.00 17.00 19.00 20.00
##
## range sd vcoef mad IQR skew kurt
## 19.00 6.55 0.67 8.90 13.00 0.24 -1.45
##
## lowest : 1 (29), 2 (25), 3 (29), 4 (33), 5 (20)
## highest: 16 (7), 17 (21), 18 (20), 19 (20), 20 (32)
##
## heap(?): remarkable frequency (9.0%) for the mode(s) (= 4)
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 2 - V.2 (numeric)
##
## length n NAs unique 0s mean meanCI'
## 369 368 1 213 0 1'648.35 1'483.95
## 99.7% 0.3% 0.0% 1'812.76
##
## .05 .10 .25 median .75 .90 .95
## 370.00 450.00 715.00 1'195.00 2'035.00 3'166.00 4'295.00
##
## range sd vcoef mad IQR skew kurt
## 15'470.00 1'603.86 0.97 896.97 1'320.00 4.10 27.08
##
## lowest : 200.0 (2), 220.0, 230.0, 270.0, 280.0 (3)
## highest: 6'740.0, 6'930.0, 8'800.0, 14'500.0, 15'670.0
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 3 - V.3 (numeric)
##
## length n NAs unique 0s mean meanCI'
## 369 368 1 101 0 407.73 363.79
## 99.7% 0.3% 0.0% 451.67
##
## .05 .10 .25 median .75 .90 .95
## 100.00 120.00 190.00 300.00 480.00 723.00 936.50
##
## range sd vcoef mad IQR skew kurt
## 4'940.00 428.64 1.05 207.56 290.00 5.55 45.62
##
## lowest : 60.0 (2), 70.0, 80.0 (5), 90.0 (4), 100.0 (8)
## highest: 1'780.0, 2'500.0, 2'960.0, 3'440.0, 5'000.0
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 4 - V.4 (numeric)
##
## length n NAs unique 0s mean meanCI'
## 369 368 1 333 0 302.210 250.610
## 99.7% 0.3% 0.0% 353.810
##
## .05 .10 .25 median .75 .90 .95
## 20.140 32.220 67.200 163.800 354.650 675.920 1'034.385
##
## range sd vcoef mad IQR skew kurt
## 7'204.500 503.371 1.666 174.280 287.450 7.870 96.099
##
## lowest : 3.7, 4.5, 6.0, 8.0, 8.6
## highest: 1'798.0, 1'800.0, 1'854.400, 2'668.0, 7'208.200
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 5 - V.5 (integer)
##
## length n NAs unique 0s mean meanCI'
## 369 368 1 50 0 161.58 150.10
## 99.7% 0.3% 0.0% 173.05
##
## .05 .10 .25 median .75 .90 .95
## 30.00 40.00 80.00 140.00 230.00 300.00 370.00
##
## range sd vcoef mad IQR skew kurt
## 630.00 111.91 0.69 103.78 150.00 1.16 1.57
##
## lowest : 10 (5), 20 (9), 30 (15), 40 (17), 50 (10)
## highest: 500, 530, 540, 580 (2), 640
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 6 - V.6 (numeric)
##
## length n NAs unique 0s mean meanCI'
## 369 368 1 310 0 552.8204 524.6332
## 99.7% 0.3% 0.0% 581.0075
##
## .05 .10 .25 median .75 .90 .95
## 273.7285 306.4060 391.6775 518.0950 660.7250 793.5190 877.6435
##
## range sd vcoef mad IQR skew kurt
## 3'243.8500 274.9744 0.4974 203.3979 269.0475 4.6295 40.4373
##
## lowest : 193.08 (2), 196.07 (2), 205.31, 206.95, 216.22
## highest: 1'362.9200, 1'446.6700, 1'495.0500, 2'648.0400, 3'436.9300
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 7 - V.7 (integer)
##
## length n NAs unique 0s mean meanCI'
## 369 368 1 18 0 18.47 17.88
## 99.7% 0.3% 0.0% 19.06
##
## .05 .10 .25 median .75 .90 .95
## 12.00 12.00 15.00 18.00 21.00 24.00 30.00
##
## range sd vcoef mad IQR skew kurt
## 43.00 5.74 0.31 4.45 6.00 1.41 4.39
##
## lowest : 5, 6 (3), 7, 8, 9 (11)
## highest: 36, 39 (3), 42, 45, 48
##
## heap(?): remarkable frequency (30.2%) for the mode(s) (= 18)
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 8 - V.8 (integer)
##
## length n NAs unique 0s mean meanCI'
## 369 368 1 121 0 1'081.88 979.79
## 99.7% 0.3% 0.0% 1'183.96
##
## .05 .10 .25 median .75 .90 .95
## 170.00 217.00 437.50 795.00 1'300.00 2'300.00 3'300.00
##
## range sd vcoef mad IQR skew kurt
## 5'660.00 995.88 0.92 600.45 862.50 2.07 4.71
##
## lowest : 40, 80 (2), 90, 100 (2), 110 (2)
## highest: 4'800, 4'900 (2), 5'000, 5'300, 5'700
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 9 - Y (integer)
##
## length n NAs unique 0s mean meanCI'
## 369 368 1 61 0 228.79 212.75
## 99.7% 0.3% 0.0% 244.83
##
## .05 .10 .25 median .75 .90 .95
## 50.00 67.00 107.50 190.00 330.00 430.00 486.50
##
## range sd vcoef mad IQR skew kurt
## 880.00 156.43 0.68 148.26 222.50 1.10 1.36
##
## lowest : 20 (3), 30 (8), 40 (4), 50 (13), 60 (9)
## highest: 620, 630, 650 (3), 700 (3), 900 (2)
##
## ' 95%-CI (classic)

# Calculate the mean for each variable
mean_values <- sapply(data, mean, na.rm = TRUE)
print(mean_values)
## V.1 V.2 V.3 V.4 V.5 V.6 V.7
## 9.75000 1648.35462 407.72962 302.20984 161.57609 552.82035 18.47011
## V.8 Y
## 1081.87500 228.79076
# Calculate the median for each variable
median_values <- sapply(data, median, na.rm = TRUE)
print(median_values)
## V.1 V.2 V.3 V.4 V.5 V.6 V.7 V.8
## 8.000 1195.000 300.000 163.800 140.000 518.095 18.000 795.000
## Y
## 190.000
# Function to calculate mode
calculate_mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
# Calculate the mode for each variable
mode_values <- sapply(data, calculate_mode)
print(mode_values)
## V.1 V.2 V.3 V.4 V.5 V.6 V.7 V.8 Y
## 4.00 1540.00 250.00 154.00 100.00 499.25 18.00 1100.00 110.00
# Calculate the range for each variable
range_values <- sapply(data, function(x) max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
print(range_values)
## V.1 V.2 V.3 V.4 V.5 V.6 V.7 V.8
## 19.00 15470.00 4940.00 7204.50 630.00 3243.85 43.00 5660.00
## Y
## 880.00
# Calculate the variance for each variable
variance_values <- sapply(data, var, na.rm = TRUE)
print(variance_values)
## V.1 V.2 V.3 V.4 V.5 V.6
## 4.290736e+01 2.572359e+06 1.837289e+05 2.533826e+05 1.252394e+04 7.561089e+04
## V.7 V.8 Y
## 3.299093e+01 9.917673e+05 2.447163e+04
# Calculate the standard deviation for each variable
std_dev_values <- sapply(data, sd, na.rm = TRUE)
print(std_dev_values)
## V.1 V.2 V.3 V.4 V.5 V.6
## 6.550371 1603.857419 428.636121 503.371240 111.910409 274.974351
## V.7 V.8 Y
## 5.743773 995.875153 156.434096
# Count missing values for each variable
missing_values <- colSums(is.na(data))
print(missing_values)
## V.1 V.2 V.3 V.4 V.5 V.6 V.7 V.8 Y
## 1 1 1 1 1 1 1 1 1
# Identify rows with missing values
rows_with_missing <- which(rowSums(is.na(data)) > 0)
print(rows_with_missing)
## [1] 369
# View the row with missing values
print(data[369, ])
## V.1 V.2 V.3 V.4 V.5 V.6 V.7 V.8 Y
## 369 NA NA NA NA NA NA NA NA NA
# Remove the row with missing values
data <- data[-369, ]
### Correlation###
# Exploring associations using Pearson's correlation statistic
# is important if we might wish to explore the data using
# linear regression. cor(data) below facilitates calculations
# of correlation matrices based on input matrices or data
# frames.
train.corr <- cor(train)
print(train.corr)
## V.1 V.2 V.3 V.4 V.5 V.6
## V.1 1.0000000 -0.3668054 -0.3444946 -0.32308120 -0.32995247 -0.5764518
## V.2 -0.3668054 1.0000000 0.9549185 0.82890755 0.29772735 0.3161668
## V.3 -0.3444946 0.9549185 1.0000000 0.76838678 0.27227474 0.2889082
## V.4 -0.3230812 0.8289075 0.7683868 1.00000000 0.58715391 0.3649887
## V.5 -0.3299525 0.2977273 0.2722747 0.58715391 1.00000000 0.3252373
## V.6 -0.5764518 0.3161668 0.2889082 0.36498870 0.32523730 1.0000000
## V.7 0.1389064 -0.1197643 -0.1608562 -0.09320812 0.04491195 0.1173662
## V.8 -0.4305652 0.3031388 0.2758476 0.49795034 0.79720694 0.1349534
## Y -0.2790013 0.2781865 0.2509129 0.56824306 0.97575557 0.3006600
## V.7 V.8 Y
## V.1 0.138906413 -0.430565227 -0.2790013
## V.2 -0.119764316 0.303138792 0.2781865
## V.3 -0.160856247 0.275847617 0.2509129
## V.4 -0.093208115 0.497950339 0.5682431
## V.5 0.044911951 0.797206940 0.9757556
## V.6 0.117366208 0.134953431 0.3006600
## V.7 1.000000000 -0.001755917 0.1551566
## V.8 -0.001755917 1.000000000 0.7802423
## Y 0.155156620 0.780242301 1.0000000
ggcorrplot::ggcorrplot(train.corr) # Correlation heat map

# Faceted scatter plots of Total Floor Area (V.2) and Preliminary Estimated Construction Cost (V.4) by Project Locality (V.1)
ggplot(data, aes(x = V.2, y = V.4)) +
geom_point() +
facet_wrap(~V.1) +
labs(title = "Scatter Plots of V.2 vs V.4 by Project Locality", x = "Total Floor Area (m^2)", y = "Preliminary Estimated Construction Cost (IRR)")

# Correlation matrix
correlation_matrix <- cor(data[, c("V.1","V.2", "V.3", "V.4", "V.5", "V.6", "V.7", "V.8", "Y")], use = "complete.obs")
print(correlation_matrix)
## V.1 V.2 V.3 V.4 V.5 V.6
## V.1 1.00000000 -0.2679524 -0.1905237 -0.31618372 -0.30916504 -0.6137182
## V.2 -0.26795239 1.0000000 0.9504365 0.71839832 0.20041915 0.2307232
## V.3 -0.19052374 0.9504365 1.0000000 0.58660991 0.13807650 0.1635075
## V.4 -0.31618372 0.7183983 0.5866099 1.00000000 0.59254692 0.3541253
## V.5 -0.30916504 0.2004191 0.1380765 0.59254692 1.00000000 0.3094863
## V.6 -0.61371820 0.2307232 0.1635075 0.35412534 0.30948629 1.0000000
## V.7 0.01812362 -0.1216557 -0.1782655 -0.04741677 0.06221763 0.1411306
## V.8 -0.42444770 0.2459270 0.1791519 0.53392526 0.81448707 0.1656995
## Y -0.29125587 0.1995680 0.1359838 0.58009348 0.97307794 0.2975429
## V.7 V.8 Y
## V.1 0.01812362 -0.42444770 -0.2912559
## V.2 -0.12165574 0.24592705 0.1995680
## V.3 -0.17826546 0.17915194 0.1359838
## V.4 -0.04741677 0.53392526 0.5800935
## V.5 0.06221763 0.81448707 0.9730779
## V.6 0.14113060 0.16569948 0.2975429
## V.7 1.00000000 0.02892702 0.1837235
## V.8 0.02892702 1.00000000 0.8004003
## Y 0.18372346 0.80040025 1.0000000
# Identify potential multicollinearity (absolute correlation > 0.7)
col_to_check <- which(abs(correlation_matrix) > 0.7 & correlation_matrix != 1, arr.ind = TRUE)
if (any(col_to_check)) {
print(paste0("High correlation detected between: ", rownames(correlation_matrix)[col_to_check[, 1]], " and ",
rownames(correlation_matrix)[col_to_check[, 2]]))
# Further investigation needed to decide if variable removal or other techniques are necessary
} else {
print("No high correlations (> 0.7) detected among relevant variables.")
}
## [1] "High correlation detected between: V.3 and V.2"
## [2] "High correlation detected between: V.4 and V.2"
## [3] "High correlation detected between: V.2 and V.3"
## [4] "High correlation detected between: V.2 and V.4"
## [5] "High correlation detected between: V.8 and V.5"
## [6] "High correlation detected between: Y and V.5"
## [7] "High correlation detected between: V.5 and V.8"
## [8] "High correlation detected between: Y and V.8"
## [9] "High correlation detected between: V.5 and Y"
## [10] "High correlation detected between: V.8 and Y"
#################### Data Preparation Code ######################
library(tidyverse) # Streamlined data manipulation, visualization and analysis
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret) # Building, training, evaluating and tuning models
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
##
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
head(train, 4)
## V.1 V.2 V.3 V.4 V.5 V.6 V.7 V.8 Y
## 341 20 810 250 32.4 40 274.06 15 570 50
## 198 9 520 120 62.4 120 667.98 27 360 170
## 262 15 430 110 73.1 170 409.62 21 650 210
## 273 16 810 210 48.6 60 432.44 18 260 80
tail(train, 4)
## V.1 V.2 V.3 V.4 V.5 V.6 V.7 V.8 Y
## 31 2 450 120 58.5 130 913.31 21 690 160
## 330 19 1530 390 15.3 10 193.08 15 120 40
## 19 1 3420 880 547.2 160 890.64 12 2000 180
## 30 2 370 120 40.7 110 585.22 15 1200 140
class(train) # show the data type
## [1] "data.frame"
str(train) # Checking the structure/format of the data.
## 'data.frame': 247 obs. of 9 variables:
## $ V.1: int 20 9 15 16 20 10 18 8 3 5 ...
## $ V.2: num 810 520 430 810 1060 450 450 450 3280 2630 ...
## $ V.3: num 250 120 110 210 330 120 120 120 720 580 ...
## $ V.4: num 32.4 62.4 73.1 48.6 21.2 ...
## $ V.5: int 40 120 170 60 20 120 180 230 140 120 ...
## $ V.6: num 274 668 410 432 205 ...
## $ V.7: int 15 27 21 18 12 24 21 21 18 18 ...
## $ V.8: int 570 360 650 260 210 500 920 1700 1200 850 ...
## $ Y : int 50 170 210 80 30 210 270 390 150 160 ...
summary(train)
## V.1 V.2 V.3 V.4
## Min. : 1.000 Min. : 200 Min. : 60.0 Min. : 3.7
## 1st Qu.: 4.000 1st Qu.: 705 1st Qu.: 190.0 1st Qu.: 67.6
## Median : 8.000 Median : 1150 Median : 290.0 Median : 160.5
## Mean : 9.534 Mean : 1603 Mean : 387.7 Mean : 317.2
## 3rd Qu.:15.500 3rd Qu.: 1905 3rd Qu.: 445.0 3rd Qu.: 352.4
## Max. :20.000 Max. :15670 Max. :3440.0 Max. :7208.2
## V.5 V.6 V.7 V.8 Y
## Min. : 10.0 Min. : 193.1 Min. : 5.00 Min. : 80 Min. : 20
## 1st Qu.: 80.0 1st Qu.: 396.5 1st Qu.:15.00 1st Qu.: 465 1st Qu.:110
## Median :140.0 Median : 523.6 Median :18.00 Median : 810 Median :190
## Mean :165.4 Mean : 563.2 Mean :18.53 Mean :1113 Mean :232
## 3rd Qu.:230.0 3rd Qu.: 665.7 3rd Qu.:21.00 3rd Qu.:1350 3rd Qu.:330
## Max. :640.0 Max. :3436.9 Max. :42.00 Max. :5700 Max. :900
train <- mutate_at(train, vars(V.1, V.5, V.7, V.8, Y), as.numeric) # convert total to numeric variable
str(train) # let`s check the data structure again
## 'data.frame': 247 obs. of 9 variables:
## $ V.1: num 20 9 15 16 20 10 18 8 3 5 ...
## $ V.2: num 810 520 430 810 1060 450 450 450 3280 2630 ...
## $ V.3: num 250 120 110 210 330 120 120 120 720 580 ...
## $ V.4: num 32.4 62.4 73.1 48.6 21.2 ...
## $ V.5: num 40 120 170 60 20 120 180 230 140 120 ...
## $ V.6: num 274 668 410 432 205 ...
## $ V.7: num 15 27 21 18 12 24 21 21 18 18 ...
## $ V.8: num 570 360 650 260 210 500 920 1700 1200 850 ...
## $ Y : num 50 170 210 80 30 210 270 390 150 160 ...
is.na(train) # classic way to check NA`s
## V.1 V.2 V.3 V.4 V.5 V.6 V.7 V.8 Y
## 341 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 198 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 262 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 273 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 349 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 204 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 297 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 178 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 75 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 131 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 306 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 311 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 63 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 136 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 231 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 289 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 54 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 112 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 171 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 38 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 110 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 144 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 45 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 238 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 208 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 134 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 339 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 9 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 130 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 244 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 3 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 129 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 304 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 363 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 301 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 354 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 274 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 8 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 164 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 37 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 226 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 149 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 205 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 242 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 44 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 276 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 156 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 342 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 106 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 175 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 359 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 182 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 224 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 271 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 13 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 189 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 96 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 166 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 265 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 53 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 143 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 36 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 17 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 241 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 80 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 127 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 267 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 79 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 78 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 364 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 153 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 192 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 23 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 28 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 196 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 185 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 221 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 102 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 151 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 71 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 275 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 212 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 239 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 269 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 133 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 154 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 163 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 91 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 225 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 191 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 142 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 25 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 361 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 197 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 350 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 295 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 172 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 356 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 94 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 34 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 72 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 77 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 81 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 176 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 355 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 29 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 51 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 200 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 263 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 82 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 287 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 109 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 362 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 12 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 222 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 314 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 150 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 345 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 353 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 278 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 298 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 76 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 195 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 310 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 59 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 229 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 320 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 146 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 326 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 313 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 118 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 282 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 181 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 210 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 180 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 107 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 14 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 98 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 213 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 56 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 254 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 247 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 124 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 281 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 139 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 62 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 35 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 86 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 217 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 335 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 88 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 70 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 89 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 95 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 252 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 103 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 43 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 16 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 344 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 333 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 257 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 227 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 11 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 50 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 108 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 155 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 209 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 42 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 135 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 319 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 332 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 87 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 308 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 193 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 100 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 138 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 67 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 177 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 218 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 169 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 145 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 234 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 141 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 184 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 283 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 327 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 256 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 233 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 343 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 148 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 117 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 55 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 315 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 346 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 255 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 250 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 41 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 48 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 173 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 61 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 111 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 90 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 165 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 4 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 85 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 243 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 57 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 6 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 161 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 199 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 261 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 20 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 328 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 104 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 290 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 321 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 264 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 235 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 360 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 122 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 277 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 338 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 168 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 83 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 317 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 215 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 337 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 206 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 240 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 232 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 93 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 347 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 286 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 167 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 201 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 219 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 7 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 211 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 251 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 312 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 322 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 31 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 330 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 19 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 30 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(is.na(train)) # counting NA`s
## [1] 0
apply(is.na(train),2, which) # which indexes of NA`s (df only)
## integer(0)
which(complete.cases(train)) # identify observed complete values
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## [91] 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## [145] 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## [163] 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## [181] 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## [199] 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## [217] 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## [235] 235 236 237 238 239 240 241 242 243 244 245 246 247
train <- na.omit(train)
clean.vector <- na.omit(list(train)) # clean/remove a vector NA`s
clean.df <- na.omit(train) # clean/remove a dataframe NA`s
apply(is.na(clean.df),2, which) # make sure if there are missing values
## integer(0)
any(is.na(clean.vector))
## [1] FALSE
any(is.na(clean.df))
## [1] FALSE
train %>% pull() %>% head() # extract column values of `state` as a vector
## [1] 50 170 210 80 30 210
print(train)
## V.1 V.2 V.3 V.4 V.5 V.6 V.7 V.8 Y
## 341 20 810.0 250.0 32.40 40 274.06 15 570 50
## 198 9 520.0 120.0 62.40 120 667.98 27 360 170
## 262 15 430.0 110.0 73.10 170 409.62 21 650 210
## 273 16 810.0 210.0 48.60 60 432.44 18 260 80
## 349 20 1060.0 330.0 21.20 20 205.31 12 210 30
## 204 10 450.0 120.0 54.00 120 490.57 24 500 210
## 297 18 450.0 120.0 81.00 180 324.97 21 920 270
## 178 8 450.0 120.0 103.50 230 505.25 21 1700 390
## 75 3 3280.0 720.0 459.20 140 805.03 18 1200 150
## 131 5 2630.0 580.0 315.60 120 599.10 18 850 160
## 306 18 1120.0 250.0 246.40 220 339.07 21 1200 350
## 311 18 1380.0 360.0 82.80 60 245.28 15 430 90
## 63 3 1820.0 400.0 655.20 360 474.74 15 3400 450
## 136 5 6740.0 1300.0 674.00 100 603.07 18 780 130
## 231 13 730.0 230.0 14.60 20 397.43 15 150 30
## 289 17 1410.0 310.0 239.70 170 338.79 18 960 270
## 54 2 5540.0 850.0 1551.20 280 957.42 27 1200 440
## 112 4 2120.0 470.0 508.80 240 817.74 18 1100 350
## 171 7 2110.0 470.0 189.90 90 620.86 15 490 110
## 38 2 900.0 300.0 351.00 390 755.51 18 2600 600
## 110 4 2100.0 380.0 819.00 390 601.08 12 2800 510
## 144 6 1020.0 270.0 142.80 140 829.60 18 840 180
## 45 2 1710.0 440.0 171.00 100 798.68 15 630 120
## 238 14 310.0 80.0 31.00 100 487.86 27 770 170
## 208 11 220.0 70.0 30.80 140 478.71 21 770 210
## 134 5 3650.0 940.0 1168.00 320 802.83 15 1500 420
## 339 20 580.0 150.0 104.40 180 348.70 21 1400 300
## 9 1 1830.0 410.0 494.10 270 650.57 18 3700 300
## 130 5 2500.0 640.0 300.00 120 599.10 12 810 140
## 244 14 580.0 150.0 75.40 130 442.94 21 1100 190
## 3 1 1150.0 380.0 575.00 500 590.68 18 5300 600
## 129 5 2370.0 730.0 71.10 30 400.73 9 210 90
## 304 18 950.0 250.0 133.00 140 337.33 18 610 160
## 363 20 2350.0 610.0 423.00 180 324.97 12 860 220
## 301 18 610.0 160.0 73.20 120 302.12 21 440 160
## 354 20 1370.0 360.0 191.80 140 271.21 15 1200 220
## 274 16 940.0 250.0 244.40 260 400.72 15 1400 370
## 8 1 1810.0 492.0 1158.40 640 843.98 33 5700 900
## 164 7 1060.0 240.0 190.80 180 784.87 21 1100 250
## 37 2 820.0 250.0 303.40 370 928.27 21 2400 580
## 226 13 520.0 140.0 78.00 150 466.12 21 1200 220
## 149 6 1450.0 320.0 377.00 260 443.35 21 3100 410
## 205 10 490.0 160.0 14.70 30 467.30 15 100 50
## 242 14 470.0 120.0 32.90 70 493.59 21 380 90
## 44 2 1540.0 590.0 215.60 140 698.95 6 820 160
## 276 17 370.0 80.0 51.80 140 337.33 42 470 270
## 156 7 350.0 143.0 122.50 350 442.61 9 1800 420
## 342 20 810.0 250.0 32.40 40 275.94 12 390 50
## 106 4 1900.0 550.0 437.00 230 1362.92 21 550 320
## 175 7 3270.0 510.0 327.00 100 579.84 27 230 90
## 359 20 1840.0 420.0 220.80 120 263.61 15 1500 170
## 182 8 550.0 140.0 44.00 80 578.67 21 530 100
## 224 12 2390.0 530.0 239.00 100 499.25 18 400 140
## 271 16 450.0 120.0 4.50 10 193.08 21 180 50
## 13 1 2620.0 670.0 1231.40 470 724.38 15 3900 650
## 189 8 1050.0 270.0 115.50 110 612.31 18 840 140
## 96 4 1500.0 330.0 540.00 360 867.43 21 1300 440
## 166 7 1380.0 310.0 193.20 140 744.83 21 700 200
## 265 15 750.0 200.0 45.00 60 423.07 18 360 80
## 53 2 4330.0 1110.0 86.60 20 421.26 12 170 30
## 143 6 920.0 210.0 119.60 130 649.02 24 1300 200
## 36 2 760.0 200.0 106.40 140 616.47 18 940 190
## 17 1 3040.0 670.0 881.60 290 561.79 15 4900 440
## 241 14 370.0 100.0 74.00 200 481.91 24 650 280
## 80 3 4840.0 930.0 1452.00 300 541.61 21 3300 430
## 127 5 2330.0 600.0 256.30 110 792.82 12 390 120
## 267 15 1120.0 290.0 358.40 320 421.99 18 1600 430
## 79 3 4600.0 1000.0 2668.00 580 3436.93 33 1300 900
## 78 3 4090.0 900.0 1104.30 270 655.93 15 2200 370
## 364 20 2500.0 510.0 325.00 130 251.84 18 1100 200
## 153 6 2880.0 740.0 662.40 230 579.07 15 2300 300
## 192 8 1510.0 390.0 241.60 160 654.09 15 1100 210
## 23 1 4080.0 790.0 530.40 130 759.17 21 1800 190
## 28 1 6930.0 1780.0 485.10 70 718.57 15 590 90
## 1 1 685.0 202.0 13.70 20 459.54 12 140 30
## 196 8 1580.0 410.0 47.40 30 585.11 15 210 70
## 185 8 670.0 180.0 154.10 230 577.03 18 1700 350
## 221 12 1000.0 260.0 20.00 20 436.43 15 80 30
## 102 4 1660.0 430.0 282.20 170 728.63 15 1200 240
## 151 6 1820.0 400.0 546.00 300 541.61 21 2600 430
## 71 3 2530.0 560.0 759.00 300 722.86 18 1800 330
## 275 16 6700.0 2500.0 603.00 90 279.67 27 580 150
## 212 12 230.0 60.0 18.40 80 460.02 30 350 110
## 239 14 330.0 90.0 23.10 70 414.80 27 320 100
## 269 16 200.0 100.0 46.00 230 445.56 9 820 300
## 133 5 3180.0 700.0 1049.40 330 732.16 15 1600 430
## 154 6 3050.0 670.0 1006.50 330 508.61 15 4300 460
## 163 7 940.0 250.0 94.00 100 723.33 18 550 120
## 91 4 1200.0 1100.0 300.00 250 451.35 21 980 370
## 225 13 430.0 110.0 81.70 190 476.68 24 820 310
## 191 8 1300.0 290.0 247.00 190 735.11 21 1100 300
## 142 6 850.0 220.0 68.00 80 548.12 15 870 100
## 25 1 4880.0 1070.0 1854.40 380 614.00 15 4100 520
## 361 20 2100.0 600.0 210.00 100 288.37 30 640 190
## 197 9 500.0 130.0 145.00 290 523.56 21 920 420
## 350 20 1070.0 280.0 160.50 150 290.58 18 1500 240
## 295 17 2170.0 480.0 195.30 90 439.07 18 340 130
## 172 7 2190.0 420.0 459.90 210 652.57 21 1000 300
## 356 20 1520.0 350.0 212.80 140 310.61 39 1200 300
## 94 4 1450.0 336.0 507.50 350 442.61 6 1800 400
## 34 2 690.0 160.0 89.70 130 770.35 24 990 190
## 72 3 3100.0 640.0 1798.00 580 647.79 21 4600 700
## 77 3 4070.0 1040.0 895.40 220 634.42 12 1800 290
## 81 3 5110.0 980.0 1430.80 280 505.51 21 2800 410
## 176 8 270.0 90.0 24.30 90 693.15 21 480 100
## 355 20 1390.0 360.0 41.70 30 206.95 12 350 40
## 29 1 15670.0 3440.0 7208.20 460 1010.50 5 3000 630
## 51 2 3600.0 980.0 612.00 170 2648.04 21 250 145
## 200 9 1680.0 370.0 285.60 170 490.23 18 800 260
## 263 15 460.0 120.0 96.60 210 461.32 21 1300 360
## 82 3 5200.0 800.0 208.00 40 958.39 39 120 130
## 287 17 1020.0 230.0 71.40 70 408.78 21 290 100
## 109 4 2070.0 400.0 269.10 130 723.64 21 730 170
## 362 20 2310.0 600.0 531.30 230 303.30 12 1400 280
## 12 1 2500.0 640.0 1050.00 420 647.32 36 4800 700
## 222 12 1370.0 300.0 109.60 80 476.68 21 640 100
## 314 18 1700.0 290.0 323.00 190 417.38 27 820 340
## 150 6 1800.0 400.0 378.00 210 659.84 18 2000 300
## 345 20 870.0 230.0 208.80 240 308.05 15 1300 320
## 353 20 1350.0 350.0 108.00 80 251.37 27 830 150
## 278 17 470.0 130.0 126.90 270 416.13 21 750 430
## 298 18 480.0 130.0 62.40 130 403.97 21 420 190
## 76 3 4000.0 880.0 400.00 100 750.70 15 540 110
## 195 8 1580.0 363.0 284.40 180 615.49 24 770 250
## 310 18 1380.0 360.0 289.80 210 339.32 15 690 300
## 59 3 1380.0 360.0 110.40 80 735.87 15 630 100
## 229 13 670.0 180.0 46.90 70 539.11 18 290 80
## 320 19 510.0 130.0 20.40 40 300.28 21 150 50
## 146 6 1070.0 210.0 42.80 40 772.31 27 430 110
## 326 19 940.0 240.0 141.00 150 377.65 18 430 220
## 313 18 1540.0 340.0 338.80 220 339.07 18 660 330
## 118 5 990.0 260.0 297.00 300 581.16 18 1700 470
## 282 17 540.0 140.0 113.40 210 406.81 21 610 350
## 181 8 530.0 140.0 148.40 280 542.42 21 1800 470
## 210 11 870.0 230.0 174.00 200 481.91 18 1300 220
## 180 8 510.0 160.0 15.30 30 654.64 15 150 50
## 107 4 1900.0 420.0 190.00 100 575.02 18 720 110
## 2 1 750.0 200.0 90.00 120 846.15 18 1100 150
## 14 1 2750.0 610.0 357.50 130 916.66 18 1300 160
## 98 4 1560.0 400.0 171.60 110 878.55 15 420 130
## 213 12 280.0 80.0 22.40 80 444.03 27 650 140
## 56 3 820.0 260.0 49.20 60 644.47 12 630 90
## 254 14 940.0 250.0 56.40 60 462.80 15 190 70
## 247 14 720.0 190.0 165.60 230 354.48 18 1200 350
## 124 5 2140.0 470.0 235.40 110 663.38 18 660 140
## 281 17 530.0 140.0 31.80 60 350.38 21 450 90
## 139 6 540.0 140.0 70.20 130 759.17 21 1400 180
## 62 3 1710.0 440.0 119.70 70 525.49 15 880 80
## 35 2 730.0 190.0 226.30 310 893.95 18 1800 450
## 86 4 720.0 190.0 72.00 100 798.68 18 490 120
## 217 12 810.0 210.0 194.40 240 532.48 18 900 330
## 335 19 2620.0 580.0 786.00 300 395.62 18 1000 400
## 88 4 1052.0 250.0 168.32 160 948.12 18 770 200
## 70 3 2260.0 440.0 158.20 70 569.44 21 620 80
## 89 4 1130.0 260.0 339.00 300 511.56 9 1400 390
## 95 4 1450.0 336.0 536.50 370 371.34 12 2300 390
## 252 14 850.0 220.0 59.50 70 422.15 15 520 80
## 103 4 1800.0 500.0 954.00 530 626.12 21 1900 650
## 43 2 1540.0 400.0 154.00 100 771.33 15 700 110
## 16 1 3030.0 930.0 515.10 170 1007.38 9 1500 190
## 344 20 840.0 190.0 67.20 80 273.55 24 650 120
## 333 19 1820.0 470.0 382.20 210 379.13 15 720 300
## 257 14 1110.0 250.0 88.80 80 390.29 21 900 120
## 227 13 550.0 140.0 77.00 140 439.90 21 1000 210
## 11 1 2110.0 540.0 189.90 90 732.14 15 690 110
## 50 2 3160.0 700.0 537.20 170 694.97 18 1100 300
## 108 4 2050.0 700.0 492.00 240 527.22 30 2100 450
## 155 6 3260.0 720.0 489.00 150 642.91 15 1500 220
## 209 11 600.0 160.0 12.00 20 392.14 18 110 60
## 42 2 1350.0 350.0 270.00 200 576.74 15 1700 270
## 135 5 5280.0 1160.0 1584.00 300 659.02 15 1500 420
## 319 19 410.0 110.0 20.50 50 301.54 24 400 70
## 332 19 1650.0 430.0 297.00 180 433.72 15 490 200
## 87 4 930.0 240.0 93.00 100 770.16 15 460 110
## 308 18 1210.0 270.0 193.60 160 318.86 21 680 260
## 193 8 1540.0 340.0 154.00 100 592.57 21 710 140
## 100 4 1650.0 432.0 264.00 160 851.23 15 550 200
## 138 6 360.0 100.0 86.40 240 409.25 24 3300 390
## 67 3 2120.0 470.0 572.40 270 650.57 18 2900 300
## 177 8 400.0 110.0 40.00 100 499.25 21 880 160
## 218 12 960.0 250.0 67.20 70 417.09 15 350 80
## 169 7 1980.0 440.0 495.00 250 549.19 18 1700 360
## 145 6 1040.0 270.0 93.60 90 718.81 15 810 110
## 234 13 960.0 220.0 67.20 70 405.89 21 140 60
## 141 6 840.0 220.0 100.80 120 690.02 18 770 130
## 184 8 640.0 250.0 96.00 150 654.06 6 1100 170
## 283 17 630.0 170.0 163.80 260 342.87 18 1100 350
## 327 19 1000.0 260.0 160.00 160 309.95 15 860 250
## 256 14 1080.0 240.0 205.20 190 343.02 18 1000 270
## 233 13 930.0 240.0 46.50 50 513.26 18 220 80
## 343 20 830.0 260.0 41.50 50 287.51 12 600 60
## 148 6 1330.0 300.0 172.90 130 649.02 21 1100 190
## 117 5 870.0 230.0 69.60 80 650.79 18 320 100
## 55 3 580.0 150.0 17.40 30 631.89 18 240 70
## 315 18 1910.0 420.0 305.60 160 388.70 18 810 210
## 346 20 890.0 230.0 35.60 40 237.03 18 350 50
## 255 14 1070.0 280.0 74.90 70 539.11 18 260 80
## 250 14 800.0 210.0 56.00 70 372.41 18 440 100
## 41 2 1260.0 330.0 189.00 150 875.96 15 790 210
## 48 2 2710.0 600.0 867.20 320 637.72 18 2300 490
## 173 7 2240.0 430.0 134.40 60 434.00 24 390 90
## 61 3 1610.0 410.0 177.10 110 585.22 15 1700 140
## 111 4 2100.0 450.0 546.00 260 816.95 15 600 330
## 90 4 1190.0 310.0 130.90 110 642.37 15 870 150
## 165 7 1310.0 290.0 104.80 80 578.67 21 480 100
## 4 1 1450.0 370.0 406.00 280 702.48 15 3800 370
## 85 4 440.0 120.0 13.20 30 779.57 24 160 60
## 243 14 520.0 140.0 109.20 210 323.66 18 1000 320
## 57 3 1090.0 280.0 152.60 140 777.06 15 1500 180
## 6 1 1480.0 380.0 222.00 150 731.79 15 1700 190
## 161 7 900.0 150.0 63.00 70 751.88 18 210 100
## 199 9 970.0 250.0 97.00 100 583.97 18 360 140
## 261 15 400.0 110.0 40.00 100 428.61 24 740 160
## 20 1 3560.0 690.0 925.60 260 885.89 21 2500 360
## 328 19 1150.0 260.0 161.00 140 307.54 21 990 240
## 104 4 1800.0 560.0 36.00 20 459.54 12 110 30
## 290 17 1510.0 330.0 166.10 110 341.82 18 640 140
## 321 19 520.0 140.0 57.20 110 341.82 21 420 160
## 264 15 620.0 160.0 136.40 220 375.15 18 1100 330
## 235 13 982.5 202.5 353.70 360 554.84 15 1900 500
## 360 20 1900.0 430.0 285.00 150 364.41 21 640 220
## 122 5 1810.0 400.0 488.70 270 538.07 18 2000 420
## 277 17 420.0 110.0 79.80 190 307.00 24 750 310
## 338 20 560.0 220.0 22.40 40 319.47 9 350 50
## 168 7 1540.0 400.0 154.00 100 555.04 15 870 140
## 83 3 6000.0 1320.0 1800.00 300 752.65 15 2800 400
## 317 19 280.0 90.0 11.20 40 325.39 18 180 50
## 215 12 540.0 140.0 75.60 140 478.71 18 900 200
## 337 20 510.0 160.0 30.60 60 245.28 27 790 110
## 206 10 660.0 170.0 112.20 170 490.23 18 680 260
## 240 14 370.0 120.0 3.70 10 196.07 15 180 20
## 232 13 740.0 190.0 37.00 50 361.67 18 340 70
## 93 4 1430.0 320.0 42.90 30 477.90 18 170 30
## 347 20 900.0 250.0 45.00 50 278.32 21 350 70
## 286 17 970.0 250.0 9.70 10 196.07 15 100 20
## 167 7 1450.0 320.0 203.00 140 600.05 21 840 250
## 201 10 280.0 90.0 44.80 160 388.70 18 710 210
## 219 12 980.0 250.0 107.80 110 471.47 15 780 160
## 7 1 1590.0 350.0 667.80 420 678.64 18 4900 620
## 211 11 2650.0 590.0 238.50 90 449.32 15 810 110
## 251 14 840.0 215.0 218.40 260 1495.05 30 440 350
## 312 18 1510.0 330.0 392.60 260 400.72 18 960 390
## 322 19 540.0 140.0 37.80 70 372.41 21 420 100
## 31 2 450.0 120.0 58.50 130 913.31 21 690 160
## 330 19 1530.0 390.0 15.30 10 193.08 15 120 40
## 19 1 3420.0 880.0 547.20 160 890.64 12 2000 180
## 30 2 370.0 120.0 40.70 110 585.22 15 1200 140
#Histogram of a numerical variable
hist(train$V.2,
main = "Histogram of V.2 (Total Floor Area)",
xlab = "Total Floor Area (m^2)", ylab = "Frequency",
col = "skyblue", border = "black")
abline(v = mean(train$V.2),
col = "red",
lwd = 2) # Add a vertical line for the mean
legend("topright",
legend = paste("Mean:",
round(mean(train$V.2), 2)),
col = "red",
lwd = 2) # Add a legend for the mean

#Boxplot of a numerical variable to identify outliers
boxplot(train$V.4, train$V.2,
main = "Boxplot of Preliminary Estimated Construction Cost (IRR)",
xlab = "Variable V.4",
ylab = "Construction Cost (IRR)",
col = "skyblue", # Customize colors
border = "black",
notch = TRUE, # Add a notch
pch = 19, # Adjust outlier symbol
horizontal = FALSE, # Horizontal orientation
grid = TRUE # Add grid lines
)

boxplot(train$V.2,
main = "Boxplot of Preliminary Estimated Construction Cost (IRR)",
xlab = "Variable V.4",
ylab = "Construction Cost (IRR)",
col = "skyblue", # Customize colors
border = "black",
notch = TRUE, # Add a notch
pch = 19, # Adjust outlier symbol
horizontal = FALSE, # Horizontal orientation
grid = TRUE # Add grid lines
)

# scatter plot
plot(train$V.2, train$V.4,
main = "Scatter Plot: Total Floor Area vs Preliminary Estimated Construction Cost",
xlab = "Total Floor Area (m^2)",
ylab = "Preliminary Estimated Construction Cost (IRR)",
col = "blue",
pch = 19)

#bar plot
barplot(table(train$V.1),
main = "Bar Plot of Project Locality",
xlab = "Project Locality",
ylab = "Frequency",
col = "skyblue")

# Time series plot of Actual Construction Costs (Y) over time (V.7)
plot(train$V.7, train$Y,
type = "l",
main = "Line Plot: Duration of Construction vs Actual Construction Costs",
xlab = "Duration of Construction (Months)",
ylab = "Actual Construction Costs (IRR)",
col = "red")

# heatmap
heatmap(cor(train[, c("V.1","V.2", "V.3", "V.4", "V.5", "V.6", "V.7", "V.8", "Y")]),
main = "Correlation Heatmap",
xlab = "Variables",
ylab = "Variables")

# Example density plot
plot(density(train$V.8),
main = "Density Plot of Price per Unit Area",
xlab = "Price per Unit Area (IRR)",
col = "blue")

pairs(train[, c("V.2", "V.3", "V.4", "V.5", "V.6", "V.7", "V.8", "Y")])

heatmap(cor(train[, c("V.2", "V.3", "V.4", "Y")]),
main = "Correlation Heatmap (Subset of Variables)",
xlab = "Variables",
ylab = "Variables")

#### Handling of the outliers########
#by examining each variable by boxplot, I have found outliers in 2 variables i.e. V.2 and V.4
# Boxplot before removing outliers
par(mfrow=c(1, 2))
boxplot(data$V.2, main="Boxplot of V.2 (Before)")
boxplot(data$V.4, main="Boxplot of V.4 (Before)")

# Identify outliers for V.2 and V.4 variables
outliers_V2 <- boxplot.stats(data$V.2)$out
outliers_V4 <- boxplot.stats(data$V.4)$out
# Remove outliers from the dataset
cleaned_data <- data[!(data$V.2 %in% outliers_V2 | data$V.4 %in% outliers_V4), ]
# Boxplot after removing outliers
par(mfrow=c(1, 2))
boxplot(cleaned_data$V.2, main="Boxplot of V.2 (After)")
boxplot(cleaned_data$V.4, main="Boxplot of V.4 (After)")

# Summary of removed outliers
cat("Outliers removed from V.2:", outliers_V2, "\n")
## Outliers removed from V.2: 4040 4080 4800 4880 5020 5030 6930 15670 4230 4330 5540 4070 4090 4600 4840 5110 5200 6000 5280 6740 6700 8800 5500 14500
cat("Outliers removed from V.4:", outliers_V4, "\n")
## Outliers removed from V.4: 1158.4 1050 1231.4 1230 881.6 925.6 962 1212 1854.4 1706.8 7208.2 867.2 1551.2 810 813.2 1798 895.4 1104.3 2668 1452 1430.8 1800 954 819 804 1049.4 1168 1584 1006.5 1100
# Remove rows with NA values
cleaned_data <- na.omit(cleaned_data)
# Summary of cleaned dataset
cat("Summary of cleaned dataset:\n")
## Summary of cleaned dataset:
summary(cleaned_data)
## V.1 V.2 V.3 V.4
## Min. : 1.00 Min. : 200 Min. : 60.0 Min. : 3.7
## 1st Qu.: 4.00 1st Qu.: 670 1st Qu.: 170.0 1st Qu.: 60.8
## Median :10.00 Median :1060 Median : 270.0 Median :136.4
## Mean :10.47 Mean :1265 Mean : 316.2 Mean :195.1
## 3rd Qu.:17.00 3rd Qu.:1705 3rd Qu.: 415.0 3rd Qu.:277.1
## Max. :20.00 Max. :4000 Max. :1100.0 Max. :786.0
## V.5 V.6 V.7 V.8
## Min. : 10.0 Min. : 193.1 Min. : 6.00 Min. : 40.0
## 1st Qu.: 80.0 1st Qu.: 378.4 1st Qu.:15.00 1st Qu.: 420.0
## Median :130.0 Median : 490.2 Median :18.00 Median : 750.0
## Mean :146.3 Mean : 532.6 Mean :18.34 Mean : 925.9
## 3rd Qu.:210.0 3rd Qu.: 650.6 3rd Qu.:21.00 3rd Qu.:1200.0
## Max. :500.0 Max. :2648.0 Max. :48.00 Max. :5300.0
## Y
## Min. : 20.0
## 1st Qu.:100.0
## Median :180.0
## Mean :206.6
## 3rd Qu.:300.0
## Max. :620.0
### IQR Method - to identify & handle outliers ###
# The IQR is used to identify and deal with outliers. It is a measure of
# the spread of the data values. It is a reliable measure of
# dispersion because it is not affected by extreme values of outliers.
#
# In the IQR method a range is defined by using first and third quartile and a
# multiplier which is usually set as 1.5. All the values below the lower
# limit and above the upper limit are considered as outliers.
# Calculate IQR for each variable
Q1 <- apply(train, 2, quantile, probs = 0.25)
Q3 <- apply(train, 2, quantile, probs = 0.75)
IQR <- Q3 - Q1
outliers <- apply(train, 2, function(x) x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))
## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x < (Q1 - 1.5 * IQR): longer object length is not a multiple of
## shorter object length
## Warning in x > (Q3 + 1.5 * IQR): longer object length is not a multiple of
## shorter object length
cleaned_data <- train
cleaned_data[outliers] <- NA
cleaned_data <- na.omit(cleaned_data)
summary(cleaned_data)
## V.1 V.2 V.3 V.4
## Min. : 1.00 Min. : 230 Min. : 60.0 Min. : 3.70
## 1st Qu.: 7.00 1st Qu.: 470 1st Qu.: 127.5 1st Qu.: 32.77
## Median :13.00 Median : 820 Median : 225.0 Median : 75.95
## Mean :11.97 Mean : 958 Mean : 251.6 Mean :151.42
## 3rd Qu.:18.00 3rd Qu.:1238 3rd Qu.: 305.0 3rd Qu.:204.45
## Max. :20.00 Max. :2710 Max. :1100.0 Max. :867.20
## V.5 V.6 V.7 V.8
## Min. : 10.0 Min. :193.1 Min. : 9.00 Min. : 80.0
## 1st Qu.: 70.0 1st Qu.:341.2 1st Qu.:15.00 1st Qu.: 387.5
## Median :120.0 Median :447.7 Median :18.00 Median : 670.0
## Mean :138.3 Mean :459.6 Mean :18.75 Mean : 800.0
## 3rd Qu.:210.0 3rd Qu.:543.8 3rd Qu.:21.00 3rd Qu.: 992.5
## Max. :390.0 Max. :798.7 Max. :30.00 Max. :2600.0
## Y
## Min. : 20.0
## 1st Qu.: 97.5
## Median :160.0
## Mean :198.4
## 3rd Qu.:300.0
## Max. :600.0
################## Modelling Code ###################
# Create Predictor matrix & Response vector for both train & test set
x_train <- as.matrix(train[ , -1]) # predictor matrix for the training set.
y_train <- train[ , 1] # response vector for the training set.
x_test <- as.matrix(test[ , -1]) # predictor matrix for the test set.
y_test <- test[ , 1] # response vector for the test set.
# Excluding variables V.1 and V.3 from the model and adding predictor matrix
# in the training & test set
x_train_excl_V3 <- as.matrix(x_train[ , -3])
# View(x_train_excl_V3) # checking the correct variable has been removed
x_test_excl_V3 <- x_test[ , -3]
# View(x_test_excl_V3) # checking the correct variable has been removed
# a matrix is a two-dimensionsal collection of elements of the same data type
# (numeric, character, or logical) arranged into a fixed number of rows and columns.
## Standardizing the predictor variables ##
# Standardizing the predictor variables to have mean
# zero and unit variance. This is important for ridge regression
# because it ensures that the penalty term is applied equally to
# all the coefficients. Using scale function to do this.
x_train <- scale(x_train) # standardize the training predictors
x_test <- scale(x_test) # standardize the test predictors
x_train_excl_V3 <- scale(x_train_excl_V3)
##### Build Training Model - Cross Validation using Ridge Regression ####
# To perform ridge regression, I’ll use functions from the glmnet package.
# I’ll use the glmnet() function to fit the ridge regression model
# and specify alpha=0 to select Ridge Regression
# Setting alpha equal to 1 is equivalent to using Lasso Regression and
# setting alpha to some value between 0 and 1 is equivalent to using an elastic net.
# I’ll use the default values of alpha and lambda and
# let the function choose the optimal values for us.
# Alpha = 0 corresponds to ridge regression,
# Alpha = 1 corresponds to lasso regression and
# 0 < alpha < 1 corresponds to elastic net regression,
# a combination of ridge and lasso.
# Ideally, producing multiple models and based on the lowest mean squared
# error and decide on the final model for implementation.
## Fitting the ridge regression model ##
# Model 1 for ridge regression
ridge_model <- glmnet(x_train, y_train, alpha = 0, standardize = FALSE)
# Model 2 for Lasso regression
lasso_model <- glmnet(x_train, y_train, alpha = 1, standardize = FALSE)
# Model 3 removing variable 3
ridge_model_excl_V3 <- glmnet(x_train_excl_V3, y_train, alpha = 0, standardize = FALSE)
# (The predictor value/ input features will need to be amended to exclude V3 in this example)
# Model 4 - Ridge Regression with lambda_min
ridge_model_lambda_min <- glmnet(x_train, y_train, alpha=0, lambda = ridge_model$lambda.min, standardize = FALSE)
# Model 5 Lasso regression with lambda_min
lasso_model_lambda.min <- glmnet(x_train, y_train, alpha = 1, lambda = lasso_model$lambda.min, standardize = FALSE)
# Note that by default, the glmnet() function standardizes the
# variables so that they are on the same scale. To turn off this default setting,
# use the argument standardize=FALSE.
# View summary model which will show the length, class, mode, and dimensions of the elements.
summary(ridge_model)
## Length Class Mode
## a0 100 -none- numeric
## beta 800 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 5 -none- call
## nobs 1 -none- numeric
summary(ridge_model_lambda_min)
## Length Class Mode
## a0 100 -none- numeric
## beta 800 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 6 -none- call
## nobs 1 -none- numeric
summary(lasso_model_lambda.min)
## Length Class Mode
## a0 79 -none- numeric
## beta 632 dgCMatrix S4
## df 79 -none- numeric
## dim 2 -none- numeric
## lambda 79 -none- numeric
## dev.ratio 79 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 6 -none- call
## nobs 1 -none- numeric
# The corresponding values of lambda, beta, df, dev. ratio, and a0.
# The beta element is a sparse matrix, which means that it only stores
# the non-zero values of the coefficients.
# The df element is the degrees of freedom, the number of non-zero coefficients.
# The dev.ratio element is the fraction of deviance the model explains.
# The a0 element is the intercept term.
# Associated with each value of λ is a vector of ridge regression coefficients,
# stored in a matrix that can be accessed by coef()
# coef(model)
# To check the coefficients using the dim()
dim(coef(ridge_model))
## [1] 9 100
ridge_model
##
## Call: glmnet(x = x_train, y = y_train, alpha = 0, standardize = FALSE)
##
## Df %Dev Lambda
## 1 8 0.00 3724.0
## 2 8 0.41 3393.0
## 3 8 0.45 3092.0
## 4 8 0.49 2817.0
## 5 8 0.54 2567.0
## 6 8 0.59 2339.0
## 7 8 0.64 2131.0
## 8 8 0.71 1942.0
## 9 8 0.77 1769.0
## 10 8 0.85 1612.0
## 11 8 0.93 1469.0
## 12 8 1.02 1338.0
## 13 8 1.11 1219.0
## 14 8 1.22 1111.0
## 15 8 1.33 1012.0
## 16 8 1.46 922.4
## 17 8 1.60 840.5
## 18 8 1.74 765.8
## 19 8 1.91 697.8
## 20 8 2.08 635.8
## 21 8 2.28 579.3
## 22 8 2.48 527.8
## 23 8 2.71 481.0
## 24 8 2.96 438.2
## 25 8 3.22 399.3
## 26 8 3.51 363.8
## 27 8 3.82 331.5
## 28 8 4.16 302.1
## 29 8 4.52 275.2
## 30 8 4.91 250.8
## 31 8 5.32 228.5
## 32 8 5.77 208.2
## 33 8 6.25 189.7
## 34 8 6.76 172.8
## 35 8 7.30 157.5
## 36 8 7.88 143.5
## 37 8 8.50 130.8
## 38 8 9.14 119.1
## 39 8 9.83 108.6
## 40 8 10.55 98.9
## 41 8 11.30 90.1
## 42 8 12.09 82.1
## 43 8 12.91 74.8
## 44 8 13.76 68.2
## 45 8 14.64 62.1
## 46 8 15.55 56.6
## 47 8 16.49 51.6
## 48 8 17.45 47.0
## 49 8 18.43 42.8
## 50 8 19.43 39.0
## 51 8 20.44 35.5
## 52 8 21.47 32.4
## 53 8 22.51 29.5
## 54 8 23.56 26.9
## 55 8 24.61 24.5
## 56 8 25.67 22.3
## 57 8 26.74 20.3
## 58 8 27.80 18.5
## 59 8 28.87 16.9
## 60 8 29.93 15.4
## 61 8 30.99 14.0
## 62 8 32.05 12.8
## 63 8 33.11 11.6
## 64 8 34.16 10.6
## 65 8 35.21 9.7
## 66 8 36.25 8.8
## 67 8 37.28 8.0
## 68 8 38.30 7.3
## 69 8 39.32 6.7
## 70 8 40.32 6.1
## 71 8 41.30 5.5
## 72 8 42.27 5.0
## 73 8 43.23 4.6
## 74 8 44.16 4.2
## 75 8 45.06 3.8
## 76 8 45.95 3.5
## 77 8 46.80 3.2
## 78 8 47.63 2.9
## 79 8 48.42 2.6
## 80 8 49.18 2.4
## 81 8 49.91 2.2
## 82 8 50.60 2.0
## 83 8 51.25 1.8
## 84 8 51.87 1.6
## 85 8 52.45 1.5
## 86 8 52.98 1.4
## 87 8 53.49 1.2
## 88 8 53.95 1.1
## 89 8 54.37 1.0
## 90 8 54.77 0.9
## 91 8 55.12 0.9
## 92 8 55.45 0.8
## 93 8 55.74 0.7
## 94 8 56.01 0.7
## 95 8 56.25 0.6
## 96 8 56.47 0.5
## 97 8 56.66 0.5
## 98 8 56.83 0.4
## 99 8 56.98 0.4
## 100 8 57.12 0.4
# Associated with each value of λ is a vector of ridge regression coefficients,
# stored in a matrix that can be accessed by coef(). In this case, it is a
# 5X100 matrix, with 5 rows (one for each predictor, plus an intercept) and
# 100 columns (one for each value of lambda).
ridge_model$lambda[20]
## [1] 635.7929
coef(ridge_model) [ , 20]
## (Intercept) V.2 V.3 V.4 V.5 V.6
## 9.534412955 -0.023183111 -0.021753041 -0.020171515 -0.020727795 -0.037175004
## V.7 V.8 Y
## 0.009042002 -0.027449376 -0.017414847
# Plotting the model object using plot function, shows coef change as a func of lambda
plot(ridge_model, xvar = "lambda", label = TRUE)# plot the coefficients vs lambda
# The x-axis is on a log scale (Log Lambda), so the smaller lambda
# values are on the right, and the larger values are on the left.
# The y-axis shows the values of the coefficients, and each line
# corresponds to a different predictor variable.
# In this example, when lambda log is 8, the coefficients are essentially zero.
# When we relax lambda the coefficients grow away from zero in a smooth way.
# The sum of squares of the coefficients are getting bigger and bigger until
# we reach a point where Lambda is effectively zero & the coefficients
# are regularized & so these would be the coefficients that you get from an
# ordinary least squares fit of these variables.
### Perform k-fold cross-validation to find optimal lambda value ###
# Next, we’ll identify the lambda value that produces the
# lowest test mean squared error (MSE) by using k-fold
# cross-validation a technique that splits the data into
# several subsets and uses some for training and some for testing.
# glmnet has the function cv.glmnet() that automatically
# performs k-fold cross validation. A fold is a subset of the
# data used for testing, while the rest is used for training.
# The default value is 10, meaning the data is split into 10 subsets,
# and each subset is used as a test set once.
# Model 1 - Ridge Regression cross validation
cv_ridge_model <- cv.glmnet(x_train, y_train, alpha = 1, nfolds = 5)
# Performs 5 fold cross validation on ridge model
# Model 2 - Lasso Regression cross validation
cv_lasso_model <- cv.glmnet(x_train, y_train, alpha = 0, nfolds = 5)
# Performs 5 fold cross validation on lasso model
# Model 3 - Ridge Regression with variable 3 excluded
cv_ridge_model_excl_V3 <- cv.glmnet(x_train_excl_V3, y_train, alpha = 1, nfolds = 5)
# Model 4 - Ridge Regression with lambda.min
# ridge_model_lambda_min <- glmnet(x_train, y_train, alpha=0, lambda = cv_ridge_model$lambda.min)
cv_ridge_model_lambda_min <- cv.glmnet(x_train, y_train, alpha = 0, nfolds = 5)
# Model 5 - Lasso Regression with lambda.min
cv_lasso_model_lambda_min <- cv.glmnet(x_train, y_train, alpha = 0, nfolds = 5)
summary(cv_ridge_model)
## Length Class Mode
## lambda 79 -none- numeric
## cvm 79 -none- numeric
## cvsd 79 -none- numeric
## cvup 79 -none- numeric
## cvlo 79 -none- numeric
## nzero 79 -none- numeric
## call 5 -none- call
## name 1 -none- character
## glmnet.fit 12 elnet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric
## index 2 -none- numeric
summary(ridge_model_lambda_min)
## Length Class Mode
## a0 100 -none- numeric
## beta 800 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 6 -none- call
## nobs 1 -none- numeric
# The cvm element is the mean cross-validated error for each value of lambda.
# The cvsd element is the standard deviation of the cross-validated error for
# each lambda value.
# The cvup and cvlo elements are the upper and lower confidence bounds
# for the cross-validated error for each lambda value.
# The nzero element is the number of non-zero coefficients for each value of lambda.
# The lambda.min element is the value of lambda that gives the minimum
# cross-validated error.
# The lambda.1se element is the largest value of lambda, giving a
# cross-validated error within one standard error of the minimum.
## plotting the cv_model object using the plot function ##
# Shows the cross-validated error changes as a function of lambda.
# The x-axis is on a log scale, so the smaller lambda values are on
# the right, and the larger values are on the left.
# The y-axis shows the values of the cross-validated error, and the
# error bars show the confidence bounds.
# The vertical dotted lines indicate the values of lambda that give
# the minimum cross-validated error and the largest error within
# one standard error of the minimum.
plot(cv_ridge_model) # plot the cross-validated error vs lambda

# This is a plot of the cross validated MSE and from the right hand side
# it dips downs. In the beginning the MSE is very high and the coefficients
# are restricted to be too small and then it starts to level off.
# This indicates that the full model is doing a good job.
# There are two vertical lines:
# The first one indicates the min. MSE.
# The second indicates the one standard error of the min. MSE.
# This is a more restricted model that can do as well as the min. MSE and
# we can decide to use this value instead of the min. MSE.
plot(cv_lasso_model) # Model 2 Lasso Regression
plot(cv_ridge_model_excl_V3) # Model 3 Ridge Regression with V3 removed

plot(cv_ridge_model_lambda_min) # Model 4 Ridge regression with lambda min.
plot(cv_lasso_model_lambda_min) # Model 5 Lasso regression with lambda min.

### This will show the results of the best results from each model. ###
cv_ridge_model
##
## Call: cv.glmnet(x = x_train, y = y_train, nfolds = 5, alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.109 39 30.14 12.149 6
## 1se 3.731 1 40.42 2.702 0
cv_lasso_model
##
## Call: cv.glmnet(x = x_train, y = y_train, nfolds = 5, alpha = 0)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.5414 96 22.95 2.874 8
## 1se 2.8891 78 25.69 1.568 8
cv_ridge_model_excl_V3
##
## Call: cv.glmnet(x = x_train_excl_V3, y = y_train, nfolds = 5, alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.082 42 29.87 12.215 5
## 1se 3.731 1 40.32 2.319 0
cv_ridge_model_lambda_min
##
## Call: cv.glmnet(x = x_train, y = y_train, nfolds = 5, alpha = 0)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.6521 94 20.59 2.921 8
## 1se 2.8891 78 23.27 1.801 8
cv_lasso_model_lambda_min
##
## Call: cv.glmnet(x = x_train, y = y_train, nfolds = 5, alpha = 0)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.7157 93 20.92 2.064 8
## 1se 2.6324 79 22.97 2.140 8
# This will pick the coefficient corresponding to the best model.
coef(cv_ridge_model)
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 9.534413
## V.2 .
## V.3 .
## V.4 .
## V.5 .
## V.6 .
## V.7 .
## V.8 .
## Y .
coef(cv_lasso_model_lambda_min)
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 9.53441296
## V.2 -0.64990363
## V.3 -0.45393667
## V.4 0.38171489
## V.5 -0.09739081
## V.6 -2.44145999
## V.7 0.73682873
## V.8 -1.73367451
## Y 0.23855216
coef(ridge_model_lambda_min)
## 9 x 100 sparse Matrix of class "dgCMatrix"
## [[ suppressing 100 column names 's0', 's1', 's2' ... ]]
##
## (Intercept) 9.534413e+00 9.534412955 9.534412955 9.534412955 9.534412955
## V.2 -2.393481e-36 -0.004486371 -0.004920155 -0.005395506 -0.005916320
## V.3 -2.247898e-36 -0.004212768 -0.004620024 -0.005066286 -0.005555210
## V.4 -2.108171e-36 -0.003942568 -0.004322814 -0.004739295 -0.005195372
## V.5 -2.153008e-36 -0.004031027 -0.004420300 -0.004846769 -0.005313906
## V.6 -3.761467e-36 -0.007077590 -0.007764819 -0.008518479 -0.009344929
## V.7 9.063929e-37 0.001708537 0.001874760 0.002057118 0.002257168
## V.8 -2.809527e-36 -0.005274764 -0.005785703 -0.006345780 -0.006959650
## Y -1.820541e-36 -0.003404471 -0.003732803 -0.004092417 -0.004486216
##
## (Intercept) 9.534412955 9.534412955 9.534412955 9.534412955 9.534412955
## V.2 -0.006486852 -0.007111736 -0.007796017 -0.008545178 -0.009365178
## V.3 -0.006090786 -0.006677356 -0.007319649 -0.008022802 -0.008792396
## V.4 -0.005694704 -0.006241263 -0.006839362 -0.007493675 -0.008209263
## V.5 -0.005825493 -0.006385643 -0.006998828 -0.007669901 -0.008404129
## V.6 -0.010251129 -0.011244686 -0.012333916 -0.013527907 -0.014836585
## V.7 0.002476618 0.002717337 0.002981375 0.003270973 0.003588586
## V.8 -0.007632393 -0.008369554 -0.009177177 -0.010061850 -0.011030748
## Y -0.004917358 -0.005389271 -0.005905678 -0.006470612 -0.007088442
##
## (Intercept) 9.534412955 9.534412955 9.534412955 9.534412955 9.534412955
## V.2 -0.010262485 -0.011244108 -0.012317641 -0.013491291 -0.014774154
## V.3 -0.009634488 -0.010555641 -0.011562960 -0.012664123 -0.013867545
## V.4 -0.008991600 -0.009846590 -0.010780600 -0.011800475 -0.012913644
## V.5 -0.009207216 -0.010085332 -0.011045144 -0.012093843 -0.013239196
## V.6 -0.016270785 -0.017842331 -0.019564118 -0.021450201 -0.023515890
## V.7 0.003936903 0.004318862 0.004737681 0.005196876 0.005700291
## V.8 -0.012091676 -0.013253125 -0.014524315 -0.015915255 -0.017436792
## Y -0.007763889 -0.008502049 -0.009308416 -0.010188894 -0.011149821
##
## (Intercept) 9.534412955 9.534412955 9.534412955 9.534412955 9.534412955
## V.2 -0.016175392 -0.017705466 -0.019375419 -0.021197085 -0.023183111
## V.3 -0.015181942 -0.016617015 -0.018183073 -0.019891153 -0.021753041
## V.4 -0.014127832 -0.015451502 -0.016893594 -0.018463591 -0.020171515
## V.5 -0.014489477 -0.015853622 -0.017341170 -0.018962290 -0.020727795
## V.6 -0.025777842 -0.028254182 -0.030964602 -0.033930480 -0.037175004
## V.7 0.006252128 0.006856976 0.007519843 0.008246198 0.009042002
## V.8 -0.019100678 -0.020919617 -0.022907328 -0.025078607 -0.027449376
## Y -0.012197983 -0.013340624 -0.014585452 -0.015940646 -0.017414847
##
## (Intercept) 9.534412955 9.53441296 9.53441296 9.53441296 9.53441296
## V.2 -0.025346979 -0.02770301 -0.03026638 -0.03305309 -0.03607995
## V.3 -0.023781289 -0.02598922 -0.02839093 -0.03100126 -0.03383578
## V.4 -0.022027909 -0.02404381 -0.02623069 -0.02860042 -0.03116518
## V.5 -0.022649143 -0.02473842 -0.02700833 -0.02947216 -0.03214369
## V.6 -0.040723297 -0.04460254 -0.04884212 -0.05347373 -0.05853156
## V.7 0.009913755 0.01086854 0.01191406 0.01305869 0.01431156
## V.8 -0.030036745 -0.03285906 -0.03593595 -0.03928837 -0.04293862
## Y -0.019017145 -0.02075705 -0.02264446 -0.02468961 -0.02690299
##
## (Intercept) 9.53441296 9.53441296 9.53441296 9.53441296 9.53441296
## V.2 -0.03936453 -0.04292511 -0.04678057 -0.05095028 -0.05545396
## V.3 -0.03691074 -0.04024299 -0.04384991 -0.04774924 -0.05195902
## V.4 -0.03393730 -0.03692921 -0.04015318 -0.04362117 -0.04734455
## V.5 -0.03503719 -0.03816725 -0.04154869 -0.04519640 -0.04912516
## V.6 -0.06405234 -0.07007558 -0.07664357 -0.08380160 -0.09159798
## V.7 0.01568255 0.01718239 0.01882272 0.02061613 0.02257622
## V.8 -0.04691037 -0.05122870 -0.05592001 -0.06101209 -0.06653402
## Y -0.02929525 -0.03187709 -0.03465912 -0.03765165 -0.04086453
##
## (Intercept) 9.53441296 9.53441296 9.53441296 9.53441296 9.53441296
## V.2 -0.06031149 -0.06554269 -0.07116707 -0.07720352 -0.08366998
## V.3 -0.05649729 -0.06138199 -0.06663060 -0.07225989 -0.07828557
## V.4 -0.05133379 -0.05559815 -0.06014523 -0.06498056 -0.07010704
## V.5 -0.05334937 -0.05788282 -0.06273838 -0.06792759 -0.07346031
## V.6 -0.10008418 -0.10931484 -0.11934789 -0.13024448 -0.14206904
## V.7 0.02471767 0.02705634 0.02960925 0.03239472 0.03543238
## V.8 -0.07251609 -0.07898975 -0.08598747 -0.09354260 -0.10168921
## Y -0.04430683 -0.04798660 -0.05191052 -0.05608349 -0.06050824
##
## (Intercept) 9.53441296 9.53441296 9.53441296 9.53441296 9.53441296
## V.2 -0.09058303 -0.09795753 -0.10580610 -0.11413871 -0.12296215
## V.3 -0.08472190 -0.09158129 -0.09887382 -0.10660675 -0.11478409
## V.4 -0.07552440 -0.08122859 -0.08721106 -0.09345813 -0.09995020
## V.5 -0.07934426 -0.08558451 -0.09218298 -0.09913788 -0.10644316
## V.6 -0.15488919 -0.16877567 -0.18380224 -0.20004546 -0.21758453
## V.7 0.03874324 0.04234971 0.04627569 0.05054653 0.05518910
## V.8 -0.11046189 -0.11989558 -0.13002534 -0.14088610 -0.15251254
## Y -0.06518483 -0.07011018 -0.07527747 -0.08067562 -0.08628871
##
## (Intercept) 9.53441296 9.53441296 9.5344130 9.5344130 9.53441296
## V.2 -0.13227956 -0.14208210 -0.1523779 -0.1631504 -0.17438393
## V.3 -0.12340603 -0.13246791 -0.1419623 -0.1518748 -0.16218611
## V.4 -0.10666104 -0.11355965 -0.1206005 -0.1277351 -0.13490409
## V.5 -0.11408794 -0.12205785 -0.1303277 -0.1388703 -0.14765062
## V.6 -0.23650100 -0.25687887 -0.2788028 -0.3023596 -0.32763670
## V.7 0.06023174 0.06570427 0.0716379 0.0780652 0.08501994
## V.8 -0.16493883 -0.17819879 -0.1923248 -0.2073492 -0.22330347
## Y -0.09209535 -0.09806811 -0.1041732 -0.1103698 -0.11660958
##
## (Intercept) 9.53441296 9.5344130 9.5344130 9.5344130 9.5344130 9.5344130
## V.2 -0.18605747 -0.1981447 -0.2106138 -0.2234284 -0.2365472 -0.2499253
## V.3 -0.17287130 -0.1838998 -0.1952351 -0.2068356 -0.2186541 -0.2306390
## V.4 -0.14203856 -0.1490596 -0.1558784 -0.1623963 -0.1685047 -0.1740862
## V.5 -0.15662649 -0.1657487 -0.1749608 -0.1841996 -0.1933951 -0.2024712
## V.6 -0.35472183 -0.3837024 -0.4146649 -0.4476942 -0.4828727 -0.5202796
## V.7 0.09253698 0.1006520 0.1094014 0.1188218 0.1289499 0.1398216
## V.8 -0.24021875 -0.2581260 -0.2770565 -0.2970427 -0.3181186 -0.3403209
## Y -0.12283688 -0.1289882 -0.1349922 -0.1407704 -0.1462368 -0.1512991
##
## (Intercept) 9.5344130 9.5344130 9.5344130 9.5344130 9.5344130 9.5344130
## V.2 -0.2635148 -0.2772657 -0.2911274 -0.3050498 -0.3189320 -0.3328210
## V.3 -0.2427347 -0.2548826 -0.2670216 -0.2790901 -0.2910208 -0.3027676
## V.4 -0.1790148 -0.1831572 -0.1863735 -0.1885191 -0.1894716 -0.1890416
## V.5 -0.2113464 -0.2199342 -0.2281449 -0.2358856 -0.2430815 -0.2496057
## V.6 -0.5599898 -0.6020726 -0.6465910 -0.6936002 -0.7431493 -0.7952662
## V.7 0.1514720 0.1639348 0.1772408 0.1914184 0.2064925 0.2224810
## V.8 -0.3636899 -0.3882706 -0.4141138 -0.4412774 -0.4698270 -0.4998346
## Y -0.1558590 -0.1598134 -0.1630553 -0.1654751 -0.1669572 -0.1673964
##
## (Intercept) 9.5344130 9.5344130 9.5344130 9.5344130 9.5344130 9.5344130
## V.2 -0.3466365 -0.3603456 -0.3739235 -0.3873545 -0.4006333 -0.4137655
## V.3 -0.3142665 -0.3254643 -0.3363127 -0.3467690 -0.3567968 -0.3663663
## V.4 -0.1870961 -0.1834870 -0.1780692 -0.1707029 -0.1612545 -0.1495991
## V.5 -0.2553782 -0.2603067 -0.2643021 -0.2672788 -0.2691567 -0.2698617
## V.6 -0.8499761 -0.9072867 -0.9671897 -1.0296593 -1.0946503 -1.1620971
## V.7 0.2393981 0.2572507 0.2760378 0.2957496 0.3163667 0.3378590
## V.8 -0.5313845 -0.5645696 -0.5994922 -0.6362637 -0.6750038 -0.7158391
## Y -0.1666799 -0.1647000 -0.1613527 -0.1565393 -0.1501671 -0.1421506
##
## (Intercept) 9.5344130 9.5344130 9.5344130 9.53441296 9.53441296
## V.2 -0.4267683 -0.4396706 -0.4525130 -0.46515209 -0.47802420
## V.3 -0.3754548 -0.3840462 -0.3921307 -0.39976113 -0.40686594
## V.4 -0.1356219 -0.1192198 -0.1003035 -0.07900570 -0.05490795
## V.5 -0.2693271 -0.2674946 -0.2643161 -0.25979861 -0.25379989
## V.6 -1.2319124 -1.3039863 -1.3781865 -1.45433852 -1.53228489
## V.7 0.3601854 0.3832931 0.4071174 0.43159029 0.45660992
## V.8 -0.7589009 -0.8043227 -0.8522372 -0.90268017 -0.95590919
## Y -0.1324124 -0.1208837 -0.1075050 -0.09220825 -0.07500598
##
## (Intercept) 9.53441296 9.534412955 9.53441296 9.53441296 9.53441296
## V.2 -0.49103400 -0.504266795 -0.51781519 -0.53177746 -0.54597130
## V.3 -0.41347740 -0.419603158 -0.42525107 -0.43042774 -0.43526475
## V.4 -0.02812692 0.001360286 0.03355605 0.06844075 0.10562324
## V.5 -0.24635677 -0.237464774 -0.22713515 -0.21539616 -0.20221605
## V.6 -1.61181966 -1.692724251 -1.77476297 -1.85768544 -1.94116165
## V.7 0.48208493 0.507908201 0.53396495 0.56013454 0.58631677
## V.8 -1.01197189 -1.070955804 -1.13292598 -1.19791987 -1.26570401
## Y -0.05585134 -0.034732582 -0.01164990 0.01338446 0.04026687
##
## (Intercept) 9.53441296 9.53441296 9.5344130 9.5344130 9.5344130
## V.2 -0.56106309 -0.57689741 -0.5935826 -0.6109031 -0.6295970
## V.3 -0.43956643 -0.44340199 -0.4467556 -0.4497545 -0.4521201
## V.4 0.14568559 0.18825022 0.2332089 0.2800363 0.3293327
## V.5 -0.18770175 -0.17192673 -0.1549870 -0.1367780 -0.1177105
## V.6 -2.02502318 -2.10895348 -2.1926749 -2.2758209 -2.3582699
## V.7 0.61234467 0.63811224 0.6634983 0.6884284 0.7127252
## V.8 -1.33664886 -1.41051353 -1.4871869 -1.5661971 -1.6479041
## Y 0.06903048 0.09959555 0.1318920 0.1656202 0.2009766
##
## (Intercept) 9.53441296 9.53441296 9.53441296 9.53441296 9.53441296
## V.2 -0.64945693 -0.67057461 -0.69271188 -0.71660207 -0.74198472
## V.3 -0.45390464 -0.45505128 -0.45569169 -0.45540443 -0.45425438
## V.4 0.38057730 0.43358015 0.48773155 0.54361722 0.60061751
## V.5 -0.09783206 -0.07730442 -0.05581313 -0.03431348 -0.01267964
## V.6 -2.43969791 -2.51985375 -2.59838243 -2.67527269 -2.75022863
## V.7 0.73632270 0.75913476 0.78115431 0.80220490 0.82229278
## V.8 -1.73182226 -1.81766993 -1.90473212 -1.99339323 -2.08293888
## Y 0.23773598 0.27577467 0.31447829 0.35446979 0.39527557
##
## (Intercept) 9.534412955 9.53441296 9.5344130 9.53441296 9.53441296
## V.2 -0.768918646 -0.79719756 -0.8274124 -0.85930001 -0.89271958
## V.3 -0.452149150 -0.44922484 -0.4449498 -0.43943058 -0.43276655
## V.4 0.658496218 0.71668421 0.7756400 0.83477436 0.89363069
## V.5 0.008893561 0.03104782 0.0521298 0.07260134 0.09323983
## V.6 -2.823066495 -2.89349543 -2.9616184 -3.02720241 -3.09004138
## V.7 0.841385259 0.85956092 0.8766345 0.89269249 0.90786024
## V.8 -2.172972053 -2.26265845 -2.3524109 -2.44142476 -2.52892234
## Y 0.436737408 0.47786064 0.5199410 0.56216753 0.60341169
##
## (Intercept) 9.5344130 9.5344130 9.5344130 9.5344130 9.5344130 9.5344130
## V.2 -0.9280584 -0.9650992 -1.0038216 -1.0441929 -1.0861673 -1.1296858
## V.3 -0.4244804 -0.4146838 -0.4033056 -0.3902934 -0.3756110 -0.3592398
## V.4 0.9525019 1.0109182 1.0686861 1.1256337 1.1816075 1.2364730
## V.5 0.1121343 0.1299273 0.1464824 0.1616841 0.1754356 0.1876579
## V.6 -3.1502778 -3.2077559 -3.2624448 -3.3143338 -3.3634315 -3.4097638
## V.7 0.9219546 0.9351017 0.9473381 0.9587047 0.9692456 0.9790062
## V.8 -2.6152790 -2.6997742 -2.7820826 -2.8619147 -2.9390197 -3.0131870
## Y 0.6453000 0.6868824 0.7280300 0.7686231 0.8085560 0.8477384
##
## (Intercept) 9.5344130 9.5344130 9.5344130 9.5344130 9.5344130 9.5344130
## V.2 -1.1746751 -1.2210482 -1.2677078 -1.3175130 -1.3673822 -1.4181346
## V.3 -0.3411802 -0.3214521 -0.3003004 -0.2771879 -0.2527608 -0.2269537
## V.4 1.2901145 1.3424346 1.3927191 1.4428350 1.4907447 1.5371207
## V.5 0.1982901 0.2072885 0.2131191 0.2200832 0.2242677 0.2266450
## V.6 -3.4533731 -3.4943158 -3.5327196 -3.5684981 -3.6018869 -3.6329502
## V.7 0.9880327 0.9963712 1.0038955 1.0111328 1.0176932 1.0237112
## V.8 -3.0842473 -3.1520718 -3.2167169 -3.2777472 -3.3354250 -3.3897642
## Y 0.8860959 0.9235706 0.9617581 0.9959394 1.0303721 1.0640065
#### Let's store the validation results! They will be useful to compare against the test set results.
# Model 1 Ridge model MSE & RMSE
cross_validation_ridge_MSE <- min(cv_ridge_model$cvm)
cross_validation_ridge_RMSE <- sqrt(cross_validation_ridge_MSE)
# Model 2 Lasso model MSE & RMSE
cross_validation_lasso_MSE <- min(cv_lasso_model$cvm)
cross_validation_lasso_RMSE <- sqrt(cross_validation_lasso_MSE)
# Model 3 Ridge model excluding Var. 3 MSE & RMSE
cross_validation_ridge_excl_V3_MSE <- min(cv_ridge_model_excl_V3$cvm)
cross_validation_ridge_excl_V3_RMSE <- sqrt(cross_validation_ridge_excl_V3_MSE)
# Model 4 Ridge model with lambda min. MSE & RMSE
cross_validation_ridge_lambda_min <- min(cv_ridge_model_lambda_min$cvm)
cross_validation_ridge_lambda_min_RMSE <- sqrt(cross_validation_ridge_lambda_min) # The square root of the MSE of the ridge regression model excl V3
# Model 5 Lasso model with lambda min.
cross_validation_lasso_lambda_min <- min(cv_lasso_model_lambda_min$cvm)
cross_validation_lasso_lambda_min_RMSE <- sqrt(cross_validation_lasso_lambda_min) # The square root of the MSE of the ridge regression model excl V3
# Create a data frame to store the MSE and RMSE values
model_comparison <- data.frame(
Model = c("Ridge", "Lasso", "Ridge Excl. V3", "Ridge Lambda Min", "Lasso Lambda Min"),
MSE = c(cross_validation_ridge_MSE, cross_validation_lasso_MSE, cross_validation_ridge_excl_V3_MSE, cross_validation_ridge_lambda_min, cross_validation_lasso_lambda_min),
RMSE = c(cross_validation_ridge_RMSE, cross_validation_lasso_RMSE, cross_validation_ridge_excl_V3_RMSE, cross_validation_ridge_lambda_min_RMSE, cross_validation_lasso_lambda_min_RMSE)
)
# Print the data frame
print(model_comparison)
## Model MSE RMSE
## 1 Ridge 30.13858 5.489862
## 2 Lasso 22.94890 4.790501
## 3 Ridge Excl. V3 29.86710 5.465080
## 4 Ridge Lambda Min 20.59304 4.537955
## 5 Lasso Lambda Min 20.91619 4.573422
# min() can be used to get the smallest MSE from the evaluated lambda values.
# The square root of the MSE of the ridge regression model
### Find optimal lambda value that minimizes test MSE ###
# From the above results we can see the Ridge Lamda Min has produced the lowest MSE.
# Model MSE RMSE
# 1 Ridge 30.13858 5.489862
# 2 Lasso 22.94890 4.790501
# 3 Ridge Excl. V3 29.86710 5.465080
# 4 Ridge Lambda Min 20.59304 4.537955
# 5 Lasso Lambda Min 20.91619 4.573422
# I've selected the optimal value of lambda, it's time for me
# to assess the performance of the ridge regression model on the test set.
# I'll utilize the predict function to generate predictions of the response variable f
# or the test set, employing the ridge regression model with the chosen lambda value.
# Next, I'll compare these predicted values with the actual values and compute
# various metrics to gauge the accuracy of the predictions,
# including Mean Squared Error (MSE),
# Root Mean Squared Error (RMSE), and the
# coefficient of determination (R-squared).
# Using the predict function,
# I'll generate the predicted values for the test set using prediction func;
### To determine the optimal lambda value that minimizes cross-validated error, element.
# Lower values of lambda indicate stronger regularization,
# while higher values indicate weaker regularization
cv_ridge_model_lambda_min$lambda.min
## [1] 0.6520751
#################### Final Evaluation ####################
test <- na.omit(test) # Remove NA value from test set
anyNA(test) # If there are any missing values in the test set they will need to be handled.
## [1] FALSE
# The test set predictor matrix has already been created earlier.
x_test <- as.matrix(test[ , -1]) # predictor matrix for the test set.
y_test <- test[ , 1]
x_test <- scale(x_test)
is.infinite(x_test)
## V.2 V.3 V.4 V.5 V.6 V.7 V.8 Y
## 5 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 10 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 15 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 18 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 21 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 22 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 24 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 26 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 27 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 32 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 33 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 39 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 40 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 46 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 47 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 49 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 52 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 58 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 60 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 64 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 65 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 66 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 68 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 69 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 73 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 74 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 84 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 92 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 97 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 99 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 101 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 105 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 113 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 114 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 115 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 116 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 119 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 120 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 121 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 123 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 125 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 126 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 128 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 132 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 137 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 140 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 147 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 152 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 157 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 158 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 159 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 160 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 162 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 170 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 174 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 179 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 183 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 186 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 187 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 188 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 190 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 194 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 202 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 203 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 207 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 214 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 216 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 220 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 223 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 228 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 230 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 236 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 237 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 245 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 246 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 248 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 249 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 253 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 258 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 259 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 260 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 266 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 268 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 270 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 272 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 279 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 280 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 284 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 285 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 288 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 291 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 292 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 293 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 294 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 296 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 299 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 300 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 302 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 303 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 305 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 307 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 309 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 316 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 318 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 323 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 324 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 325 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 329 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 331 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 334 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 336 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 340 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 348 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 351 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 352 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 357 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 358 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 365 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 366 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 367 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 368 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
is.infinite(y_test)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE
head(x_test)
## V.2 V.3 V.4 V.5 V.6 V.7 V.8
## 5 -0.16150584 -0.12469449 1.1920213 2.5705410 1.0061656 -0.50491007 3.8283033
## 10 0.06911469 0.05724249 0.8962365 1.2679695 1.5202055 -0.95745910 2.3723086
## 15 0.72638319 0.63944082 3.2069717 2.3844594 0.5006565 3.11548214 4.1403022
## 18 0.81286589 0.85776520 1.0939840 0.3375613 2.3910401 4.47312923 0.1883167
## 21 1.12996911 0.67582822 2.3102485 0.9888471 1.0886295 -0.05236104 1.6443113
## 22 1.32599656 0.80318411 3.1467440 1.3610104 1.1029971 -0.05236104 2.8923067
## Y
## 5 2.460813
## 10 1.055977
## 15 3.196680
## 18 1.256668
## 21 1.055977
## 22 1.524256
head(y_test)
## [1] 1 1 1 1 1 1
# Check if the test set is scaled similarly to the training set
summary(x_test)
## V.2 V.3 V.4 V.5
## Min. :-0.8880 Min. :-0.7069 Min. :-0.8885 Min. :-1.3372
## 1st Qu.:-0.5824 1st Qu.:-0.4522 1st Qu.:-0.6888 1st Qu.:-0.7789
## Median :-0.2941 Median :-0.2339 Median :-0.3424 Median :-0.2207
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.2305 3rd Qu.: 0.1118 3rd Qu.: 0.3140 3rd Qu.: 0.6167
## Max. : 7.3567 Max. : 8.2808 Max. : 4.8023 Max. : 3.5940
## V.6 V.7 V.8 Y
## Min. :-1.57311 Min. :-1.71171 Min. :-1.0181 Min. :-1.3523
## 1st Qu.:-0.78200 1st Qu.:-0.50491 1st Qu.:-0.6749 1st Qu.:-0.8171
## Median :-0.08592 Median :-0.05236 Median :-0.3317 Median :-0.2820
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.56332 3rd Qu.: 0.40019 3rd Qu.: 0.2923 3rd Qu.: 0.7215
## Max. : 4.56528 Max. : 4.47313 Max. : 4.1403 Max. : 3.1967
# Make test set predictions.
predictions <- predict(ridge_model_lambda_min, s = ridge_model_lambda_min$lambda, newx = x_test)
summary(predictions) # Verify the prediction process
## s1 s2 s3 s4
## Min. :9.534 Min. :9.465 Min. :9.458 Min. :9.451
## 1st Qu.:9.534 1st Qu.:9.521 1st Qu.:9.519 1st Qu.:9.518
## Median :9.534 Median :9.540 Median :9.541 Median :9.542
## Mean :9.534 Mean :9.534 Mean :9.534 Mean :9.534
## 3rd Qu.:9.534 3rd Qu.:9.549 3rd Qu.:9.551 3rd Qu.:9.552
## Max. :9.534 Max. :9.566 Max. :9.569 Max. :9.573
## s5 s6 s7 s8
## Min. :9.443 Min. :9.434 Min. :9.424 Min. :9.414
## 1st Qu.:9.516 1st Qu.:9.515 1st Qu.:9.513 1st Qu.:9.511
## Median :9.542 Median :9.543 Median :9.544 Median :9.545
## Mean :9.534 Mean :9.534 Mean :9.534 Mean :9.534
## 3rd Qu.:9.554 3rd Qu.:9.556 3rd Qu.:9.558 3rd Qu.:9.560
## Max. :9.576 Max. :9.580 Max. :9.585 Max. :9.590
## s9 s10 s11 s12
## Min. :9.402 Min. :9.390 Min. :9.376 Min. :9.360
## 1st Qu.:9.508 1st Qu.:9.506 1st Qu.:9.503 1st Qu.:9.500
## Median :9.546 Median :9.547 Median :9.548 Median :9.549
## Mean :9.534 Mean :9.534 Mean :9.534 Mean :9.534
## 3rd Qu.:9.563 3rd Qu.:9.565 3rd Qu.:9.568 3rd Qu.:9.572
## Max. :9.595 Max. :9.601 Max. :9.607 Max. :9.614
## s13 s14 s15 s16
## Min. :9.344 Min. :9.326 Min. :9.306 Min. :9.284
## 1st Qu.:9.497 1st Qu.:9.493 1st Qu.:9.489 1st Qu.:9.485
## Median :9.551 Median :9.552 Median :9.554 Median :9.556
## Mean :9.534 Mean :9.534 Mean :9.534 Mean :9.534
## 3rd Qu.:9.575 3rd Qu.:9.579 3rd Qu.:9.583 3rd Qu.:9.588
## Max. :9.622 Max. :9.630 Max. :9.639 Max. :9.649
## s17 s18 s19 s20
## Min. :9.261 Min. :9.235 Min. :9.206 Min. :9.176
## 1st Qu.:9.480 1st Qu.:9.475 1st Qu.:9.470 1st Qu.:9.464
## Median :9.558 Median :9.560 Median :9.563 Median :9.565
## Mean :9.534 Mean :9.534 Mean :9.534 Mean :9.534
## 3rd Qu.:9.593 3rd Qu.:9.598 3rd Qu.:9.604 3rd Qu.:9.611
## Max. :9.660 Max. :9.672 Max. :9.685 Max. :9.699
## s21 s22 s23 s24
## Min. :9.142 Min. :9.106 Min. :9.066 Min. :9.023
## 1st Qu.:9.457 1st Qu.:9.450 1st Qu.:9.442 1st Qu.:9.433
## Median :9.568 Median :9.571 Median :9.575 Median :9.578
## Mean :9.534 Mean :9.534 Mean :9.534 Mean :9.534
## 3rd Qu.:9.618 3rd Qu.:9.626 3rd Qu.:9.634 3rd Qu.:9.643
## Max. :9.715 Max. :9.731 Max. :9.750 Max. :9.770
## s25 s26 s27 s28
## Min. :8.976 Min. :8.925 Min. :8.870 Min. :8.810
## 1st Qu.:9.424 1st Qu.:9.414 1st Qu.:9.403 1st Qu.:9.391
## Median :9.582 Median :9.587 Median :9.591 Median :9.596
## Mean :9.534 Mean :9.534 Mean :9.534 Mean :9.534
## 3rd Qu.:9.653 3rd Qu.:9.664 3rd Qu.:9.676 3rd Qu.:9.689
## Max. :9.791 Max. :9.815 Max. :9.840 Max. :9.868
## s29 s30 s31 s32
## Min. :8.746 Min. :8.676 Min. :8.601 Min. : 8.520
## 1st Qu.:9.378 1st Qu.:9.364 1st Qu.:9.349 1st Qu.: 9.333
## Median :9.602 Median :9.608 Median :9.614 Median : 9.621
## Mean :9.534 Mean :9.534 Mean :9.534 Mean : 9.534
## 3rd Qu.:9.703 3rd Qu.:9.718 3rd Qu.:9.734 3rd Qu.: 9.752
## Max. :9.898 Max. :9.930 Max. :9.965 Max. :10.003
## s33 s34 s35 s36
## Min. : 8.433 Min. : 8.339 Min. : 8.239 Min. : 8.132
## 1st Qu.: 9.315 1st Qu.: 9.297 1st Qu.: 9.277 1st Qu.: 9.257
## Median : 9.628 Median : 9.636 Median : 9.644 Median : 9.653
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.: 9.771 3rd Qu.: 9.791 3rd Qu.: 9.813 3rd Qu.: 9.837
## Max. :10.043 Max. :10.087 Max. :10.133 Max. :10.184
## s37 s38 s39 s40
## Min. : 8.017 Min. : 7.895 Min. : 7.766 Min. : 7.629
## 1st Qu.: 9.236 1st Qu.: 9.213 1st Qu.: 9.190 1st Qu.: 9.165
## Median : 9.662 Median : 9.672 Median : 9.682 Median : 9.693
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.: 9.862 3rd Qu.: 9.889 3rd Qu.: 9.918 3rd Qu.: 9.948
## Max. :10.237 Max. :10.294 Max. :10.355 Max. :10.420
## s41 s42 s43 s44
## Min. : 7.484 Min. : 7.332 Min. : 7.172 Min. : 7.005
## 1st Qu.: 9.140 1st Qu.: 9.113 1st Qu.: 9.086 1st Qu.: 9.058
## Median : 9.704 Median : 9.716 Median : 9.728 Median : 9.741
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.: 9.981 3rd Qu.:10.015 3rd Qu.:10.051 3rd Qu.:10.090
## Max. :10.488 Max. :10.560 Max. :10.637 Max. :10.717
## s45 s46 s47 s48
## Min. : 6.830 Min. : 6.648 Min. : 6.460 Min. : 6.266
## 1st Qu.: 9.030 1st Qu.: 9.002 1st Qu.: 8.973 1st Qu.: 8.945
## Median : 9.754 Median : 9.772 Median : 9.794 Median : 9.818
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.:10.127 3rd Qu.:10.163 3rd Qu.:10.200 3rd Qu.:10.237
## Max. :10.801 Max. :10.889 Max. :10.980 Max. :11.075
## s49 s50 s51 s52
## Min. : 6.067 Min. : 5.863 Min. : 5.654 Min. : 5.443
## 1st Qu.: 8.917 1st Qu.: 8.890 1st Qu.: 8.864 1st Qu.: 8.839
## Median : 9.837 Median : 9.849 Median : 9.876 Median : 9.905
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.:10.277 3rd Qu.:10.322 3rd Qu.:10.380 3rd Qu.:10.440
## Max. :11.173 Max. :11.274 Max. :11.378 Max. :11.484
## s53 s54 s55 s56
## Min. : 5.230 Min. : 5.015 Min. : 4.800 Min. : 4.585
## 1st Qu.: 8.816 1st Qu.: 8.796 1st Qu.: 8.777 1st Qu.: 8.746
## Median : 9.936 Median : 9.945 Median : 9.947 Median : 9.959
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.:10.490 3rd Qu.:10.533 3rd Qu.:10.569 3rd Qu.:10.620
## Max. :11.593 Max. :11.704 Max. :11.816 Max. :11.930
## s57 s58 s59 s60
## Min. : 4.372 Min. : 4.161 Min. : 3.954 Min. : 3.751
## 1st Qu.: 8.740 1st Qu.: 8.722 1st Qu.: 8.663 1st Qu.: 8.600
## Median : 9.959 Median : 9.956 Median : 9.950 Median :10.003
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.:10.662 3rd Qu.:10.720 3rd Qu.:10.778 3rd Qu.:10.831
## Max. :12.044 Max. :12.159 Max. :12.274 Max. :12.389
## s61 s62 s63 s64
## Min. : 3.553 Min. : 3.361 Min. : 3.176 Min. : 2.998
## 1st Qu.: 8.534 1st Qu.: 8.465 1st Qu.: 8.505 1st Qu.: 8.457
## Median :10.062 Median :10.090 Median :10.121 Median :10.182
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.:10.883 3rd Qu.:10.919 3rd Qu.:10.981 3rd Qu.:11.010
## Max. :12.504 Max. :12.618 Max. :12.731 Max. :12.843
## s65 s66 s67 s68
## Min. : 2.828 Min. : 2.666 Min. : 2.512 Min. : 2.367
## 1st Qu.: 8.370 1st Qu.: 8.278 1st Qu.: 8.267 1st Qu.: 8.253
## Median :10.234 Median :10.245 Median :10.257 Median :10.288
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.:11.023 3rd Qu.:11.033 3rd Qu.:11.081 3rd Qu.:11.137
## Max. :12.953 Max. :13.061 Max. :13.168 Max. :13.273
## s69 s70 s71 s72
## Min. : 2.232 Min. : 2.106 Min. : 1.862 Min. : 1.540
## 1st Qu.: 8.177 1st Qu.: 8.123 1st Qu.: 8.021 1st Qu.: 7.959
## Median :10.290 Median :10.309 Median :10.305 Median :10.291
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.:11.193 3rd Qu.:11.249 3rd Qu.:11.304 3rd Qu.:11.360
## Max. :13.375 Max. :13.475 Max. :13.573 Max. :13.668
## s73 s74 s75 s76
## Min. : 1.220 Min. : 0.9016 Min. : 0.5864 Min. : 0.2756
## 1st Qu.: 7.918 1st Qu.: 7.8949 1st Qu.: 7.8684 1st Qu.: 7.8385
## Median :10.280 Median :10.3186 Median :10.3191 Median :10.4190
## Mean : 9.534 Mean : 9.5344 Mean : 9.5344 Mean : 9.5344
## 3rd Qu.:11.415 3rd Qu.:11.4685 3rd Qu.:11.4936 3rd Qu.:11.5648
## Max. :13.760 Max. :13.8497 Max. :13.9366 Max. :14.0207
## s77 s78 s79 s80
## Min. :-0.02964 Min. :-0.3286 Min. :-0.6201 Min. :-0.9036
## 1st Qu.: 7.80511 1st Qu.: 7.6979 1st Qu.: 7.5713 1st Qu.: 7.4817
## Median :10.50587 Median :10.5098 Median :10.5138 Median :10.4939
## Mean : 9.53441 Mean : 9.5344 Mean : 9.5344 Mean : 9.5344
## 3rd Qu.:11.59945 3rd Qu.:11.6884 3rd Qu.:11.7710 3rd Qu.:11.8164
## Max. :14.10164 Max. :14.1799 Max. :14.2553 Max. :14.3277
## s81 s82 s83 s84
## Min. :-1.178 Min. :-1.443 Min. :-1.698 Min. :-1.942
## 1st Qu.: 7.470 1st Qu.: 7.461 1st Qu.: 7.454 1st Qu.: 7.424
## Median :10.469 Median :10.440 Median :10.426 Median :10.402
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.:11.860 3rd Qu.:11.904 3rd Qu.:12.034 3rd Qu.:12.122
## Max. :14.397 Max. :14.464 Max. :14.527 Max. :14.588
## s85 s86 s87 s88
## Min. :-2.175 Min. :-2.398 Min. :-2.610 Min. :-2.809
## 1st Qu.: 7.331 1st Qu.: 7.240 1st Qu.: 7.152 1st Qu.: 7.066
## Median :10.377 Median :10.353 Median :10.355 Median :10.338
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.:12.298 3rd Qu.:12.397 3rd Qu.:12.417 3rd Qu.:12.433
## Max. :14.646 Max. :14.701 Max. :14.754 Max. :14.805
## s89 s90 s91 s92
## Min. :-2.999 Min. :-3.177 Min. :-3.344 Min. :-3.501
## 1st Qu.: 6.984 1st Qu.: 6.904 1st Qu.: 6.901 1st Qu.: 6.817
## Median :10.322 Median :10.313 Median :10.356 Median :10.396
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.:12.473 3rd Qu.:12.581 3rd Qu.:12.619 3rd Qu.:12.623
## Max. :14.853 Max. :14.899 Max. :14.943 Max. :14.983
## s93 s94 s95 s96
## Min. :-3.648 Min. :-3.784 Min. :-3.912 Min. :-4.030
## 1st Qu.: 6.732 1st Qu.: 6.684 1st Qu.: 6.656 1st Qu.: 6.631
## Median :10.431 Median :10.405 Median :10.397 Median :10.384
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.:12.626 3rd Qu.:12.651 3rd Qu.:12.702 3rd Qu.:12.757
## Max. :15.021 Max. :15.057 Max. :15.091 Max. :15.122
## s97 s98 s99 s100
## Min. :-4.140 Min. :-4.240 Min. :-4.333 Min. :-4.419
## 1st Qu.: 6.608 1st Qu.: 6.585 1st Qu.: 6.565 1st Qu.: 6.547
## Median :10.360 Median :10.337 Median :10.315 Median :10.293
## Mean : 9.534 Mean : 9.534 Mean : 9.534 Mean : 9.534
## 3rd Qu.:12.809 3rd Qu.:12.858 3rd Qu.:12.904 3rd Qu.:12.947
## Max. :15.152 Max. :15.179 Max. :15.313 Max. :15.461
# Finally, let's evaluate how well (or poorly) we have done on the test set.
test_MSE <- MSE(predictions, y_test)
test_RMSE <- RMSE(predictions, y_test)
test_MSE
## [1] 34.62584
test_RMSE
## [1] 5.884372
# Extracting the best lambda by indexing the glmnet lambda component
# then index it by order of RMSE, order puts them in ascending order
# smallest value and this will pick out the best lambda.
lam.best <- ridge_model_lambda_min$lambda[order(test_MSE)[1]]
lam.best <- min(ridge_model_lambda_min$lambda)
lam.best
## [1] 0.3723852
coef(ridge_model_lambda_min, s = lam.best)
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 9.5344130
## V.2 -1.4181346
## V.3 -0.2269537
## V.4 1.5371207
## V.5 0.2266450
## V.6 -3.6329502
## V.7 1.0237112
## V.8 -3.3897642
## Y 1.0640065
############## Plot of test set predictions vs actual values ######################
# Pad predictions vector with zeros if its length is less than y_test
if (length(predictions) < length(y_test)) {
extra_zeros <- rep(0, length(y_test) - length(predictions))
predictions <- c(predictions, extra_zeros)
} else if (length(predictions) > length(y_test)) {
predictions <- predictions[1:length(y_test)]
}
# Check if the lengths are the same
length(predictions)
## [1] 121
length(y_test)
## [1] 121
# If they are not the same, you can adjust one of the vectors to match the length of the other.
# For example, you can trim or pad one of the vectors to match the length of the other.
# But in this case its same
# Ensuring both vectors have the same length
min_length <- min(length(predictions), length(y_test))
predictions <- predictions[1:min_length]
y_test <- y_test[1:min_length]
# Now plot the data
plot(x = predictions, y = y_test, frame = FALSE, pch = 19,
col = "red", xlab = "Predicted Values", ylab = "Actual Values")
######Dual line chart for predicted vs. actual values########
test_instances <- seq_along(y_test)
plot(x = test_instances, y = y_test, frame = FALSE, pch = 19, type = "l",
col = "red", xlab = "Test Instance", ylab = "Valence")
lines(x = test_instances, y = predictions, pch = 18, col = "blue", type = "l", lty = 2)
# Adding legend
legend("topleft", legend=c("Actual", "Predicted"), col=c("red", "blue"), lty = 1:2, cex=0.8)

###### verify if the model predictions fall within +/- 500,000 Iranian Rial##############
# Calculate the absolute difference between predictions and actual values
abs_diff <- abs(predictions - y_test)
# Check if the absolute difference is within the specified threshold
within_threshold <- abs_diff <= 500000
# Count the number of predictions within the threshold
num_within_threshold <- sum(within_threshold)
# Calculate the percentage of predictions within the threshold
percentage_within_threshold <- (num_within_threshold / length(y_test)) * 100
print(percentage_within_threshold)
## [1] 100
# Print the results
cat("Number of predictions within +/- 500,000 Iranian Rial threshold:", num_within_threshold, "\n")
## Number of predictions within +/- 500,000 Iranian Rial threshold: 121
cat("Percentage of predictions within +/- 500,000 Iranian Rial threshold:", percentage_within_threshold, "%\n")
## Percentage of predictions within +/- 500,000 Iranian Rial threshold: 100 %
#####ploting the visual for it
# Convert the logical vector to a factor with labels "Within Threshold" and "Outside Threshold"
within_threshold_factor <- factor(within_threshold, levels = c(FALSE, TRUE), labels = c("Outside Threshold", "Within Threshold"))
# Create a bar plot to visualize the proportion of predictions within the threshold
bar_colors <- c("red", "green")
bar_names <- c("Outside Threshold", "Within Threshold")
barplot(table(within_threshold_factor), col = bar_colors, main = "Predictions Within +/- 500,000 Iranian Rial threshold", ylab = "Frequency")
# Add a legend to the plot
legend("topleft", legend = bar_names, fill = bar_colors, cex = 0.8)
#########Plotting another scatter plot####################
# Calculate the mean or median of the actual values
actual_mean <- mean(y_test)
actual_median <- median(y_test)
# Calculate the threshold values
threshold_upper <- actual_mean + 500000
threshold_lower <- actual_mean - 500000
# Create a vector to indicate whether each prediction is within the threshold
within_threshold <- ifelse(predictions >= threshold_lower & predictions <= threshold_upper, TRUE, FALSE)
# Define colors for points based on whether they are within the threshold
point_colors <- ifelse(within_threshold, "blue", "red")
# Create the scatter plot
plot(predictions, y_test, col = point_colors, pch = 19,
xlab = "Predicted Values", ylab = "Actual Values",
main = "Scatter Plot with Threshold Highlighted")
# Add horizontal lines for the threshold range
abline(h = c(threshold_upper, threshold_lower), col = "green", lty = 2)
# Add legend
legend("topright", legend = c("Within Threshold", "Outside Threshold"),
col = c("blue", "red"), lwd = 3, cex = 0.5, text.width = 0.9)
