#set directory
setwd("C:/Users/sharl/Desktop/USF/Fall 2021/LIS 4805 - Predictive Analytics/Final Project")
#load libraries
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
library(boot)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(readr)
library(ISLR)
library(summarytools)
##
## Attaching package: 'summarytools'
## The following object is masked from 'package:tibble':
##
## view
dataIOL <- read_csv("IOL.csv")
## Rows: 256 Columns: 16
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (11): eye, performedIolModel, targetRange, target, predictedSE, meanPost...
## dbl (5): performedIOL, performedToricIOL, netAstigmatism, meanAnteriorK, me...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#review data types
glimpse(dataIOL)
## Rows: 256
## Columns: 16
## $ eye <chr> "Left", "Right", "Right", "Right", "Left", "Right~
## $ performedIolModel <chr> "AcrySof IQ Toric (clear)", "Softec HD", NA, "Sof~
## $ performedIOL <dbl> 24.50, 24.50, NA, 21.50, 19.50, 20.50, 22.50, 25.~
## $ performedToricIOL <dbl> 1.50, NA, NA, NA, 1.50, NA, NA, NA, NA, NA, NA, 1~
## $ targetRange <chr> "Distance", "Distance", NA, "Distance", "Distance~
## $ target <chr> "0.00 D", "-0.25 D", NA, "-0.25 D", "0.00 D",~
## $ predictedSE <chr> "-0.42)", "-0.54)", NA, "0.03)", "-0.20)", "-0.39~
## $ netAstigmatism <dbl> 1.03, 1.46, NA, 0.66, 0.99, 0.30, 0.61, 0.42, 0.6~
## $ meanAnteriorK <dbl> 43.43, 43.68, NA, 45.16, 42.00, 44.07, 43.73, 43.~
## $ meanPosteiorK <chr> "56.72268908", "55.60131796", "#DIV/0!", "52.3255~
## $ meanPosteriorK <dbl> 5.95, 6.07, NA, 6.45, 5.60, 6.35, 6.35, 5.87, 6.4~
## $ axialLength <chr> "22.89 mm", "22.55 mm", NA, "22.85 mm", "24.81 mm~
## $ whiteToWhite <chr> "12.16 mm", "12.02 mm", NA, "13.52 mm", "12.49 mm~
## $ anteriorChamberDepth <chr> "2.98 mm", "2.43 mm", NA, "3.20 mm", "3.27 mm", "~
## $ lensThickness <chr> "4.91 mm", "5.61 mm", NA, "5.25 mm", "4.63 mm", "~
## $ postOpSE <chr> "-0.12)", "-0.44)", NA, "-0.24)", "-0.16)", "-0.0~
#clean data
#convert axial length to numeric
dataIOL <- dataIOL%>%
mutate(axialLength = as.numeric(str_remove(axialLength, "mm")))
#convert whiteToWhite to numeric
dataIOL <- dataIOL%>%
mutate(whiteToWhite = as.numeric(str_remove(whiteToWhite, "mm")))
#convert anteriorChamberDepth to numeric
dataIOL <- dataIOL%>%
mutate(anteriorChamberDepth = as.numeric(str_remove(anteriorChamberDepth, "mm")))
#convert lensThickness to numeric
dataIOL <- dataIOL%>%
mutate(lensThickness = as.numeric(str_remove(lensThickness, "mm")))
#convert centralCornealThickness to numeric
dataIOL <- dataIOL%>%
mutate(lensThickness = as.numeric(str_remove(lensThickness, "mm")))
#convert target to numeric
#convert centralCornealThickness to numeric
dataIOL <- dataIOL%>%
mutate(target = as.numeric(str_remove(target, "D")))
dataIOL$predictedSE <- as.numeric(gsub(")", "", dataIOL$predictedSE))
dataIOL$postOpSE <- as.numeric(gsub(")","", dataIOL$postOpSE))
dataIOL$meanPosteiorK = NULL
performedToricIOL = NULL
#convert to z-score
dataIOL$meanAnteriorK <- as.numeric(scale(dataIOL$meanAnteriorK))
dataIOL$meanPosteriorK <- as.numeric(scale(dataIOL$meanPosteriorK))
dataIOL$axialLength <- as.numeric(scale(dataIOL$axialLength))
dataIOL$anteriorChamberDepth <- as.numeric(scale(dataIOL$anteriorChamberDepth))
dataIOL$lensThickness <- as.numeric(scale(dataIOL$lensThickness))
dataIOL$postOpSE <- as.numeric(scale(dataIOL$postOpSE))
dataIOL$predictedSE <- as.numeric(scale(dataIOL$predictedSE))
dataIOL$performedIOL <- as.numeric(scale(dataIOL$performedIOL))
dataIOL$target <- as.numeric(scale(dataIOL$target))
dataIOL$whiteToWhite <- as.numeric(scale(dataIOL$whiteToWhite))
#save clean data
summary(dataIOL)
## eye performedIolModel performedIOL performedToricIOL
## Length:256 Length:256 Min. :-3.5508 Min. :1.500
## Class :character Class :character 1st Qu.:-0.4751 1st Qu.:1.500
## Mode :character Mode :character Median : 0.1334 Median :1.500
## Mean : 0.0000 Mean :2.047
## 3rd Qu.: 0.6598 3rd Qu.:2.250
## Max. : 2.5019 Max. :5.250
## NA's :8 NA's :156
## targetRange target predictedSE netAstigmatism
## Length:256 Min. :-8.8066 Min. :-8.1570 Min. :0.0700
## Class :character 1st Qu.: 0.2271 1st Qu.:-0.1271 1st Qu.:0.4675
## Mode :character Median : 0.2271 Median : 0.2151 Median :0.7950
## Mean : 0.0000 Mean : 0.0000 Mean :0.9418
## 3rd Qu.: 0.2271 3rd Qu.: 0.4945 3rd Qu.:1.2825
## Max. : 0.7585 Max. : 1.2872 Max. :3.5500
## NA's :8 NA's :8 NA's :8
## meanAnteriorK meanPosteriorK axialLength whiteToWhite
## Min. :-3.33536 Min. :-3.19041 Min. :-2.5212 Min. :-2.61881
## 1st Qu.:-0.63157 1st Qu.:-0.68115 1st Qu.:-0.7493 1st Qu.:-0.68568
## Median : 0.05592 Median : 0.05072 Median :-0.1114 Median :-0.07921
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.60745 3rd Qu.: 0.70417 3rd Qu.: 0.6310 3rd Qu.: 0.67888
## Max. : 2.47240 Max. : 2.53384 Max. : 3.3190 Max. : 6.15606
## NA's :8 NA's :21 NA's :8 NA's :11
## anteriorChamberDepth lensThickness postOpSE
## Min. :-2.5267 Min. :-2.53244 Min. :-4.3068
## 1st Qu.:-0.7515 1st Qu.:-0.60568 1st Qu.:-0.1852
## Median : 0.0965 Median :-0.02105 Median : 0.1696
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.6732 3rd Qu.: 0.68051 3rd Qu.: 0.5927
## Max. : 2.9233 Max. : 2.61235 Max. : 1.3706
## NA's :9 NA's :28 NA's :69
dataIOL$eye = NULL
dataIOL$performedIolModel = NULL
dataIOL$targetRange = NULL
##Linear model fit
#split data into 80% training set and 20% test set
#train set size
set.seed(1234)
z <- round(.8 * dim(dataIOL)[1])
train <- dataIOL[1:z,]
test <- dataIOL[-(1:z),]
#linear regression
model1 <- lm(performedIOL ~ ., data = train, na.action = na.omit)
#10 fold CV because sample size is small
#create control parameters
ControlParameters <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5)
#Use train function for five fold cross validation
model1.cv <- train(performedIOL ~ ., data = train,
method = "lm",
trControl = ControlParameters, na.action = na.omit)
RMSE_train1 <- model1.cv
RMSE_train1
## Linear Regression
##
## 205 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 54, 54, 52, 55, 54, 55, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1455215 0.9854669 0.109799
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(model1.cv)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16370 -0.07508 -0.01802 0.04776 0.46161
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.082399 0.056638 1.455 0.152222
## performedToricIOL -0.010062 0.056788 -0.177 0.860103
## target 0.015935 0.060354 0.264 0.792890
## predictedSE -0.186260 0.051343 -3.628 0.000691 ***
## netAstigmatism 0.003069 0.069343 0.044 0.964885
## meanAnteriorK -0.735440 0.036971 -19.892 < 2e-16 ***
## meanPosteriorK 0.086383 0.031254 2.764 0.008077 **
## axialLength -1.307162 0.024461 -53.438 < 2e-16 ***
## whiteToWhite -0.039268 0.029837 -1.316 0.194398
## anteriorChamberDepth 0.144895 0.034279 4.227 0.000105 ***
## lensThickness 0.097979 0.031220 3.138 0.002903 **
## postOpSE -0.038483 0.024559 -1.567 0.123694
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1296 on 48 degrees of freedom
## Multiple R-squared: 0.9895, Adjusted R-squared: 0.9871
## F-statistic: 412.5 on 11 and 48 DF, p-value: < 2.2e-16
#model 2 with significant p-value variables
model2 <- lm(performedIOL ~ predictedSE + meanAnteriorK + meanPosteriorK +
axialLength + anteriorChamberDepth + lensThickness, data = train,
na.action = na.omit)
model2.cv <- train(performedIOL ~ predictedSE + meanAnteriorK + meanPosteriorK +
axialLength + anteriorChamberDepth + lensThickness,
data = train, method = "lm", trControl = ControlParameters,
na.action = na.omit)
RMSE_train2 <- model2.cv
RMSE_train2
## Linear Regression
##
## 205 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 162, 160, 161, 160, 161, 161, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1621797 0.9742781 0.1164239
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(model2.cv)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44239 -0.08684 -0.02169 0.05813 0.68371
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.009457 0.012065 0.784 0.4342
## predictedSE -0.142080 0.011435 -12.425 < 2e-16 ***
## meanAnteriorK -0.736330 0.021346 -34.495 < 2e-16 ***
## meanPosteriorK 0.101160 0.018725 5.402 2.16e-07 ***
## axialLength -1.212192 0.017548 -69.080 < 2e-16 ***
## anteriorChamberDepth 0.104233 0.021421 4.866 2.56e-06 ***
## lensThickness 0.038983 0.018718 2.083 0.0388 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1607 on 172 degrees of freedom
## Multiple R-squared: 0.9748, Adjusted R-squared: 0.9739
## F-statistic: 1107 on 6 and 172 DF, p-value: < 2.2e-16
#Model 3
model3 <- lm(performedIOL ~ predictedSE + meanAnteriorK + axialLength +
anteriorChamberDepth + meanPosteriorK, data = train,
na.action = na.omit)
model3.cv <- train(performedIOL ~ predictedSE + meanAnteriorK + axialLength +
anteriorChamberDepth + meanPosteriorK, data = train, method = "lm",
trControl = ControlParameters, na.action = na.omit)
RMSE_train3 <- model3.cv
RMSE_train3
## Linear Regression
##
## 205 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 172, 172, 171, 172, 171, 171, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1673813 0.972818 0.1173973
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(model3.cv)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45752 -0.08067 -0.01967 0.05734 0.71032
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.002395 0.012050 0.199 0.843
## predictedSE -0.140545 0.011726 -11.986 < 2e-16 ***
## meanAnteriorK -0.722382 0.020669 -34.949 < 2e-16 ***
## axialLength -1.197340 0.016337 -73.288 < 2e-16 ***
## anteriorChamberDepth 0.064197 0.014435 4.447 1.50e-05 ***
## meanPosteriorK 0.087472 0.018457 4.739 4.27e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1659 on 185 degrees of freedom
## Multiple R-squared: 0.9737, Adjusted R-squared: 0.973
## F-statistic: 1371 on 5 and 185 DF, p-value: < 2.2e-16
#remove character variables
test$eye = NULL
test$performedIolModel = NULL
test$targetRange = NULL
# Use the model to make predictions on the test set
predictions1 <- model1 %>% predict(test)
predictions2 <- model2 %>% predict(test)
predictions3 <- model3 %>% predict(test)
#Examine R-squared, RMSE, and MAE of predictions1
data.frame(R_squared = R2(predictions1, test$performedIOL, na.rm = T),
RMSE = RMSE(predictions1, test$performedIOL, na.rm = T),
MAE = MAE(predictions1, test$performedIOL, na.rm = T))
## R_squared RMSE MAE
## 1 0.9976335 0.1320414 0.1136177
#Examine R-squared, RMSE, and MAE of predictions2
data.frame(R_squared = R2(predictions2, test$performedIOL, na.rm = T),
RMSE = RMSE(predictions2, test$performedIOL, na.rm = T),
MAE = MAE(predictions2, test$performedIOL, na.rm = T))
## R_squared RMSE MAE
## 1 0.9877374 0.114698 0.09434669
#Examine R-squared, RMSE, and MAE of predictions3
data.frame(R_squared = R2(predictions3, test$performedIOL, na.rm = T),
RMSE = RMSE(predictions3, test$performedIOL, na.rm = T),
MAE = MAE(predictions3, test$performedIOL, na.rm = T))
## R_squared RMSE MAE
## 1 0.9880131 0.1137352 0.09484213