#set directory
setwd("C:/Users/sharl/Desktop/USF/Fall 2021/LIS 4805 - Predictive Analytics/Final Project")

#load libraries
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(stringr)
library(boot)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(readr)
library(ISLR)
library(summarytools)
## 
## Attaching package: 'summarytools'
## The following object is masked from 'package:tibble':
## 
##     view
dataIOL <- read_csv("IOL.csv")
## Rows: 256 Columns: 16
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (11): eye, performedIolModel, targetRange, target, predictedSE, meanPost...
## dbl  (5): performedIOL, performedToricIOL, netAstigmatism, meanAnteriorK, me...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#review data types
glimpse(dataIOL)
## Rows: 256
## Columns: 16
## $ eye                  <chr> "Left", "Right", "Right", "Right", "Left", "Right~
## $ performedIolModel    <chr> "AcrySof IQ Toric (clear)", "Softec HD", NA, "Sof~
## $ performedIOL         <dbl> 24.50, 24.50, NA, 21.50, 19.50, 20.50, 22.50, 25.~
## $ performedToricIOL    <dbl> 1.50, NA, NA, NA, 1.50, NA, NA, NA, NA, NA, NA, 1~
## $ targetRange          <chr> "Distance", "Distance", NA, "Distance", "Distance~
## $ target               <chr> "0.00  D", "-0.25  D", NA, "-0.25  D", "0.00  D",~
## $ predictedSE          <chr> "-0.42)", "-0.54)", NA, "0.03)", "-0.20)", "-0.39~
## $ netAstigmatism       <dbl> 1.03, 1.46, NA, 0.66, 0.99, 0.30, 0.61, 0.42, 0.6~
## $ meanAnteriorK        <dbl> 43.43, 43.68, NA, 45.16, 42.00, 44.07, 43.73, 43.~
## $ meanPosteiorK        <chr> "56.72268908", "55.60131796", "#DIV/0!", "52.3255~
## $ meanPosteriorK       <dbl> 5.95, 6.07, NA, 6.45, 5.60, 6.35, 6.35, 5.87, 6.4~
## $ axialLength          <chr> "22.89 mm", "22.55 mm", NA, "22.85 mm", "24.81 mm~
## $ whiteToWhite         <chr> "12.16 mm", "12.02 mm", NA, "13.52 mm", "12.49 mm~
## $ anteriorChamberDepth <chr> "2.98 mm", "2.43 mm", NA, "3.20 mm", "3.27 mm", "~
## $ lensThickness        <chr> "4.91 mm", "5.61 mm", NA, "5.25 mm", "4.63 mm", "~
## $ postOpSE             <chr> "-0.12)", "-0.44)", NA, "-0.24)", "-0.16)", "-0.0~
#clean data
#convert axial length to numeric
dataIOL <- dataIOL%>%
  mutate(axialLength = as.numeric(str_remove(axialLength, "mm")))

#convert whiteToWhite to numeric         
dataIOL <- dataIOL%>%
  mutate(whiteToWhite = as.numeric(str_remove(whiteToWhite, "mm")))

#convert anteriorChamberDepth to numeric
dataIOL <- dataIOL%>%
  mutate(anteriorChamberDepth = as.numeric(str_remove(anteriorChamberDepth, "mm")))

#convert lensThickness to numeric
dataIOL <- dataIOL%>%
  mutate(lensThickness = as.numeric(str_remove(lensThickness, "mm")))

#convert centralCornealThickness to numeric
dataIOL <- dataIOL%>%
  mutate(lensThickness = as.numeric(str_remove(lensThickness, "mm")))

#convert target to numeric
#convert centralCornealThickness to numeric
dataIOL <- dataIOL%>%
  mutate(target = as.numeric(str_remove(target, "D")))

dataIOL$predictedSE <- as.numeric(gsub(")", "", dataIOL$predictedSE))
dataIOL$postOpSE <- as.numeric(gsub(")","", dataIOL$postOpSE))         

dataIOL$meanPosteiorK = NULL
performedToricIOL = NULL

#convert to z-score
dataIOL$meanAnteriorK <- as.numeric(scale(dataIOL$meanAnteriorK))
dataIOL$meanPosteriorK <- as.numeric(scale(dataIOL$meanPosteriorK))
dataIOL$axialLength <- as.numeric(scale(dataIOL$axialLength))
dataIOL$anteriorChamberDepth <- as.numeric(scale(dataIOL$anteriorChamberDepth))
dataIOL$lensThickness <- as.numeric(scale(dataIOL$lensThickness))
dataIOL$postOpSE <- as.numeric(scale(dataIOL$postOpSE))
dataIOL$predictedSE <- as.numeric(scale(dataIOL$predictedSE))
dataIOL$performedIOL <- as.numeric(scale(dataIOL$performedIOL))
dataIOL$target <- as.numeric(scale(dataIOL$target))
dataIOL$whiteToWhite <- as.numeric(scale(dataIOL$whiteToWhite))

#save clean data
summary(dataIOL)
##      eye            performedIolModel   performedIOL     performedToricIOL
##  Length:256         Length:256         Min.   :-3.5508   Min.   :1.500    
##  Class :character   Class :character   1st Qu.:-0.4751   1st Qu.:1.500    
##  Mode  :character   Mode  :character   Median : 0.1334   Median :1.500    
##                                        Mean   : 0.0000   Mean   :2.047    
##                                        3rd Qu.: 0.6598   3rd Qu.:2.250    
##                                        Max.   : 2.5019   Max.   :5.250    
##                                        NA's   :8         NA's   :156      
##  targetRange            target         predictedSE      netAstigmatism  
##  Length:256         Min.   :-8.8066   Min.   :-8.1570   Min.   :0.0700  
##  Class :character   1st Qu.: 0.2271   1st Qu.:-0.1271   1st Qu.:0.4675  
##  Mode  :character   Median : 0.2271   Median : 0.2151   Median :0.7950  
##                     Mean   : 0.0000   Mean   : 0.0000   Mean   :0.9418  
##                     3rd Qu.: 0.2271   3rd Qu.: 0.4945   3rd Qu.:1.2825  
##                     Max.   : 0.7585   Max.   : 1.2872   Max.   :3.5500  
##                     NA's   :8         NA's   :8         NA's   :8       
##  meanAnteriorK      meanPosteriorK      axialLength       whiteToWhite     
##  Min.   :-3.33536   Min.   :-3.19041   Min.   :-2.5212   Min.   :-2.61881  
##  1st Qu.:-0.63157   1st Qu.:-0.68115   1st Qu.:-0.7493   1st Qu.:-0.68568  
##  Median : 0.05592   Median : 0.05072   Median :-0.1114   Median :-0.07921  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.60745   3rd Qu.: 0.70417   3rd Qu.: 0.6310   3rd Qu.: 0.67888  
##  Max.   : 2.47240   Max.   : 2.53384   Max.   : 3.3190   Max.   : 6.15606  
##  NA's   :8          NA's   :21         NA's   :8         NA's   :11        
##  anteriorChamberDepth lensThickness         postOpSE      
##  Min.   :-2.5267      Min.   :-2.53244   Min.   :-4.3068  
##  1st Qu.:-0.7515      1st Qu.:-0.60568   1st Qu.:-0.1852  
##  Median : 0.0965      Median :-0.02105   Median : 0.1696  
##  Mean   : 0.0000      Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.6732      3rd Qu.: 0.68051   3rd Qu.: 0.5927  
##  Max.   : 2.9233      Max.   : 2.61235   Max.   : 1.3706  
##  NA's   :9            NA's   :28         NA's   :69
dataIOL$eye = NULL
dataIOL$performedIolModel = NULL
dataIOL$targetRange = NULL


##Linear model fit
#split data into 80% training set and 20% test set
#train set size
set.seed(1234)
z <- round(.8 * dim(dataIOL)[1])
train <- dataIOL[1:z,]
test <- dataIOL[-(1:z),]

#linear regression
model1 <- lm(performedIOL ~ ., data = train, na.action = na.omit)

#10 fold CV because sample size is small
#create control parameters
ControlParameters <- trainControl(method = "repeatedcv",
                                  number = 10,
                                  repeats = 5)

#Use train function for five fold cross validation
model1.cv <- train(performedIOL ~ ., data = train,
                   method = "lm",
                   trControl = ControlParameters, na.action = na.omit)

RMSE_train1 <- model1.cv
RMSE_train1
## Linear Regression 
## 
## 205 samples
##  11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 54, 54, 52, 55, 54, 55, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE     
##   0.1455215  0.9854669  0.109799
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(model1.cv)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16370 -0.07508 -0.01802  0.04776  0.46161 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.082399   0.056638   1.455 0.152222    
## performedToricIOL    -0.010062   0.056788  -0.177 0.860103    
## target                0.015935   0.060354   0.264 0.792890    
## predictedSE          -0.186260   0.051343  -3.628 0.000691 ***
## netAstigmatism        0.003069   0.069343   0.044 0.964885    
## meanAnteriorK        -0.735440   0.036971 -19.892  < 2e-16 ***
## meanPosteriorK        0.086383   0.031254   2.764 0.008077 ** 
## axialLength          -1.307162   0.024461 -53.438  < 2e-16 ***
## whiteToWhite         -0.039268   0.029837  -1.316 0.194398    
## anteriorChamberDepth  0.144895   0.034279   4.227 0.000105 ***
## lensThickness         0.097979   0.031220   3.138 0.002903 ** 
## postOpSE             -0.038483   0.024559  -1.567 0.123694    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1296 on 48 degrees of freedom
## Multiple R-squared:  0.9895, Adjusted R-squared:  0.9871 
## F-statistic: 412.5 on 11 and 48 DF,  p-value: < 2.2e-16
#model 2 with significant p-value variables
model2 <- lm(performedIOL ~ predictedSE + meanAnteriorK + meanPosteriorK +
               axialLength + anteriorChamberDepth + lensThickness, data = train,
             na.action = na.omit)
model2.cv <- train(performedIOL ~ predictedSE + meanAnteriorK + meanPosteriorK +
                     axialLength + anteriorChamberDepth + lensThickness, 
                   data = train, method = "lm", trControl = ControlParameters,
                   na.action = na.omit) 

RMSE_train2 <- model2.cv
RMSE_train2
## Linear Regression 
## 
## 205 samples
##   6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 162, 160, 161, 160, 161, 161, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.1621797  0.9742781  0.1164239
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(model2.cv)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44239 -0.08684 -0.02169  0.05813  0.68371 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.009457   0.012065   0.784   0.4342    
## predictedSE          -0.142080   0.011435 -12.425  < 2e-16 ***
## meanAnteriorK        -0.736330   0.021346 -34.495  < 2e-16 ***
## meanPosteriorK        0.101160   0.018725   5.402 2.16e-07 ***
## axialLength          -1.212192   0.017548 -69.080  < 2e-16 ***
## anteriorChamberDepth  0.104233   0.021421   4.866 2.56e-06 ***
## lensThickness         0.038983   0.018718   2.083   0.0388 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1607 on 172 degrees of freedom
## Multiple R-squared:  0.9748, Adjusted R-squared:  0.9739 
## F-statistic:  1107 on 6 and 172 DF,  p-value: < 2.2e-16
#Model 3
model3 <- lm(performedIOL ~ predictedSE + meanAnteriorK + axialLength +
               anteriorChamberDepth + meanPosteriorK, data = train, 
             na.action = na.omit)
model3.cv <-  train(performedIOL ~ predictedSE + meanAnteriorK + axialLength +
                      anteriorChamberDepth + meanPosteriorK, data = train, method = "lm", 
                    trControl = ControlParameters, na.action = na.omit)
RMSE_train3 <- model3.cv
RMSE_train3
## Linear Regression 
## 
## 205 samples
##   5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 172, 172, 171, 172, 171, 171, ... 
## Resampling results:
## 
##   RMSE       Rsquared  MAE      
##   0.1673813  0.972818  0.1173973
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(model3.cv)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45752 -0.08067 -0.01967  0.05734  0.71032 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.002395   0.012050   0.199    0.843    
## predictedSE          -0.140545   0.011726 -11.986  < 2e-16 ***
## meanAnteriorK        -0.722382   0.020669 -34.949  < 2e-16 ***
## axialLength          -1.197340   0.016337 -73.288  < 2e-16 ***
## anteriorChamberDepth  0.064197   0.014435   4.447 1.50e-05 ***
## meanPosteriorK        0.087472   0.018457   4.739 4.27e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1659 on 185 degrees of freedom
## Multiple R-squared:  0.9737, Adjusted R-squared:  0.973 
## F-statistic:  1371 on 5 and 185 DF,  p-value: < 2.2e-16
#remove character variables
test$eye = NULL
test$performedIolModel = NULL
test$targetRange = NULL

# Use the model to make predictions on the test set
predictions1 <- model1 %>% predict(test)
predictions2 <- model2 %>% predict(test)
predictions3 <- model3 %>% predict(test)

#Examine R-squared, RMSE, and MAE of predictions1
data.frame(R_squared = R2(predictions1, test$performedIOL, na.rm = T),
           RMSE = RMSE(predictions1, test$performedIOL, na.rm = T),
           MAE = MAE(predictions1, test$performedIOL, na.rm = T))
##   R_squared      RMSE       MAE
## 1 0.9976335 0.1320414 0.1136177
#Examine R-squared, RMSE, and MAE of predictions2
data.frame(R_squared = R2(predictions2, test$performedIOL, na.rm = T),
           RMSE = RMSE(predictions2, test$performedIOL, na.rm = T),
           MAE = MAE(predictions2, test$performedIOL, na.rm = T))
##   R_squared     RMSE        MAE
## 1 0.9877374 0.114698 0.09434669
#Examine R-squared, RMSE, and MAE of predictions3
data.frame(R_squared = R2(predictions3, test$performedIOL, na.rm = T),
           RMSE = RMSE(predictions3, test$performedIOL, na.rm = T),
           MAE = MAE(predictions3, test$performedIOL, na.rm = T))
##   R_squared      RMSE        MAE
## 1 0.9880131 0.1137352 0.09484213