rm(list = ls())
set.seed(1)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
UPDRS_df <- read_csv('./Datasets/Regression/parkinsons_updrs.data', show_col_types = FALSE) %>%
drop_na()
Use the appropriate functions to obtain descriptive information about the variables included in the dataset.
str(UPDRS_df)
## tibble [5,875 × 22] (S3: tbl_df/tbl/data.frame)
## $ subject# : num [1:5875] 1 1 1 1 1 1 1 1 1 1 ...
## $ age : num [1:5875] 72 72 72 72 72 72 72 72 72 72 ...
## $ sex : num [1:5875] 0 0 0 0 0 0 0 0 0 0 ...
## $ test_time : num [1:5875] 5.64 12.67 19.68 25.65 33.64 ...
## $ motor_UPDRS : num [1:5875] 28.2 28.4 28.7 28.9 29.2 ...
## $ total_UPDRS : num [1:5875] 34.4 34.9 35.4 35.8 36.4 ...
## $ Jitter(%) : num [1:5875] 0.00662 0.003 0.00481 0.00528 0.00335 0.00353 0.00422 0.00476 0.00432 0.00496 ...
## $ Jitter(Abs) : num [1:5875] 3.38e-05 1.68e-05 2.46e-05 2.66e-05 2.01e-05 ...
## $ Jitter:RAP : num [1:5875] 0.00401 0.00132 0.00205 0.00191 0.00093 0.00119 0.00212 0.00226 0.00156 0.00258 ...
## $ Jitter:PPQ5 : num [1:5875] 0.00317 0.0015 0.00208 0.00264 0.0013 0.00159 0.00221 0.00259 0.00207 0.00253 ...
## $ Jitter:DDP : num [1:5875] 0.01204 0.00395 0.00616 0.00573 0.00278 ...
## $ Shimmer : num [1:5875] 0.0256 0.0202 0.0168 0.0231 0.017 ...
## $ Shimmer(dB) : num [1:5875] 0.23 0.179 0.181 0.327 0.176 0.214 0.445 0.212 0.371 0.31 ...
## $ Shimmer:APQ3 : num [1:5875] 0.01438 0.00994 0.00734 0.01106 0.00679 ...
## $ Shimmer:APQ5 : num [1:5875] 0.01309 0.01072 0.00844 0.01265 0.00929 ...
## $ Shimmer:APQ11: num [1:5875] 0.0166 0.0169 0.0146 0.0196 0.0182 ...
## $ Shimmer:DDA : num [1:5875] 0.0431 0.0298 0.022 0.0332 0.0204 ...
## $ NHR : num [1:5875] 0.0143 0.0111 0.0202 0.0278 0.0116 ...
## $ HNR : num [1:5875] 21.6 27.2 23 24.4 26.1 ...
## $ RPDE : num [1:5875] 0.419 0.435 0.462 0.487 0.472 ...
## $ DFA : num [1:5875] 0.548 0.565 0.544 0.578 0.561 ...
## $ PPE : num [1:5875] 0.16 0.108 0.21 0.333 0.194 ...
summary(UPDRS_df)
## subject# age sex test_time
## Min. : 1.00 Min. :36.0 Min. :0.0000 Min. : -4.263
## 1st Qu.:10.00 1st Qu.:58.0 1st Qu.:0.0000 1st Qu.: 46.847
## Median :22.00 Median :65.0 Median :0.0000 Median : 91.523
## Mean :21.49 Mean :64.8 Mean :0.3178 Mean : 92.864
## 3rd Qu.:33.00 3rd Qu.:72.0 3rd Qu.:1.0000 3rd Qu.:138.445
## Max. :42.00 Max. :85.0 Max. :1.0000 Max. :215.490
## motor_UPDRS total_UPDRS Jitter(%) Jitter(Abs)
## Min. : 5.038 Min. : 7.00 Min. :0.000830 Min. :2.250e-06
## 1st Qu.:15.000 1st Qu.:21.37 1st Qu.:0.003580 1st Qu.:2.244e-05
## Median :20.871 Median :27.58 Median :0.004900 Median :3.453e-05
## Mean :21.296 Mean :29.02 Mean :0.006154 Mean :4.403e-05
## 3rd Qu.:27.596 3rd Qu.:36.40 3rd Qu.:0.006800 3rd Qu.:5.333e-05
## Max. :39.511 Max. :54.99 Max. :0.099990 Max. :4.456e-04
## Jitter:RAP Jitter:PPQ5 Jitter:DDP Shimmer
## Min. :0.000330 Min. :0.000430 Min. :0.000980 Min. :0.00306
## 1st Qu.:0.001580 1st Qu.:0.001820 1st Qu.:0.004730 1st Qu.:0.01912
## Median :0.002250 Median :0.002490 Median :0.006750 Median :0.02751
## Mean :0.002987 Mean :0.003277 Mean :0.008962 Mean :0.03404
## 3rd Qu.:0.003290 3rd Qu.:0.003460 3rd Qu.:0.009870 3rd Qu.:0.03975
## Max. :0.057540 Max. :0.069560 Max. :0.172630 Max. :0.26863
## Shimmer(dB) Shimmer:APQ3 Shimmer:APQ5 Shimmer:APQ11
## Min. :0.026 Min. :0.00161 Min. :0.00194 Min. :0.00249
## 1st Qu.:0.175 1st Qu.:0.00928 1st Qu.:0.01079 1st Qu.:0.01566
## Median :0.253 Median :0.01370 Median :0.01594 Median :0.02271
## Mean :0.311 Mean :0.01716 Mean :0.02014 Mean :0.02748
## 3rd Qu.:0.365 3rd Qu.:0.02057 3rd Qu.:0.02375 3rd Qu.:0.03272
## Max. :2.107 Max. :0.16267 Max. :0.16702 Max. :0.27546
## Shimmer:DDA NHR HNR RPDE
## Min. :0.00484 Min. :0.000286 Min. : 1.659 Min. :0.1510
## 1st Qu.:0.02783 1st Qu.:0.010955 1st Qu.:19.406 1st Qu.:0.4698
## Median :0.04111 Median :0.018448 Median :21.920 Median :0.5423
## Mean :0.05147 Mean :0.032120 Mean :21.680 Mean :0.5415
## 3rd Qu.:0.06173 3rd Qu.:0.031463 3rd Qu.:24.444 3rd Qu.:0.6140
## Max. :0.48802 Max. :0.748260 Max. :37.875 Max. :0.9661
## DFA PPE
## Min. :0.5140 Min. :0.02198
## 1st Qu.:0.5962 1st Qu.:0.15634
## Median :0.6436 Median :0.20550
## Mean :0.6532 Mean :0.21959
## 3rd Qu.:0.7113 3rd Qu.:0.26449
## Max. :0.8656 Max. :0.73173
Calculate the correlation between the different attributes (include the figure produced by R in your answer).
cor(UPDRS_df)
## subject# age sex test_time
## subject# 1.0000000000 -0.030863612 0.2868514199 -0.0008815743
## age -0.0308636122 1.000000000 -0.0416017291 0.0198838435
## sex 0.2868514199 -0.041601729 1.0000000000 -0.0098049838
## test_time -0.0008815743 0.019883844 -0.0098049838 1.0000000000
## motor_UPDRS 0.2529185298 0.273664760 -0.0312050144 0.0679182641
## total_UPDRS 0.2536427490 0.310289929 -0.0965588806 0.0752626604
## Jitter(%) 0.1354475184 0.023071181 0.0514216175 -0.0228370926
## Jitter(Abs) 0.0751561345 0.035691340 -0.1546453007 -0.0113648117
## Jitter:RAP 0.1203393232 0.010254988 0.0767182203 -0.0288878317
## Jitter:PPQ5 0.1364738360 0.013199367 0.0879947680 -0.0232899083
## Jitter:DDP 0.1203500584 0.010257836 0.0767031684 -0.0288759827
## Shimmer 0.1462017730 0.101553856 0.0587357861 -0.0338701798
## Shimmer(dB) 0.1428639729 0.111129664 0.0564805319 -0.0309624121
## Shimmer:APQ3 0.1129497993 0.098912301 0.0449371995 -0.0290196929
## Shimmer:APQ5 0.1382636007 0.089982893 0.0648192972 -0.0365044263
## Shimmer:APQ11 0.1733326282 0.135237944 0.0233598626 -0.0391096958
## Shimmer:DDA 0.1129486657 0.098913123 0.0449375945 -0.0290168593
## NHR 0.1687433623 0.007092699 0.1681695195 -0.0263570332
## HNR -0.2069286890 -0.104842069 -0.0001671123 0.0365448637
## RPDE 0.1473003405 0.090208319 -0.1592624409 -0.0388869742
## DFA 0.0974642595 -0.092870159 -0.1651134712 0.0192608786
## PPE 0.1575592025 0.120789753 -0.0999006846 -0.0005633701
## motor_UPDRS total_UPDRS Jitter(%) Jitter(Abs) Jitter:RAP
## subject# 0.25291853 0.25364275 0.13544752 0.07515613 0.12033932
## age 0.27366476 0.31028993 0.02307118 0.03569134 0.01025499
## sex -0.03120501 -0.09655888 0.05142162 -0.15464530 0.07671822
## test_time 0.06791826 0.07526266 -0.02283709 -0.01136481 -0.02888783
## motor_UPDRS 1.00000000 0.94723131 0.08481576 0.05090328 0.07268353
## total_UPDRS 0.94723131 1.00000000 0.07424667 0.06692673 0.06401542
## Jitter(%) 0.08481576 0.07424667 1.00000000 0.86557722 0.98418075
## Jitter(Abs) 0.05090328 0.06692673 0.86557722 1.00000000 0.84462628
## Jitter:RAP 0.07268353 0.06401542 0.98418075 0.84462628 1.00000000
## Jitter:PPQ5 0.07629087 0.06335178 0.96821443 0.79053765 0.94719593
## Jitter:DDP 0.07269792 0.06402746 0.98418354 0.84463035 0.99999962
## Shimmer 0.10234870 0.09214091 0.70979112 0.64904638 0.68172901
## Shimmer(dB) 0.11007600 0.09878973 0.71670399 0.65587068 0.68555054
## Shimmer:APQ3 0.08426056 0.07936272 0.66414874 0.62382984 0.65022614
## Shimmer:APQ5 0.09210517 0.08346725 0.69400164 0.62140081 0.65983121
## Shimmer:APQ11 0.13656029 0.12083750 0.64596519 0.58999842 0.60308168
## Shimmer:DDA 0.08426039 0.07936324 0.66414746 0.62382750 0.65022465
## NHR 0.07496727 0.06095164 0.82529366 0.69995990 0.79237273
## HNR -0.15702858 -0.16211683 -0.67518824 -0.70641805 -0.64147280
## RPDE 0.12860740 0.15689651 0.42712754 0.54709960 0.38289088
## DFA -0.11624248 -0.11347483 0.22654994 0.35226386 0.21488132
## PPE 0.16243297 0.15619488 0.72184881 0.78785284 0.67065210
## Jitter:PPQ5 Jitter:DDP Shimmer Shimmer(dB) Shimmer:APQ3
## subject# 0.13647384 0.12035006 0.14620177 0.14286397 0.11294980
## age 0.01319937 0.01025784 0.10155386 0.11112966 0.09891230
## sex 0.08799477 0.07670317 0.05873579 0.05648053 0.04493720
## test_time -0.02328991 -0.02887598 -0.03387018 -0.03096241 -0.02901969
## motor_UPDRS 0.07629087 0.07269792 0.10234870 0.11007600 0.08426056
## total_UPDRS 0.06335178 0.06402746 0.09214091 0.09878973 0.07936272
## Jitter(%) 0.96821443 0.98418354 0.70979112 0.71670399 0.66414874
## Jitter(Abs) 0.79053765 0.84463035 0.64904638 0.65587068 0.62382984
## Jitter:RAP 0.94719593 0.99999962 0.68172901 0.68555054 0.65022614
## Jitter:PPQ5 1.00000000 0.94720256 0.73274748 0.73459079 0.67671149
## Jitter:DDP 0.94720256 1.00000000 0.68173376 0.68555613 0.65022816
## Shimmer 0.73274748 0.68173376 1.00000000 0.99233407 0.97982804
## Shimmer(dB) 0.73459079 0.68555613 0.99233407 1.00000000 0.96801480
## Shimmer:APQ3 0.67671149 0.65022816 0.97982804 0.96801480 1.00000000
## Shimmer:APQ5 0.73402075 0.65983319 0.98490432 0.97637257 0.96272296
## Shimmer:APQ11 0.66841348 0.60309033 0.93545684 0.93633812 0.88569537
## Shimmer:DDA 0.67671017 0.65022667 0.97982731 0.96801427 0.99999998
## NHR 0.86486425 0.79237731 0.79515848 0.79807697 0.73273634
## HNR -0.66240886 -0.64148177 -0.80141600 -0.80249646 -0.78069689
## RPDE 0.38150298 0.38288580 0.46823455 0.47240859 0.43687810
## DFA 0.17535854 0.21489299 0.13253994 0.12611117 0.13073500
## PPE 0.66349144 0.67066035 0.61570856 0.63516268 0.57670395
## Shimmer:APQ5 Shimmer:APQ11 Shimmer:DDA NHR HNR
## subject# 0.13826360 0.17333263 0.11294867 0.168743362 -0.2069286890
## age 0.08998289 0.13523794 0.09891312 0.007092699 -0.1048420689
## sex 0.06481930 0.02335986 0.04493759 0.168169520 -0.0001671123
## test_time -0.03650443 -0.03910970 -0.02901686 -0.026357033 0.0365448637
## motor_UPDRS 0.09210517 0.13656029 0.08426039 0.074967270 -0.1570285788
## total_UPDRS 0.08346725 0.12083750 0.07936324 0.060951644 -0.1621168287
## Jitter(%) 0.69400164 0.64596519 0.66414746 0.825293655 -0.6751882442
## Jitter(Abs) 0.62140081 0.58999842 0.62382750 0.699959896 -0.7064180505
## Jitter:RAP 0.65983121 0.60308168 0.65022465 0.792372728 -0.6414728036
## Jitter:PPQ5 0.73402075 0.66841348 0.67671017 0.864864252 -0.6624088579
## Jitter:DDP 0.65983319 0.60309033 0.65022667 0.792377310 -0.6414817715
## Shimmer 0.98490432 0.93545684 0.97982731 0.795158485 -0.8014160019
## Shimmer(dB) 0.97637257 0.93633812 0.96801427 0.798076972 -0.8024964615
## Shimmer:APQ3 0.96272296 0.88569537 0.99999998 0.732736344 -0.7806968895
## Shimmer:APQ5 1.00000000 0.93893494 0.96272308 0.798173148 -0.7906382164
## Shimmer:APQ11 0.93893494 1.00000000 0.88569414 0.711546170 -0.7779743467
## Shimmer:DDA 0.96272308 0.88569414 1.00000000 0.732733983 -0.7806962950
## NHR 0.79817315 0.71154617 0.73273398 1.000000000 -0.6844118571
## HNR -0.79063822 -0.77797435 -0.78069630 -0.684411857 1.0000000000
## RPDE 0.45088990 0.48073856 0.43687244 0.416659644 -0.6590531523
## DFA 0.12803754 0.17964765 0.13073592 -0.022087779 -0.2905194517
## PPE 0.59367655 0.62341606 0.57670220 0.564654472 -0.7587222059
## RPDE DFA PPE
## subject# 0.14730034 0.09746426 0.1575592025
## age 0.09020832 -0.09287016 0.1207897526
## sex -0.15926244 -0.16511347 -0.0999006846
## test_time -0.03888697 0.01926088 -0.0005633701
## motor_UPDRS 0.12860740 -0.11624248 0.1624329732
## total_UPDRS 0.15689651 -0.11347483 0.1561948752
## Jitter(%) 0.42712754 0.22654994 0.7218488137
## Jitter(Abs) 0.54709960 0.35226386 0.7878528397
## Jitter:RAP 0.38289088 0.21488132 0.6706520982
## Jitter:PPQ5 0.38150298 0.17535854 0.6634914441
## Jitter:DDP 0.38288580 0.21489299 0.6706603464
## Shimmer 0.46823455 0.13253994 0.6157085590
## Shimmer(dB) 0.47240859 0.12611117 0.6351626782
## Shimmer:APQ3 0.43687810 0.13073500 0.5767039508
## Shimmer:APQ5 0.45088990 0.12803754 0.5936765462
## Shimmer:APQ11 0.48073856 0.17964765 0.6234160550
## Shimmer:DDA 0.43687244 0.13073592 0.5767021962
## NHR 0.41665964 -0.02208778 0.5646544721
## HNR -0.65905315 -0.29051945 -0.7587222059
## RPDE 1.00000000 0.19203007 0.5660648549
## DFA 0.19203007 1.00000000 0.3946496554
## PPE 0.56606485 0.39464966 1.0000000000
pairs(UPDRS_df)
Divide the input dataset into training and testing. a. Split the datasets using 80% for training and 20% for testing. b. How many examples will be used for training and how many for testing?
# Split data
index <- sample(1:nrow(UPDRS_df), size = nrow(UPDRS_df)*0.8)
# UPDRS_df <- UPDRS_df %>%
# rename(`Subject Number` = `subject#`) %>%
# mutate(`Subject Number` = as_factor(`Subject Number`))
train <- UPDRS_df[index,]
test <- UPDRS_df[-index,]
# Number of training and test examples
train_num <- nrow(train)
test_num <- nrow(test)
cat("Number of training examples:", train_num, "\n")
## Number of training examples: 4700
cat("Number of test examples:", test_num)
## Number of test examples: 1175
Build a multiple linear regression model containing all the input variables to predict the output variable. a. Which predictors have a significant impact in the prediction? b. How does the model perform? Provide the R2 and RSE.
model1 <- lm(motor_UPDRS ~ . - total_UPDRS, data = train)
summary(model1)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.922538e+01 2.656686e+00 11.00069015 8.288693e-28
## `subject#` 1.847772e-01 9.310138e-03 19.84687741 3.139823e-84
## age 2.010258e-01 1.258357e-02 15.97526579 5.609241e-56
## sex -2.561165e+00 2.709921e-01 -9.45106950 5.157567e-21
## test_time 9.396041e-03 1.972730e-03 4.76296330 1.965387e-06
## `Jitter(%)` 9.073003e+01 1.751170e+02 0.51811082 6.044054e-01
## `Jitter(Abs)` -4.863330e+04 8.099617e+03 -6.00439538 2.065669e-09
## `Jitter:RAP` -3.122471e+04 3.870409e+04 -0.80675470 4.198488e-01
## `Jitter:PPQ5` -1.569159e+02 1.587158e+02 -0.98865988 3.228807e-01
## `Jitter:DDP` 1.056883e+04 1.290240e+04 0.81913704 4.127499e-01
## Shimmer 3.695318e+01 5.259725e+01 0.70256866 4.823595e-01
## `Shimmer(dB)` 1.787392e-01 3.994928e+00 0.04474155 9.643152e-01
## `Shimmer:APQ3` 3.138519e+04 3.874135e+04 0.81012119 4.179117e-01
## `Shimmer:APQ5` -5.594127e+01 4.545759e+01 -1.23062529 2.185249e-01
## `Shimmer:APQ11` 4.201019e+01 1.973457e+01 2.12876101 3.332613e-02
## `Shimmer:DDA` -1.049535e+04 1.291349e+04 -0.81274317 4.164067e-01
## NHR -1.645451e+01 5.228084e+00 -3.14733079 1.658038e-03
## HNR -3.701143e-01 5.725469e-02 -6.46434881 1.121156e-10
## RPDE -2.054135e+00 1.503850e+00 -1.36591738 1.720305e-01
## DFA -2.672225e+01 1.928972e+00 -13.85309979 8.441359e-43
## PPE 1.468105e+01 2.394727e+00 6.13057443 9.476295e-10
train1_predictions <- predict(model1, newdata = train)
train1_R2 <- summary(lm(motor_UPDRS ~ train1_predictions, data = train))$r.squared
train1_RSE <- sqrt(mean((train$motor_UPDRS - train1_predictions)^2))
cat("Jitter(%), Shimmer, sex, Shimmer:APQ5 have a significant impact in the prediction", "\n")
## Jitter(%), Shimmer, sex, Shimmer:APQ5 have a significant impact in the prediction
cat("Train R2:", train1_R2, "\n")
## Train R2: 0.2177958
cat("Train RSE:", train1_RSE, "\n")
## Train RSE: 7.181451
test1_predictions <- predict(model1, newdata = test)
test1_R2 <- summary(lm(motor_UPDRS ~ test1_predictions, data = test))$r.squared
test1_RSE <- sqrt(mean((test$motor_UPDRS - test1_predictions)^2))
cat("Test R2:", test1_R2, "\n")
## Test R2: 0.2158165
cat("Test RSE:", test1_RSE, "\n")
## Test RSE: 7.228161
Jitter(%)
, Shimmer
, sex
,
Shimmer:APQ5
have a significant impact in the
prediction
Sample | Model 1 |
---|---|
Training | \[0.2178\] |
Testing | \[0.2158\] |
Training RSE | \[7.1815\] |
Testing RSE | \[7.2282\] |
Build a multiple linear regression model to predict output variable including an interaction term. a. Provide the formula for the model. b. Does this model improve the performance of the previous model?
model2 <- lm(motor_UPDRS ~ . - total_UPDRS + age:sex, data = train)
summary(model2)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.992660e+01 2.766031e+00 10.81933033 5.800552e-27
## `subject#` 1.849034e-01 9.311339e-03 19.85787559 2.569940e-84
## age 1.904135e-01 1.714942e-02 11.10320513 2.723733e-28
## sex -4.040456e+00 1.646557e+00 -2.45388111 1.416857e-02
## test_time 9.371742e-03 1.972946e-03 4.75012510 2.093747e-06
## `Jitter(%)` 9.984950e+01 1.754062e+02 0.56924723 5.692157e-01
## `Jitter(Abs)` -4.848659e+04 8.101366e+03 -5.98498991 2.325571e-09
## `Jitter:RAP` -3.091775e+04 3.870627e+04 -0.79877901 4.244591e-01
## `Jitter:PPQ5` -1.562729e+02 1.587202e+02 -0.98458107 3.248808e-01
## `Jitter:DDP` 1.045983e+04 1.290319e+04 0.81063955 4.176139e-01
## Shimmer 3.641327e+01 5.260154e+01 0.69224708 4.888165e-01
## `Shimmer(dB)` 3.067683e-01 3.997473e+00 0.07674055 9.388332e-01
## `Shimmer:APQ3` 3.219004e+04 3.875213e+04 0.83066501 4.062053e-01
## `Shimmer:APQ5` -5.531128e+01 4.546368e+01 -1.21660368 2.238164e-01
## `Shimmer:APQ11` 4.275968e+01 1.975208e+01 2.16481912 3.045195e-02
## `Shimmer:DDA` -1.076381e+04 1.291709e+04 -0.83329988 4.047182e-01
## NHR -1.664105e+01 5.232189e+00 -3.18051304 1.479750e-03
## HNR -3.651917e-01 5.751024e-02 -6.35002937 2.356986e-10
## RPDE -2.185039e+00 1.510729e+00 -1.44634720 1.481468e-01
## DFA -2.688298e+01 1.937063e+00 -13.87822011 6.033120e-43
## PPE 1.474520e+01 2.395806e+00 6.15458979 8.156212e-10
## age:sex 2.285125e-02 2.508824e-02 0.91083519 3.624291e-01
train2_predictions <- predict(model2, newdata = train)
train2_R2 <- summary(lm(motor_UPDRS ~ train2_predictions, data = train))$r.squared
train2_RSE <- sqrt(mean((train$motor_UPDRS - train2_predictions)^2))
cat("Model including an interaction, Train R2:", train2_R2, "\n")
## Model including an interaction, Train R2: 0.2179345
cat("Model including an interaction, Train RSE:", train2_RSE, "\n")
## Model including an interaction, Train RSE: 7.180815
test2_predictions <- predict(model2, newdata = test)
test2_R2 <- summary(lm(motor_UPDRS ~ test2_predictions, data = test))$r.squared
test2_RSE <- sqrt(mean((test$motor_UPDRS - test2_predictions)^2))
cat("Model including an interaction, Test R2:", test2_R2, "\n")
## Model including an interaction, Test R2: 0.216465
cat("Model including an interaction, Test RSE:", test2_RSE, "\n")
## Model including an interaction, Test RSE: 7.225201
Sample | Model 1 | Model 2 |
---|---|---|
Training R² | \[0.2178\] | \[0.2179\] |
Testing R² | \[0.2158\] | \[0.2165\] |
Training RSE | \[7.1815\] | \[7.1808\] |
Testing RSE | \[7.2282\] | \[7.2252\] |
For the training set, the new model including an interaction improve the performance of the previous model; for the test set, the new model’s performance is better, too.
Build a regression model which includes non-linear transformations of predictors. a. Provide the formula for the model. b. Does this model improve the performance of the models obtained in (4) and/or (5)?
model3 <- lm(motor_UPDRS ~ . - total_UPDRS + poly(age, 2) + poly(age, 3), data = train)
train3_predictions <- predict(model3, newdata = train)
## Warning in predict.lm(model3, newdata = train): prediction from rank-deficient
## fit; attr(*, "non-estim") has doubtful cases
train3_R2 <- summary(lm(motor_UPDRS ~ train3_predictions, data = train))$r.squared
train3_RSE <- sqrt(mean((train$motor_UPDRS - train3_predictions)^2))
cat("Model with Non-linear Transformation, Train R2:", train3_R2, "\n")
## Model with Non-linear Transformation, Train R2: 0.2335892
cat("Model with Non-linear Transformation, Train RSE:", train3_RSE, "\n")
## Model with Non-linear Transformation, Train RSE: 7.108582
test3_predictions <- predict(model3, newdata = test)
## Warning in predict.lm(model3, newdata = test): prediction from rank-deficient
## fit; attr(*, "non-estim") has doubtful cases
test3_R2 <- summary(lm(motor_UPDRS ~ test3_predictions, data = test))$r.squared
test3_RSE <- sqrt(mean((test$motor_UPDRS - test3_predictions)^2))
cat("Model with Non-linear Transformation, Test R2:", test3_R2, "\n")
## Model with Non-linear Transformation, Test R2: 0.2298964
cat("Model with Non-linear Transformation, Test RSE:", test3_RSE, "\n")
## Model with Non-linear Transformation, Test RSE: 7.162677
Sample | Model 1 | Model 2 | Model 3 |
---|---|---|---|
Training R² | \[0.2178\] | \[0.2179\] | \[0.2336\] |
Testing R² | \[0.2158\] | \[0.2165\] | \[0.2299\] |
Training RSE | \[7.1815\] | \[7.1808\] | \[7.1086\] |
Testing RSE | \[7.2282\] | \[7.2252\] | \[7.1627\] |
This model improves the performance of the models obtained in (4) and (5).
Provide diagnostic plots for all the models built and comment on whether the models are appropriate based on what these plots show.
anova(model1, model2)
## Analysis of Variance Table
##
## Model 1: motor_UPDRS ~ (`subject#` + age + sex + test_time + total_UPDRS +
## `Jitter(%)` + `Jitter(Abs)` + `Jitter:RAP` + `Jitter:PPQ5` +
## `Jitter:DDP` + Shimmer + `Shimmer(dB)` + `Shimmer:APQ3` +
## `Shimmer:APQ5` + `Shimmer:APQ11` + `Shimmer:DDA` + NHR +
## HNR + RPDE + DFA + PPE) - total_UPDRS
## Model 2: motor_UPDRS ~ (`subject#` + age + sex + test_time + total_UPDRS +
## `Jitter(%)` + `Jitter(Abs)` + `Jitter:RAP` + `Jitter:PPQ5` +
## `Jitter:DDP` + Shimmer + `Shimmer(dB)` + `Shimmer:APQ3` +
## `Shimmer:APQ5` + `Shimmer:APQ11` + `Shimmer:DDA` + NHR +
## HNR + RPDE + DFA + PPE) - total_UPDRS + age:sex
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4679 242394
## 2 4678 242351 1 42.98 0.8296 0.3624
anova(model1, model3)
## Analysis of Variance Table
##
## Model 1: motor_UPDRS ~ (`subject#` + age + sex + test_time + total_UPDRS +
## `Jitter(%)` + `Jitter(Abs)` + `Jitter:RAP` + `Jitter:PPQ5` +
## `Jitter:DDP` + Shimmer + `Shimmer(dB)` + `Shimmer:APQ3` +
## `Shimmer:APQ5` + `Shimmer:APQ11` + `Shimmer:DDA` + NHR +
## HNR + RPDE + DFA + PPE) - total_UPDRS
## Model 2: motor_UPDRS ~ (`subject#` + age + sex + test_time + total_UPDRS +
## `Jitter(%)` + `Jitter(Abs)` + `Jitter:RAP` + `Jitter:PPQ5` +
## `Jitter:DDP` + Shimmer + `Shimmer(dB)` + `Shimmer:APQ3` +
## `Shimmer:APQ5` + `Shimmer:APQ11` + `Shimmer:DDA` + NHR +
## HNR + RPDE + DFA + PPE) - total_UPDRS + poly(age, 2) + poly(age,
## 3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4679 242394
## 2 4677 237500 2 4894.1 48.189 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model2, model3)
## Analysis of Variance Table
##
## Model 1: motor_UPDRS ~ (`subject#` + age + sex + test_time + total_UPDRS +
## `Jitter(%)` + `Jitter(Abs)` + `Jitter:RAP` + `Jitter:PPQ5` +
## `Jitter:DDP` + Shimmer + `Shimmer(dB)` + `Shimmer:APQ3` +
## `Shimmer:APQ5` + `Shimmer:APQ11` + `Shimmer:DDA` + NHR +
## HNR + RPDE + DFA + PPE) - total_UPDRS + age:sex
## Model 2: motor_UPDRS ~ (`subject#` + age + sex + test_time + total_UPDRS +
## `Jitter(%)` + `Jitter(Abs)` + `Jitter:RAP` + `Jitter:PPQ5` +
## `Jitter:DDP` + Shimmer + `Shimmer(dB)` + `Shimmer:APQ3` +
## `Shimmer:APQ5` + `Shimmer:APQ11` + `Shimmer:DDA` + NHR +
## HNR + RPDE + DFA + PPE) - total_UPDRS + poly(age, 2) + poly(age,
## 3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4678 242351
## 2 4677 237500 1 4851.2 95.532 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(model1)
par(mfrow = c(2, 2))
plot(model2)
par(mfrow = c(2, 2))
plot(model3)
According to the Residuals vs Fitted and other three types of subplots of all three models, the last model is the best and appropriate model.