Pre_loading

rm(list = ls())
set.seed(1)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
UPDRS_df <- read_csv('./Datasets/Regression/parkinsons_updrs.data', show_col_types = FALSE) %>% 
  drop_na()

1

Use the appropriate functions to obtain descriptive information about the variables included in the dataset.

str(UPDRS_df)
## tibble [5,875 × 22] (S3: tbl_df/tbl/data.frame)
##  $ subject#     : num [1:5875] 1 1 1 1 1 1 1 1 1 1 ...
##  $ age          : num [1:5875] 72 72 72 72 72 72 72 72 72 72 ...
##  $ sex          : num [1:5875] 0 0 0 0 0 0 0 0 0 0 ...
##  $ test_time    : num [1:5875] 5.64 12.67 19.68 25.65 33.64 ...
##  $ motor_UPDRS  : num [1:5875] 28.2 28.4 28.7 28.9 29.2 ...
##  $ total_UPDRS  : num [1:5875] 34.4 34.9 35.4 35.8 36.4 ...
##  $ Jitter(%)    : num [1:5875] 0.00662 0.003 0.00481 0.00528 0.00335 0.00353 0.00422 0.00476 0.00432 0.00496 ...
##  $ Jitter(Abs)  : num [1:5875] 3.38e-05 1.68e-05 2.46e-05 2.66e-05 2.01e-05 ...
##  $ Jitter:RAP   : num [1:5875] 0.00401 0.00132 0.00205 0.00191 0.00093 0.00119 0.00212 0.00226 0.00156 0.00258 ...
##  $ Jitter:PPQ5  : num [1:5875] 0.00317 0.0015 0.00208 0.00264 0.0013 0.00159 0.00221 0.00259 0.00207 0.00253 ...
##  $ Jitter:DDP   : num [1:5875] 0.01204 0.00395 0.00616 0.00573 0.00278 ...
##  $ Shimmer      : num [1:5875] 0.0256 0.0202 0.0168 0.0231 0.017 ...
##  $ Shimmer(dB)  : num [1:5875] 0.23 0.179 0.181 0.327 0.176 0.214 0.445 0.212 0.371 0.31 ...
##  $ Shimmer:APQ3 : num [1:5875] 0.01438 0.00994 0.00734 0.01106 0.00679 ...
##  $ Shimmer:APQ5 : num [1:5875] 0.01309 0.01072 0.00844 0.01265 0.00929 ...
##  $ Shimmer:APQ11: num [1:5875] 0.0166 0.0169 0.0146 0.0196 0.0182 ...
##  $ Shimmer:DDA  : num [1:5875] 0.0431 0.0298 0.022 0.0332 0.0204 ...
##  $ NHR          : num [1:5875] 0.0143 0.0111 0.0202 0.0278 0.0116 ...
##  $ HNR          : num [1:5875] 21.6 27.2 23 24.4 26.1 ...
##  $ RPDE         : num [1:5875] 0.419 0.435 0.462 0.487 0.472 ...
##  $ DFA          : num [1:5875] 0.548 0.565 0.544 0.578 0.561 ...
##  $ PPE          : num [1:5875] 0.16 0.108 0.21 0.333 0.194 ...
summary(UPDRS_df)
##     subject#          age            sex           test_time      
##  Min.   : 1.00   Min.   :36.0   Min.   :0.0000   Min.   : -4.263  
##  1st Qu.:10.00   1st Qu.:58.0   1st Qu.:0.0000   1st Qu.: 46.847  
##  Median :22.00   Median :65.0   Median :0.0000   Median : 91.523  
##  Mean   :21.49   Mean   :64.8   Mean   :0.3178   Mean   : 92.864  
##  3rd Qu.:33.00   3rd Qu.:72.0   3rd Qu.:1.0000   3rd Qu.:138.445  
##  Max.   :42.00   Max.   :85.0   Max.   :1.0000   Max.   :215.490  
##   motor_UPDRS      total_UPDRS      Jitter(%)         Jitter(Abs)       
##  Min.   : 5.038   Min.   : 7.00   Min.   :0.000830   Min.   :2.250e-06  
##  1st Qu.:15.000   1st Qu.:21.37   1st Qu.:0.003580   1st Qu.:2.244e-05  
##  Median :20.871   Median :27.58   Median :0.004900   Median :3.453e-05  
##  Mean   :21.296   Mean   :29.02   Mean   :0.006154   Mean   :4.403e-05  
##  3rd Qu.:27.596   3rd Qu.:36.40   3rd Qu.:0.006800   3rd Qu.:5.333e-05  
##  Max.   :39.511   Max.   :54.99   Max.   :0.099990   Max.   :4.456e-04  
##    Jitter:RAP        Jitter:PPQ5         Jitter:DDP          Shimmer       
##  Min.   :0.000330   Min.   :0.000430   Min.   :0.000980   Min.   :0.00306  
##  1st Qu.:0.001580   1st Qu.:0.001820   1st Qu.:0.004730   1st Qu.:0.01912  
##  Median :0.002250   Median :0.002490   Median :0.006750   Median :0.02751  
##  Mean   :0.002987   Mean   :0.003277   Mean   :0.008962   Mean   :0.03404  
##  3rd Qu.:0.003290   3rd Qu.:0.003460   3rd Qu.:0.009870   3rd Qu.:0.03975  
##  Max.   :0.057540   Max.   :0.069560   Max.   :0.172630   Max.   :0.26863  
##   Shimmer(dB)     Shimmer:APQ3      Shimmer:APQ5     Shimmer:APQ11    
##  Min.   :0.026   Min.   :0.00161   Min.   :0.00194   Min.   :0.00249  
##  1st Qu.:0.175   1st Qu.:0.00928   1st Qu.:0.01079   1st Qu.:0.01566  
##  Median :0.253   Median :0.01370   Median :0.01594   Median :0.02271  
##  Mean   :0.311   Mean   :0.01716   Mean   :0.02014   Mean   :0.02748  
##  3rd Qu.:0.365   3rd Qu.:0.02057   3rd Qu.:0.02375   3rd Qu.:0.03272  
##  Max.   :2.107   Max.   :0.16267   Max.   :0.16702   Max.   :0.27546  
##   Shimmer:DDA           NHR                HNR              RPDE       
##  Min.   :0.00484   Min.   :0.000286   Min.   : 1.659   Min.   :0.1510  
##  1st Qu.:0.02783   1st Qu.:0.010955   1st Qu.:19.406   1st Qu.:0.4698  
##  Median :0.04111   Median :0.018448   Median :21.920   Median :0.5423  
##  Mean   :0.05147   Mean   :0.032120   Mean   :21.680   Mean   :0.5415  
##  3rd Qu.:0.06173   3rd Qu.:0.031463   3rd Qu.:24.444   3rd Qu.:0.6140  
##  Max.   :0.48802   Max.   :0.748260   Max.   :37.875   Max.   :0.9661  
##       DFA              PPE         
##  Min.   :0.5140   Min.   :0.02198  
##  1st Qu.:0.5962   1st Qu.:0.15634  
##  Median :0.6436   Median :0.20550  
##  Mean   :0.6532   Mean   :0.21959  
##  3rd Qu.:0.7113   3rd Qu.:0.26449  
##  Max.   :0.8656   Max.   :0.73173

2

Calculate the correlation between the different attributes (include the figure produced by R in your answer).

cor(UPDRS_df)
##                    subject#          age           sex     test_time
## subject#       1.0000000000 -0.030863612  0.2868514199 -0.0008815743
## age           -0.0308636122  1.000000000 -0.0416017291  0.0198838435
## sex            0.2868514199 -0.041601729  1.0000000000 -0.0098049838
## test_time     -0.0008815743  0.019883844 -0.0098049838  1.0000000000
## motor_UPDRS    0.2529185298  0.273664760 -0.0312050144  0.0679182641
## total_UPDRS    0.2536427490  0.310289929 -0.0965588806  0.0752626604
## Jitter(%)      0.1354475184  0.023071181  0.0514216175 -0.0228370926
## Jitter(Abs)    0.0751561345  0.035691340 -0.1546453007 -0.0113648117
## Jitter:RAP     0.1203393232  0.010254988  0.0767182203 -0.0288878317
## Jitter:PPQ5    0.1364738360  0.013199367  0.0879947680 -0.0232899083
## Jitter:DDP     0.1203500584  0.010257836  0.0767031684 -0.0288759827
## Shimmer        0.1462017730  0.101553856  0.0587357861 -0.0338701798
## Shimmer(dB)    0.1428639729  0.111129664  0.0564805319 -0.0309624121
## Shimmer:APQ3   0.1129497993  0.098912301  0.0449371995 -0.0290196929
## Shimmer:APQ5   0.1382636007  0.089982893  0.0648192972 -0.0365044263
## Shimmer:APQ11  0.1733326282  0.135237944  0.0233598626 -0.0391096958
## Shimmer:DDA    0.1129486657  0.098913123  0.0449375945 -0.0290168593
## NHR            0.1687433623  0.007092699  0.1681695195 -0.0263570332
## HNR           -0.2069286890 -0.104842069 -0.0001671123  0.0365448637
## RPDE           0.1473003405  0.090208319 -0.1592624409 -0.0388869742
## DFA            0.0974642595 -0.092870159 -0.1651134712  0.0192608786
## PPE            0.1575592025  0.120789753 -0.0999006846 -0.0005633701
##               motor_UPDRS total_UPDRS   Jitter(%) Jitter(Abs)  Jitter:RAP
## subject#       0.25291853  0.25364275  0.13544752  0.07515613  0.12033932
## age            0.27366476  0.31028993  0.02307118  0.03569134  0.01025499
## sex           -0.03120501 -0.09655888  0.05142162 -0.15464530  0.07671822
## test_time      0.06791826  0.07526266 -0.02283709 -0.01136481 -0.02888783
## motor_UPDRS    1.00000000  0.94723131  0.08481576  0.05090328  0.07268353
## total_UPDRS    0.94723131  1.00000000  0.07424667  0.06692673  0.06401542
## Jitter(%)      0.08481576  0.07424667  1.00000000  0.86557722  0.98418075
## Jitter(Abs)    0.05090328  0.06692673  0.86557722  1.00000000  0.84462628
## Jitter:RAP     0.07268353  0.06401542  0.98418075  0.84462628  1.00000000
## Jitter:PPQ5    0.07629087  0.06335178  0.96821443  0.79053765  0.94719593
## Jitter:DDP     0.07269792  0.06402746  0.98418354  0.84463035  0.99999962
## Shimmer        0.10234870  0.09214091  0.70979112  0.64904638  0.68172901
## Shimmer(dB)    0.11007600  0.09878973  0.71670399  0.65587068  0.68555054
## Shimmer:APQ3   0.08426056  0.07936272  0.66414874  0.62382984  0.65022614
## Shimmer:APQ5   0.09210517  0.08346725  0.69400164  0.62140081  0.65983121
## Shimmer:APQ11  0.13656029  0.12083750  0.64596519  0.58999842  0.60308168
## Shimmer:DDA    0.08426039  0.07936324  0.66414746  0.62382750  0.65022465
## NHR            0.07496727  0.06095164  0.82529366  0.69995990  0.79237273
## HNR           -0.15702858 -0.16211683 -0.67518824 -0.70641805 -0.64147280
## RPDE           0.12860740  0.15689651  0.42712754  0.54709960  0.38289088
## DFA           -0.11624248 -0.11347483  0.22654994  0.35226386  0.21488132
## PPE            0.16243297  0.15619488  0.72184881  0.78785284  0.67065210
##               Jitter:PPQ5  Jitter:DDP     Shimmer Shimmer(dB) Shimmer:APQ3
## subject#       0.13647384  0.12035006  0.14620177  0.14286397   0.11294980
## age            0.01319937  0.01025784  0.10155386  0.11112966   0.09891230
## sex            0.08799477  0.07670317  0.05873579  0.05648053   0.04493720
## test_time     -0.02328991 -0.02887598 -0.03387018 -0.03096241  -0.02901969
## motor_UPDRS    0.07629087  0.07269792  0.10234870  0.11007600   0.08426056
## total_UPDRS    0.06335178  0.06402746  0.09214091  0.09878973   0.07936272
## Jitter(%)      0.96821443  0.98418354  0.70979112  0.71670399   0.66414874
## Jitter(Abs)    0.79053765  0.84463035  0.64904638  0.65587068   0.62382984
## Jitter:RAP     0.94719593  0.99999962  0.68172901  0.68555054   0.65022614
## Jitter:PPQ5    1.00000000  0.94720256  0.73274748  0.73459079   0.67671149
## Jitter:DDP     0.94720256  1.00000000  0.68173376  0.68555613   0.65022816
## Shimmer        0.73274748  0.68173376  1.00000000  0.99233407   0.97982804
## Shimmer(dB)    0.73459079  0.68555613  0.99233407  1.00000000   0.96801480
## Shimmer:APQ3   0.67671149  0.65022816  0.97982804  0.96801480   1.00000000
## Shimmer:APQ5   0.73402075  0.65983319  0.98490432  0.97637257   0.96272296
## Shimmer:APQ11  0.66841348  0.60309033  0.93545684  0.93633812   0.88569537
## Shimmer:DDA    0.67671017  0.65022667  0.97982731  0.96801427   0.99999998
## NHR            0.86486425  0.79237731  0.79515848  0.79807697   0.73273634
## HNR           -0.66240886 -0.64148177 -0.80141600 -0.80249646  -0.78069689
## RPDE           0.38150298  0.38288580  0.46823455  0.47240859   0.43687810
## DFA            0.17535854  0.21489299  0.13253994  0.12611117   0.13073500
## PPE            0.66349144  0.67066035  0.61570856  0.63516268   0.57670395
##               Shimmer:APQ5 Shimmer:APQ11 Shimmer:DDA          NHR           HNR
## subject#        0.13826360    0.17333263  0.11294867  0.168743362 -0.2069286890
## age             0.08998289    0.13523794  0.09891312  0.007092699 -0.1048420689
## sex             0.06481930    0.02335986  0.04493759  0.168169520 -0.0001671123
## test_time      -0.03650443   -0.03910970 -0.02901686 -0.026357033  0.0365448637
## motor_UPDRS     0.09210517    0.13656029  0.08426039  0.074967270 -0.1570285788
## total_UPDRS     0.08346725    0.12083750  0.07936324  0.060951644 -0.1621168287
## Jitter(%)       0.69400164    0.64596519  0.66414746  0.825293655 -0.6751882442
## Jitter(Abs)     0.62140081    0.58999842  0.62382750  0.699959896 -0.7064180505
## Jitter:RAP      0.65983121    0.60308168  0.65022465  0.792372728 -0.6414728036
## Jitter:PPQ5     0.73402075    0.66841348  0.67671017  0.864864252 -0.6624088579
## Jitter:DDP      0.65983319    0.60309033  0.65022667  0.792377310 -0.6414817715
## Shimmer         0.98490432    0.93545684  0.97982731  0.795158485 -0.8014160019
## Shimmer(dB)     0.97637257    0.93633812  0.96801427  0.798076972 -0.8024964615
## Shimmer:APQ3    0.96272296    0.88569537  0.99999998  0.732736344 -0.7806968895
## Shimmer:APQ5    1.00000000    0.93893494  0.96272308  0.798173148 -0.7906382164
## Shimmer:APQ11   0.93893494    1.00000000  0.88569414  0.711546170 -0.7779743467
## Shimmer:DDA     0.96272308    0.88569414  1.00000000  0.732733983 -0.7806962950
## NHR             0.79817315    0.71154617  0.73273398  1.000000000 -0.6844118571
## HNR            -0.79063822   -0.77797435 -0.78069630 -0.684411857  1.0000000000
## RPDE            0.45088990    0.48073856  0.43687244  0.416659644 -0.6590531523
## DFA             0.12803754    0.17964765  0.13073592 -0.022087779 -0.2905194517
## PPE             0.59367655    0.62341606  0.57670220  0.564654472 -0.7587222059
##                      RPDE         DFA           PPE
## subject#       0.14730034  0.09746426  0.1575592025
## age            0.09020832 -0.09287016  0.1207897526
## sex           -0.15926244 -0.16511347 -0.0999006846
## test_time     -0.03888697  0.01926088 -0.0005633701
## motor_UPDRS    0.12860740 -0.11624248  0.1624329732
## total_UPDRS    0.15689651 -0.11347483  0.1561948752
## Jitter(%)      0.42712754  0.22654994  0.7218488137
## Jitter(Abs)    0.54709960  0.35226386  0.7878528397
## Jitter:RAP     0.38289088  0.21488132  0.6706520982
## Jitter:PPQ5    0.38150298  0.17535854  0.6634914441
## Jitter:DDP     0.38288580  0.21489299  0.6706603464
## Shimmer        0.46823455  0.13253994  0.6157085590
## Shimmer(dB)    0.47240859  0.12611117  0.6351626782
## Shimmer:APQ3   0.43687810  0.13073500  0.5767039508
## Shimmer:APQ5   0.45088990  0.12803754  0.5936765462
## Shimmer:APQ11  0.48073856  0.17964765  0.6234160550
## Shimmer:DDA    0.43687244  0.13073592  0.5767021962
## NHR            0.41665964 -0.02208778  0.5646544721
## HNR           -0.65905315 -0.29051945 -0.7587222059
## RPDE           1.00000000  0.19203007  0.5660648549
## DFA            0.19203007  1.00000000  0.3946496554
## PPE            0.56606485  0.39464966  1.0000000000
pairs(UPDRS_df)

3

Divide the input dataset into training and testing. a. Split the datasets using 80% for training and 20% for testing. b. How many examples will be used for training and how many for testing?

# Split data
index <- sample(1:nrow(UPDRS_df), size = nrow(UPDRS_df)*0.8)
# UPDRS_df <- UPDRS_df %>% 
#   rename(`Subject Number` = `subject#`) %>%
#   mutate(`Subject Number` = as_factor(`Subject Number`))
train <- UPDRS_df[index,]
test <- UPDRS_df[-index,]

# Number of training and test examples  
train_num <- nrow(train)
test_num <- nrow(test)

cat("Number of training examples:", train_num, "\n")
## Number of training examples: 4700
cat("Number of test examples:", test_num)
## Number of test examples: 1175

4

Build a multiple linear regression model containing all the input variables to predict the output variable. a. Which predictors have a significant impact in the prediction? b. How does the model perform? Provide the R2 and RSE.

model1 <- lm(motor_UPDRS ~ . - total_UPDRS, data = train)
summary(model1)$coefficients
##                      Estimate   Std. Error      t value     Pr(>|t|)
## (Intercept)      2.922538e+01 2.656686e+00  11.00069015 8.288693e-28
## `subject#`       1.847772e-01 9.310138e-03  19.84687741 3.139823e-84
## age              2.010258e-01 1.258357e-02  15.97526579 5.609241e-56
## sex             -2.561165e+00 2.709921e-01  -9.45106950 5.157567e-21
## test_time        9.396041e-03 1.972730e-03   4.76296330 1.965387e-06
## `Jitter(%)`      9.073003e+01 1.751170e+02   0.51811082 6.044054e-01
## `Jitter(Abs)`   -4.863330e+04 8.099617e+03  -6.00439538 2.065669e-09
## `Jitter:RAP`    -3.122471e+04 3.870409e+04  -0.80675470 4.198488e-01
## `Jitter:PPQ5`   -1.569159e+02 1.587158e+02  -0.98865988 3.228807e-01
## `Jitter:DDP`     1.056883e+04 1.290240e+04   0.81913704 4.127499e-01
## Shimmer          3.695318e+01 5.259725e+01   0.70256866 4.823595e-01
## `Shimmer(dB)`    1.787392e-01 3.994928e+00   0.04474155 9.643152e-01
## `Shimmer:APQ3`   3.138519e+04 3.874135e+04   0.81012119 4.179117e-01
## `Shimmer:APQ5`  -5.594127e+01 4.545759e+01  -1.23062529 2.185249e-01
## `Shimmer:APQ11`  4.201019e+01 1.973457e+01   2.12876101 3.332613e-02
## `Shimmer:DDA`   -1.049535e+04 1.291349e+04  -0.81274317 4.164067e-01
## NHR             -1.645451e+01 5.228084e+00  -3.14733079 1.658038e-03
## HNR             -3.701143e-01 5.725469e-02  -6.46434881 1.121156e-10
## RPDE            -2.054135e+00 1.503850e+00  -1.36591738 1.720305e-01
## DFA             -2.672225e+01 1.928972e+00 -13.85309979 8.441359e-43
## PPE              1.468105e+01 2.394727e+00   6.13057443 9.476295e-10
train1_predictions <- predict(model1, newdata = train)
train1_R2 <- summary(lm(motor_UPDRS ~ train1_predictions, data = train))$r.squared
train1_RSE <- sqrt(mean((train$motor_UPDRS - train1_predictions)^2))

cat("Jitter(%), Shimmer, sex, Shimmer:APQ5 have a significant impact in the prediction", "\n")
## Jitter(%), Shimmer, sex, Shimmer:APQ5 have a significant impact in the prediction
cat("Train R2:", train1_R2, "\n")
## Train R2: 0.2177958
cat("Train RSE:", train1_RSE, "\n")
## Train RSE: 7.181451
test1_predictions <- predict(model1, newdata = test)
test1_R2 <- summary(lm(motor_UPDRS ~ test1_predictions, data = test))$r.squared
test1_RSE <- sqrt(mean((test$motor_UPDRS - test1_predictions)^2))

cat("Test R2:", test1_R2, "\n")
## Test R2: 0.2158165
cat("Test RSE:", test1_RSE, "\n")
## Test RSE: 7.228161

Jitter(%), Shimmer, sex, Shimmer:APQ5 have a significant impact in the prediction

Model Performance

Sample Model 1
Training \[0.2178\]
Testing \[0.2158\]
Training RSE \[7.1815\]
Testing RSE \[7.2282\]

5

Build a multiple linear regression model to predict output variable including an interaction term. a. Provide the formula for the model. b. Does this model improve the performance of the previous model?

model2 <- lm(motor_UPDRS ~ . - total_UPDRS + age:sex, data = train)
summary(model2)$coefficients
##                      Estimate   Std. Error      t value     Pr(>|t|)
## (Intercept)      2.992660e+01 2.766031e+00  10.81933033 5.800552e-27
## `subject#`       1.849034e-01 9.311339e-03  19.85787559 2.569940e-84
## age              1.904135e-01 1.714942e-02  11.10320513 2.723733e-28
## sex             -4.040456e+00 1.646557e+00  -2.45388111 1.416857e-02
## test_time        9.371742e-03 1.972946e-03   4.75012510 2.093747e-06
## `Jitter(%)`      9.984950e+01 1.754062e+02   0.56924723 5.692157e-01
## `Jitter(Abs)`   -4.848659e+04 8.101366e+03  -5.98498991 2.325571e-09
## `Jitter:RAP`    -3.091775e+04 3.870627e+04  -0.79877901 4.244591e-01
## `Jitter:PPQ5`   -1.562729e+02 1.587202e+02  -0.98458107 3.248808e-01
## `Jitter:DDP`     1.045983e+04 1.290319e+04   0.81063955 4.176139e-01
## Shimmer          3.641327e+01 5.260154e+01   0.69224708 4.888165e-01
## `Shimmer(dB)`    3.067683e-01 3.997473e+00   0.07674055 9.388332e-01
## `Shimmer:APQ3`   3.219004e+04 3.875213e+04   0.83066501 4.062053e-01
## `Shimmer:APQ5`  -5.531128e+01 4.546368e+01  -1.21660368 2.238164e-01
## `Shimmer:APQ11`  4.275968e+01 1.975208e+01   2.16481912 3.045195e-02
## `Shimmer:DDA`   -1.076381e+04 1.291709e+04  -0.83329988 4.047182e-01
## NHR             -1.664105e+01 5.232189e+00  -3.18051304 1.479750e-03
## HNR             -3.651917e-01 5.751024e-02  -6.35002937 2.356986e-10
## RPDE            -2.185039e+00 1.510729e+00  -1.44634720 1.481468e-01
## DFA             -2.688298e+01 1.937063e+00 -13.87822011 6.033120e-43
## PPE              1.474520e+01 2.395806e+00   6.15458979 8.156212e-10
## age:sex          2.285125e-02 2.508824e-02   0.91083519 3.624291e-01
train2_predictions <- predict(model2, newdata = train)
train2_R2 <- summary(lm(motor_UPDRS ~ train2_predictions, data = train))$r.squared
train2_RSE <- sqrt(mean((train$motor_UPDRS - train2_predictions)^2))

cat("Model including an interaction, Train R2:", train2_R2, "\n")
## Model including an interaction, Train R2: 0.2179345
cat("Model including an interaction, Train RSE:", train2_RSE, "\n")
## Model including an interaction, Train RSE: 7.180815
test2_predictions <- predict(model2, newdata = test)
test2_R2 <- summary(lm(motor_UPDRS ~ test2_predictions, data = test))$r.squared
test2_RSE <- sqrt(mean((test$motor_UPDRS - test2_predictions)^2))

cat("Model including an interaction, Test R2:", test2_R2, "\n")
## Model including an interaction, Test R2: 0.216465
cat("Model including an interaction, Test RSE:", test2_RSE, "\n")
## Model including an interaction, Test RSE: 7.225201

Model Performance

Sample Model 1 Model 2
Training R² \[0.2178\] \[0.2179\]
Testing R² \[0.2158\] \[0.2165\]
Training RSE \[7.1815\] \[7.1808\]
Testing RSE \[7.2282\] \[7.2252\]

For the training set, the new model including an interaction improve the performance of the previous model; for the test set, the new model’s performance is better, too.

6

Build a regression model which includes non-linear transformations of predictors. a. Provide the formula for the model. b. Does this model improve the performance of the models obtained in (4) and/or (5)?

model3 <- lm(motor_UPDRS ~ . - total_UPDRS + poly(age, 2) + poly(age, 3), data = train)

train3_predictions <- predict(model3, newdata = train)
## Warning in predict.lm(model3, newdata = train): prediction from rank-deficient
## fit; attr(*, "non-estim") has doubtful cases
train3_R2 <- summary(lm(motor_UPDRS ~ train3_predictions, data = train))$r.squared
train3_RSE <- sqrt(mean((train$motor_UPDRS - train3_predictions)^2))

cat("Model with Non-linear Transformation, Train R2:", train3_R2, "\n")
## Model with Non-linear Transformation, Train R2: 0.2335892
cat("Model with Non-linear Transformation, Train RSE:", train3_RSE, "\n")
## Model with Non-linear Transformation, Train RSE: 7.108582
test3_predictions <- predict(model3, newdata = test)
## Warning in predict.lm(model3, newdata = test): prediction from rank-deficient
## fit; attr(*, "non-estim") has doubtful cases
test3_R2 <- summary(lm(motor_UPDRS ~ test3_predictions, data = test))$r.squared
test3_RSE <- sqrt(mean((test$motor_UPDRS - test3_predictions)^2))

cat("Model with Non-linear Transformation, Test R2:", test3_R2, "\n")
## Model with Non-linear Transformation, Test R2: 0.2298964
cat("Model with Non-linear Transformation, Test RSE:", test3_RSE, "\n")
## Model with Non-linear Transformation, Test RSE: 7.162677

Model Performance

Sample Model 1 Model 2 Model 3
Training R² \[0.2178\] \[0.2179\] \[0.2336\]
Testing R² \[0.2158\] \[0.2165\] \[0.2299\]
Training RSE \[7.1815\] \[7.1808\] \[7.1086\]
Testing RSE \[7.2282\] \[7.2252\] \[7.1627\]

This model improves the performance of the models obtained in (4) and (5).

7

Provide diagnostic plots for all the models built and comment on whether the models are appropriate based on what these plots show.

anova(model1, model2)
## Analysis of Variance Table
## 
## Model 1: motor_UPDRS ~ (`subject#` + age + sex + test_time + total_UPDRS + 
##     `Jitter(%)` + `Jitter(Abs)` + `Jitter:RAP` + `Jitter:PPQ5` + 
##     `Jitter:DDP` + Shimmer + `Shimmer(dB)` + `Shimmer:APQ3` + 
##     `Shimmer:APQ5` + `Shimmer:APQ11` + `Shimmer:DDA` + NHR + 
##     HNR + RPDE + DFA + PPE) - total_UPDRS
## Model 2: motor_UPDRS ~ (`subject#` + age + sex + test_time + total_UPDRS + 
##     `Jitter(%)` + `Jitter(Abs)` + `Jitter:RAP` + `Jitter:PPQ5` + 
##     `Jitter:DDP` + Shimmer + `Shimmer(dB)` + `Shimmer:APQ3` + 
##     `Shimmer:APQ5` + `Shimmer:APQ11` + `Shimmer:DDA` + NHR + 
##     HNR + RPDE + DFA + PPE) - total_UPDRS + age:sex
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   4679 242394                           
## 2   4678 242351  1     42.98 0.8296 0.3624
anova(model1, model3)
## Analysis of Variance Table
## 
## Model 1: motor_UPDRS ~ (`subject#` + age + sex + test_time + total_UPDRS + 
##     `Jitter(%)` + `Jitter(Abs)` + `Jitter:RAP` + `Jitter:PPQ5` + 
##     `Jitter:DDP` + Shimmer + `Shimmer(dB)` + `Shimmer:APQ3` + 
##     `Shimmer:APQ5` + `Shimmer:APQ11` + `Shimmer:DDA` + NHR + 
##     HNR + RPDE + DFA + PPE) - total_UPDRS
## Model 2: motor_UPDRS ~ (`subject#` + age + sex + test_time + total_UPDRS + 
##     `Jitter(%)` + `Jitter(Abs)` + `Jitter:RAP` + `Jitter:PPQ5` + 
##     `Jitter:DDP` + Shimmer + `Shimmer(dB)` + `Shimmer:APQ3` + 
##     `Shimmer:APQ5` + `Shimmer:APQ11` + `Shimmer:DDA` + NHR + 
##     HNR + RPDE + DFA + PPE) - total_UPDRS + poly(age, 2) + poly(age, 
##     3)
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   4679 242394                                  
## 2   4677 237500  2    4894.1 48.189 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model2, model3)
## Analysis of Variance Table
## 
## Model 1: motor_UPDRS ~ (`subject#` + age + sex + test_time + total_UPDRS + 
##     `Jitter(%)` + `Jitter(Abs)` + `Jitter:RAP` + `Jitter:PPQ5` + 
##     `Jitter:DDP` + Shimmer + `Shimmer(dB)` + `Shimmer:APQ3` + 
##     `Shimmer:APQ5` + `Shimmer:APQ11` + `Shimmer:DDA` + NHR + 
##     HNR + RPDE + DFA + PPE) - total_UPDRS + age:sex
## Model 2: motor_UPDRS ~ (`subject#` + age + sex + test_time + total_UPDRS + 
##     `Jitter(%)` + `Jitter(Abs)` + `Jitter:RAP` + `Jitter:PPQ5` + 
##     `Jitter:DDP` + Shimmer + `Shimmer(dB)` + `Shimmer:APQ3` + 
##     `Shimmer:APQ5` + `Shimmer:APQ11` + `Shimmer:DDA` + NHR + 
##     HNR + RPDE + DFA + PPE) - total_UPDRS + poly(age, 2) + poly(age, 
##     3)
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   4678 242351                                  
## 2   4677 237500  1    4851.2 95.532 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(model1)

par(mfrow = c(2, 2))
plot(model2)

par(mfrow = c(2, 2))
plot(model3)

According to the Residuals vs Fitted and other three types of subplots of all three models, the last model is the best and appropriate model.

The codes are also publicly available at https://rpubs.com/AlanHuang/CSC642-R_Assignment1