LAST NAME: RISER
FIRST NAME: SAMANTHA
M#: M05023930
# Set the working directory
setwd("~/OneNote Notebooks/7038-DA Methods/Homework 4")
# Load the required packages
library(DAAG)
library(ggplot2)
library(readr)
library(dplyr)
library(psych)
Continue with the PGA.csv data set. For this data set, fit a multiple linear regression to the data. Use the log of Average winnings as the response variable and use Age, Average Drive (Yards), Driving Accuracy (percent), Greens on Regulation (%), Average # of Putts, Save Percent, and # Events as covariates. Perform a 5-fold cross validation and obtain the PRESS statistic, MPSE, and prediction R-squared.
# Read the data into R
pga <- read.csv("PGA.csv", header = T, stringsAsFactors = FALSE)
# Fit the model with log of the repsonse variable
pga_fit_log <- lm(log(AverageWinnings) ~ Age + AverageDrive + DrivingAccuracy + GreensonRegulation + AverageNumofPutts + SavePercent + NumEvents, data = pga)
summary(pga_fit_log)
##
## Call:
## lm(formula = log(AverageWinnings) ~ Age + AverageDrive + DrivingAccuracy +
## GreensonRegulation + AverageNumofPutts + SavePercent + NumEvents,
## data = pga)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.77513 -0.41331 -0.01228 0.43022 1.73912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.831322 4.482002 7.548 1.85e-12 ***
## Age -0.008215 0.007609 -1.080 0.28169
## AverageDrive -0.006110 0.008314 -0.735 0.46335
## DrivingAccuracy -0.020495 0.012514 -1.638 0.10313
## GreensonRegulation 0.190382 0.019105 9.965 < 2e-16 ***
## AverageNumofPutts -18.457365 2.024330 -9.118 < 2e-16 ***
## SavePercent 0.027000 0.008609 3.136 0.00199 **
## NumEvents -0.039754 0.009440 -4.211 3.93e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6071 on 188 degrees of freedom
## Multiple R-squared: 0.6358, Adjusted R-squared: 0.6223
## F-statistic: 46.89 on 7 and 188 DF, p-value: < 2.2e-16
# Perform a 5-fold cross validation.
KCV <- cv.lm(data = pga, pga_fit_log, m = 5, seed = 123)
## Analysis of Variance Table
##
## Response: log(AverageWinnings)
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 0.0 0.0 0.02 0.89355
## AverageDrive 1 2.6 2.6 6.93 0.00916 **
## DrivingAccuracy 1 3.4 3.4 9.30 0.00262 **
## GreensonRegulation 1 57.3 57.3 155.58 < 2e-16 ***
## AverageNumofPutts 1 46.7 46.7 126.74 < 2e-16 ***
## SavePercent 1 4.4 4.4 11.92 0.00069 ***
## NumEvents 1 6.5 6.5 17.74 3.9e-05 ***
## Residuals 188 69.3 0.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## fold 1
## Observations in test set: 39
## 4 5 11 13 25 36 38 39
## Predicted 10.026 9.854 10.214 9.244 10.999 10.286 9.480 9.77
## cvpred 10.064 9.902 10.211 9.277 11.122 10.183 9.444 9.71
## log(AverageWinnings) 10.607 9.693 9.921 8.704 10.560 10.554 9.710 11.31
## CV residual 0.543 -0.209 -0.289 -0.573 -0.562 0.372 0.266 1.60
## 43 53 72 73 75 76 78
## Predicted 9.288 11.628 8.692 10.456 10.4188 10.142 9.995
## cvpred 9.336 11.627 8.710 10.456 10.4348 10.215 10.066
## log(AverageWinnings) 8.384 11.495 8.603 10.212 10.4462 10.291 10.207
## CV residual -0.952 -0.132 -0.107 -0.244 0.0114 0.075 0.141
## 81 92 111 114 116 119 120
## Predicted 9.910 11.070 8.71 11.232 10.912 10.249 11.2035
## cvpred 9.923 11.200 8.74 11.240 10.934 10.352 11.2533
## log(AverageWinnings) 9.176 10.509 6.94 11.086 10.419 9.830 11.2695
## CV residual -0.748 -0.691 -1.80 -0.154 -0.515 -0.522 0.0162
## 125 127 128 136 141 145 152
## Predicted 11.04 10.61 9.989 10.686 10.815 9.605 11.13416
## cvpred 11.11 10.58 10.057 10.721 10.775 9.736 11.11286
## log(AverageWinnings) 9.67 10.39 9.809 9.899 10.172 9.060 11.11000
## CV residual -1.44 -0.19 -0.248 -0.823 -0.603 -0.676 -0.00286
## 159 160 162 164 169 183 185
## Predicted 11.075 10.397 11.4116 10.3566 10.051 9.807 9.90
## cvpred 11.083 10.422 11.4386 10.4150 10.040 9.815 9.92
## log(AverageWinnings) 11.197 10.285 11.4985 10.4683 10.792 10.527 11.64
## CV residual 0.114 -0.138 0.0599 0.0533 0.752 0.712 1.72
## 189 195 196
## Predicted 10.480 10.7422 11.021
## cvpred 10.462 10.8009 11.096
## log(AverageWinnings) 10.770 10.8510 11.297
## CV residual 0.309 0.0502 0.201
##
## Sum of squares = 17.6 Mean square = 0.45 n = 39
##
## fold 2
## Observations in test set: 40
## 9 12 18 24 28 33 40 46
## Predicted 9.593 9.216 10.266 8.703 10.440 10.398 9.428 10.671
## cvpred 9.537 9.116 10.304 8.566 10.337 10.413 9.300 10.673
## log(AverageWinnings) 10.128 9.663 10.500 8.804 11.190 10.937 9.661 9.845
## CV residual 0.591 0.547 0.197 0.239 0.853 0.524 0.361 -0.828
## 50 51 52 54 55 57 58 63
## Predicted 7.394 9.19 10.111 10.6 10.051 10.4804 8.556 10.322
## cvpred 7.258 9.20 10.086 10.6 10.057 10.4271 8.459 10.317
## log(AverageWinnings) 6.745 8.71 9.822 11.8 9.742 10.3810 9.212 10.750
## CV residual -0.513 -0.49 -0.263 1.2 -0.316 -0.0461 0.752 0.434
## 80 84 87 88 95 97 99 103
## Predicted 10.296 9.896 9.353 10.738 10.52 10.893 9.86 10.812
## cvpred 10.260 9.819 9.263 10.722 10.55 10.830 9.86 10.892
## log(AverageWinnings) 10.136 8.898 9.488 11.244 9.31 10.685 10.14 10.570
## CV residual -0.124 -0.922 0.225 0.521 -1.24 -0.145 0.28 -0.322
## 106 109 115 126 131 134 135 137
## Predicted 11.424 10.983 9.0705 9.087 11.202 9.138 10.59 9.250
## cvpred 11.403 10.913 8.9457 8.960 11.235 9.057 10.59 9.239
## log(AverageWinnings) 10.937 11.349 8.9735 9.779 11.740 9.398 10.14 9.421
## CV residual -0.466 0.435 0.0278 0.819 0.505 0.341 -0.45 0.183
## 142 143 149 155 173 188 190 192
## Predicted 9.24 11.871 9.732 10.17 10.09 11.487 9.36 9.52
## cvpred 9.14 11.844 9.723 10.09 10.04 11.460 9.26 9.46
## log(AverageWinnings) 10.29 12.480 10.414 11.47 10.66 11.166 10.31 8.32
## CV residual 1.15 0.635 0.692 1.39 0.62 -0.294 1.06 -1.14
##
## Sum of squares = 17.1 Mean square = 0.43 n = 40
##
## fold 3
## Observations in test set: 39
## 14 15 17 19 21 27 30 47
## Predicted 10.687 9.8898 10.0 10.38 10.437 11.189 10.00 11.081
## cvpred 10.582 9.9251 10.1 10.52 10.495 11.230 9.91 11.146
## log(AverageWinnings) 10.855 9.8720 10.5 9.64 10.203 10.797 10.95 11.741
## CV residual 0.273 -0.0531 0.4 -0.88 -0.293 -0.433 1.04 0.595
## 62 66 67 68 69 71 74
## Predicted 8.694 10.878 9.17977 11.474 9.873 10.92 10.9683
## cvpred 8.698 11.007 9.17277 11.403 9.868 11.01 10.9089
## log(AverageWinnings) 8.071 10.868 9.16660 10.770 9.665 9.81 10.9660
## CV residual -0.627 -0.139 -0.00617 -0.633 -0.203 -1.20 0.0571
## 77 79 89 90 93 101 104 105
## Predicted 8.368 10.041 10.771 11.056 10.086 10.181 11.05 10.856
## cvpred 8.216 9.859 10.822 11.070 10.092 10.129 11.31 10.854
## log(AverageWinnings) 8.700 10.250 10.491 11.363 10.717 10.645 10.28 11.023
## CV residual 0.484 0.391 -0.331 0.293 0.625 0.516 -1.03 0.169
## 108 113 117 118 123 124 132
## Predicted 9.537 9.545 9.259 11.68 10.711 10.445 10.167
## cvpred 9.606 9.503 9.281 11.88 10.707 10.595 10.261
## log(AverageWinnings) 9.215 10.446 9.053 10.72 11.448 10.373 9.964
## CV residual -0.392 0.943 -0.227 -1.16 0.741 -0.222 -0.297
## 140 157 158 163 166 178 179
## Predicted 10.217 10.411 9.331 11.656 9.860 11.112 10.421
## cvpred 10.194 10.332 9.228 11.667 9.850 11.154 10.278
## log(AverageWinnings) 10.530 10.784 9.816 12.100 10.636 10.452 10.753
## CV residual 0.335 0.452 0.588 0.433 0.786 -0.702 0.475
## 180 191
## Predicted 12.212 9.84
## cvpred 12.298 9.99
## log(AverageWinnings) 12.551 8.71
## CV residual 0.253 -1.27
##
## Sum of squares = 14.3 Mean square = 0.37 n = 39
##
## fold 4
## Observations in test set: 39
## 6 7 10 20 23 26 29
## Predicted 10.806 10.737 10.763 10.120 10.31 10.562 10.331
## cvpred 10.831 10.755 10.784 10.125 10.22 10.567 10.350
## log(AverageWinnings) 10.984 10.641 11.049 10.612 9.91 10.165 11.301
## CV residual 0.153 -0.113 0.265 0.487 -0.31 -0.402 0.951
## 32 35 37 45 49 61 65 86
## Predicted 11.166 8.697 10.74 10.401 9.892 11.62 10.334 10.226
## cvpred 11.127 8.674 10.80 10.416 9.925 11.61 10.340 10.253
## log(AverageWinnings) 11.609 8.964 10.08 9.561 9.648 12.80 11.192 9.904
## CV residual 0.482 0.291 -0.72 -0.855 -0.277 1.19 0.852 -0.349
## 94 96 98 100 112 121 122 130
## Predicted 9.49 11.768 10.919 11.021 9.918 8.614 10.452 9.478
## cvpred 9.51 11.785 10.908 11.089 9.875 8.568 10.479 9.525
## log(AverageWinnings) 10.81 11.583 10.123 10.124 10.246 9.050 10.306 9.188
## CV residual 1.30 -0.203 -0.785 -0.965 0.371 0.482 -0.173 -0.337
## 133 144 146 147 151 156 161 170
## Predicted 10.4 12.212 9.530 10.381 10.037 9.692 9.524 10.116
## cvpred 10.4 12.246 9.615 10.344 10.088 9.693 9.583 10.154
## log(AverageWinnings) 10.9 12.400 9.016 9.576 9.515 8.910 9.270 9.886
## CV residual 0.5 0.154 -0.599 -0.768 -0.573 -0.784 -0.314 -0.268
## 171 172 174 175 177 182 186 193
## Predicted 10.297 10.55 10.00 10.051 10.72 10.236 10.044 11.057
## cvpred 10.363 10.50 10.03 9.946 10.73 10.250 10.049 11.039
## log(AverageWinnings) 9.406 11.28 9.03 9.701 11.68 11.104 10.461 10.682
## CV residual -0.957 0.78 -1.00 -0.245 0.95 0.854 0.412 -0.356
##
## Sum of squares = 16.1 Mean square = 0.41 n = 39
##
## fold 5
## Observations in test set: 39
## 1 2 3 8 16 22 31
## Predicted 9.562 11.671 9.920 11.033 10.616 10.876 8.974
## cvpred 9.575 11.590 9.934 11.035 10.569 10.905 9.065
## log(AverageWinnings) 10.062 12.358 10.910 10.556 10.725 10.553 8.338
## CV residual 0.487 0.768 0.976 -0.479 0.156 -0.352 -0.727
## 34 41 42 44 48 56 59
## Predicted 10.028 9.901 9.569 10.575 9.860 10.097 10.109
## cvpred 10.126 9.965 9.535 10.614 9.929 10.101 10.104
## log(AverageWinnings) 9.952 9.489 9.330 10.049 9.189 9.665 10.523
## CV residual -0.174 -0.476 -0.205 -0.566 -0.741 -0.436 0.418
## 60 64 70 82 83 85 91
## Predicted 10.8849 10.843 9.139 10.318 10.250 11.284 11.0872
## cvpred 10.8872 10.738 9.167 10.403 10.296 11.137 11.0119
## log(AverageWinnings) 10.9548 11.377 9.018 9.882 9.337 11.408 11.0758
## CV residual 0.0676 0.638 -0.149 -0.521 -0.959 0.271 0.0639
## 102 107 110 129 138 139 148
## Predicted 9.98 10.716 10.382 10.41451 9.71 10.3 10.8457
## cvpred 10.03 10.683 10.454 10.35197 9.78 10.4 10.8757
## log(AverageWinnings) 11.12 11.369 9.957 10.35873 9.14 10.0 10.9719
## CV residual 1.10 0.686 -0.497 0.00676 -0.64 -0.4 0.0962
## 150 153 154 165 167 168 176
## Predicted 10.405 9.661 10.24 10.899 9.3132 11.439 11.840
## cvpred 10.445 9.729 10.30 10.864 9.3973 11.392 11.778
## log(AverageWinnings) 10.060 9.032 9.29 11.391 9.3360 11.715 11.976
## CV residual -0.385 -0.697 -1.02 0.528 -0.0613 0.323 0.198
## 181 184 187 194
## Predicted 10.796 10.031 9.2926 12.109
## cvpred 10.840 10.049 9.3569 12.052
## log(AverageWinnings) 10.660 10.154 9.3333 12.837
## CV residual -0.179 0.105 -0.0236 0.785
##
## Sum of squares = 11.2 Mean square = 0.29 n = 39
##
## Overall (Sum over all 39 folds)
## ms
## 0.389
# Obtain the PRESS statistic (which is equivalent to the leave-one-out cross validation estimate of prediction error, where K = n),
(press <- sum((log(pga$AverageWinnings) - KCV$cvpred)^2))
## [1] 76.2
# Obtain the MPSE
n = length(pga$Name)
(mpse <- sum((log(pga$AverageWinnings) - KCV$cvpred)^2)/n)
## [1] 0.389
# Obtain the prediction R-squared.
(predrsq <- 1 - sum((log(pga$AverageWinnings) - KCV$cvpred)^2) / sum((log(pga$AverageWinnings) - mean(log(pga$AverageWinnings)))^2))
## [1] 0.599
Fit another multiple linear regression to the data. Use the log of Average winnings as the response variable and use Driving Accuracy (percent), Greens on Regulation (%), Average # of Putts, Save Percent, and # Events as covariates. Perform a 5-fold cross validation and obtain the PRESS statistic, MPSE, and prediction R-squared.
# Fit the model with log of the repsonse variable
pga_fit_log2 <- lm(log(AverageWinnings) ~ DrivingAccuracy + GreensonRegulation + AverageNumofPutts + SavePercent + NumEvents, data = pga)
summary(pga_fit_log2)
##
## Call:
## lm(formula = log(AverageWinnings) ~ DrivingAccuracy + GreensonRegulation +
## AverageNumofPutts + SavePercent + NumEvents, data = pga)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7202 -0.3915 -0.0202 0.4216 1.7450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.28145 3.99570 8.08 7.4e-14 ***
## DrivingAccuracy -0.01598 0.00869 -1.84 0.0674 .
## GreensonRegulation 0.18357 0.01634 11.23 < 2e-16 ***
## AverageNumofPutts -18.67042 2.00742 -9.30 < 2e-16 ***
## SavePercent 0.02716 0.00857 3.17 0.0018 **
## NumEvents -0.03885 0.00926 -4.20 4.2e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.606 on 190 degrees of freedom
## Multiple R-squared: 0.633, Adjusted R-squared: 0.624
## F-statistic: 65.6 on 5 and 190 DF, p-value: <2e-16
# Perform a 5-fold cross validation.
KCV2 <- cv.lm(data = pga, pga_fit_log2, m = 5, seed = 123)
## Analysis of Variance Table
##
## Response: log(AverageWinnings)
## Df Sum Sq Mean Sq F value Pr(>F)
## DrivingAccuracy 1 0.3 0.3 0.83 0.36402
## GreensonRegulation 1 59.7 59.7 162.57 < 2e-16 ***
## AverageNumofPutts 1 49.4 49.4 134.60 < 2e-16 ***
## SavePercent 1 4.6 4.6 12.52 0.00051 ***
## NumEvents 1 6.5 6.5 17.60 4.2e-05 ***
## Residuals 190 69.8 0.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## fold 1
## Observations in test set: 39
## 4 5 11 13 25 36 38 39
## Predicted 10.058 9.832 10.214 9.287 10.947 10.300 9.450 9.74
## cvpred 10.078 9.877 10.240 9.348 11.066 10.215 9.403 9.68
## log(AverageWinnings) 10.607 9.693 9.921 8.704 10.560 10.554 9.710 11.31
## CV residual 0.529 -0.184 -0.319 -0.644 -0.506 0.339 0.308 1.64
## 43 53 72 73 75 76 78 81
## Predicted 9.33 11.601 8.720 10.522 10.3 10.108 9.919 9.922
## cvpred 9.40 11.609 8.737 10.507 10.3 10.163 9.959 9.938
## log(AverageWinnings) 8.38 11.495 8.603 10.212 10.4 10.291 10.207 9.176
## CV residual -1.01 -0.114 -0.134 -0.295 0.1 0.128 0.248 -0.763
## 92 111 114 116 119 120 125 127
## Predicted 11.090 8.66 11.25 10.909 10.24 11.072 10.96 10.610
## cvpred 11.206 8.67 11.27 10.947 10.30 11.096 11.00 10.569
## log(AverageWinnings) 10.509 6.94 11.09 10.419 9.83 11.270 9.67 10.395
## CV residual -0.697 -1.73 -0.18 -0.528 -0.47 0.173 -1.33 -0.174
## 128 136 141 145 152 159 160
## Predicted 9.904 10.748 10.857 9.558 11.13354 11.1271 10.467
## cvpred 9.950 10.791 10.851 9.677 11.11514 11.1528 10.486
## log(AverageWinnings) 9.809 9.899 10.172 9.060 11.11000 11.1969 10.285
## CV residual -0.141 -0.892 -0.679 -0.617 -0.00514 0.0441 -0.201
## 162 164 169 183 185 189 195
## Predicted 11.4282 10.3292 10.011 9.838 9.89 10.558 10.77818
## cvpred 11.4451 10.3888 9.996 9.853 9.94 10.554 10.84727
## log(AverageWinnings) 11.4985 10.4683 10.792 10.527 11.64 10.770 10.85101
## CV residual 0.0534 0.0795 0.796 0.673 1.70 0.216 0.00374
## 196
## Predicted 10.97
## cvpred 11.02
## log(AverageWinnings) 11.30
## CV residual 0.28
##
## Sum of squares = 17.4 Mean square = 0.45 n = 39
##
## fold 2
## Observations in test set: 40
## 9 12 18 24 28 33 40 46
## Predicted 9.487 9.235 10.263 8.710 10.441 10.303 9.413 10.659
## cvpred 9.407 9.141 10.276 8.577 10.329 10.291 9.264 10.663
## log(AverageWinnings) 10.128 9.663 10.500 8.804 11.190 10.937 9.661 9.845
## CV residual 0.721 0.523 0.225 0.227 0.862 0.646 0.397 -0.818
## 50 51 52 54 55 57 58 63
## Predicted 7.293 9.147 10.145 10.71 10.03 10.604 8.52 10.330
## cvpred 7.120 9.147 10.111 10.70 10.03 10.585 8.41 10.330
## log(AverageWinnings) 6.745 8.714 9.822 11.76 9.74 10.381 9.21 10.750
## CV residual -0.374 -0.433 -0.289 1.06 -0.29 -0.204 0.80 0.421
## 80 84 87 88 95 97 99 103
## Predicted 10.376 9.96 9.440 10.733 10.53 10.918 9.841 10.865
## cvpred 10.352 9.89 9.376 10.713 10.55 10.857 9.860 10.968
## log(AverageWinnings) 10.136 8.90 9.488 11.244 9.31 10.685 10.144 10.570
## CV residual -0.216 -0.99 0.112 0.531 -1.23 -0.172 0.284 -0.397
## 106 109 115 126 131 134 135 137
## Predicted 11.316 11.067 9.09175 9.085 11.157 9.12 10.565 9.210
## cvpred 11.289 11.013 8.97550 8.950 11.183 9.03 10.569 9.190
## log(AverageWinnings) 10.937 11.349 8.97348 9.779 11.740 9.40 10.141 9.421
## CV residual -0.352 0.336 -0.00202 0.829 0.557 0.37 -0.427 0.232
## 142 143 149 155 173 188 190 192
## Predicted 9.27 11.870 9.671 10.11 10.154 11.535 9.45 9.52
## cvpred 9.16 11.861 9.651 10.02 10.091 11.517 9.34 9.45
## log(AverageWinnings) 10.29 12.480 10.414 11.47 10.658 11.166 10.31 8.32
## CV residual 1.13 0.619 0.763 1.45 0.568 -0.351 0.97 -1.12
##
## Sum of squares = 17.1 Mean square = 0.43 n = 40
##
## fold 3
## Observations in test set: 39
## 14 15 17 19 21 27 30
## Predicted 10.665 9.860157 10.040 10.394 10.440 11.138 9.91
## cvpred 10.620 9.872372 10.025 10.522 10.478 11.235 9.86
## log(AverageWinnings) 10.855 9.872048 10.523 9.645 10.203 10.797 10.95
## CV residual 0.235 -0.000323 0.499 -0.878 -0.275 -0.438 1.09
## 47 62 66 67 68 69 71
## Predicted 11.088 8.73 10.8016 9.21643 11.488 9.931 10.91
## cvpred 11.176 8.69 10.8831 9.16978 11.575 9.953 11.08
## log(AverageWinnings) 11.741 8.07 10.8678 9.16660 10.770 9.665 9.81
## CV residual 0.565 -0.62 -0.0153 -0.00318 -0.805 -0.288 -1.26
## 74 77 79 89 90 93 101
## Predicted 11.020 8.348 10.061 10.808 11.031 10.061 10.172
## cvpred 11.077 8.253 9.993 10.746 11.005 10.101 10.262
## log(AverageWinnings) 10.966 8.700 10.250 10.491 11.363 10.717 10.645
## CV residual -0.111 0.448 0.258 -0.255 0.358 0.616 0.383
## 104 105 108 113 117 118 123 124
## Predicted 10.994 10.806 9.510 9.596 9.263 11.70 10.71 10.473
## cvpred 11.142 10.821 9.536 9.582 9.318 11.77 10.77 10.540
## log(AverageWinnings) 10.279 11.023 9.215 10.446 9.053 10.72 11.45 10.373
## CV residual -0.863 0.201 -0.321 0.864 -0.264 -1.05 0.68 -0.167
## 132 140 157 158 163 166 178
## Predicted 10.219 10.233 10.397 9.412 11.530 9.896 11.116
## cvpred 10.248 10.219 10.403 9.442 11.542 9.890 11.238
## log(AverageWinnings) 9.964 10.530 10.784 9.816 12.100 10.636 10.452
## CV residual -0.284 0.311 0.381 0.373 0.559 0.746 -0.786
## 179 180 191
## Predicted 10.412 12.193 9.83
## cvpred 10.335 12.352 9.90
## log(AverageWinnings) 10.753 12.551 8.71
## CV residual 0.418 0.199 -1.19
##
## Sum of squares = 13.4 Mean square = 0.34 n = 39
##
## fold 4
## Observations in test set: 39
## 6 7 10 20 23 26 29 32
## Predicted 10.74 10.770 10.8 10.154 10.280 10.572 10.264 11.118
## cvpred 10.77 10.789 10.8 10.141 10.199 10.564 10.302 11.103
## log(AverageWinnings) 10.98 10.641 11.0 10.612 9.912 10.165 11.301 11.609
## CV residual 0.21 -0.147 0.2 0.472 -0.288 -0.399 0.999 0.506
## 35 37 45 49 61 65 86 94
## Predicted 8.699 10.727 10.429 9.897 11.63 10.400 10.268 9.58
## cvpred 8.681 10.776 10.458 9.957 11.62 10.401 10.268 9.59
## log(AverageWinnings) 8.964 10.079 9.561 9.648 12.80 11.192 9.904 10.81
## CV residual 0.283 -0.697 -0.897 -0.309 1.17 0.791 -0.364 1.22
## 96 98 100 112 121 122 130
## Predicted 11.845 10.911 11.023 9.789 8.637 10.552 9.591
## cvpred 11.835 10.933 11.076 9.764 8.608 10.554 9.594
## log(AverageWinnings) 11.583 10.123 10.124 10.246 9.050 10.306 9.188
## CV residual -0.252 -0.811 -0.952 0.482 0.442 -0.248 -0.407
## 133 144 146 147 151 156 161
## Predicted 10.452 12.213 9.565 10.328 10.066 9.743 9.618
## cvpred 10.447 12.245 9.625 10.289 10.122 9.756 9.673
## log(AverageWinnings) 10.879 12.400 9.016 9.576 9.515 8.910 9.270
## CV residual 0.432 0.155 -0.609 -0.714 -0.607 -0.847 -0.403
## 170 171 172 174 175 177 182 186
## Predicted 10.129 10.35 10.519 10.03 10.02 10.723 10.229 10.077
## cvpred 10.145 10.41 10.494 10.07 9.95 10.723 10.248 10.087
## log(AverageWinnings) 9.886 9.41 11.282 9.03 9.70 11.678 11.104 10.461
## CV residual -0.259 -1.00 0.788 -1.05 -0.25 0.955 0.856 0.374
## 193
## Predicted 11.01
## cvpred 10.99
## log(AverageWinnings) 10.68
## CV residual -0.31
##
## Sum of squares = 16.2 Mean square = 0.42 n = 39
##
## fold 5
## Observations in test set: 39
## 1 2 3 8 16 22 31 34
## Predicted 9.460 11.59 9.90 10.963 10.583 10.864 9.009 10.061
## cvpred 9.447 11.45 9.90 10.937 10.568 10.850 9.052 10.170
## log(AverageWinnings) 10.062 12.36 10.91 10.556 10.725 10.553 8.338 9.952
## CV residual 0.615 0.91 1.01 -0.381 0.158 -0.296 -0.714 -0.218
## 41 42 44 48 56 59 60
## Predicted 9.844 9.629 10.575 9.845 10.144 10.095 10.91650
## cvpred 9.875 9.634 10.590 9.918 10.128 10.094 10.94523
## log(AverageWinnings) 9.489 9.330 10.049 9.189 9.665 10.523 10.95478
## CV residual -0.386 -0.305 -0.541 -0.729 -0.463 0.429 0.00956
## 64 70 82 83 85 91 102
## Predicted 10.918 9.167 10.310 10.188 11.325 11.0926 9.97
## cvpred 10.839 9.228 10.363 10.201 11.269 11.0463 10.00
## log(AverageWinnings) 11.377 9.018 9.882 9.337 11.408 11.0758 11.12
## CV residual 0.538 -0.209 -0.481 -0.864 0.139 0.0295 1.12
## 107 110 129 138 139 148 150
## Predicted 10.666 10.400 10.488 9.651 10.306 10.831 10.417
## cvpred 10.652 10.468 10.480 9.747 10.352 10.848 10.457
## log(AverageWinnings) 11.369 9.957 10.359 9.144 10.026 10.972 10.060
## CV residual 0.717 -0.512 -0.122 -0.603 -0.325 0.124 -0.397
## 153 154 165 167 168 176 181
## Predicted 9.656 10.172 10.855 9.2913 11.464 11.799 10.7127
## cvpred 9.680 10.195 10.838 9.3925 11.436 11.739 10.7147
## log(AverageWinnings) 9.032 9.286 11.391 9.3360 11.715 11.976 10.6601
## CV residual -0.649 -0.909 0.553 -0.0565 0.278 0.237 -0.0545
## 184 187 194
## Predicted 9.984 9.3274 12.169
## cvpred 10.041 9.4121 12.145
## log(AverageWinnings) 10.154 9.3333 12.837
## CV residual 0.113 -0.0789 0.692
##
## Sum of squares = 10.7 Mean square = 0.27 n = 39
##
## Overall (Sum over all 39 folds)
## ms
## 0.382
# Obtain the PRESS statistic
(press2 <- sum((log(pga$AverageWinnings) - KCV2$cvpred)^2))
## [1] 74.9
# Obtain the MPSE
n = length(pga$Name)
(mpse2 <- sum((log(pga$AverageWinnings) - KCV2$cvpred)^2)/n)
## [1] 0.382
# Obtain the prediction R-squared.
(predrsq2 <- 1 - sum((log(pga$AverageWinnings) - KCV2$cvpred)^2) / sum((log(pga$AverageWinnings) - mean(log(pga$AverageWinnings)))^2))
## [1] 0.606
Fit a third multiple linear regression to the data. Use the log of Average winnings as the response variable and use Average # of Putts, Save Percent, and # Events as covariates. Perform a 5-fold cross validation and obtain the PRESS statistic, MPSE, and prediction R-squared.
# Fit the model with log of the repsonse variable
pga_fit_log3 <- lm(log(AverageWinnings) ~ AverageNumofPutts + SavePercent + NumEvents, data = pga)
summary(pga_fit_log3)
##
## Call:
## lm(formula = log(AverageWinnings) ~ AverageNumofPutts + SavePercent +
## NumEvents, data = pga)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5447 -0.5020 0.0232 0.5641 2.1771
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.8740 4.7031 10.82 < 2e-16 ***
## AverageNumofPutts -22.4970 2.5334 -8.88 4.7e-16 ***
## SavePercent 0.0116 0.0108 1.07 0.28438
## NumEvents -0.0441 0.0119 -3.70 0.00028 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.782 on 192 degrees of freedom
## Multiple R-squared: 0.383, Adjusted R-squared: 0.373
## F-statistic: 39.7 on 3 and 192 DF, p-value: <2e-16
# Perform a 5-fold cross validation.
KCV3 <- cv.lm(data = pga, pga_fit_log3, m = 5, seed = 123)
## Analysis of Variance Table
##
## Response: log(AverageWinnings)
## Df Sum Sq Mean Sq F value Pr(>F)
## AverageNumofPutts 1 63.4 63.4 103.68 < 2e-16 ***
## SavePercent 1 1.1 1.1 1.83 0.17735
## NumEvents 1 8.4 8.4 13.67 0.00028 ***
## Residuals 192 117.4 0.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## fold 1
## Observations in test set: 39
## 4 5 11 13 25 36 38
## Predicted 10.345 10.328 10.208 9.50 9.996 10.888 9.877
## cvpred 10.365 10.371 10.194 9.56 10.054 10.821 9.845
## log(AverageWinnings) 10.607 9.693 9.921 8.70 10.560 10.554 9.710
## CV residual 0.242 -0.678 -0.272 -0.86 0.506 -0.267 -0.135
## 39 43 53 72 73 75 76
## Predicted 10.385 9.99 10.914 9.59 10.534 9.914 10.123
## cvpred 10.347 10.05 10.860 9.65 10.447 9.872 10.187
## log(AverageWinnings) 11.312 8.38 11.495 8.60 10.212 10.446 10.291
## CV residual 0.965 -1.67 0.635 -1.05 -0.235 0.575 0.104
## 78 81 92 111 114 116 119 120
## Predicted 10.1520 10.27 9.763 9.48 10.975 10.304 9.687 10.26
## cvpred 10.1779 10.29 9.830 9.51 10.972 10.288 9.696 10.25
## log(AverageWinnings) 10.2074 9.18 10.509 6.94 11.086 10.419 9.830 11.27
## CV residual 0.0295 -1.12 0.679 -2.58 0.114 0.132 0.134 1.02
## 125 127 128 136 141 145 152
## Predicted 10.371 11.045 9.7995 10.175 11.20 9.711 10.75
## cvpred 10.389 11.008 9.8238 10.185 11.18 9.831 10.67
## log(AverageWinnings) 9.668 10.395 9.8085 9.899 10.17 9.060 11.11
## CV residual -0.721 -0.613 -0.0153 -0.286 -1.01 -0.771 0.44
## 159 160 162 164 169 183 185
## Predicted 10.527 10.397 11.152 9.896 9.852 10.081 10.29
## cvpred 10.530 10.393 11.166 9.915 9.810 10.077 10.33
## log(AverageWinnings) 11.197 10.285 11.498 10.468 10.792 10.527 11.64
## CV residual 0.667 -0.108 0.333 0.553 0.982 0.449 1.31
## 189 195 196
## Predicted 10.152 10.209 10.686
## cvpred 10.099 10.226 10.712
## log(AverageWinnings) 10.770 10.851 11.297
## CV residual 0.672 0.625 0.585
##
## Sum of squares = 24.4 Mean square = 0.63 n = 39
##
## fold 2
## Observations in test set: 40
## 9 12 18 24 28 33 40 46
## Predicted 9.786 8.89 10.908 8.938 9.78 10.731 9.439 10.829
## cvpred 9.763 8.90 10.904 8.949 9.77 10.723 9.443 10.828
## log(AverageWinnings) 10.128 9.66 10.500 8.804 11.19 10.937 9.661 9.845
## CV residual 0.365 0.76 -0.404 -0.144 1.42 0.213 0.218 -0.983
## 50 51 52 54 55 57 58 63
## Predicted 8.77 10.08 10.287 10.983 10.546 10.750 8.903 10.586
## cvpred 8.78 10.07 10.281 10.974 10.546 10.765 8.888 10.554
## log(AverageWinnings) 6.75 8.71 9.822 11.761 9.742 10.381 9.212 10.750
## CV residual -2.03 -1.36 -0.459 0.787 -0.805 -0.384 0.324 0.196
## 80 84 87 88 95 97 99 103
## Predicted 10.2372 9.9 9.195 10.321 10.37 10.058 10.447 11.251
## cvpred 10.2313 9.9 9.193 10.311 10.36 10.067 10.431 11.263
## log(AverageWinnings) 10.1358 8.9 9.488 11.244 9.31 10.685 10.144 10.570
## CV residual -0.0956 -1.0 0.295 0.932 -1.05 0.618 -0.286 -0.692
## 106 109 115 126 131 134 135 137
## Predicted 10.839 10.44 9.541 9.232 11.182 9.748 10.532 9.513
## cvpred 10.825 10.42 9.521 9.222 11.175 9.722 10.508 9.532
## log(AverageWinnings) 10.937 11.35 8.973 9.779 11.740 9.398 10.141 9.421
## CV residual 0.112 0.93 -0.548 0.557 0.565 -0.324 -0.367 -0.111
## 142 143 149 155 173 188 190 192
## Predicted 9.486 10.99 9.900 9.98 10.352 10.549 9.495 10.03
## cvpred 9.479 10.99 9.896 9.98 10.330 10.523 9.481 10.01
## log(AverageWinnings) 10.287 12.48 10.414 11.47 10.658 11.166 10.314 8.32
## CV residual 0.808 1.49 0.518 1.50 0.328 0.644 0.834 -1.69
##
## Sum of squares = 27.2 Mean square = 0.68 n = 40
##
## fold 3
## Observations in test set: 39
## 14 15 17 19 21 27 30
## Predicted 10.28 10.358 10.049 10.450 10.0757 11.268 9.58
## cvpred 10.29 10.426 10.051 10.558 10.1266 11.415 9.57
## log(AverageWinnings) 10.86 9.872 10.523 9.645 10.2029 10.797 10.95
## CV residual 0.56 -0.554 0.472 -0.913 0.0763 -0.618 1.38
## 47 62 66 67 68 69 71 74
## Predicted 11.449 9.51 10.96 9.823 11.023 10.62 11.75 11.049
## cvpred 11.539 9.52 11.02 9.798 11.183 10.70 11.94 11.161
## log(AverageWinnings) 11.741 8.07 10.87 9.167 10.770 9.66 9.81 10.966
## CV residual 0.202 -1.45 -0.15 -0.632 -0.413 -1.04 -2.13 -0.195
## 77 79 89 90 93 101 104 105
## Predicted 9.182 9.81 10.196 10.548 10.536 10.741 11.5 10.603
## cvpred 9.097 9.78 10.214 10.634 10.605 10.854 11.6 10.687
## log(AverageWinnings) 8.700 10.25 10.491 11.363 10.717 10.645 10.3 11.023
## CV residual -0.397 0.47 0.277 0.729 0.112 -0.209 -1.3 0.335
## 108 113 117 118 123 124 132
## Predicted 9.951 9.836 10.16 11.494 11.124 10.975 9.9615
## cvpred 9.964 9.858 10.23 11.674 11.267 11.064 10.0073
## log(AverageWinnings) 9.215 10.446 9.05 10.723 11.448 10.373 9.9641
## CV residual -0.749 0.588 -1.18 -0.951 0.181 -0.691 -0.0432
## 140 157 158 163 166 178 179 180
## Predicted 10.35 10.241 9.536 10.37 10.41 11.263 10.076 11.874
## cvpred 10.38 10.306 9.514 10.38 10.46 11.359 10.109 12.048
## log(AverageWinnings) 10.53 10.784 9.816 12.10 10.64 10.452 10.753 12.551
## CV residual 0.15 0.478 0.302 1.72 0.18 -0.907 0.644 0.503
## 191
## Predicted 10.32
## cvpred 10.33
## log(AverageWinnings) 8.71
## CV residual -1.62
##
## Sum of squares = 26.4 Mean square = 0.68 n = 39
##
## fold 4
## Observations in test set: 39
## 6 7 10 20 23 26 29
## Predicted 10.408 10.353 10.791 10.149 10.99 10.2594 9.70
## cvpred 10.433 10.359 10.851 10.127 10.95 10.2374 9.71
## log(AverageWinnings) 10.984 10.641 11.049 10.612 9.91 10.1648 11.30
## CV residual 0.551 0.282 0.198 0.486 -1.04 -0.0726 1.60
## 32 35 37 45 49 61 65
## Predicted 10.858 9.411 10.185 9.855 9.887 11.58 10.434
## cvpred 10.847 9.388 10.208 9.852 9.925 11.62 10.428
## log(AverageWinnings) 11.609 8.964 10.079 9.561 9.648 12.80 11.192
## CV residual 0.762 -0.423 -0.129 -0.291 -0.278 1.18 0.764
## 86 94 96 98 100 112 121 122
## Predicted 10.119 9.17 11.4870 10.7 9.936 10.109 9.609 10.717
## cvpred 10.100 9.15 11.5006 10.7 9.941 10.083 9.589 10.732
## log(AverageWinnings) 9.904 10.81 11.5829 10.1 10.124 10.246 9.050 10.306
## CV residual -0.196 1.66 0.0823 -0.6 0.183 0.163 -0.539 -0.426
## 130 133 144 146 147 151 156 161
## Predicted 10.22 11.616 11.59 9.486 10.504 10.065 10.01 10.182
## cvpred 10.25 11.679 11.64 9.514 10.471 10.113 10.02 10.245
## log(AverageWinnings) 9.19 10.879 12.40 9.016 9.576 9.515 8.91 9.270
## CV residual -1.07 -0.799 0.76 -0.498 -0.895 -0.598 -1.11 -0.975
## 170 171 172 174 175 177 182 186
## Predicted 9.606 10.158 10.415 9.953 10.99 10.685 10.08 10.4280
## cvpred 9.584 10.205 10.386 9.979 10.98 10.693 10.09 10.4438
## log(AverageWinnings) 9.886 9.406 11.282 9.026 9.70 11.678 11.10 10.4610
## CV residual 0.302 -0.799 0.896 -0.953 -1.28 0.986 1.01 0.0172
## 193
## Predicted 10.554
## cvpred 10.533
## log(AverageWinnings) 10.682
## CV residual 0.149
##
## Sum of squares = 23 Mean square = 0.59 n = 39
##
## fold 5
## Observations in test set: 39
## 1 2 3 8 16 22 31 34
## Predicted 10.52 11.33 10.024 11.165 10.876 10.6505 9.81 9.063
## cvpred 10.49 11.22 9.995 11.109 10.831 10.6083 9.82 9.135
## log(AverageWinnings) 10.06 12.36 10.910 10.556 10.725 10.5533 8.34 9.952
## CV residual -0.43 1.14 0.915 -0.553 -0.106 -0.0551 -1.48 0.817
## 41 42 44 48 56 59 60
## Predicted 9.5411 9.622 10.702 9.649 10.462 10.256 10.388
## cvpred 9.5409 9.597 10.686 9.671 10.427 10.227 10.377
## log(AverageWinnings) 9.4895 9.330 10.049 9.189 9.665 10.523 10.955
## CV residual -0.0514 -0.267 -0.637 -0.482 -0.762 0.296 0.578
## 64 70 82 83 85 91 102 107
## Predicted 10.650 9.430 10.08 9.981 10.963 10.848 9.70 10.529
## cvpred 10.579 9.453 10.09 9.966 10.892 10.793 9.70 10.496
## log(AverageWinnings) 11.377 9.018 9.88 9.337 11.408 11.076 11.12 11.369
## CV residual 0.798 -0.435 -0.21 -0.629 0.516 0.283 1.42 0.873
## 110 129 138 139 148 150 153
## Predicted 10.049 10.546 9.425 10.0130 9.82 10.317 9.808
## cvpred 10.061 10.522 9.457 10.0158 9.81 10.313 9.805
## log(AverageWinnings) 9.957 10.359 9.144 10.0264 10.97 10.060 9.032
## CV residual -0.104 -0.163 -0.313 0.0106 1.16 -0.252 -0.773
## 154 165 167 168 176 181 184
## Predicted 9.610 11.023 9.2322 10.836 11.530 10.6632 10.1651
## cvpred 9.607 10.976 9.2728 10.787 11.462 10.6186 10.1766
## log(AverageWinnings) 9.286 11.391 9.3360 11.715 11.976 10.6601 10.1541
## CV residual -0.321 0.415 0.0632 0.927 0.514 0.0416 -0.0225
## 187 194
## Predicted 9.56 10.7
## cvpred 9.60 10.6
## log(AverageWinnings) 9.33 12.8
## CV residual -0.27 2.2
##
## Sum of squares = 20.1 Mean square = 0.52 n = 39
##
## Overall (Sum over all 39 folds)
## ms
## 0.618
# Obtain the PRESS statistic
(press3 <- sum((log(pga$AverageWinnings) - KCV3$cvpred)^2))
## [1] 121
# Obtain the MPSE
n = length(pga$Name)
(mpse3 <- sum((log(pga$AverageWinnings) - KCV3$cvpred)^2)/n)
## [1] 0.618
# Obtain the prediction R-squared.
(predrsq3 <- 1 - sum((log(pga$AverageWinnings) - KCV3$cvpred)^2) / sum((log(pga$AverageWinnings) - mean(log(pga$AverageWinnings)))^2))
## [1] 0.364
Compare the prediction R-squared obtained from the previous three questions. Based on the comparison, which model is preferred in terms of model validation? Compare each prediction R-squared with its own traditional R-squared. Which one is higher and why?
# Model from Q1
as.data.frame(cbind(predrsq, rsq = summary(pga_fit_log)$r.squared))
## predrsq rsq
## 1 0.599 0.636
# Model from Q2
as.data.frame(cbind(predrsq2, rsq2 = summary(pga_fit_log2)$r.squared))
## predrsq2 rsq2
## 1 0.606 0.633
# Model from Q3
as.data.frame(cbind(predrsq3, rsq3 = summary(pga_fit_log3)$r.squared))
## predrsq3 rsq3
## 1 0.364 0.383
Based on the comparison, the model given by pga_fit_log2 (from question 2) is preferred in terms of model validation. This is because the predicted R Squared is the highest among the three models.
Each prediction R-squared is lower than its respective traditional R-squared. This is because the prediction R-squared is calculated without using the datapoint it is trying to estimate, which helps “validate” how well the model captures the relationships regarding the variation in the response variable being explained by the covariate. It’s a truer measure, and therefore is not as confident at the traditional R-Squared.
The Iris flower data set iris.csv is a multivariate data set introduced by the British statistician and biologist Ronald Fisher. It consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters. Please read the data set into R and visualize the data.
iris <- read_csv("iris.csv")
pairs.panels(iris)
Build a linear regression using Sepal.Length as response variable and Sepal.Width as covariate. Interpret the coefficient.
iris_fit <- lm(Sepal.Length ~ Sepal.Width, data = iris)
summary(iris_fit)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.556 -0.633 -0.112 0.558 2.223
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.526 0.479 13.63 <2e-16 ***
## Sepal.Width -0.223 0.155 -1.44 0.15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.825 on 148 degrees of freedom
## Multiple R-squared: 0.0138, Adjusted R-squared: 0.00716
## F-statistic: 2.07 on 1 and 148 DF, p-value: 0.152
B1 = -0.223 | For every centimeter increase in the width of the sepal, the length of the sepal decreases 0.223 centimeters.
Repeat the same analysis for only the species of setosa: Build a linear regression using Sepal.Length as response variable and Sepal.Width as covariate, and interpret the coefficient.
iris_setosa <- iris %>% filter(Species == "setosa")
iris_setosa_fit <- lm(Sepal.Length ~ Sepal.Width, data = iris_setosa)
summary(iris_setosa_fit)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = iris_setosa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5248 -0.1629 0.0217 0.1383 0.4443
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6390 0.3100 8.51 3.7e-11 ***
## Sepal.Width 0.6905 0.0899 7.68 6.7e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.239 on 48 degrees of freedom
## Multiple R-squared: 0.551, Adjusted R-squared: 0.542
## F-statistic: 59 on 1 and 48 DF, p-value: 6.71e-10
B1 = 0.6905 | For every centimeter increase in the width of the sepal, the length of the sepal increases 0.6905 centimeters.
Repeat the same analysis for only the species of versicolor.
iris_versicolor <- iris %>% filter(Species == "versicolor")
iris_versicolor_fit <- lm(Sepal.Length ~ Sepal.Width, data = iris_versicolor)
summary(iris_versicolor_fit)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = iris_versicolor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7350 -0.2856 -0.0754 0.4367 0.8380
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.540 0.563 6.29 9.1e-08 ***
## Sepal.Width 0.865 0.202 4.28 8.8e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.444 on 48 degrees of freedom
## Multiple R-squared: 0.277, Adjusted R-squared: 0.262
## F-statistic: 18.4 on 1 and 48 DF, p-value: 8.77e-05
B1 = 0.865 | For every centimeter increase in the width of the sepal, the length of the sepal increases 0.865 centimeters.
Repeat the same analysis for only the species of virginica.
iris_virginica <- iris %>% filter(Species == "virginica")
iris_virginica_fit <- lm(Sepal.Length ~ Sepal.Width, data = iris_virginica)
summary(iris_virginica_fit)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = iris_virginica)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2607 -0.3692 -0.0361 0.1984 1.4492
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.907 0.757 5.16 4.7e-06 ***
## Sepal.Width 0.902 0.253 3.56 0.00084 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.571 on 48 degrees of freedom
## Multiple R-squared: 0.209, Adjusted R-squared: 0.193
## F-statistic: 12.7 on 1 and 48 DF, p-value: 0.000843
B1 = 0.902 | For every centimeter increase in the width of the sepal, the length of the sepal increases 0.902 centimeters.
Compare the results from questions 3,4,5 with the results in question 2. Are these results consistent? Why?
Answer: The results are not consistent - this is a classic example of Simpson’s Paradox. In this dataset, there exists a cofounder, which is Species. Using all the data together, there appears to be a negative relationship between Sepal.Length and Sepal.Width. However, when you stratify the data by Species, there are very clear and strong positive relationships between Sepal.Length and Sepal.Width within each species. The position of these stratified samples on the graph makes the sample as a whole appear negative. See below for a visual representation as to how this can happen.
Build a simple linear regression using Sepal.Length as response variable and Petal.Length as covariate. If we run the regression for all three species combined what is the estimated slope?
spec_all <- lm(Sepal.Length ~ Petal.Length, data = iris)
summary(spec_all)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2468 -0.2966 -0.0152 0.2768 1.0027
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.3066 0.0784 54.9 <2e-16 ***
## Petal.Length 0.4089 0.0189 21.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.407 on 148 degrees of freedom
## Multiple R-squared: 0.76, Adjusted R-squared: 0.758
## F-statistic: 469 on 1 and 148 DF, p-value: <2e-16
B1 = 0.4089 | If we run the regression for all three species combined, the estimated slope is 0.4089.
If we run the regression for all three species separately what are the estimated slopes respectively?
# Let's use the subsets we created for each Species in questions 3-5 and run a linear regression for each.
set_fit <- lm(Sepal.Length ~ Petal.Length, data = iris_setosa)
summary(set_fit)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = iris_setosa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5724 -0.2067 -0.0308 0.1734 0.9361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.213 0.416 10.14 1.6e-13 ***
## Petal.Length 0.542 0.282 1.92 0.061 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.343 on 48 degrees of freedom
## Multiple R-squared: 0.0714, Adjusted R-squared: 0.052
## F-statistic: 3.69 on 1 and 48 DF, p-value: 0.0607
ver_fit <- lm(Sepal.Length ~ Petal.Length, data = iris_versicolor)
summary(ver_fit)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = iris_versicolor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7348 -0.2027 -0.0206 0.2609 0.6996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.408 0.446 5.39 2.1e-06 ***
## Petal.Length 0.828 0.104 7.95 2.6e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.343 on 48 degrees of freedom
## Multiple R-squared: 0.569, Adjusted R-squared: 0.56
## F-statistic: 63.3 on 1 and 48 DF, p-value: 2.59e-10
vir_fit <- lm(Sepal.Length ~ Petal.Length, data = iris_virginica)
summary(vir_fit)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = iris_virginica)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7341 -0.2364 -0.0313 0.2377 0.7621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0597 0.4668 2.27 0.028 *
## Petal.Length 0.9957 0.0837 11.90 6.3e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.323 on 48 degrees of freedom
## Multiple R-squared: 0.747, Adjusted R-squared: 0.742
## F-statistic: 142 on 1 and 48 DF, p-value: 6.3e-16
If we run the regression for all three species separately, the estimated slopes respectively are:
Are these results consistent? Why?
These results are consistent. Each stratified sample has a positive slope, as does the sample as a whole. Since the stratified samples have a positive slope and they are potitioned in a way that also gives a positive slope, the results are consistent. See below for a visual representation.