#install DAAG from archived source
if(!is.element("DAAG", installed.packages()[,1])){
packageurl <- "https://cran.r-project.org/src/contrib/Archive/DAAG/DAAG_1.22.tar.gz"
install.packages("latticeExtra")
install.packages(packageurl, repos=NULL, type="source")
}
library(DAAG)
## Loading required package: lattice
PGA.csv data set. For this data set, fit a multiple linear regression to the data. Use the log of Average winnings as the response variable and use Age, Average Drive (Yards), Driving Accuracy (percent), Greens on Regulation (%), Average # of Putts, Save Percent, and # Events as covariates. Perform a 5-fold cross validation and obtain the PRESS statistic, MPSE, and prediction R-squared.PGA <- read.csv("C:/Users/bhuvi iyer/Downloads/PGA.csv",h=T)
head(PGA)
## Name Age AverageDrive DrivingAccuracy GreensonRegulation
## 1 Aaron Baddeley 23 288.0 53.1 58.2
## 2 Adam Scott 24 295.4 57.7 65.6
## 3 Alex Cejka 34 285.8 64.2 63.8
## 4 Andre Stolz 34 297.9 59.0 63.0
## 5 Arjun Atwal 31 289.4 60.5 62.5
## 6 Arron Oberholser 29 284.6 68.8 67.0
## AverageNumofPutts SavePercent MoneyRank NumEvents TotalWinnings
## 1 1.767 50.9 123 27 632878
## 2 1.757 59.3 7 16 3724984
## 3 1.795 50.7 54 24 1313484
## 4 1.787 47.7 101 20 808373
## 5 1.766 43.5 146 30 486053
## 6 1.780 50.9 52 23 1355433
## AverageWinnings
## 1 23440
## 2 232812
## 3 54729
## 4 40419
## 5 16202
## 6 58932
model_pga <- lm(AverageWinnings~Age+AverageDrive+DrivingAccuracy+ +GreensonRegulation +AverageNumofPutts+SavePercent + NumEvents, data = PGA)
summary(model_pga)
##
## Call:
## lm(formula = AverageWinnings ~ Age + AverageDrive + DrivingAccuracy +
## +GreensonRegulation + AverageNumofPutts + SavePercent + NumEvents,
## data = PGA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71690 -22176 -6735 17147 247928
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 945579.88 305886.59 3.091 0.00230 **
## Age -587.13 519.32 -1.131 0.25968
## AverageDrive -94.76 567.42 -0.167 0.86755
## DrivingAccuracy -2360.57 854.02 -2.764 0.00628 **
## GreensonRegulation 8466.04 1303.87 6.493 7.30e-10 ***
## AverageNumofPutts -694226.49 138155.99 -5.025 1.17e-06 ***
## SavePercent 1395.67 587.54 2.375 0.01853 *
## NumEvents -3159.22 644.24 -4.904 2.03e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41430 on 188 degrees of freedom
## Multiple R-squared: 0.4527, Adjusted R-squared: 0.4323
## F-statistic: 22.21 on 7 and 188 DF, p-value: < 2.2e-16
KCV=cv.lm(data=PGA, model_pga, m=5, seed=123)
## Analysis of Variance Table
##
## Response: AverageWinnings
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 1.71e+09 1.71e+09 0.99 0.32011
## AverageDrive 1 2.19e+10 2.19e+10 12.74 0.00045 ***
## DrivingAccuracy 1 5.19e+07 5.19e+07 0.03 0.86220
## GreensonRegulation 1 1.13e+11 1.13e+11 66.08 5.7e-14 ***
## AverageNumofPutts 1 7.57e+10 7.57e+10 44.09 3.3e-10 ***
## SavePercent 1 1.29e+10 1.29e+10 7.51 0.00673 **
## NumEvents 1 4.13e+10 4.13e+10 24.05 2.0e-06 ***
## Residuals 188 3.23e+11 1.72e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in cv.lm(data = PGA, model_pga, m = 5, seed = 123):
##
## As there is >1 explanatory variable, cross-validation
## predicted values for a fold are not a linear function
## of corresponding overall predicted values. Lines that
## are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 39
## 4 5 11 13 25 36 38 39 43
## Predicted 54281 26199 41198 -690 71891 33982 -12370 40130 13636
## cvpred 56300 25301 42783 -2912 79496 29407 -15697 37234 12304
## AverageWinnings 40419 16202 20359 6026 38551 38345 16486 81787 4378
## CV residual -15881 -9099 -22424 8938 -40945 8938 32183 44553 -7926
## 53 72 73 75 76 78 81 92 111
## Predicted 108551 -23602 76158 37758 25550 34250 20946 66001 3186
## cvpred 115529 -29528 85012 42230 25240 36558 18906 74185 1293
## AverageWinnings 98230 5449 27230 34414 29452 27103 9660 36636 1031
## CV residual -17299 34977 -57782 -7816 4212 -9455 -9246 -37549 -262
## 114 116 119 120 125 127 128 136
## Predicted 74126 78632 46836 89611 80412 53702 34344 41938
## cvpred 75748 84344 55495 96514 85301 53404 37495 45154
## AverageWinnings 65268 33499 18582 78394 15806 32685 18188 19901
## CV residual -10480 -50845 -36913 -18120 -69495 -20719 -19307 -25253
## 141 145 152 159 160 162 164 169
## Predicted 78190 26320 91937 84791 49187 73553 51887 42194
## cvpred 77816 27160 98826 88195 52381 74598 56481 45302
## AverageWinnings 26149 8600 66836 72904 29286 98566 35182 48607
## CV residual -51667 -18560 -31990 -15291 -23095 23968 -21299 3305
## 183 185 189 195 196
## Predicted 17294 33462 41647 63191 60052
## cvpred 17593 31829 45797 68725 63387
## AverageWinnings 37292 113473 47589 51586 80590
## CV residual 19699 81644 1792 -17139 17203
##
## Sum of squares = 3.64e+10 Mean square = 9.34e+08 n = 39
##
## fold 2
## Observations in test set: 40
## 9 12 18 24 28 33 40 46 50
## Predicted 27814 -11945 39971 -8586 61377 57696 26539 64420 -55958
## cvpred 20584 -19961 42354 -22535 58787 55666 17991 63950 -75176
## AverageWinnings 25041 15727 36318 6664 72422 56206 15694 18862 850
## CV residual 4457 35688 -6036 29199 13635 540 -2297 -45088 76026
## 51 52 54 55 57 58 63 80
## Predicted -12356 37163 68168 25565 70638 -26310 39691 34220
## cvpred -16754 37007 68885 24464 67524 -36524 41640 35715
## AverageWinnings 6090 18441 128129 17010 32240 10014 46638 25230
## CV residual 22844 -18566 59244 -7454 -35284 46538 4998 -10485
## 84 87 88 95 97 99 103 106
## Predicted 33780 -13120 67484 49893 82544 15504 60796 108133
## cvpred 30563 -17601 67476 51978 80344 13377 62421 107109
## AverageWinnings 7315 13203 76394 11076 43714 25447 38960 56217
## CV residual -23248 30804 8918 -40902 -36630 12070 -23461 -50892
## 109 115 126 131 134 135 137 142 143
## Predicted 83254 5834 -2172 89954 8174 55652 -11751 13857 125603
## cvpred 85654 -2755 -10141 91911 1100 55902 -19460 5671 127026
## AverageWinnings 84871 7891 17657 125524 12065 25371 12350 29356 262947
## CV residual -783 10646 27798 33613 10965 -30531 31810 23685 135921
## 149 155 173 188 190 192
## Predicted 11851 53206 40323 98936 6623 22907
## cvpred 7385 47061 41563 104162 3024 18391
## AverageWinnings 33335 96169 42544 70699 30166 4123
## CV residual 25950 49108 981 -33463 27142 -14268
##
## Sum of squares = 5.43e+10 Mean square = 1.36e+09 n = 40
##
## fold 3
## Observations in test set: 39
## 14 15 17 19 21 27 30 47 62
## Predicted 58117 16655 33067 61430 59075 86277 33859 94327 -43969
## cvpred 55373 17573 36275 72911 64494 91941 30537 99574 -46034
## AverageWinnings 51794 19381 37175 15438 26980 48856 56783 125614 3200
## CV residual -3579 1808 900 -57473 -37514 -43085 26246 26040 49234
## 66 67 68 69 71 74 77 79
## Predicted 85201 -13466 104887 11756 80341 77758 -45212 33729
## cvpred 93434 -15529 107499 12439 88284 79059 -54860 27745
## AverageWinnings 52460 9572 47573 15756 18290 57871 6004 28294
## CV residual -40974 25101 -59926 3317 -69994 -21188 60864 549
## 89 90 93 101 104 105 108 113 117
## Predicted 55633 64575 32134 46527 97855 68785 15745 10685 210
## cvpred 56874 65146 34347 47941 110829 69733 19231 8745 743
## AverageWinnings 35987 86077 45105 41969 29121 61241 10044 34398 8548
## CV residual -20887 20931 10758 -5972 -81708 -8492 -9187 25653 7805
## 118 123 124 132 140 157 158 163 166
## Predicted 97075 52508 56942 41766 32866 46417 23867 132427 14363
## cvpred 105861 54265 62652 47891 33183 45816 23801 136100 13514
## AverageWinnings 45394 93751 31992 21250 37404 48253 18319 179956 41601
## CV residual -60467 39486 -30660 -26641 4221 2437 -5482 43856 28087
## 178 179 180 191
## Predicted 106316 31487 152278 41358
## cvpred 113125 25983 162687 49080
## AverageWinnings 34626 46768 282393 6074
## CV residual -78499 20785 119706 -43006
##
## Sum of squares = 6.63e+10 Mean square = 1.7e+09 n = 39
##
## fold 4
## Observations in test set: 39
## 6 7 10 20 23 26 29 32 35
## Predicted 69056 52426 81848 31939 33284 50033 49954 73339 -29929
## cvpred 69087 51150 75895 32674 28310 50416 53022 70588 -27368
## AverageWinnings 58932 41833 62906 40638 20162 25973 80892 110068 7820
## CV residual -10155 -9317 -12989 7964 -8148 -24443 27870 39480 35188
## 37 45 49 61 65 86 94 96
## Predicted 66588 45903 27229 129503 20421 28022 2208 133109
## cvpred 68939 47958 31270 118322 22195 31876 5942 125029
## AverageWinnings 23833 14200 15484 361702 72542 20001 49577 107250
## CV residual -45106 -33758 -15786 243380 50347 -11875 43635 -17779
## 98 100 112 121 122 130 133 144
## Predicted 87354 78904 17301 -41084 54421 26604 53668 146852
## cvpred 80350 83140 18584 -40483 51318 19978 44318 139511
## AverageWinnings 24898 24936 28161 8519 29912 9776 53072 242848
## CV residual -55452 -58204 9577 49002 -21406 -10202 8754 103337
## 146 147 151 156 161 170 171 172
## Predicted 22072 34111 37346 17511 13387 42243 50701 45603
## cvpred 27850 33680 36960 16734 12644 46480 50939 44731
## AverageWinnings 8232 14412 13563 7404 10610 19651 12162 79413
## CV residual -19618 -19268 -23397 -9330 -2034 -26829 -38777 34682
## 174 175 177 182 186 193
## Predicted 35986 46171 70165 55993 18212 77473
## cvpred 36757 37024 67323 55258 18124 74583
## AverageWinnings 8315 16330 117969 66445 34926 43572
## CV residual -28442 -20694 50646 11187 16802 -31011
##
## Sum of squares = 1.02e+11 Mean square = 2.63e+09 n = 39
##
## fold 5
## Observations in test set: 39
## 1 2 3 8 16 22 31 34
## Predicted 31204 135123 35922 75580 60388 73359 5902 36616
## cvpred 39008 128487 37935 75783 61664 73286 18699 31302
## AverageWinnings 23440 232812 54729 38406 45481 38302 4178 20993
## CV residual -15568 104325 16794 -37377 -16183 -34984 -14521 -10309
## 41 42 44 48 56 59 60 64 70
## Predicted 35194 15529 57668 19871 49849 44427 62817 92520 576
## cvpred 36125 8166 54499 21826 52344 46581 58970 84473 2221
## AverageWinnings 13220 11269 23129 9787 15756 37158 57227 87257 8252
## CV residual -22905 3103 -31370 -12039 -36588 -9423 -1743 2784 6031
## 82 83 85 91 102 107 110 129 138
## Predicted 42457 49626 86806 88840 36540 71375 33417 57590 5238
## cvpred 44938 48760 72938 79915 38432 68840 32318 49002 7772
## AverageWinnings 19584 11349 90071 64589 67813 86574 21091 31531 9356
## CV residual -25354 -37411 17133 -15326 29381 17734 -11227 -17471 1584
## 139 148 150 153 154 165 167 168
## Predicted 43042 74524 42229 29158 53809 69712 -8063 92850
## cvpred 42019 67246 40458 31539 55879 68299 -5190 80998
## AverageWinnings 22616 58213 23399 8363 10782 88527 11339 122341
## CV residual -19403 -9033 -17059 -23176 -45097 20228 16529 41343
## 176 181 184 187 194
## Predicted 117117 57459 26657 5150 128112
## cvpred 106245 60852 26251 4841 105828
## AverageWinnings 158938 42623 25697 11308 376040
## CV residual 52693 -18229 -554 6467 270212
##
## Sum of squares = 1.04e+11 Mean square = 2.66e+09 n = 39
##
## Overall (Sum over all 39 folds)
## ms
## 1.85e+09
sum((PGA$AverageWinnings-KCV$cvpred)^2)/5
## [1] 7.27e+10
sum((PGA$AverageWinnings-KCV$cvpred)^2)
## [1] 3.63e+11
sum((PGA$AverageWinnings-KCV$cvpred)^2)/sum((PGA$AverageWinnings-mean(PGA$AverageWinnings))^2)
## [1] 0.616
summary(model_pga)$r.squared
## [1] 0.453
#LOCV
LOOCV=cv.lm(data=PGA, model_pga, m=5,seed=123)
## Analysis of Variance Table
##
## Response: AverageWinnings
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 1.71e+09 1.71e+09 0.99 0.32011
## AverageDrive 1 2.19e+10 2.19e+10 12.74 0.00045 ***
## DrivingAccuracy 1 5.19e+07 5.19e+07 0.03 0.86220
## GreensonRegulation 1 1.13e+11 1.13e+11 66.08 5.7e-14 ***
## AverageNumofPutts 1 7.57e+10 7.57e+10 44.09 3.3e-10 ***
## SavePercent 1 1.29e+10 1.29e+10 7.51 0.00673 **
## NumEvents 1 4.13e+10 4.13e+10 24.05 2.0e-06 ***
## Residuals 188 3.23e+11 1.72e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in cv.lm(data = PGA, model_pga, m = 5, seed = 123):
##
## As there is >1 explanatory variable, cross-validation
## predicted values for a fold are not a linear function
## of corresponding overall predicted values. Lines that
## are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 39
## 4 5 11 13 25 36 38 39 43
## Predicted 54281 26199 41198 -690 71891 33982 -12370 40130 13636
## cvpred 56300 25301 42783 -2912 79496 29407 -15697 37234 12304
## AverageWinnings 40419 16202 20359 6026 38551 38345 16486 81787 4378
## CV residual -15881 -9099 -22424 8938 -40945 8938 32183 44553 -7926
## 53 72 73 75 76 78 81 92 111
## Predicted 108551 -23602 76158 37758 25550 34250 20946 66001 3186
## cvpred 115529 -29528 85012 42230 25240 36558 18906 74185 1293
## AverageWinnings 98230 5449 27230 34414 29452 27103 9660 36636 1031
## CV residual -17299 34977 -57782 -7816 4212 -9455 -9246 -37549 -262
## 114 116 119 120 125 127 128 136
## Predicted 74126 78632 46836 89611 80412 53702 34344 41938
## cvpred 75748 84344 55495 96514 85301 53404 37495 45154
## AverageWinnings 65268 33499 18582 78394 15806 32685 18188 19901
## CV residual -10480 -50845 -36913 -18120 -69495 -20719 -19307 -25253
## 141 145 152 159 160 162 164 169
## Predicted 78190 26320 91937 84791 49187 73553 51887 42194
## cvpred 77816 27160 98826 88195 52381 74598 56481 45302
## AverageWinnings 26149 8600 66836 72904 29286 98566 35182 48607
## CV residual -51667 -18560 -31990 -15291 -23095 23968 -21299 3305
## 183 185 189 195 196
## Predicted 17294 33462 41647 63191 60052
## cvpred 17593 31829 45797 68725 63387
## AverageWinnings 37292 113473 47589 51586 80590
## CV residual 19699 81644 1792 -17139 17203
##
## Sum of squares = 3.64e+10 Mean square = 9.34e+08 n = 39
##
## fold 2
## Observations in test set: 40
## 9 12 18 24 28 33 40 46 50
## Predicted 27814 -11945 39971 -8586 61377 57696 26539 64420 -55958
## cvpred 20584 -19961 42354 -22535 58787 55666 17991 63950 -75176
## AverageWinnings 25041 15727 36318 6664 72422 56206 15694 18862 850
## CV residual 4457 35688 -6036 29199 13635 540 -2297 -45088 76026
## 51 52 54 55 57 58 63 80
## Predicted -12356 37163 68168 25565 70638 -26310 39691 34220
## cvpred -16754 37007 68885 24464 67524 -36524 41640 35715
## AverageWinnings 6090 18441 128129 17010 32240 10014 46638 25230
## CV residual 22844 -18566 59244 -7454 -35284 46538 4998 -10485
## 84 87 88 95 97 99 103 106
## Predicted 33780 -13120 67484 49893 82544 15504 60796 108133
## cvpred 30563 -17601 67476 51978 80344 13377 62421 107109
## AverageWinnings 7315 13203 76394 11076 43714 25447 38960 56217
## CV residual -23248 30804 8918 -40902 -36630 12070 -23461 -50892
## 109 115 126 131 134 135 137 142 143
## Predicted 83254 5834 -2172 89954 8174 55652 -11751 13857 125603
## cvpred 85654 -2755 -10141 91911 1100 55902 -19460 5671 127026
## AverageWinnings 84871 7891 17657 125524 12065 25371 12350 29356 262947
## CV residual -783 10646 27798 33613 10965 -30531 31810 23685 135921
## 149 155 173 188 190 192
## Predicted 11851 53206 40323 98936 6623 22907
## cvpred 7385 47061 41563 104162 3024 18391
## AverageWinnings 33335 96169 42544 70699 30166 4123
## CV residual 25950 49108 981 -33463 27142 -14268
##
## Sum of squares = 5.43e+10 Mean square = 1.36e+09 n = 40
##
## fold 3
## Observations in test set: 39
## 14 15 17 19 21 27 30 47 62
## Predicted 58117 16655 33067 61430 59075 86277 33859 94327 -43969
## cvpred 55373 17573 36275 72911 64494 91941 30537 99574 -46034
## AverageWinnings 51794 19381 37175 15438 26980 48856 56783 125614 3200
## CV residual -3579 1808 900 -57473 -37514 -43085 26246 26040 49234
## 66 67 68 69 71 74 77 79
## Predicted 85201 -13466 104887 11756 80341 77758 -45212 33729
## cvpred 93434 -15529 107499 12439 88284 79059 -54860 27745
## AverageWinnings 52460 9572 47573 15756 18290 57871 6004 28294
## CV residual -40974 25101 -59926 3317 -69994 -21188 60864 549
## 89 90 93 101 104 105 108 113 117
## Predicted 55633 64575 32134 46527 97855 68785 15745 10685 210
## cvpred 56874 65146 34347 47941 110829 69733 19231 8745 743
## AverageWinnings 35987 86077 45105 41969 29121 61241 10044 34398 8548
## CV residual -20887 20931 10758 -5972 -81708 -8492 -9187 25653 7805
## 118 123 124 132 140 157 158 163 166
## Predicted 97075 52508 56942 41766 32866 46417 23867 132427 14363
## cvpred 105861 54265 62652 47891 33183 45816 23801 136100 13514
## AverageWinnings 45394 93751 31992 21250 37404 48253 18319 179956 41601
## CV residual -60467 39486 -30660 -26641 4221 2437 -5482 43856 28087
## 178 179 180 191
## Predicted 106316 31487 152278 41358
## cvpred 113125 25983 162687 49080
## AverageWinnings 34626 46768 282393 6074
## CV residual -78499 20785 119706 -43006
##
## Sum of squares = 6.63e+10 Mean square = 1.7e+09 n = 39
##
## fold 4
## Observations in test set: 39
## 6 7 10 20 23 26 29 32 35
## Predicted 69056 52426 81848 31939 33284 50033 49954 73339 -29929
## cvpred 69087 51150 75895 32674 28310 50416 53022 70588 -27368
## AverageWinnings 58932 41833 62906 40638 20162 25973 80892 110068 7820
## CV residual -10155 -9317 -12989 7964 -8148 -24443 27870 39480 35188
## 37 45 49 61 65 86 94 96
## Predicted 66588 45903 27229 129503 20421 28022 2208 133109
## cvpred 68939 47958 31270 118322 22195 31876 5942 125029
## AverageWinnings 23833 14200 15484 361702 72542 20001 49577 107250
## CV residual -45106 -33758 -15786 243380 50347 -11875 43635 -17779
## 98 100 112 121 122 130 133 144
## Predicted 87354 78904 17301 -41084 54421 26604 53668 146852
## cvpred 80350 83140 18584 -40483 51318 19978 44318 139511
## AverageWinnings 24898 24936 28161 8519 29912 9776 53072 242848
## CV residual -55452 -58204 9577 49002 -21406 -10202 8754 103337
## 146 147 151 156 161 170 171 172
## Predicted 22072 34111 37346 17511 13387 42243 50701 45603
## cvpred 27850 33680 36960 16734 12644 46480 50939 44731
## AverageWinnings 8232 14412 13563 7404 10610 19651 12162 79413
## CV residual -19618 -19268 -23397 -9330 -2034 -26829 -38777 34682
## 174 175 177 182 186 193
## Predicted 35986 46171 70165 55993 18212 77473
## cvpred 36757 37024 67323 55258 18124 74583
## AverageWinnings 8315 16330 117969 66445 34926 43572
## CV residual -28442 -20694 50646 11187 16802 -31011
##
## Sum of squares = 1.02e+11 Mean square = 2.63e+09 n = 39
##
## fold 5
## Observations in test set: 39
## 1 2 3 8 16 22 31 34
## Predicted 31204 135123 35922 75580 60388 73359 5902 36616
## cvpred 39008 128487 37935 75783 61664 73286 18699 31302
## AverageWinnings 23440 232812 54729 38406 45481 38302 4178 20993
## CV residual -15568 104325 16794 -37377 -16183 -34984 -14521 -10309
## 41 42 44 48 56 59 60 64 70
## Predicted 35194 15529 57668 19871 49849 44427 62817 92520 576
## cvpred 36125 8166 54499 21826 52344 46581 58970 84473 2221
## AverageWinnings 13220 11269 23129 9787 15756 37158 57227 87257 8252
## CV residual -22905 3103 -31370 -12039 -36588 -9423 -1743 2784 6031
## 82 83 85 91 102 107 110 129 138
## Predicted 42457 49626 86806 88840 36540 71375 33417 57590 5238
## cvpred 44938 48760 72938 79915 38432 68840 32318 49002 7772
## AverageWinnings 19584 11349 90071 64589 67813 86574 21091 31531 9356
## CV residual -25354 -37411 17133 -15326 29381 17734 -11227 -17471 1584
## 139 148 150 153 154 165 167 168
## Predicted 43042 74524 42229 29158 53809 69712 -8063 92850
## cvpred 42019 67246 40458 31539 55879 68299 -5190 80998
## AverageWinnings 22616 58213 23399 8363 10782 88527 11339 122341
## CV residual -19403 -9033 -17059 -23176 -45097 20228 16529 41343
## 176 181 184 187 194
## Predicted 117117 57459 26657 5150 128112
## cvpred 106245 60852 26251 4841 105828
## AverageWinnings 158938 42623 25697 11308 376040
## CV residual 52693 -18229 -554 6467 270212
##
## Sum of squares = 1.04e+11 Mean square = 2.66e+09 n = 39
##
## Overall (Sum over all 39 folds)
## ms
## 1.85e+09
sum((PGA$DrivingAccuracy-LOOCV$cvpred)^2)/5
## [1] 1.4e+11
sum((PGA$DrivingAccuracy-LOOCV$cvpred)^2)
## [1] 7.02e+11
1-sum((PGA$DrivingAccuracy-LOOCV$cvpred)^2)/sum((PGA$DrivingAccuracy-mean(PGA$DrivingAccuracy))^2)
## [1] -1.28e+08
summary(model_pga)$r.squared
## [1] 0.453
Computing using the funcrions
### This calculate the PRESS (predictive residual sum of squares), the lower, the better
#' @title PRESS
#' @author Thomas Hopper
#' @description Returns the PRESS statistic (predictive residual sum of squares).
#' Useful for evaluating predictive power of regression models.
#' @param linear.model A linear regression model (class 'lm'). Required.
PRESS <- function(linear.model) {
#' calculate the predictive residuals
pr <- residuals(linear.model)/(1-lm.influence(linear.model)$hat)
#' calculate the PRESS
PRESS <- sum(pr^2)
return(PRESS)
}
### This calculate the MSPE (mean square prediction error), the lower, the better
#' @title MSPE
#' @author Yichen Qin
#' @description Returns the MSPE statistic (mean square prediction error).
#' @param linear.model A linear regression model (class 'lm'). Required.
MSPE <- function(linear.model) {
#' calculate the MSPE =PRESS/sample size
return(PRESS(linear.model)/length(residuals(linear.model)))
}
### This calculate the Prediction r-squared
#' @title Predictive R-squared
#' @author Thomas Hopper
#' @description returns the prediction r-squared. Requires the function PRESS(), which returns
#' the PRESS statistic.
#' @param linear.model A linear regression model (class 'lm'). Required.
pred_r_squared <- function(linear.model) {
#' Use anova() to get the sum of squares for the linear model
lm.anova <- anova(linear.model)
#' Calculate the total sum of squares
tss <- sum(lm.anova$'Sum Sq')
# Calculate the predictive R^2
pred.r.squared <- 1-PRESS(linear.model)/(tss)
return(pred.r.squared)
}
MSPE(model_pga)
## [1] 1.84e+09
PRESS(model_pga)
## [1] 3.62e+11
pred_r_squared(model_pga)
## [1] 0.387
summary(model_pga)$r.squared
## [1] 0.453
Model_Pga2 <- lm(AverageWinnings~DrivingAccuracy+ +GreensonRegulation +AverageNumofPutts+SavePercent + NumEvents, data = PGA)
summary(Model_Pga2)
##
## Call:
## lm(formula = AverageWinnings ~ DrivingAccuracy + +GreensonRegulation +
## AverageNumofPutts + SavePercent + NumEvents, data = PGA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69302 -23747 -6472 17736 243585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 916419 272720 3.36 0.00094 ***
## DrivingAccuracy -2430 593 -4.10 6.1e-05 ***
## GreensonRegulation 8391 1115 7.52 2.1e-12 ***
## AverageNumofPutts -701063 137013 -5.12 7.6e-07 ***
## SavePercent 1380 585 2.36 0.01932 *
## NumEvents -3042 632 -4.81 3.0e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41400 on 190 degrees of freedom
## Multiple R-squared: 0.449, Adjusted R-squared: 0.434
## F-statistic: 30.9 on 5 and 190 DF, p-value: <2e-16
5-Cross Validation
KCV2=cv.lm(data=PGA, Model_Pga2, m=5, seed=123)
## Analysis of Variance Table
##
## Response: AverageWinnings
## Df Sum Sq Mean Sq F value Pr(>F)
## DrivingAccuracy 1 9.71e+09 9.71e+09 5.68 0.018 *
## GreensonRegulation 1 1.22e+11 1.22e+11 71.40 7.5e-15 ***
## AverageNumofPutts 1 8.00e+10 8.00e+10 46.74 1.1e-10 ***
## SavePercent 1 1.32e+10 1.32e+10 7.71 0.006 **
## NumEvents 1 3.96e+10 3.96e+10 23.16 3.0e-06 ***
## Residuals 190 3.25e+11 1.71e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in cv.lm(data = PGA, Model_Pga2, m = 5, seed = 123):
##
## As there is >1 explanatory variable, cross-validation
## predicted values for a fold are not a linear function
## of corresponding overall predicted values. Lines that
## are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 39
## 4 5 11 13 25 36 38 39 43
## Predicted 53873 24541 44140 4747 69825 36760 -14291 39282 16957
## cvpred 54605 23485 47674 4275 77581 33399 -18125 36898 16787
## AverageWinnings 40419 16202 20359 6026 38551 38345 16486 81787 4378
## CV residual -14186 -7283 -27315 1751 -39030 4946 34611 44889 -12409
## 53 72 73 75 76 78 81 92 111
## Predicted 108536 -21992 76166 33334 22759 27344 22114 66931 -1051
## cvpred 116400 -28035 83257 37119 21505 27896 20319 74295 -3751
## AverageWinnings 98230 5449 27230 34414 29452 27103 9660 36636 1031
## CV residual -18170 33484 -56027 -2705 7947 -793 -10659 -37659 4782
## 114 116 119 120 125 127 128 136
## Predicted 76053 80616 42048 82180 74679 51625 28589 46276
## cvpred 77990 87553 47721 88422 78210 49983 30784 49783
## AverageWinnings 65268 33499 18582 78394 15806 32685 18188 19901
## CV residual -12722 -54054 -29139 -10028 -62404 -17298 -12596 -29882
## 141 145 152 159 160 162 164 169
## Predicted 83115 23245 91975 90046 51658 73888 51285 40585
## cvpred 84771 23622 99045 94647 54010 74303 56222 43829
## AverageWinnings 26149 8600 66836 72904 29286 98566 35182 48607
## CV residual -58622 -15022 -32209 -21743 -24724 24263 -21040 4778
## 183 185 189 195 196
## Predicted 19332 35628 46879 66151 55148
## cvpred 19907 35682 51523 72213 56955
## AverageWinnings 37292 113473 47589 51586 80590
## CV residual 17385 77791 -3934 -20627 23635
##
## Sum of squares = 3.56e+10 Mean square = 9.13e+08 n = 39
##
## fold 2
## Observations in test set: 40
## 9 12 18 24 28 33 40 46 50
## Predicted 21334 -11114 43613 -10294 63589 53853 27995 63424 -61866
## cvpred 16022 -19294 43261 -23215 59618 52365 17952 63468 -79926
## AverageWinnings 25041 15727 36318 6664 72422 56206 15694 18862 850
## CV residual 9019 35021 -6943 29879 12804 3841 -2258 -44606 80776
## 51 52 54 55 57 58 63 80
## Predicted -14256 41375 72022 25140 75037 -28326 39322 39623
## cvpred -18543 38921 72365 23938 71484 -38301 41649 39202
## AverageWinnings 6090 18441 128129 17010 32240 10014 46638 25230
## CV residual 24633 -20480 55764 -6928 -39244 48315 4989 -13972
## 84 87 88 95 97 99 103 106
## Predicted 38634 -9670 67926 53059 85171 11673 61665 101025
## cvpred 33371 -14638 67512 53014 81841 11780 63792 102928
## AverageWinnings 7315 13203 76394 11076 43714 25447 38960 56217
## CV residual -26056 27841 8882 -41938 -38127 13667 -24832 -46711
## 109 115 126 131 134 135 137 142 143
## Predicted 88321 4504 -1804 87938 5902 52468 -13751 14571 124491
## cvpred 89133 -3007 -10218 90446 -454 54353 -21058 6114 127120
## AverageWinnings 84871 7891 17657 125524 12065 25371 12350 29356 262947
## CV residual -4262 10898 27875 35078 12519 -28982 33408 23242 135827
## 149 155 173 188 190 192
## Predicted 8361 49231 46029 102598 12505 22684
## cvpred 4952 44621 44453 106503 6437 17851
## AverageWinnings 33335 96169 42544 70699 30166 4123
## CV residual 28383 51548 -1909 -35804 23729 -13728
##
## Sum of squares = 5.57e+10 Mean square = 1.39e+09 n = 40
##
## fold 3
## Observations in test set: 39
## 14 15 17 19 21 27 30 47 62
## Predicted 55041 16070 37499 63453 60093 82057 27729 93785 -40463
## cvpred 54004 16428 37283 73892 64640 89159 25837 99431 -43542
## AverageWinnings 51794 19381 37175 15438 26980 48856 56783 125614 3200
## CV residual -2210 2953 -108 -58454 -37660 -40303 30946 26183 46742
## 66 67 68 69 71 74 77 79
## Predicted 82411 -10035 101035 14586.9 77656 77107 -47910 31315
## cvpred 89192 -13215 107818 15834.2 87464 81404 -55483 28564
## AverageWinnings 52460 9572 47573 15756.0 18290 57871 6004 28294
## CV residual -36732 22787 -60245 -78.2 -69174 -23533 61487 -270
## 89 90 93 101 104 105 108 113 117
## Predicted 61105 64613 30022 42006 97938 65577 15904 12764 -242
## cvpred 58870 63976 33192 47413 107585 67049 18131 11567 1329
## AverageWinnings 35987 86077 45105 41969 29121 61241 10044 34398 8548
## CV residual -22883 22101 11913 -5444 -78464 -5808 -8087 22831 7219
## 118 123 124 132 140 157 158 163 166
## Predicted 102113 50919 60961 46887 33696 43391 24363 125070 16472
## cvpred 106851 54332 64103 50760 34120 45138 27787 128775 15662
## AverageWinnings 45394 93751 31992 21250 37404 48253 18319 179956 41601
## CV residual -61457 39419 -32111 -29510 3284 3115 -9468 51181 25939
## 178 179 180 191
## Predicted 103928 29294 148859 43500
## cvpred 112664 25693 160865 48742
## AverageWinnings 34626 46768 282393 6074
## CV residual -78038 21075 121528 -42668
##
## Sum of squares = 6.67e+10 Mean square = 1.71e+09 n = 39
##
## fold 4
## Observations in test set: 39
## 6 7 10 20 23 26 29 32 35
## Predicted 63819 54250 86642 32883 31955 49459 46448 72194 -29157
## cvpred 64119 53662 82097 33353 26868 49552 49512 69621 -26383
## AverageWinnings 58932 41833 62906 40638 20162 25973 80892 110068 7820
## CV residual -5187 -11829 -19191 7285 -6706 -23579 31380 40447 34203
## 37 45 49 61 65 86 94 96
## Predicted 63893 50134 30840 129588 25387 29020 8758 136454
## cvpred 66172 52475 35467 119015 27502 32325 12745 128127
## AverageWinnings 23833 14200 15484 361702 72542 20001 49577 107250
## CV residual -42339 -38275 -19983 242687 45040 -12324 36832 -20877
## 98 100 112 121 122 130 133 144
## Predicted 89237 77631 8329 -37499 59342 28279 57680 145732
## cvpred 83464 81380 9389 -36112 56554 22727 49866 138744
## AverageWinnings 24898 24936 28161 8519 29912 9776 53072 242848
## CV residual -58566 -56444 18772 44631 -26642 -12951 3206 104104
## 146 147 151 156 161 170 171 172
## Predicted 21931 29537 38562 22581 18848 41536 53011 46004
## cvpred 27509 28754 39261 22659 19515 45023 54029 45110
## AverageWinnings 8232 14412 13563 7404 10610 19651 12162 79413
## CV residual -19277 -14342 -25698 -15255 -8905 -25372 -41867 34303
## 174 175 177 182 186 193
## Predicted 39325 47808 69011 55812 20720 73291
## cvpred 40769 39220 66169 55199 21379 70263
## AverageWinnings 8315 16330 117969 66445 34926 43572
## CV residual -32454 -22890 51800 11246 13547 -26691
##
## Sum of squares = 1.03e+11 Mean square = 2.63e+09 n = 39
##
## fold 5
## Observations in test set: 39
## 1 2 3 8 16 22 31 34
## Predicted 25081 128057 34314 70896 60639 70485 5108 38372
## cvpred 29095 116434 35031 68092 62829 67751 15888 34517
## AverageWinnings 23440 232812 54729 38406 45481 38302 4178 20993
## CV residual -5655 116378 19698 -29686 -17348 -29449 -11710 -13524
## 41 42 44 48 56 59 60 64 70
## Predicted 30666 20824 56408 19317 51215 44058 65723 97594 3743
## cvpred 28543 16880 52020 21135 53592 45992 64047 92501 7690
## AverageWinnings 13220 11269 23129 9787 15756 37158 57227 87257 8252
## CV residual -15323 -5611 -28891 -11348 -37836 -8834 -6820 -5244 562
## 82 83 85 91 102 107 110 129 138
## Predicted 40301 44855 93844 90715 35373 69989 34102 64166 3630
## cvpred 40855 40842 85859 83532 36310 67398 33263 60221 6246
## AverageWinnings 19584 11349 90071 64589 67813 86574 21091 31531 9356
## CV residual -21271 -29493 4212 -18943 31503 19176 -12172 -28690 3110
## 139 148 150 153 154 165 167 168
## Predicted 39243 72928 42852 26631 48329 68716 -8157 95131
## cvpred 35518 64717 41380 26479 46737 67342 -4830 85028
## AverageWinnings 22616 58213 23399 8363 10782 88527 11339 122341
## CV residual -12902 -6504 -17981 -18116 -35955 21185 16169 37313
## 176 181 184 187 194
## Predicted 115260 51378 26561 7883 132455
## cvpred 103735 50652 27337 9499 113545
## AverageWinnings 158938 42623 25697 11308 376040
## CV residual 55203 -8029 -1640 1809 262495
##
## Sum of squares = 9.95e+10 Mean square = 2.55e+09 n = 39
##
## Overall (Sum over all 39 folds)
## ms
## 1.84e+09
MSPE
sum((PGA$AverageWinnings-KCV2$cvpred)^2)/5
## [1] 7.2e+10
sum((PGA$AverageWinnings-KCV2$cvpred)^2)
## [1] 3.6e+11
1-sum((PGA$AverageWinnings-KCV2$cvpred)^2)/sum((PGA$AverageWinnings-mean(PGA$AverageWinnings))^2)
## [1] 0.389
summary(Model_Pga2)$r.squared
## [1] 0.449
Model_Pga3 <- lm(AverageWinnings~ AverageNumofPutts+ SavePercent + NumEvents, data = PGA)
summary(Model_Pga3)
##
## Call:
## lm(formula = AverageWinnings ~ AverageNumofPutts + SavePercent +
## NumEvents, data = PGA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -94583 -23983 -5520 11815 318377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1735961 283752 6.12 5.2e-09 ***
## AverageNumofPutts -915366 152848 -5.99 1.0e-08 ***
## SavePercent 521 654 0.80 0.43
## NumEvents -3328 720 -4.62 6.9e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47200 on 192 degrees of freedom
## Multiple R-squared: 0.275, Adjusted R-squared: 0.264
## F-statistic: 24.3 on 3 and 192 DF, p-value: 2.2e-13
5-Cross Validation
KCV3=cv.lm(data=PGA, Model_Pga3, m=5, seed=123)
## Analysis of Variance Table
##
## Response: AverageWinnings
## Df Sum Sq Mean Sq F value Pr(>F)
## AverageNumofPutts 1 1.12e+11 1.12e+11 50.26 2.5e-11 ***
## SavePercent 1 2.92e+09 2.92e+09 1.31 0.25
## NumEvents 1 4.76e+10 4.76e+10 21.38 6.9e-06 ***
## Residuals 192 4.27e+11 2.23e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in cv.lm(data = PGA, Model_Pga3, m = 5, seed = 123):
##
## As there is >1 explanatory variable, cross-validation
## predicted values for a fold are not a linear function
## of corresponding overall predicted values. Lines that
## are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 39
## 4 5 11 13 25 36 38 39 43
## Predicted 58485 42245 36154 13158 28701 76436 24359 66598 26840
## cvpred 60387 43734 37222 14041 29914 78364 25064 68488 28016
## AverageWinnings 40419 16202 20359 6026 38551 38345 16486 81787 4378
## CV residual -19968 -27532 -16863 -8015 8637 -40019 -8578 13299 -23638
## 53 72 73 75 76 78 81 92 111
## Predicted 75885 21414 51386 24402 38423 35204 44480 25326 21840
## cvpred 77844 22541 52517 25047 39942 36439 45976 26543 22865
## AverageWinnings 98230 5449 27230 34414 29452 27103 9660 36636 1031
## CV residual 20386 -17092 -25287 9367 -10490 -9336 -36316 10093 -21834
## 114 116 119 120 125 127 128 136
## Predicted 78045 49317 16420 53838 61105 79546 23968 36245
## cvpred 80292 50772 17105 55446 63074 81665 24937 37432
## AverageWinnings 65268 33499 18582 78394 15806 32685 18188 19901
## CV residual -15024 -17273 1477 22948 -47268 -48980 -6749 -17531
## 141 145 152 159 160 162 164 169 183
## Predicted 89030 22910 66319 70580 49963 85138 29439 32655 29404
## cvpred 91471 24303 67895 72743 51464 87636 30536 33584 30339
## AverageWinnings 26149 8600 66836 72904 29286 98566 35182 48607 37292
## CV residual -65322 -15703 -1059 161 -22178 10930 4646 15023 6953
## 185 189 195 196
## Predicted 45477 34128 39104 56912
## cvpred 47057 34968 40411 58690
## AverageWinnings 113473 47589 51586 80590
## CV residual 66416 12621 11175 21900
##
## Sum of squares = 2.42e+10 Mean square = 6.19e+08 n = 39
##
## fold 2
## Observations in test set: 40
## 9 12 18 24 28 33 40 46
## Predicted 35483 -18850 69355 -4711 24810 69755 11019 69260
## cvpred 33301 -24396 70955 -11642 22696 70161 6869 70052
## AverageWinnings 25041 15727 36318 6664 72422 56206 15694 18862
## CV residual -8260 40123 -34637 18306 49726 -13955 8825 -51190
## 50 51 52 54 55 57 58 63
## Predicted -13075 36888 47040 78483 46965 74033 -5071 57229
## cvpred -19730 35601 46334 80013 47871 72493 -10340 59583
## AverageWinnings 850 6090 18441 128129 17010 32240 10014 46638
## CV residual 20580 -29511 -27893 48116 -30861 -40253 20354 -12945
## 80 84 87 88 95 97 99 103
## Predicted 35759 31240 -9832 57591 61179 47192.6 48757 95981
## cvpred 35856 28690 -13251 56273 59835 43617.6 49847 96253
## AverageWinnings 25230 7315 13203 76394 11076 43714.0 25447 38960
## CV residual -10626 -21375 26454 20121 -48759 96.4 -24400 -57293
## 106 109 115 126 131 134 135 137 142
## Predicted 75599 57582 16277 -5394 89737 30793 61326 12802 18898
## cvpred 76715 57847 13982 -8484 91475 29010 62271 7942 15002
## AverageWinnings 56217 84871 7891 17657 125524 12065 25371 12350 29356
## CV residual -20498 27024 -6091 26141 34049 -16945 -36900 4408 14354
## 143 149 155 173 188 190 192
## Predicted 81989 28172 34567 46310 68124 11422 42476
## cvpred 82435 26261 32806 47290 68613 8907 41293
## AverageWinnings 262947 33335 96169 42544 70699 30166 4123
## CV residual 180512 7074 63363 -4746 2086 21259 -37170
##
## Sum of squares = 6.75e+10 Mean square = 1.69e+09 n = 40
##
## fold 3
## Observations in test set: 39
## 14 15 17 19 21 27 30 47 62
## Predicted 40206 43872 37187 41642 30892 84001 12103 109459 54.9
## cvpred 41345 47617 37401 47586 33750 91770 11507 113849 1113.3
## AverageWinnings 51794 19381 37175 15438 26980 48856 56783 125614 3200.0
## CV residual 10449 -28236 -226 -32148 -6770 -42914 45276 11765 2086.7
## 66 67 68 69 71 74 77 79
## Predicted 77064 21690 77481 49906 112873 79629 260 21170
## cvpred 80277 20635 85765 54386 122808 85456 -4069 19806
## AverageWinnings 52460 9572 47573 15756 18290 57871 6004 28294
## CV residual -27817 -11063 -38192 -38630 -104518 -27585 10073 8488
## 89 90 93 101 104 105 108 113 117
## Predicted 40081 53160 47845 64228 108735 61603 27144 27331 43778
## cvpred 41137 57798 51759 70149 114670 65964 28000 28505 47666
## AverageWinnings 35987 86077 45105 41969 29121 61241 10044 34398 8548
## CV residual -5150 28279 -6654 -28180 -85549 -4723 -17956 5893 -39118
## 118 123 124 132 140 157 158 163 166
## Predicted 102722 76669 88983 23165 38409 34492 11821 62602 47156
## cvpred 111907 84265 93307 25840 40468 38174 10835 62929 49994
## AverageWinnings 45394 93751 31992 21250 37404 48253 18319 179956 41601
## CV residual -66513 9486 -61315 -4590 -3064 10079 7484 117027 -8393
## 178 179 180 191
## Predicted 98978 24475 122500 49729
## cvpred 103744 26618 131331 50132
## AverageWinnings 34626 46768 282393 6074
## CV residual -69118 20150 151062 -44058
##
## Sum of squares = 8.48e+10 Mean square = 2.17e+09 n = 39
##
## fold 4
## Observations in test set: 39
## 6 7 10 20 23 26 29 32 35
## Predicted 56575 53855 84276 33699 66665 39646 20047 68739 454
## cvpred 56030 51049 80842 34194 63262 39048 22183 65674 4101
## AverageWinnings 58932 41833 62906 40638 20162 25973 80892 110068 7820
## CV residual 2902 -9216 -17936 6444 -43100 -13075 58709 44394 3719
## 37 45 49 61 65 86 94 96
## Predicted 44554 26194 31244 114921 48605 29772 -6647 102210
## cvpred 45714 27381 35786 107225 48834 32138 -4292 96723
## AverageWinnings 23833 14200 15484 361702 72542 20001 49577 107250
## CV residual -21881 -13181 -20302 254477 23708 -12137 53869 10527
## 98 100 112 121 122 130 133 144 146
## Predicted 76701 29712 30579 11456 67707 57443 118380 115663 11679
## cvpred 71676 31677 31390 13485 65816 53225 113122 109542 17512
## AverageWinnings 24898 24936 28161 8519 29912 9776 53072 242848 8232
## CV residual -46778 -6741 -3229 -4966 -35904 -43449 -60050 133306 -9280
## 147 151 156 161 170 171 172 174
## Predicted 46713 48507 37144 55062 8528 50923 44527 36437
## cvpred 45636 48381 37497 55811 11892 51270 43496 37789
## AverageWinnings 14412 13563 7404 10610 19651 12162 79413 8315
## CV residual -31224 -34818 -30093 -45201 7759 -39108 35917 -29474
## 175 177 182 186 193
## Predicted 74210 64826 38571 54415 55971
## cvpred 69961 62893 39055 54152 52444
## AverageWinnings 16330 117969 66445 34926 43572
## CV residual -53631 55076 27390 -19226 -8872
##
## Sum of squares = 1.2e+11 Mean square = 3.09e+09 n = 39
##
## fold 5
## Observations in test set: 39
## 1 2 3 8 16 22 31 34
## Predicted 55165 105295 39413 81390 75368 64814 29908 -14292
## cvpred 51385 98369 37666 74840 72906 61810 33564 -9498
## AverageWinnings 23440 232812 54729 38406 45481 38302 4178 20993
## CV residual -27945 134443 17063 -36434 -27425 -23508 -29386 30491
## 41 42 44 48 56 59 60 64 70
## Predicted 15127 17496 53268 14628 60024 50155 50748 76975 7283
## cvpred 14685 12992 47643 15238 59167 49333 50299 75623 8536
## AverageWinnings 13220 11269 23129 9787 15756 37158 57227 87257 8252
## CV residual -1465 -1723 -24514 -5451 -43411 -12175 6928 11634 -284
## 82 83 85 91 102 107 110 129 138
## Predicted 33807 31617 79421 72991 24337 59746 27998 54458 2502
## cvpred 33607 28857 73439 68413 25859 57887 26138 51030 2960
## AverageWinnings 19584 11349 90071 64589 67813 86574 21091 31531 9356
## CV residual -14023 -17508 16632 -3824 41954 28687 -5047 -19499 6396
## 139 148 150 153 154 165 167 168 176
## Predicted 26731 27825 40501 24459 23722 76934 -6817 68024 94821
## cvpred 23656 27594 37471 23141 25799 72237 -6165 62265 86351
## AverageWinnings 22616 58213 23399 8363 10782 88527 11339 122341 158938
## CV residual -1040 30619 -14072 -14778 -15017 16290 17504 60076 72587
## 181 184 187 194
## Predicted 62488 31182 3222 57663
## cvpred 57751 28760 3473 53160
## AverageWinnings 42623 25697 11308 376040
## CV residual -15128 -3063 7835 322880
##
## Sum of squares = 1.46e+11 Mean square = 3.74e+09 n = 39
##
## Overall (Sum over all 39 folds)
## ms
## 2.26e+09
#MSPE
sum((PGA$AverageWinnings-KCV3$cvpred)^2)/5
## [1] 8.85e+10
sum((PGA$AverageWinnings-KCV3$cvpred)^2)
## [1] 4.43e+11
1-sum((PGA$AverageWinnings-KCV3$cvpred)^2)/sum((PGA$AverageWinnings-mean(PGA$AverageWinnings))^2)
## [1] 0.249
summary(Model_Pga3)$r.squared
## [1] 0.275
The predcition R -squared for the 2 question is the highest where Driving Accuracy (percent), Greens on Regulation (%), Average # of Putts, Save Percent, and # Events are the covariates.
The difference between predicted and traiditional r-squared should be the least which it is in the case of 2.
From the above inefernce we can conclude that model in question 2 is a good fitting model from the rest.
iris.csv is a multivariate data set introduced by the British statistician and biologist Ronald Fisher. It consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters. Please read the data set into R and visualize the data.Iris_Df <- read.csv("C:/Users/bhuvi iyer/Downloads/iris.csv",h=T)
Iris_Df
## X Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 1 5.1 3.5 1.4 0.2 setosa
## 2 2 4.9 3.0 1.4 0.2 setosa
## 3 3 4.7 3.2 1.3 0.2 setosa
## 4 4 4.6 3.1 1.5 0.2 setosa
## 5 5 5.0 3.6 1.4 0.2 setosa
## 6 6 5.4 3.9 1.7 0.4 setosa
## 7 7 4.6 3.4 1.4 0.3 setosa
## 8 8 5.0 3.4 1.5 0.2 setosa
## 9 9 4.4 2.9 1.4 0.2 setosa
## 10 10 4.9 3.1 1.5 0.1 setosa
## 11 11 5.4 3.7 1.5 0.2 setosa
## 12 12 4.8 3.4 1.6 0.2 setosa
## 13 13 4.8 3.0 1.4 0.1 setosa
## 14 14 4.3 3.0 1.1 0.1 setosa
## 15 15 5.8 4.0 1.2 0.2 setosa
## 16 16 5.7 4.4 1.5 0.4 setosa
## 17 17 5.4 3.9 1.3 0.4 setosa
## 18 18 5.1 3.5 1.4 0.3 setosa
## 19 19 5.7 3.8 1.7 0.3 setosa
## 20 20 5.1 3.8 1.5 0.3 setosa
## 21 21 5.4 3.4 1.7 0.2 setosa
## 22 22 5.1 3.7 1.5 0.4 setosa
## 23 23 4.6 3.6 1.0 0.2 setosa
## 24 24 5.1 3.3 1.7 0.5 setosa
## 25 25 4.8 3.4 1.9 0.2 setosa
## 26 26 5.0 3.0 1.6 0.2 setosa
## 27 27 5.0 3.4 1.6 0.4 setosa
## 28 28 5.2 3.5 1.5 0.2 setosa
## 29 29 5.2 3.4 1.4 0.2 setosa
## 30 30 4.7 3.2 1.6 0.2 setosa
## 31 31 4.8 3.1 1.6 0.2 setosa
## 32 32 5.4 3.4 1.5 0.4 setosa
## 33 33 5.2 4.1 1.5 0.1 setosa
## 34 34 5.5 4.2 1.4 0.2 setosa
## 35 35 4.9 3.1 1.5 0.2 setosa
## 36 36 5.0 3.2 1.2 0.2 setosa
## 37 37 5.5 3.5 1.3 0.2 setosa
## 38 38 4.9 3.6 1.4 0.1 setosa
## 39 39 4.4 3.0 1.3 0.2 setosa
## 40 40 5.1 3.4 1.5 0.2 setosa
## 41 41 5.0 3.5 1.3 0.3 setosa
## 42 42 4.5 2.3 1.3 0.3 setosa
## 43 43 4.4 3.2 1.3 0.2 setosa
## 44 44 5.0 3.5 1.6 0.6 setosa
## 45 45 5.1 3.8 1.9 0.4 setosa
## 46 46 4.8 3.0 1.4 0.3 setosa
## 47 47 5.1 3.8 1.6 0.2 setosa
## 48 48 4.6 3.2 1.4 0.2 setosa
## 49 49 5.3 3.7 1.5 0.2 setosa
## 50 50 5.0 3.3 1.4 0.2 setosa
## 51 51 7.0 3.2 4.7 1.4 versicolor
## 52 52 6.4 3.2 4.5 1.5 versicolor
## 53 53 6.9 3.1 4.9 1.5 versicolor
## 54 54 5.5 2.3 4.0 1.3 versicolor
## 55 55 6.5 2.8 4.6 1.5 versicolor
## 56 56 5.7 2.8 4.5 1.3 versicolor
## 57 57 6.3 3.3 4.7 1.6 versicolor
## 58 58 4.9 2.4 3.3 1.0 versicolor
## 59 59 6.6 2.9 4.6 1.3 versicolor
## 60 60 5.2 2.7 3.9 1.4 versicolor
## 61 61 5.0 2.0 3.5 1.0 versicolor
## 62 62 5.9 3.0 4.2 1.5 versicolor
## 63 63 6.0 2.2 4.0 1.0 versicolor
## 64 64 6.1 2.9 4.7 1.4 versicolor
## 65 65 5.6 2.9 3.6 1.3 versicolor
## 66 66 6.7 3.1 4.4 1.4 versicolor
## 67 67 5.6 3.0 4.5 1.5 versicolor
## 68 68 5.8 2.7 4.1 1.0 versicolor
## 69 69 6.2 2.2 4.5 1.5 versicolor
## 70 70 5.6 2.5 3.9 1.1 versicolor
## 71 71 5.9 3.2 4.8 1.8 versicolor
## 72 72 6.1 2.8 4.0 1.3 versicolor
## 73 73 6.3 2.5 4.9 1.5 versicolor
## 74 74 6.1 2.8 4.7 1.2 versicolor
## 75 75 6.4 2.9 4.3 1.3 versicolor
## 76 76 6.6 3.0 4.4 1.4 versicolor
## 77 77 6.8 2.8 4.8 1.4 versicolor
## 78 78 6.7 3.0 5.0 1.7 versicolor
## 79 79 6.0 2.9 4.5 1.5 versicolor
## 80 80 5.7 2.6 3.5 1.0 versicolor
## 81 81 5.5 2.4 3.8 1.1 versicolor
## 82 82 5.5 2.4 3.7 1.0 versicolor
## 83 83 5.8 2.7 3.9 1.2 versicolor
## 84 84 6.0 2.7 5.1 1.6 versicolor
## 85 85 5.4 3.0 4.5 1.5 versicolor
## 86 86 6.0 3.4 4.5 1.6 versicolor
## 87 87 6.7 3.1 4.7 1.5 versicolor
## 88 88 6.3 2.3 4.4 1.3 versicolor
## 89 89 5.6 3.0 4.1 1.3 versicolor
## 90 90 5.5 2.5 4.0 1.3 versicolor
## 91 91 5.5 2.6 4.4 1.2 versicolor
## 92 92 6.1 3.0 4.6 1.4 versicolor
## 93 93 5.8 2.6 4.0 1.2 versicolor
## 94 94 5.0 2.3 3.3 1.0 versicolor
## 95 95 5.6 2.7 4.2 1.3 versicolor
## 96 96 5.7 3.0 4.2 1.2 versicolor
## 97 97 5.7 2.9 4.2 1.3 versicolor
## 98 98 6.2 2.9 4.3 1.3 versicolor
## 99 99 5.1 2.5 3.0 1.1 versicolor
## 100 100 5.7 2.8 4.1 1.3 versicolor
## 101 101 6.3 3.3 6.0 2.5 virginica
## 102 102 5.8 2.7 5.1 1.9 virginica
## 103 103 7.1 3.0 5.9 2.1 virginica
## 104 104 6.3 2.9 5.6 1.8 virginica
## 105 105 6.5 3.0 5.8 2.2 virginica
## 106 106 7.6 3.0 6.6 2.1 virginica
## 107 107 4.9 2.5 4.5 1.7 virginica
## 108 108 7.3 2.9 6.3 1.8 virginica
## 109 109 6.7 2.5 5.8 1.8 virginica
## 110 110 7.2 3.6 6.1 2.5 virginica
## 111 111 6.5 3.2 5.1 2.0 virginica
## 112 112 6.4 2.7 5.3 1.9 virginica
## 113 113 6.8 3.0 5.5 2.1 virginica
## 114 114 5.7 2.5 5.0 2.0 virginica
## 115 115 5.8 2.8 5.1 2.4 virginica
## 116 116 6.4 3.2 5.3 2.3 virginica
## 117 117 6.5 3.0 5.5 1.8 virginica
## 118 118 7.7 3.8 6.7 2.2 virginica
## 119 119 7.7 2.6 6.9 2.3 virginica
## 120 120 6.0 2.2 5.0 1.5 virginica
## 121 121 6.9 3.2 5.7 2.3 virginica
## 122 122 5.6 2.8 4.9 2.0 virginica
## 123 123 7.7 2.8 6.7 2.0 virginica
## 124 124 6.3 2.7 4.9 1.8 virginica
## 125 125 6.7 3.3 5.7 2.1 virginica
## 126 126 7.2 3.2 6.0 1.8 virginica
## 127 127 6.2 2.8 4.8 1.8 virginica
## 128 128 6.1 3.0 4.9 1.8 virginica
## 129 129 6.4 2.8 5.6 2.1 virginica
## 130 130 7.2 3.0 5.8 1.6 virginica
## 131 131 7.4 2.8 6.1 1.9 virginica
## 132 132 7.9 3.8 6.4 2.0 virginica
## 133 133 6.4 2.8 5.6 2.2 virginica
## 134 134 6.3 2.8 5.1 1.5 virginica
## 135 135 6.1 2.6 5.6 1.4 virginica
## 136 136 7.7 3.0 6.1 2.3 virginica
## 137 137 6.3 3.4 5.6 2.4 virginica
## 138 138 6.4 3.1 5.5 1.8 virginica
## 139 139 6.0 3.0 4.8 1.8 virginica
## 140 140 6.9 3.1 5.4 2.1 virginica
## 141 141 6.7 3.1 5.6 2.4 virginica
## 142 142 6.9 3.1 5.1 2.3 virginica
## 143 143 5.8 2.7 5.1 1.9 virginica
## 144 144 6.8 3.2 5.9 2.3 virginica
## 145 145 6.7 3.3 5.7 2.5 virginica
## 146 146 6.7 3.0 5.2 2.3 virginica
## 147 147 6.3 2.5 5.0 1.9 virginica
## 148 148 6.5 3.0 5.2 2.0 virginica
## 149 149 6.2 3.4 5.4 2.3 virginica
## 150 150 5.9 3.0 5.1 1.8 virginica
model_Iris <- lm(Sepal.Length ~ Sepal.Width, data = Iris_Df)
summary(model_Iris)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = Iris_Df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.556 -0.633 -0.112 0.558 2.223
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.526 0.479 13.63 <2e-16 ***
## Sepal.Width -0.223 0.155 -1.44 0.15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.825 on 148 degrees of freedom
## Multiple R-squared: 0.0138, Adjusted R-squared: 0.00716
## F-statistic: 2.07 on 1 and 148 DF, p-value: 0.152
Filtering data for only setosa
Setosa_Df <- Iris_Df[Iris_Df$Species == "setosa",]
head(Setosa_Df)
## X Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 1 5.1 3.5 1.4 0.2 setosa
## 2 2 4.9 3.0 1.4 0.2 setosa
## 3 3 4.7 3.2 1.3 0.2 setosa
## 4 4 4.6 3.1 1.5 0.2 setosa
## 5 5 5.0 3.6 1.4 0.2 setosa
## 6 6 5.4 3.9 1.7 0.4 setosa
Model for the filtered dataframe
model_Iris_Setosa <- lm(Sepal.Length ~ Sepal.Width, data = Setosa_Df)
summary(model_Iris_Setosa)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = Setosa_Df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5248 -0.1629 0.0217 0.1383 0.4443
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6390 0.3100 8.51 3.7e-11 ***
## Sepal.Width 0.6905 0.0899 7.68 6.7e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.239 on 48 degrees of freedom
## Multiple R-squared: 0.551, Adjusted R-squared: 0.542
## F-statistic: 59 on 1 and 48 DF, p-value: 6.71e-10
Filtering data for only versicolor.
Versicolor_Df <- Iris_Df[Iris_Df$Species == "versicolor",]
head(Versicolor_Df)
## X Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 51 51 7.0 3.2 4.7 1.4 versicolor
## 52 52 6.4 3.2 4.5 1.5 versicolor
## 53 53 6.9 3.1 4.9 1.5 versicolor
## 54 54 5.5 2.3 4.0 1.3 versicolor
## 55 55 6.5 2.8 4.6 1.5 versicolor
## 56 56 5.7 2.8 4.5 1.3 versicolor
Model for the filtered dataframe
model_Iris_versicolor <- lm(Sepal.Length ~ Sepal.Width, data = Versicolor_Df)
summary(model_Iris_versicolor)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = Versicolor_Df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7350 -0.2856 -0.0754 0.4367 0.8380
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.540 0.563 6.29 9.1e-08 ***
## Sepal.Width 0.865 0.202 4.28 8.8e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.444 on 48 degrees of freedom
## Multiple R-squared: 0.277, Adjusted R-squared: 0.262
## F-statistic: 18.4 on 1 and 48 DF, p-value: 8.77e-05
Virginica_df <- Iris_Df[Iris_Df$Species == "virginica",]
head(Virginica_df)
## X Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 101 101 6.3 3.3 6.0 2.5 virginica
## 102 102 5.8 2.7 5.1 1.9 virginica
## 103 103 7.1 3.0 5.9 2.1 virginica
## 104 104 6.3 2.9 5.6 1.8 virginica
## 105 105 6.5 3.0 5.8 2.2 virginica
## 106 106 7.6 3.0 6.6 2.1 virginica
model_Iris_virginica <- lm(Sepal.Length ~ Sepal.Width, data = Virginica_df)
summary(model_Iris_virginica)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = Virginica_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2607 -0.3692 -0.0361 0.1984 1.4492
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.907 0.757 5.16 4.7e-06 ***
## Sepal.Width 0.902 0.253 3.56 0.00084 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.571 on 48 degrees of freedom
## Multiple R-squared: 0.209, Adjusted R-squared: 0.193
## F-statistic: 12.7 on 1 and 48 DF, p-value: 0.000843
In question 2 the sepal length and width do not seem to be correlated by obeserving the p-value but in the case where species are grouped individually and modelled the regression seems show the different result.The model created by separating is more accurate and has a appropriate p-value.
colors <- c("#00AFBB", "#E7B800", "#FC4E07")
colors <- colors[as.numeric(Iris_Df$Species)]
shapes = c(16, 17, 18)
shapes <- shapes[as.numeric(Iris_Df$Species)]
Model_Iris_2 <- lm(Sepal.Length ~ Petal.Length, data=Iris_Df) # obtain least square estimate
plot(Iris_Df$Petal.Length,Iris_Df$Sepal.Length,pch=shapes,col=colors,frame = FALSE)
abline(Model_Iris_2)
legend("topright", legend = levels(Iris_Df$Species),
col = c("#00AFBB", "#E7B800", "#FC4E07"),
pch = c(16, 17, 18) )
summary(Model_Iris_2)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = Iris_Df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2468 -0.2966 -0.0152 0.2768 1.0027
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.3066 0.0784 54.9 <2e-16 ***
## Petal.Length 0.4089 0.0189 21.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.407 on 148 degrees of freedom
## Multiple R-squared: 0.76, Adjusted R-squared: 0.758
## F-statistic: 469 on 1 and 148 DF, p-value: <2e-16
Analysis for setosa
model_Iris_Setosa_2 <- lm(Sepal.Length ~ Petal.Length, data = Setosa_Df)
plot(Setosa_Df$Petal.Length,Setosa_Df$Sepal.Length,pch=20)
abline(model_Iris_Setosa_2)
summary(model_Iris_Setosa_2)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = Setosa_Df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5724 -0.2067 -0.0308 0.1734 0.9361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.213 0.416 10.14 1.6e-13 ***
## Petal.Length 0.542 0.282 1.92 0.061 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.343 on 48 degrees of freedom
## Multiple R-squared: 0.0714, Adjusted R-squared: 0.052
## F-statistic: 3.69 on 1 and 48 DF, p-value: 0.0607
Analysis for Versicolor
model_Iris_versicolor_2 <- lm(Sepal.Length ~ Petal.Length, data = Versicolor_Df)
plot(Versicolor_Df$Petal.Length,Versicolor_Df$Sepal.Length,pch=20)
abline(model_Iris_versicolor_2)
summary(model_Iris_versicolor_2)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = Versicolor_Df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7348 -0.2027 -0.0206 0.2609 0.6996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.408 0.446 5.39 2.1e-06 ***
## Petal.Length 0.828 0.104 7.95 2.6e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.343 on 48 degrees of freedom
## Multiple R-squared: 0.569, Adjusted R-squared: 0.56
## F-statistic: 63.3 on 1 and 48 DF, p-value: 2.59e-10
Analysis for Versicolor
model_Iris_virginica_2 <- lm(Sepal.Length ~ Petal.Length, data = Virginica_df)
plot(Virginica_df$Petal.Length,Virginica_df$Sepal.Length,pch=20)
abline(model_Iris_virginica_2)
summary(model_Iris_virginica_2)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = Virginica_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7341 -0.2364 -0.0313 0.2377 0.7621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0597 0.4668 2.27 0.028 *
## Petal.Length 0.9957 0.0837 11.90 6.3e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.323 on 48 degrees of freedom
## Multiple R-squared: 0.747, Adjusted R-squared: 0.742
## F-statistic: 142 on 1 and 48 DF, p-value: 6.3e-16
The slope of the data combining all three spicies is 0.4089.The slop for setosa specie is 0.542, Versicolor is 0.828 and Verginica is 0.9957.
The slope in all the cases are positive which shows that all the function run uphill.