library(DAAG)
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.1
library("psych")
## Warning: package 'psych' was built under R version 3.6.2
library("caTools")
## Warning: package 'caTools' was built under R version 3.6.2
library("tidyverse")
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages -------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.1
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'readr' was built under R version 3.6.1
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.1
## Warning: package 'stringr' was built under R version 3.6.1
## Warning: package 'forcats' was built under R version 3.6.1
## -- Conflicts ----------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::%+%() masks psych::%+%()
## x ggplot2::alpha() masks psych::alpha()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library('asbio')
## Warning: package 'asbio' was built under R version 3.6.2
## Loading required package: tcltk
##
## Attaching package: 'asbio'
## The following object is masked from 'package:psych':
##
## skew
## The following object is masked from 'package:DAAG':
##
## press
PGA.csv data set. For this data set, fit a multiple linear regression to the data. Use the log of Average winnings as the response variable and use Age, Average Drive (Yards), Driving Accuracy (percent), Greens on Regulation (%), Average # of Putts, Save Percent, and # Events as covariates. Perform a 5-fold cross validation and obtain the PRESS statistic, MPSE, and prediction R-squared.setwd("C:/Users/Sreejith Nair/Google Drive/UC/Course/Data Analytics Methods BANA 7038/Week 3")
getwd()
## [1] "C:/Users/Sreejith Nair/Google Drive/UC/Course/Data Analytics Methods BANA 7038/Week 3"
pga <- read.csv("PGA.csv")
ModelA <- lm(formula = log(AverageWinnings) ~ Age + AverageDrive + DrivingAccuracy + GreensonRegulation + AverageNumofPutts + SavePercent + NumEvents, data = pga)
KCV=cv.lm(data=pga, ModelA, m=5, seed=123)
## Analysis of Variance Table
##
## Response: log(AverageWinnings)
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 0.0 0.0 0.02 0.89355
## AverageDrive 1 2.6 2.6 6.93 0.00916 **
## DrivingAccuracy 1 3.4 3.4 9.30 0.00262 **
## GreensonRegulation 1 57.3 57.3 155.58 < 2e-16 ***
## AverageNumofPutts 1 46.7 46.7 126.74 < 2e-16 ***
## SavePercent 1 4.4 4.4 11.92 0.00069 ***
## NumEvents 1 6.5 6.5 17.74 3.9e-05 ***
## Residuals 188 69.3 0.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in cv.lm(data = pga, ModelA, m = 5, seed = 123):
##
## As there is >1 explanatory variable, cross-validation
## predicted values for a fold are not a linear function
## of corresponding overall predicted values. Lines that
## are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 39
## 4 5 11 13 25 36 38 39
## Predicted 10.026 9.854 10.214 9.244 10.999 10.286 9.480 9.77
## cvpred 10.064 9.902 10.211 9.277 11.122 10.183 9.444 9.71
## log(AverageWinnings) 10.607 9.693 9.921 8.704 10.560 10.554 9.710 11.31
## CV residual 0.543 -0.209 -0.289 -0.573 -0.562 0.372 0.266 1.60
## 43 53 72 73 75 76 78 81
## Predicted 9.288 11.628 8.692 10.456 10.4188 10.142 9.995 9.910
## cvpred 9.336 11.627 8.710 10.456 10.4348 10.215 10.066 9.923
## log(AverageWinnings) 8.384 11.495 8.603 10.212 10.4462 10.291 10.207 9.176
## CV residual -0.952 -0.132 -0.107 -0.244 0.0114 0.075 0.141 -0.748
## 92 111 114 116 119 120 125 127
## Predicted 11.070 8.71 11.232 10.912 10.249 11.2035 11.04 10.61
## cvpred 11.200 8.74 11.240 10.934 10.352 11.2533 11.11 10.58
## log(AverageWinnings) 10.509 6.94 11.086 10.419 9.830 11.2695 9.67 10.39
## CV residual -0.691 -1.80 -0.154 -0.515 -0.522 0.0162 -1.44 -0.19
## 128 136 141 145 152 159 160 162
## Predicted 9.989 10.686 10.815 9.605 11.13416 11.075 10.397 11.4116
## cvpred 10.057 10.721 10.775 9.736 11.11286 11.083 10.422 11.4386
## log(AverageWinnings) 9.809 9.899 10.172 9.060 11.11000 11.197 10.285 11.4985
## CV residual -0.248 -0.823 -0.603 -0.676 -0.00286 0.114 -0.138 0.0599
## 164 169 183 185 189 195 196
## Predicted 10.3566 10.051 9.807 9.90 10.480 10.7422 11.021
## cvpred 10.4150 10.040 9.815 9.92 10.462 10.8009 11.096
## log(AverageWinnings) 10.4683 10.792 10.527 11.64 10.770 10.8510 11.297
## CV residual 0.0533 0.752 0.712 1.72 0.309 0.0502 0.201
##
## Sum of squares = 17.6 Mean square = 0.45 n = 39
##
## fold 2
## Observations in test set: 40
## 9 12 18 24 28 33 40 46
## Predicted 9.593 9.216 10.266 8.703 10.440 10.398 9.428 10.671
## cvpred 9.537 9.116 10.304 8.566 10.337 10.413 9.300 10.673
## log(AverageWinnings) 10.128 9.663 10.500 8.804 11.190 10.937 9.661 9.845
## CV residual 0.591 0.547 0.197 0.239 0.853 0.524 0.361 -0.828
## 50 51 52 54 55 57 58 63
## Predicted 7.394 9.19 10.111 10.6 10.051 10.4804 8.556 10.322
## cvpred 7.258 9.20 10.086 10.6 10.057 10.4271 8.459 10.317
## log(AverageWinnings) 6.745 8.71 9.822 11.8 9.742 10.3810 9.212 10.750
## CV residual -0.513 -0.49 -0.263 1.2 -0.316 -0.0461 0.752 0.434
## 80 84 87 88 95 97 99 103
## Predicted 10.296 9.896 9.353 10.738 10.52 10.893 9.86 10.812
## cvpred 10.260 9.819 9.263 10.722 10.55 10.830 9.86 10.892
## log(AverageWinnings) 10.136 8.898 9.488 11.244 9.31 10.685 10.14 10.570
## CV residual -0.124 -0.922 0.225 0.521 -1.24 -0.145 0.28 -0.322
## 106 109 115 126 131 134 135 137 142
## Predicted 11.424 10.983 9.0705 9.087 11.202 9.138 10.59 9.250 9.24
## cvpred 11.403 10.913 8.9457 8.960 11.235 9.057 10.59 9.239 9.14
## log(AverageWinnings) 10.937 11.349 8.9735 9.779 11.740 9.398 10.14 9.421 10.29
## CV residual -0.466 0.435 0.0278 0.819 0.505 0.341 -0.45 0.183 1.15
## 143 149 155 173 188 190 192
## Predicted 11.871 9.732 10.17 10.09 11.487 9.36 9.52
## cvpred 11.844 9.723 10.09 10.04 11.460 9.26 9.46
## log(AverageWinnings) 12.480 10.414 11.47 10.66 11.166 10.31 8.32
## CV residual 0.635 0.692 1.39 0.62 -0.294 1.06 -1.14
##
## Sum of squares = 17.1 Mean square = 0.43 n = 40
##
## fold 3
## Observations in test set: 39
## 14 15 17 19 21 27 30 47
## Predicted 10.687 9.8898 10.0 10.38 10.437 11.189 10.00 11.081
## cvpred 10.582 9.9251 10.1 10.52 10.495 11.230 9.91 11.146
## log(AverageWinnings) 10.855 9.8720 10.5 9.64 10.203 10.797 10.95 11.741
## CV residual 0.273 -0.0531 0.4 -0.88 -0.293 -0.433 1.04 0.595
## 62 66 67 68 69 71 74 77
## Predicted 8.694 10.878 9.17977 11.474 9.873 10.92 10.9683 8.368
## cvpred 8.698 11.007 9.17277 11.403 9.868 11.01 10.9089 8.216
## log(AverageWinnings) 8.071 10.868 9.16660 10.770 9.665 9.81 10.9660 8.700
## CV residual -0.627 -0.139 -0.00617 -0.633 -0.203 -1.20 0.0571 0.484
## 79 89 90 93 101 104 105 108
## Predicted 10.041 10.771 11.056 10.086 10.181 11.05 10.856 9.537
## cvpred 9.859 10.822 11.070 10.092 10.129 11.31 10.854 9.606
## log(AverageWinnings) 10.250 10.491 11.363 10.717 10.645 10.28 11.023 9.215
## CV residual 0.391 -0.331 0.293 0.625 0.516 -1.03 0.169 -0.392
## 113 117 118 123 124 132 140 157
## Predicted 9.545 9.259 11.68 10.711 10.445 10.167 10.217 10.411
## cvpred 9.503 9.281 11.88 10.707 10.595 10.261 10.194 10.332
## log(AverageWinnings) 10.446 9.053 10.72 11.448 10.373 9.964 10.530 10.784
## CV residual 0.943 -0.227 -1.16 0.741 -0.222 -0.297 0.335 0.452
## 158 163 166 178 179 180 191
## Predicted 9.331 11.656 9.860 11.112 10.421 12.212 9.84
## cvpred 9.228 11.667 9.850 11.154 10.278 12.298 9.99
## log(AverageWinnings) 9.816 12.100 10.636 10.452 10.753 12.551 8.71
## CV residual 0.588 0.433 0.786 -0.702 0.475 0.253 -1.27
##
## Sum of squares = 14.3 Mean square = 0.37 n = 39
##
## fold 4
## Observations in test set: 39
## 6 7 10 20 23 26 29 32
## Predicted 10.806 10.737 10.763 10.120 10.31 10.562 10.331 11.166
## cvpred 10.831 10.755 10.784 10.125 10.22 10.567 10.350 11.127
## log(AverageWinnings) 10.984 10.641 11.049 10.612 9.91 10.165 11.301 11.609
## CV residual 0.153 -0.113 0.265 0.487 -0.31 -0.402 0.951 0.482
## 35 37 45 49 61 65 86 94 96
## Predicted 8.697 10.74 10.401 9.892 11.62 10.334 10.226 9.49 11.768
## cvpred 8.674 10.80 10.416 9.925 11.61 10.340 10.253 9.51 11.785
## log(AverageWinnings) 8.964 10.08 9.561 9.648 12.80 11.192 9.904 10.81 11.583
## CV residual 0.291 -0.72 -0.855 -0.277 1.19 0.852 -0.349 1.30 -0.203
## 98 100 112 121 122 130 133 144
## Predicted 10.919 11.021 9.918 8.614 10.452 9.478 10.4 12.212
## cvpred 10.908 11.089 9.875 8.568 10.479 9.525 10.4 12.246
## log(AverageWinnings) 10.123 10.124 10.246 9.050 10.306 9.188 10.9 12.400
## CV residual -0.785 -0.965 0.371 0.482 -0.173 -0.337 0.5 0.154
## 146 147 151 156 161 170 171 172
## Predicted 9.530 10.381 10.037 9.692 9.524 10.116 10.297 10.55
## cvpred 9.615 10.344 10.088 9.693 9.583 10.154 10.363 10.50
## log(AverageWinnings) 9.016 9.576 9.515 8.910 9.270 9.886 9.406 11.28
## CV residual -0.599 -0.768 -0.573 -0.784 -0.314 -0.268 -0.957 0.78
## 174 175 177 182 186 193
## Predicted 10.00 10.051 10.72 10.236 10.044 11.057
## cvpred 10.03 9.946 10.73 10.250 10.049 11.039
## log(AverageWinnings) 9.03 9.701 11.68 11.104 10.461 10.682
## CV residual -1.00 -0.245 0.95 0.854 0.412 -0.356
##
## Sum of squares = 16.1 Mean square = 0.41 n = 39
##
## fold 5
## Observations in test set: 39
## 1 2 3 8 16 22 31 34
## Predicted 9.562 11.671 9.920 11.033 10.616 10.876 8.974 10.028
## cvpred 9.575 11.590 9.934 11.035 10.569 10.905 9.065 10.126
## log(AverageWinnings) 10.062 12.358 10.910 10.556 10.725 10.553 8.338 9.952
## CV residual 0.487 0.768 0.976 -0.479 0.156 -0.352 -0.727 -0.174
## 41 42 44 48 56 59 60 64
## Predicted 9.901 9.569 10.575 9.860 10.097 10.109 10.8849 10.843
## cvpred 9.965 9.535 10.614 9.929 10.101 10.104 10.8872 10.738
## log(AverageWinnings) 9.489 9.330 10.049 9.189 9.665 10.523 10.9548 11.377
## CV residual -0.476 -0.205 -0.566 -0.741 -0.436 0.418 0.0676 0.638
## 70 82 83 85 91 102 107 110
## Predicted 9.139 10.318 10.250 11.284 11.0872 9.98 10.716 10.382
## cvpred 9.167 10.403 10.296 11.137 11.0119 10.03 10.683 10.454
## log(AverageWinnings) 9.018 9.882 9.337 11.408 11.0758 11.12 11.369 9.957
## CV residual -0.149 -0.521 -0.959 0.271 0.0639 1.10 0.686 -0.497
## 129 138 139 148 150 153 154 165
## Predicted 10.41451 9.71 10.3 10.8457 10.405 9.661 10.24 10.899
## cvpred 10.35197 9.78 10.4 10.8757 10.445 9.729 10.30 10.864
## log(AverageWinnings) 10.35873 9.14 10.0 10.9719 10.060 9.032 9.29 11.391
## CV residual 0.00676 -0.64 -0.4 0.0962 -0.385 -0.697 -1.02 0.528
## 167 168 176 181 184 187 194
## Predicted 9.3132 11.439 11.840 10.796 10.031 9.2926 12.109
## cvpred 9.3973 11.392 11.778 10.840 10.049 9.3569 12.052
## log(AverageWinnings) 9.3360 11.715 11.976 10.660 10.154 9.3333 12.837
## CV residual -0.0613 0.323 0.198 -0.179 0.105 -0.0236 0.785
##
## Sum of squares = 11.2 Mean square = 0.29 n = 39
##
## Overall (Sum over all 39 folds)
## ms
## 0.389
### This calculate the PRESS (predictive residual sum of squares), the lower, the better
#' @title PRESS
#' @author Thomas Hopper
#' @description Returns the PRESS statistic (predictive residual sum of squares).
#' Useful for evaluating predictive power of regression models.
#' @param linear.model A linear regression model (class 'lm'). Required.
PRESS <- function(linear.model) {
#' calculate the predictive residuals
pr <- residuals(linear.model)/(1-lm.influence(linear.model)$hat)
#' calculate the PRESS
PRESS <- sum(pr^2)
return(PRESS)
}
### This calculate the MSPE (mean square prediction error), the lower, the better
#' @title MSPE
#' @author Yichen Qin
#' @description Returns the MSPE statistic (mean square prediction error).
#' @param linear.model A linear regression model (class 'lm'). Required.
MSPE <- function(linear.model) {
#' calculate the MSPE =PRESS/sample size
return(PRESS(linear.model)/length(residuals(linear.model)))
}
pred_r_squared <- function(linear.model) {
#' Use anova() to get the sum of squares for the linear model
lm.anova <- anova(linear.model)
#' Calculate the total sum of squares
tss <- sum(lm.anova$'Sum Sq')
# Calculate the predictive R^2
pred.r.squared <- 1-PRESS(linear.model)/(tss)
return(pred.r.squared)
}
ModelList <- data.frame(Model = 1, PRESS = PRESS(ModelA), MSPE = MSPE(ModelA), PredRsq = pred_r_squared(ModelA), Rsq = summary(ModelA)$r.squared)
ModelList
## Model PRESS MSPE PredRsq Rsq
## 1 1 75.3 0.384 0.604 0.636
PRESS = 75.3 MSPE = 0.384 Pred Rsq = 0.604 Original Rsq = 0.636
ModelB <- lm(formula = log(AverageWinnings) ~ DrivingAccuracy + GreensonRegulation + AverageNumofPutts + SavePercent + NumEvents, data = pga)
KCV=cv.lm(data=pga, ModelB, m=5, seed=123)
## Analysis of Variance Table
##
## Response: log(AverageWinnings)
## Df Sum Sq Mean Sq F value Pr(>F)
## DrivingAccuracy 1 0.3 0.3 0.83 0.36402
## GreensonRegulation 1 59.7 59.7 162.57 < 2e-16 ***
## AverageNumofPutts 1 49.4 49.4 134.60 < 2e-16 ***
## SavePercent 1 4.6 4.6 12.52 0.00051 ***
## NumEvents 1 6.5 6.5 17.60 4.2e-05 ***
## Residuals 190 69.8 0.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in cv.lm(data = pga, ModelB, m = 5, seed = 123):
##
## As there is >1 explanatory variable, cross-validation
## predicted values for a fold are not a linear function
## of corresponding overall predicted values. Lines that
## are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 39
## 4 5 11 13 25 36 38 39
## Predicted 10.058 9.832 10.214 9.287 10.947 10.300 9.450 9.74
## cvpred 10.078 9.877 10.240 9.348 11.066 10.215 9.403 9.68
## log(AverageWinnings) 10.607 9.693 9.921 8.704 10.560 10.554 9.710 11.31
## CV residual 0.529 -0.184 -0.319 -0.644 -0.506 0.339 0.308 1.64
## 43 53 72 73 75 76 78 81
## Predicted 9.33 11.601 8.720 10.522 10.3 10.108 9.919 9.922
## cvpred 9.40 11.609 8.737 10.507 10.3 10.163 9.959 9.938
## log(AverageWinnings) 8.38 11.495 8.603 10.212 10.4 10.291 10.207 9.176
## CV residual -1.01 -0.114 -0.134 -0.295 0.1 0.128 0.248 -0.763
## 92 111 114 116 119 120 125 127 128
## Predicted 11.090 8.66 11.25 10.909 10.24 11.072 10.96 10.610 9.904
## cvpred 11.206 8.67 11.27 10.947 10.30 11.096 11.00 10.569 9.950
## log(AverageWinnings) 10.509 6.94 11.09 10.419 9.83 11.270 9.67 10.395 9.809
## CV residual -0.697 -1.73 -0.18 -0.528 -0.47 0.173 -1.33 -0.174 -0.141
## 136 141 145 152 159 160 162
## Predicted 10.748 10.857 9.558 11.13354 11.1271 10.467 11.4282
## cvpred 10.791 10.851 9.677 11.11514 11.1528 10.486 11.4451
## log(AverageWinnings) 9.899 10.172 9.060 11.11000 11.1969 10.285 11.4985
## CV residual -0.892 -0.679 -0.617 -0.00514 0.0441 -0.201 0.0534
## 164 169 183 185 189 195 196
## Predicted 10.3292 10.011 9.838 9.89 10.558 10.77818 10.97
## cvpred 10.3888 9.996 9.853 9.94 10.554 10.84727 11.02
## log(AverageWinnings) 10.4683 10.792 10.527 11.64 10.770 10.85101 11.30
## CV residual 0.0795 0.796 0.673 1.70 0.216 0.00374 0.28
##
## Sum of squares = 17.4 Mean square = 0.45 n = 39
##
## fold 2
## Observations in test set: 40
## 9 12 18 24 28 33 40 46
## Predicted 9.487 9.235 10.263 8.710 10.441 10.303 9.413 10.659
## cvpred 9.407 9.141 10.276 8.577 10.329 10.291 9.264 10.663
## log(AverageWinnings) 10.128 9.663 10.500 8.804 11.190 10.937 9.661 9.845
## CV residual 0.721 0.523 0.225 0.227 0.862 0.646 0.397 -0.818
## 50 51 52 54 55 57 58 63 80
## Predicted 7.293 9.147 10.145 10.71 10.03 10.604 8.52 10.330 10.376
## cvpred 7.120 9.147 10.111 10.70 10.03 10.585 8.41 10.330 10.352
## log(AverageWinnings) 6.745 8.714 9.822 11.76 9.74 10.381 9.21 10.750 10.136
## CV residual -0.374 -0.433 -0.289 1.06 -0.29 -0.204 0.80 0.421 -0.216
## 84 87 88 95 97 99 103 106
## Predicted 9.96 9.440 10.733 10.53 10.918 9.841 10.865 11.316
## cvpred 9.89 9.376 10.713 10.55 10.857 9.860 10.968 11.289
## log(AverageWinnings) 8.90 9.488 11.244 9.31 10.685 10.144 10.570 10.937
## CV residual -0.99 0.112 0.531 -1.23 -0.172 0.284 -0.397 -0.352
## 109 115 126 131 134 135 137 142
## Predicted 11.067 9.09175 9.085 11.157 9.12 10.565 9.210 9.27
## cvpred 11.013 8.97550 8.950 11.183 9.03 10.569 9.190 9.16
## log(AverageWinnings) 11.349 8.97348 9.779 11.740 9.40 10.141 9.421 10.29
## CV residual 0.336 -0.00202 0.829 0.557 0.37 -0.427 0.232 1.13
## 143 149 155 173 188 190 192
## Predicted 11.870 9.671 10.11 10.154 11.535 9.45 9.52
## cvpred 11.861 9.651 10.02 10.091 11.517 9.34 9.45
## log(AverageWinnings) 12.480 10.414 11.47 10.658 11.166 10.31 8.32
## CV residual 0.619 0.763 1.45 0.568 -0.351 0.97 -1.12
##
## Sum of squares = 17.1 Mean square = 0.43 n = 40
##
## fold 3
## Observations in test set: 39
## 14 15 17 19 21 27 30 47
## Predicted 10.665 9.860157 10.040 10.394 10.440 11.138 9.91 11.088
## cvpred 10.620 9.872372 10.025 10.522 10.478 11.235 9.86 11.176
## log(AverageWinnings) 10.855 9.872048 10.523 9.645 10.203 10.797 10.95 11.741
## CV residual 0.235 -0.000323 0.499 -0.878 -0.275 -0.438 1.09 0.565
## 62 66 67 68 69 71 74 77
## Predicted 8.73 10.8016 9.21643 11.488 9.931 10.91 11.020 8.348
## cvpred 8.69 10.8831 9.16978 11.575 9.953 11.08 11.077 8.253
## log(AverageWinnings) 8.07 10.8678 9.16660 10.770 9.665 9.81 10.966 8.700
## CV residual -0.62 -0.0153 -0.00318 -0.805 -0.288 -1.26 -0.111 0.448
## 79 89 90 93 101 104 105 108
## Predicted 10.061 10.808 11.031 10.061 10.172 10.994 10.806 9.510
## cvpred 9.993 10.746 11.005 10.101 10.262 11.142 10.821 9.536
## log(AverageWinnings) 10.250 10.491 11.363 10.717 10.645 10.279 11.023 9.215
## CV residual 0.258 -0.255 0.358 0.616 0.383 -0.863 0.201 -0.321
## 113 117 118 123 124 132 140 157
## Predicted 9.596 9.263 11.70 10.71 10.473 10.219 10.233 10.397
## cvpred 9.582 9.318 11.77 10.77 10.540 10.248 10.219 10.403
## log(AverageWinnings) 10.446 9.053 10.72 11.45 10.373 9.964 10.530 10.784
## CV residual 0.864 -0.264 -1.05 0.68 -0.167 -0.284 0.311 0.381
## 158 163 166 178 179 180 191
## Predicted 9.412 11.530 9.896 11.116 10.412 12.193 9.83
## cvpred 9.442 11.542 9.890 11.238 10.335 12.352 9.90
## log(AverageWinnings) 9.816 12.100 10.636 10.452 10.753 12.551 8.71
## CV residual 0.373 0.559 0.746 -0.786 0.418 0.199 -1.19
##
## Sum of squares = 13.4 Mean square = 0.34 n = 39
##
## fold 4
## Observations in test set: 39
## 6 7 10 20 23 26 29 32 35
## Predicted 10.74 10.770 10.8 10.154 10.280 10.572 10.264 11.118 8.699
## cvpred 10.77 10.789 10.8 10.141 10.199 10.564 10.302 11.103 8.681
## log(AverageWinnings) 10.98 10.641 11.0 10.612 9.912 10.165 11.301 11.609 8.964
## CV residual 0.21 -0.147 0.2 0.472 -0.288 -0.399 0.999 0.506 0.283
## 37 45 49 61 65 86 94 96
## Predicted 10.727 10.429 9.897 11.63 10.400 10.268 9.58 11.845
## cvpred 10.776 10.458 9.957 11.62 10.401 10.268 9.59 11.835
## log(AverageWinnings) 10.079 9.561 9.648 12.80 11.192 9.904 10.81 11.583
## CV residual -0.697 -0.897 -0.309 1.17 0.791 -0.364 1.22 -0.252
## 98 100 112 121 122 130 133 144
## Predicted 10.911 11.023 9.789 8.637 10.552 9.591 10.452 12.213
## cvpred 10.933 11.076 9.764 8.608 10.554 9.594 10.447 12.245
## log(AverageWinnings) 10.123 10.124 10.246 9.050 10.306 9.188 10.879 12.400
## CV residual -0.811 -0.952 0.482 0.442 -0.248 -0.407 0.432 0.155
## 146 147 151 156 161 170 171 172
## Predicted 9.565 10.328 10.066 9.743 9.618 10.129 10.35 10.519
## cvpred 9.625 10.289 10.122 9.756 9.673 10.145 10.41 10.494
## log(AverageWinnings) 9.016 9.576 9.515 8.910 9.270 9.886 9.41 11.282
## CV residual -0.609 -0.714 -0.607 -0.847 -0.403 -0.259 -1.00 0.788
## 174 175 177 182 186 193
## Predicted 10.03 10.02 10.723 10.229 10.077 11.01
## cvpred 10.07 9.95 10.723 10.248 10.087 10.99
## log(AverageWinnings) 9.03 9.70 11.678 11.104 10.461 10.68
## CV residual -1.05 -0.25 0.955 0.856 0.374 -0.31
##
## Sum of squares = 16.2 Mean square = 0.42 n = 39
##
## fold 5
## Observations in test set: 39
## 1 2 3 8 16 22 31 34
## Predicted 9.460 11.59 9.90 10.963 10.583 10.864 9.009 10.061
## cvpred 9.447 11.45 9.90 10.937 10.568 10.850 9.052 10.170
## log(AverageWinnings) 10.062 12.36 10.91 10.556 10.725 10.553 8.338 9.952
## CV residual 0.615 0.91 1.01 -0.381 0.158 -0.296 -0.714 -0.218
## 41 42 44 48 56 59 60 64
## Predicted 9.844 9.629 10.575 9.845 10.144 10.095 10.91650 10.918
## cvpred 9.875 9.634 10.590 9.918 10.128 10.094 10.94523 10.839
## log(AverageWinnings) 9.489 9.330 10.049 9.189 9.665 10.523 10.95478 11.377
## CV residual -0.386 -0.305 -0.541 -0.729 -0.463 0.429 0.00956 0.538
## 70 82 83 85 91 102 107 110
## Predicted 9.167 10.310 10.188 11.325 11.0926 9.97 10.666 10.400
## cvpred 9.228 10.363 10.201 11.269 11.0463 10.00 10.652 10.468
## log(AverageWinnings) 9.018 9.882 9.337 11.408 11.0758 11.12 11.369 9.957
## CV residual -0.209 -0.481 -0.864 0.139 0.0295 1.12 0.717 -0.512
## 129 138 139 148 150 153 154 165
## Predicted 10.488 9.651 10.306 10.831 10.417 9.656 10.172 10.855
## cvpred 10.480 9.747 10.352 10.848 10.457 9.680 10.195 10.838
## log(AverageWinnings) 10.359 9.144 10.026 10.972 10.060 9.032 9.286 11.391
## CV residual -0.122 -0.603 -0.325 0.124 -0.397 -0.649 -0.909 0.553
## 167 168 176 181 184 187 194
## Predicted 9.2913 11.464 11.799 10.7127 9.984 9.3274 12.169
## cvpred 9.3925 11.436 11.739 10.7147 10.041 9.4121 12.145
## log(AverageWinnings) 9.3360 11.715 11.976 10.6601 10.154 9.3333 12.837
## CV residual -0.0565 0.278 0.237 -0.0545 0.113 -0.0789 0.692
##
## Sum of squares = 10.7 Mean square = 0.27 n = 39
##
## Overall (Sum over all 39 folds)
## ms
## 0.382
ModelList <- rbind.data.frame(ModelList, list(2, PRESS(ModelB), MSPE(ModelB), pred_r_squared(ModelB), summary(ModelB)$r.squared))
ModelList
## Model PRESS MSPE PredRsq Rsq
## 1 1 75.3 0.384 0.604 0.636
## 2 2 74.3 0.379 0.610 0.633
ModelC <- lm(formula = log(AverageWinnings) ~ AverageNumofPutts + SavePercent + NumEvents, data = pga)
KCV=cv.lm(data=pga, ModelC, m=5, seed=123)
## Analysis of Variance Table
##
## Response: log(AverageWinnings)
## Df Sum Sq Mean Sq F value Pr(>F)
## AverageNumofPutts 1 63.4 63.4 103.68 < 2e-16 ***
## SavePercent 1 1.1 1.1 1.83 0.17735
## NumEvents 1 8.4 8.4 13.67 0.00028 ***
## Residuals 192 117.4 0.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in cv.lm(data = pga, ModelC, m = 5, seed = 123):
##
## As there is >1 explanatory variable, cross-validation
## predicted values for a fold are not a linear function
## of corresponding overall predicted values. Lines that
## are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 39
## 4 5 11 13 25 36 38 39
## Predicted 10.345 10.328 10.208 9.50 9.996 10.888 9.877 10.385
## cvpred 10.365 10.371 10.194 9.56 10.054 10.821 9.845 10.347
## log(AverageWinnings) 10.607 9.693 9.921 8.70 10.560 10.554 9.710 11.312
## CV residual 0.242 -0.678 -0.272 -0.86 0.506 -0.267 -0.135 0.965
## 43 53 72 73 75 76 78 81
## Predicted 9.99 10.914 9.59 10.534 9.914 10.123 10.1520 10.27
## cvpred 10.05 10.860 9.65 10.447 9.872 10.187 10.1779 10.29
## log(AverageWinnings) 8.38 11.495 8.60 10.212 10.446 10.291 10.2074 9.18
## CV residual -1.67 0.635 -1.05 -0.235 0.575 0.104 0.0295 -1.12
## 92 111 114 116 119 120 125 127
## Predicted 9.763 9.48 10.975 10.304 9.687 10.26 10.371 11.045
## cvpred 9.830 9.51 10.972 10.288 9.696 10.25 10.389 11.008
## log(AverageWinnings) 10.509 6.94 11.086 10.419 9.830 11.27 9.668 10.395
## CV residual 0.679 -2.58 0.114 0.132 0.134 1.02 -0.721 -0.613
## 128 136 141 145 152 159 160 162
## Predicted 9.7995 10.175 11.20 9.711 10.75 10.527 10.397 11.152
## cvpred 9.8238 10.185 11.18 9.831 10.67 10.530 10.393 11.166
## log(AverageWinnings) 9.8085 9.899 10.17 9.060 11.11 11.197 10.285 11.498
## CV residual -0.0153 -0.286 -1.01 -0.771 0.44 0.667 -0.108 0.333
## 164 169 183 185 189 195 196
## Predicted 9.896 9.852 10.081 10.29 10.152 10.209 10.686
## cvpred 9.915 9.810 10.077 10.33 10.099 10.226 10.712
## log(AverageWinnings) 10.468 10.792 10.527 11.64 10.770 10.851 11.297
## CV residual 0.553 0.982 0.449 1.31 0.672 0.625 0.585
##
## Sum of squares = 24.4 Mean square = 0.63 n = 39
##
## fold 2
## Observations in test set: 40
## 9 12 18 24 28 33 40 46 50
## Predicted 9.786 8.89 10.908 8.938 9.78 10.731 9.439 10.829 8.77
## cvpred 9.763 8.90 10.904 8.949 9.77 10.723 9.443 10.828 8.78
## log(AverageWinnings) 10.128 9.66 10.500 8.804 11.19 10.937 9.661 9.845 6.75
## CV residual 0.365 0.76 -0.404 -0.144 1.42 0.213 0.218 -0.983 -2.03
## 51 52 54 55 57 58 63 80
## Predicted 10.08 10.287 10.983 10.546 10.750 8.903 10.586 10.2372
## cvpred 10.07 10.281 10.974 10.546 10.765 8.888 10.554 10.2313
## log(AverageWinnings) 8.71 9.822 11.761 9.742 10.381 9.212 10.750 10.1358
## CV residual -1.36 -0.459 0.787 -0.805 -0.384 0.324 0.196 -0.0956
## 84 87 88 95 97 99 103 106 109
## Predicted 9.9 9.195 10.321 10.37 10.058 10.447 11.251 10.839 10.44
## cvpred 9.9 9.193 10.311 10.36 10.067 10.431 11.263 10.825 10.42
## log(AverageWinnings) 8.9 9.488 11.244 9.31 10.685 10.144 10.570 10.937 11.35
## CV residual -1.0 0.295 0.932 -1.05 0.618 -0.286 -0.692 0.112 0.93
## 115 126 131 134 135 137 142 143
## Predicted 9.541 9.232 11.182 9.748 10.532 9.513 9.486 10.99
## cvpred 9.521 9.222 11.175 9.722 10.508 9.532 9.479 10.99
## log(AverageWinnings) 8.973 9.779 11.740 9.398 10.141 9.421 10.287 12.48
## CV residual -0.548 0.557 0.565 -0.324 -0.367 -0.111 0.808 1.49
## 149 155 173 188 190 192
## Predicted 9.900 9.98 10.352 10.549 9.495 10.03
## cvpred 9.896 9.98 10.330 10.523 9.481 10.01
## log(AverageWinnings) 10.414 11.47 10.658 11.166 10.314 8.32
## CV residual 0.518 1.50 0.328 0.644 0.834 -1.69
##
## Sum of squares = 27.2 Mean square = 0.68 n = 40
##
## fold 3
## Observations in test set: 39
## 14 15 17 19 21 27 30 47
## Predicted 10.28 10.358 10.049 10.450 10.0757 11.268 9.58 11.449
## cvpred 10.29 10.426 10.051 10.558 10.1266 11.415 9.57 11.539
## log(AverageWinnings) 10.86 9.872 10.523 9.645 10.2029 10.797 10.95 11.741
## CV residual 0.56 -0.554 0.472 -0.913 0.0763 -0.618 1.38 0.202
## 62 66 67 68 69 71 74 77 79
## Predicted 9.51 10.96 9.823 11.023 10.62 11.75 11.049 9.182 9.81
## cvpred 9.52 11.02 9.798 11.183 10.70 11.94 11.161 9.097 9.78
## log(AverageWinnings) 8.07 10.87 9.167 10.770 9.66 9.81 10.966 8.700 10.25
## CV residual -1.45 -0.15 -0.632 -0.413 -1.04 -2.13 -0.195 -0.397 0.47
## 89 90 93 101 104 105 108 113
## Predicted 10.196 10.548 10.536 10.741 11.5 10.603 9.951 9.836
## cvpred 10.214 10.634 10.605 10.854 11.6 10.687 9.964 9.858
## log(AverageWinnings) 10.491 11.363 10.717 10.645 10.3 11.023 9.215 10.446
## CV residual 0.277 0.729 0.112 -0.209 -1.3 0.335 -0.749 0.588
## 117 118 123 124 132 140 157 158
## Predicted 10.16 11.494 11.124 10.975 9.9615 10.35 10.241 9.536
## cvpred 10.23 11.674 11.267 11.064 10.0073 10.38 10.306 9.514
## log(AverageWinnings) 9.05 10.723 11.448 10.373 9.9641 10.53 10.784 9.816
## CV residual -1.18 -0.951 0.181 -0.691 -0.0432 0.15 0.478 0.302
## 163 166 178 179 180 191
## Predicted 10.37 10.41 11.263 10.076 11.874 10.32
## cvpred 10.38 10.46 11.359 10.109 12.048 10.33
## log(AverageWinnings) 12.10 10.64 10.452 10.753 12.551 8.71
## CV residual 1.72 0.18 -0.907 0.644 0.503 -1.62
##
## Sum of squares = 26.4 Mean square = 0.68 n = 39
##
## fold 4
## Observations in test set: 39
## 6 7 10 20 23 26 29 32
## Predicted 10.408 10.353 10.791 10.149 10.99 10.2594 9.70 10.858
## cvpred 10.433 10.359 10.851 10.127 10.95 10.2374 9.71 10.847
## log(AverageWinnings) 10.984 10.641 11.049 10.612 9.91 10.1648 11.30 11.609
## CV residual 0.551 0.282 0.198 0.486 -1.04 -0.0726 1.60 0.762
## 35 37 45 49 61 65 86 94
## Predicted 9.411 10.185 9.855 9.887 11.58 10.434 10.119 9.17
## cvpred 9.388 10.208 9.852 9.925 11.62 10.428 10.100 9.15
## log(AverageWinnings) 8.964 10.079 9.561 9.648 12.80 11.192 9.904 10.81
## CV residual -0.423 -0.129 -0.291 -0.278 1.18 0.764 -0.196 1.66
## 96 98 100 112 121 122 130 133
## Predicted 11.4870 10.7 9.936 10.109 9.609 10.717 10.22 11.616
## cvpred 11.5006 10.7 9.941 10.083 9.589 10.732 10.25 11.679
## log(AverageWinnings) 11.5829 10.1 10.124 10.246 9.050 10.306 9.19 10.879
## CV residual 0.0823 -0.6 0.183 0.163 -0.539 -0.426 -1.07 -0.799
## 144 146 147 151 156 161 170 171
## Predicted 11.59 9.486 10.504 10.065 10.01 10.182 9.606 10.158
## cvpred 11.64 9.514 10.471 10.113 10.02 10.245 9.584 10.205
## log(AverageWinnings) 12.40 9.016 9.576 9.515 8.91 9.270 9.886 9.406
## CV residual 0.76 -0.498 -0.895 -0.598 -1.11 -0.975 0.302 -0.799
## 172 174 175 177 182 186 193
## Predicted 10.415 9.953 10.99 10.685 10.08 10.4280 10.554
## cvpred 10.386 9.979 10.98 10.693 10.09 10.4438 10.533
## log(AverageWinnings) 11.282 9.026 9.70 11.678 11.10 10.4610 10.682
## CV residual 0.896 -0.953 -1.28 0.986 1.01 0.0172 0.149
##
## Sum of squares = 23 Mean square = 0.59 n = 39
##
## fold 5
## Observations in test set: 39
## 1 2 3 8 16 22 31 34
## Predicted 10.52 11.33 10.024 11.165 10.876 10.6505 9.81 9.063
## cvpred 10.49 11.22 9.995 11.109 10.831 10.6083 9.82 9.135
## log(AverageWinnings) 10.06 12.36 10.910 10.556 10.725 10.5533 8.34 9.952
## CV residual -0.43 1.14 0.915 -0.553 -0.106 -0.0551 -1.48 0.817
## 41 42 44 48 56 59 60 64
## Predicted 9.5411 9.622 10.702 9.649 10.462 10.256 10.388 10.650
## cvpred 9.5409 9.597 10.686 9.671 10.427 10.227 10.377 10.579
## log(AverageWinnings) 9.4895 9.330 10.049 9.189 9.665 10.523 10.955 11.377
## CV residual -0.0514 -0.267 -0.637 -0.482 -0.762 0.296 0.578 0.798
## 70 82 83 85 91 102 107 110
## Predicted 9.430 10.08 9.981 10.963 10.848 9.70 10.529 10.049
## cvpred 9.453 10.09 9.966 10.892 10.793 9.70 10.496 10.061
## log(AverageWinnings) 9.018 9.88 9.337 11.408 11.076 11.12 11.369 9.957
## CV residual -0.435 -0.21 -0.629 0.516 0.283 1.42 0.873 -0.104
## 129 138 139 148 150 153 154 165
## Predicted 10.546 9.425 10.0130 9.82 10.317 9.808 9.610 11.023
## cvpred 10.522 9.457 10.0158 9.81 10.313 9.805 9.607 10.976
## log(AverageWinnings) 10.359 9.144 10.0264 10.97 10.060 9.032 9.286 11.391
## CV residual -0.163 -0.313 0.0106 1.16 -0.252 -0.773 -0.321 0.415
## 167 168 176 181 184 187 194
## Predicted 9.2322 10.836 11.530 10.6632 10.1651 9.56 10.7
## cvpred 9.2728 10.787 11.462 10.6186 10.1766 9.60 10.6
## log(AverageWinnings) 9.3360 11.715 11.976 10.6601 10.1541 9.33 12.8
## CV residual 0.0632 0.927 0.514 0.0416 -0.0225 -0.27 2.2
##
## Sum of squares = 20.1 Mean square = 0.52 n = 39
##
## Overall (Sum over all 39 folds)
## ms
## 0.618
ModelList <- rbind.data.frame(ModelList, list(3, PRESS(ModelC), MSPE(ModelC), pred_r_squared(ModelC), summary(ModelC)$r.squared))
ModelList
## Model PRESS MSPE PredRsq Rsq
## 1 1 75.3 0.384 0.604 0.636
## 2 2 74.3 0.379 0.610 0.633
## 3 3 123.0 0.628 0.353 0.383
From the above table, we can observe that Model # 1& 2 are better since drop in R-squared to Predicted R-squared is very less as compared to the last Model. But between them Model # 2 is better with lower PRESS and higher Predicted Rsquared
iris.csv is a multivariate data set introduced by the British statistician and biologist Ronald Fisher. It consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters. Please read the data set into R and visualize the data.iris <- read.csv("iris.csv")
par(mfrow = c(2,2))
hist(iris$Sepal.Length, col = 'light green', xlab = 'Sepal Length', main = 'Distribution of Sepal Length')
hist(iris$Sepal.Width, col = 'light green', xlab = 'Sepal Width', main = 'Distribution of Sepal Width')
hist(iris$Petal.Length, col = 'light green', xlab = 'Petal Length', main = 'Distribution of Petal Length')
hist(iris$Petal.Width, col = 'light green', xlab = 'Petal Width', main = 'Distribution of Petal Width')
pairs(iris, col = 'blue')
iris_modelA <- lm(formula = Sepal.Length ~ Sepal.Width, data = iris)
summary(iris_modelA)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.556 -0.633 -0.112 0.558 2.223
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.526 0.479 13.63 <2e-16 ***
## Sepal.Width -0.223 0.155 -1.44 0.15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.825 on 148 degrees of freedom
## Multiple R-squared: 0.0138, Adjusted R-squared: 0.00716
## F-statistic: 2.07 on 1 and 148 DF, p-value: 0.152
Covariate: Sepal Width is statistically “Not Significant’ Intercept is statistically”Significant"
iris_setosa <- iris[iris$Species == 'setosa', ]
iris_setosa_modelA <- lm(formula = Sepal.Length ~ Sepal.Width, data = iris_setosa)
summary(iris_setosa_modelA)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = iris_setosa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5248 -0.1629 0.0217 0.1383 0.4443
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6390 0.3100 8.51 3.7e-11 ***
## Sepal.Width 0.6905 0.0899 7.68 6.7e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.239 on 48 degrees of freedom
## Multiple R-squared: 0.551, Adjusted R-squared: 0.542
## F-statistic: 59 on 1 and 48 DF, p-value: 6.71e-10
Both Covariate: Sepal Width & the Intercept are statistically “Highly Significant”
iris_versicolor <- iris[iris$Species == 'versicolor', ]
iris_versicolor_modelA <- lm(formula = Sepal.Length ~ Sepal.Width, data = iris_versicolor)
summary(iris_versicolor_modelA)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = iris_versicolor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7350 -0.2856 -0.0754 0.4367 0.8380
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.540 0.563 6.29 9.1e-08 ***
## Sepal.Width 0.865 0.202 4.28 8.8e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.444 on 48 degrees of freedom
## Multiple R-squared: 0.277, Adjusted R-squared: 0.262
## F-statistic: 18.4 on 1 and 48 DF, p-value: 8.77e-05
With the filter on Species = Versicolor, both Intercept and Covariate: Sepal Width is “Highly Significant”
iris_virginica <- iris[iris$Species == 'virginica', ]
iris_virginica_modelA <- lm(formula = Sepal.Length ~ Sepal.Width, data = iris_virginica)
summary(iris_virginica_modelA)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = iris_virginica)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2607 -0.3692 -0.0361 0.1984 1.4492
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.907 0.757 5.16 4.7e-06 ***
## Sepal.Width 0.902 0.253 3.56 0.00084 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.571 on 48 degrees of freedom
## Multiple R-squared: 0.209, Adjusted R-squared: 0.193
## F-statistic: 12.7 on 1 and 48 DF, p-value: 0.000843
With filter Species = Virginica, both Intercept and Covariate: Sepal Width are “Highly Significant”
No, results are different between Model built using whole data and stratified data. This means there are distinct groups of data which has different relationship between the response and the regressor.
When we built a model using whole data, this got confounded but when the groups are treated separately, this relationship is identified and found to be statistically “Highly Significant”.
This can be very clearly observed in the below visualization
plot(iris$Sepal.Width,iris$Sepal.Length, pch = 20, col = iris$Species)
abline(iris_modelA, col = 'red')
plot(iris$Sepal.Width, iris$Sepal.Length,pch = 20,col = iris$Species)
abline(iris_setosa_modelA, col = 'blue')
abline(iris_versicolor_modelA, col = 'red')
abline(iris_virginica_modelA, col = 'green')
iris_modelB <- lm(formula = Sepal.Length ~ Petal.Length, data = iris)
summary(iris_modelB)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2468 -0.2966 -0.0152 0.2768 1.0027
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.3066 0.0784 54.9 <2e-16 ***
## Petal.Length 0.4089 0.0189 21.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.407 on 148 degrees of freedom
## Multiple R-squared: 0.76, Adjusted R-squared: 0.758
## F-statistic: 469 on 1 and 148 DF, p-value: <2e-16
iris_setosa_modelB <- lm(formula = Sepal.Length ~ Petal.Length, data = iris_setosa)
summary(iris_setosa_modelB)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = iris_setosa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5724 -0.2067 -0.0308 0.1734 0.9361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.213 0.416 10.14 1.6e-13 ***
## Petal.Length 0.542 0.282 1.92 0.061 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.343 on 48 degrees of freedom
## Multiple R-squared: 0.0714, Adjusted R-squared: 0.052
## F-statistic: 3.69 on 1 and 48 DF, p-value: 0.0607
iris_versicolor_modelB <- lm(formula = Sepal.Length ~ Petal.Length, data = iris_versicolor)
summary(iris_versicolor_modelB)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = iris_versicolor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7348 -0.2027 -0.0206 0.2609 0.6996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.408 0.446 5.39 2.1e-06 ***
## Petal.Length 0.828 0.104 7.95 2.6e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.343 on 48 degrees of freedom
## Multiple R-squared: 0.569, Adjusted R-squared: 0.56
## F-statistic: 63.3 on 1 and 48 DF, p-value: 2.59e-10
iris_virginica_modelB <- lm(formula = Sepal.Length ~ Petal.Length, data = iris_virginica)
summary(iris_virginica_modelB)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = iris_virginica)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7341 -0.2364 -0.0313 0.2377 0.7621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0597 0.4668 2.27 0.028 *
## Petal.Length 0.9957 0.0837 11.90 6.3e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.323 on 48 degrees of freedom
## Multiple R-squared: 0.747, Adjusted R-squared: 0.742
## F-statistic: 142 on 1 and 48 DF, p-value: 6.3e-16
plot(iris$Petal.Length,iris$Sepal.Length, pch = 20, col = iris$Species)
abline(iris_modelB, col = 'red')
plot(iris$Petal.Length,iris$Sepal.Length, pch = 20, col = iris$Species)
abline(iris_setosa_modelB, col = 'blue')
abline(iris_versicolor_modelB, col = 'red')
abline(iris_virginica_modelB, col = 'green')
In this case, the Full Model was able to capture variation in data quite well and Covariate was very “Highly Significant”.
But when we fit model using individual groups, atleast for one group (Setosa), Covariate was statistically “Not Significant”. This we can observe from above visualization. The Blue regression line doesn’t capture the black cluster (Setosa) well.