#read.csv reads the csv from my directory
pga<-read.csv(file= "C:/Users/Billa/Google Drive/MSIS/FLEX 2/DAM/Homeworks/Homework 3/PGA.csv")
#Get the names of all the columns
names(pga)
## [1] "Name" "Age" "AverageDrive"
## [4] "DrivingAccuracy" "GreensonRegulation" "AverageNumofPutts"
## [7] "SavePercent" "MoneyRank" "NumEvents"
## [10] "TotalWinnings" "AverageWinnings"
#get the summary of data loaded into R
summary(pga)
## Name Age AverageDrive DrivingAccuracy
## Aaron Baddeley : 1 Min. :21.00 Min. :268.2 Min. :49.90
## Adam Scott : 1 1st Qu.:31.00 1st Qu.:281.3 1st Qu.:60.95
## Alex Cejka : 1 Median :35.50 Median :287.2 Median :64.30
## Andre Stolz : 1 Mean :35.96 Mean :287.2 Mean :64.08
## Arjun Atwal : 1 3rd Qu.:40.25 3rd Qu.:292.1 3rd Qu.:67.80
## Arron Oberholser: 1 Max. :51.00 Max. :314.4 Max. :77.20
## (Other) :190
## GreensonRegulation AverageNumofPutts SavePercent MoneyRank
## Min. :54.70 Min. :1.723 Min. :31.80 Min. : 1.00
## 1st Qu.:63.00 1st Qu.:1.762 1st Qu.:45.67 1st Qu.: 49.75
## Median :64.90 Median :1.776 Median :49.00 Median : 99.50
## Mean :64.90 Mean :1.778 Mean :48.97 Mean :101.80
## 3rd Qu.:66.83 3rd Qu.:1.796 3rd Qu.:52.42 3rd Qu.:151.25
## Max. :73.30 Max. :1.847 Max. :62.30 Max. :245.00
##
## NumEvents TotalWinnings AverageWinnings
## Min. :15.00 Min. : 21250 Min. : 850
## 1st Qu.:23.00 1st Qu.: 436617 1st Qu.: 15749
## Median :27.00 Median : 814989 Median : 30849
## Mean :26.19 Mean : 1134632 Mean : 46549
## 3rd Qu.:30.00 3rd Qu.: 1407922 3rd Qu.: 56209
## Max. :36.00 Max. :10905167 Max. :376040
##
#get the correlation of data loaded into R
cor(pga[,-1])
## Age AverageDrive DrivingAccuracy
## Age 1.00000000 -0.403540292 0.24340496
## AverageDrive -0.40354029 1.000000000 -0.61057290
## DrivingAccuracy 0.24340496 -0.610572899 1.00000000
## GreensonRegulation 0.04502772 0.192967032 0.29264516
## AverageNumofPutts 0.02349291 -0.004033659 0.07392944
## SavePercent 0.05739343 -0.163324369 0.05440433
## MoneyRank 0.05781969 -0.100771685 -0.05178986
## NumEvents -0.15997652 0.064939115 0.02423528
## TotalWinnings -0.05656756 0.195029075 -0.09179806
## AverageWinnings -0.05378647 0.197897841 -0.12835117
## GreensonRegulation AverageNumofPutts SavePercent
## Age 0.04502772 0.023492914 0.05739343
## AverageDrive 0.19296703 -0.004033659 -0.16332437
## DrivingAccuracy 0.29264516 0.073929437 0.05440433
## GreensonRegulation 1.00000000 -0.109282664 -0.09131139
## AverageNumofPutts -0.10928266 1.000000000 -0.33820386
## SavePercent -0.09131139 -0.338203855 1.00000000
## MoneyRank -0.53132270 0.542715767 -0.23441510
## NumEvents -0.04280127 0.091718103 -0.10112236
## TotalWinnings 0.43713537 -0.438274327 0.19897065
## AverageWinnings 0.39765144 -0.435545246 0.21347893
## MoneyRank NumEvents TotalWinnings AverageWinnings
## Age 0.05781969 -0.15997652 -0.05656756 -0.05378647
## AverageDrive -0.10077168 0.06493912 0.19502907 0.19789784
## DrivingAccuracy -0.05178986 0.02423528 -0.09179806 -0.12835117
## GreensonRegulation -0.53132270 -0.04280127 0.43713537 0.39765144
## AverageNumofPutts 0.54271577 0.09171810 -0.43827433 -0.43554525
## SavePercent -0.23441510 -0.10112236 0.19897065 0.21347893
## MoneyRank 1.00000000 0.09381435 -0.73930964 -0.70434981
## NumEvents 0.09381435 1.00000000 -0.15030655 -0.32728005
## TotalWinnings -0.73930964 -0.15030655 1.00000000 0.95403024
## AverageWinnings -0.70434981 -0.32728005 0.95403024 1.00000000
Seeing the correlation data, it can be observed that total winnings has a high degree of correlation with average winnings. Also total winnings is negatively correlated to Money rank. (P.S : Only considering significant correlation)
#We can use the pairs function to visualize the scatterplots of different data combinations
pairs(pga, lower.panel=panel.smooth, upper.panel=NULL, pch=16, cex=0.1, gap=0)
#plotting the histogram using the hist() function
par(mfrow = c(3,4))
attach(pga)
hist(Age, xlab = "Age", ylab = "Frequency", col = "Orange", main = "Age")
hist(AverageDrive, xlab = "Average Drive", ylab = "Frequency", col = "Green", main = "Average Drive (Yd)")
hist(DrivingAccuracy, xlab = "Driving accuracy (%)", ylab = "Frequency", col = "blue", main = "Driving accuracy")
hist(GreensonRegulation, xlab = "Greens on regulation (%)", ylab = "Frequency", col = "cyan",main = "Greens On Regulation")
hist(AverageNumofPutts, xlab = "Average # of putts", ylab = "Frequency", col = "pink",main = "Average # of putts")
hist(SavePercent, xlab = "Save Percent", ylab = "Frequency", col = "yellow",main = "Save Percent")
hist(MoneyRank, xlab = "Money Rank", ylab = "Frequency", col = "green2",main = "Money Rank")
hist(NumEvents, xlab = "# Events", ylab = "Frequency", col = "magenta",main = "# Events")
hist(TotalWinnings, xlab = "Total Winnings ($)", ylab = "Frequency", col = "grey",main = "Total Winnings ($)")
hist(AverageWinnings, xlab = "Average winnings ($)", ylab = "Frequency", col = "red",main = "Average winnings ($)")
#fitting a linear regression model "avgwin" to the 8 columns:
avgwin<-lm(pga$AverageWinnings~pga$Age+pga$AverageDrive+pga$DrivingAccuracy+pga$GreensonRegulation+pga$AverageNumofPutts+pga$SavePercent+pga$NumEvents)
#summary of the data:
summary(avgwin)
##
## Call:
## lm(formula = pga$AverageWinnings ~ pga$Age + pga$AverageDrive +
## pga$DrivingAccuracy + pga$GreensonRegulation + pga$AverageNumofPutts +
## pga$SavePercent + pga$NumEvents)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71690 -22176 -6735 17147 247928
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 945579.88 305886.59 3.091 0.00230 **
## pga$Age -587.13 519.32 -1.131 0.25968
## pga$AverageDrive -94.76 567.42 -0.167 0.86755
## pga$DrivingAccuracy -2360.57 854.02 -2.764 0.00628 **
## pga$GreensonRegulation 8466.04 1303.87 6.493 7.30e-10 ***
## pga$AverageNumofPutts -694226.49 138155.99 -5.025 1.17e-06 ***
## pga$SavePercent 1395.67 587.54 2.375 0.01853 *
## pga$NumEvents -3159.22 644.24 -4.904 2.03e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41430 on 188 degrees of freedom
## Multiple R-squared: 0.4527, Adjusted R-squared: 0.4323
## F-statistic: 22.21 on 7 and 188 DF, p-value: < 2.2e-16
#plotting the residuals vs fitted, Normal Q-Q, Scale-Location, Residuals vs leverage for all of these
par(mfrow = c(2,2), mai= c(.3,.3,.3,.3))
plot(avgwin, pch = 21, cex=1, col = "orange")
# T statistic values of the coefficients can be obtained using summary function
summary(avgwin)$coef[,3]
## (Intercept) pga$Age pga$AverageDrive
## 3.0912760 -1.1305567 -0.1670039
## pga$DrivingAccuracy pga$GreensonRegulation pga$AverageNumofPutts
## -2.7640660 6.4930148 -5.0249465
## pga$SavePercent pga$NumEvents
## 2.3754460 -4.9037814
Null hypothesis: H0:\(\beta_{0}\)=\(\beta_{1}\)=…..=\(\beta_{k}\) = 0 H1:\(\beta_{j}\) != 0 for at least one j
This means that our null hypothesis is that all the variables are unrelated and their corresponding coefficients are zero. We check if our null hypothesis is true or false by comparing the P value with alpha (0.05). If the P value is lesser than 0.05, that means the probability of our hypothesis is very low (< 0.05), hence we can reject the null hypothesis. Or else we fail to reject the null hypothesis.
# P values can be obtained using Summary() function.
summary(avgwin)$coef[,4]
## (Intercept) pga$Age pga$AverageDrive
## 2.296050e-03 2.596820e-01 8.675464e-01
## pga$DrivingAccuracy pga$GreensonRegulation pga$AverageNumofPutts
## 6.276772e-03 7.300592e-10 1.167423e-06
## pga$SavePercent pga$NumEvents
## 1.853368e-02 2.026906e-06
Observing these P values, we can see that P value of Age is 0.259 and of AverageDrive is 0.86755 which both are greater than 0.05. Hence we fail to reject the null hypothesis. So we can eliminate AverageDrive from the equation. Now fitting a new model after removing this variable, we can find in next step that adjusted R square is 0.4352 while previously it was 0.4323. That means we got a slightly better fit now.
avgwin2<-lm(pga$AverageWinnings~pga$Age+pga$DrivingAccuracy+pga$GreensonRegulation+pga$AverageNumofPutts+pga$SavePercent+pga$NumEvents)
summary(avgwin2)$coef[,4]
## (Intercept) pga$Age pga$DrivingAccuracy
## 8.649821e-04 2.523241e-01 2.784576e-04
## pga$GreensonRegulation pga$AverageNumofPutts pga$SavePercent
## 2.504737e-12 8.693815e-07 1.748792e-02
## pga$NumEvents
## 1.700236e-06
We get the P value as 0.25 for age, which is again greater than 0.05. Hence, removing this too, we get a new adjusted R squared value of 0.4343.
avgwin3<-lm(pga$AverageWinnings~pga$DrivingAccuracy+pga$GreensonRegulation+pga$AverageNumofPutts+pga$SavePercent+pga$NumEvents)
summary(avgwin3)
##
## Call:
## lm(formula = pga$AverageWinnings ~ pga$DrivingAccuracy + pga$GreensonRegulation +
## pga$AverageNumofPutts + pga$SavePercent + pga$NumEvents)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69302 -23747 -6472 17736 243585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 916418.5 272719.9 3.360 0.000941 ***
## pga$DrivingAccuracy -2430.3 592.9 -4.099 6.14e-05 ***
## pga$GreensonRegulation 8391.1 1115.2 7.524 2.06e-12 ***
## pga$AverageNumofPutts -701063.0 137013.1 -5.117 7.57e-07 ***
## pga$SavePercent 1380.2 585.0 2.359 0.019322 *
## pga$NumEvents -3041.6 632.0 -4.812 3.03e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41360 on 190 degrees of freedom
## Multiple R-squared: 0.4488, Adjusted R-squared: 0.4343
## F-statistic: 30.94 on 5 and 190 DF, p-value: < 2.2e-16
Here F value is to be compared to critical value or we can compare the P value to alpha which is 0.05 (95% confidence)
We can use anova() function to test this.
# performing Anova test to get the F statistic and P value:
anova(avgwin)
## Analysis of Variance Table
##
## Response: pga$AverageWinnings
## Df Sum Sq Mean Sq F value Pr(>F)
## pga$Age 1 1.7059e+09 1.7059e+09 0.9937 0.3201135
## pga$AverageDrive 1 2.1867e+10 2.1867e+10 12.7378 0.0004548 ***
## pga$DrivingAccuracy 1 5.1862e+07 5.1862e+07 0.0302 0.8621994
## pga$GreensonRegulation 1 1.1345e+11 1.1345e+11 66.0847 5.666e-14 ***
## pga$AverageNumofPutts 1 7.5694e+10 7.5694e+10 44.0933 3.268e-10 ***
## pga$SavePercent 1 1.2892e+10 1.2892e+10 7.5101 0.0067270 **
## pga$NumEvents 1 4.1281e+10 4.1281e+10 24.0471 2.027e-06 ***
## Residuals 188 3.2273e+11 1.7167e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
P value for all of these is less than 0.05 except for Age and Driving Accuracy. Hence, we can say that Age and Driving Accuracy are insignificant while all of the remaining variables are significant. We have compared these values with 0.05 as we are looking for >95% confidence/probability in our hypothesis. Final Equation is as follows:
AverageWinnings = \(\beta_{0}\) + \(\beta_{1}\) DrivingAccuracy + \(\beta_{2}\) GreensonRegulation + \(\beta_{3}\) AverageNumofPutts + \(\beta_{4}\) SavePercent + \(\beta_{5}\) NumEvents
AverageWinnings = 916418.5 - 2430.3 DrivingAccuracy + 8391.1 GreensonRegulation - 701063.0 AverageNumofPutts + 1380.2 SavePercent - 3041.6 NumEvents
# Model removing Age and Average Drive:
reduced2<-lm(pga$AverageWinnings~pga$DrivingAccuracy+pga$GreensonRegulation+pga$AverageNumofPutts+pga$SavePercent+pga$NumEvents)
anova(reduced2, avgwin)
## Analysis of Variance Table
##
## Model 1: pga$AverageWinnings ~ pga$DrivingAccuracy + pga$GreensonRegulation +
## pga$AverageNumofPutts + pga$SavePercent + pga$NumEvents
## Model 2: pga$AverageWinnings ~ pga$Age + pga$AverageDrive + pga$DrivingAccuracy +
## pga$GreensonRegulation + pga$AverageNumofPutts + pga$SavePercent +
## pga$NumEvents
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 190 3.2503e+11
## 2 188 3.2273e+11 2 2299556206 0.6698 0.513
P value is 0.513, which says that we fail to reject the null hypothesis that the variables Age and Average Drive determine the outcome together.
# Model removing Age, Average Drive (Yards), and Save Percent:
reduced3<-lm(pga$AverageWinnings~pga$DrivingAccuracy+pga$GreensonRegulation+pga$AverageNumofPutts+pga$NumEvents)
anova(reduced3, avgwin)
## Analysis of Variance Table
##
## Model 1: pga$AverageWinnings ~ pga$DrivingAccuracy + pga$GreensonRegulation +
## pga$AverageNumofPutts + pga$NumEvents
## Model 2: pga$AverageWinnings ~ pga$Age + pga$AverageDrive + pga$DrivingAccuracy +
## pga$GreensonRegulation + pga$AverageNumofPutts + pga$SavePercent +
## pga$NumEvents
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 191 3.3456e+11
## 2 188 3.2273e+11 3 1.1822e+10 2.2956 0.0792 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
P value is 0.0792, which says that we fail to reject the null hypothesis that the variables Age, Average Drive and Save Percent determine the outcome together.
Standardized regression coefficients can be obtained using lm.beta, from the lm.beta package
#Finding Standard regression Coefficients of the equation (with 7 variables)
library(lm.beta)
## Warning: package 'lm.beta' was built under R version 3.3.2
lm.beta(avgwin)
##
## Call:
## lm(formula = pga$AverageWinnings ~ pga$Age + pga$AverageDrive +
## pga$DrivingAccuracy + pga$GreensonRegulation + pga$AverageNumofPutts +
## pga$SavePercent + pga$NumEvents)
##
## Standardized Coefficients::
## (Intercept) pga$Age pga$AverageDrive
## 0.00000000 -0.06831285 -0.01425906
## pga$DrivingAccuracy pga$GreensonRegulation pga$AverageNumofPutts
## -0.22790216 0.43883140 -0.29706575
## pga$SavePercent pga$NumEvents
## 0.13960500 -0.27161322
We can determine the infuence of each of the variables using the standardized regression coefficients. We can conclude that Greens on regulation has the highest influence followed by Average # of putts
Multicollinearity can be verified by using VIF (Variable Inflation Factor). Using this for our original dataset:
# using the vif function
final<-lm(pga$AverageWinnings~pga$DrivingAccuracy+pga$GreensonRegulation+pga$AverageNumofPutts+pga$SavePercent+pga$NumEvents)
library(car)
## Warning: package 'car' was built under R version 3.3.2
vif(final)
## pga$DrivingAccuracy pga$GreensonRegulation pga$AverageNumofPutts
## 1.129399 1.151848 1.184846
## pga$SavePercent pga$NumEvents
## 1.180236 1.017764
Largest vif value is 1.184846 (< 10), so this data set doesn’t suffer from multi-collinearity.