library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#A: Import the data
candy<-read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/candy-power-ranking/candy-data.csv",
header=TRUE)
#View(candy)
#B: SLR
ggplot(candy, aes(sugarpercent, winpercent))+
geom_point()

#It looks like there is a weak or no relationship between winpercent and sugarpercent. If there is a form and direction, it would be linear and weakly positive. No obvious outlires.
mod1 <- lm(winpercent ~ sugarpercent, candy)
summary(mod1)
##
## Call:
## lm(formula = winpercent ~ sugarpercent, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.924 -11.066 -1.168 9.252 36.851
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.609 3.086 14.455 <2e-16 ***
## sugarpercent 11.924 5.560 2.145 0.0349 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.41 on 83 degrees of freedom
## Multiple R-squared: 0.05251, Adjusted R-squared: 0.04109
## F-statistic: 4.6 on 1 and 83 DF, p-value: 0.0349
#as sugarpercent increases by 1 unit, inpercent increases by 11.924 units and there is a significnat relationship between sugarpercent and winpercent (p=0.0349)
ggplot(candy, aes(sugarpercent, winpercent))+
geom_point()+
geom_smooth(method = "lm", se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

ggplot(candy, aes(sugarpercent, mod1$residuals))+
geom_point()+
geom_hline(yintercept = 0, color = "green")

#This residual plot seems to show that the mean zero assumtion is met (there seems to be an equal amount of - and + error, which would average to zero)
#The residual plot also shows that the residuals are spead evenly, meeting the homoscasticity assumption.
qqnorm(mod1$residuals)
qqline(mod1$residuals)

#The qq plot shows that the residulas are relativly normally distributed with a little bit of deviation on the tails which is usually okay.
#C: Catagorical Explanatory Variables and Ineractions
mod2 <- lm(winpercent~sugarpercent+chocolate, candy)
summary(mod2)
##
## Call:
## lm(formula = winpercent ~ sugarpercent + chocolate, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.6981 -8.5153 -0.4489 7.7602 27.4820
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.262 2.552 14.990 < 2e-16 ***
## sugarpercent 8.567 4.355 1.967 0.0525 .
## chocolate 18.273 2.469 7.401 1.06e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.22 on 82 degrees of freedom
## Multiple R-squared: 0.432, Adjusted R-squared: 0.4181
## F-statistic: 31.18 on 2 and 82 DF, p-value: 8.5e-11
#With all else held constant, as sugarpercent increases by 1 unit, winpercent increases by 8.567 units
#chocolate: Y=1, N=0 (so, N=0 is the reference group)
#No chocolate: Y = 38.262 + 8.567(sugarpercent)
#Yes chocolate: Y = 38.262 + 8.567(sugarpercent) + 18.273(1) = 56.535 + 8.567(sugarpercent)
ggplot(candy, aes(sugarpercent, winpercent))+
geom_point()+
geom_smooth(method = "lm", se=FALSE)+
geom_abline(slope=8.567, intercept=38.262, color = "red")+
geom_abline(slope=8.567, intercept = 56.535, color = "red")
## `geom_smooth()` using formula 'y ~ x'

#Candy that contains chocolate has a significantly higher winpercent than candy that doesn't have chocolate (with all else held constant).
mod3 <- lm(winpercent ~ sugarpercent*chocolate, candy)
summary(mod3)
##
## Call:
## lm(formula = winpercent ~ sugarpercent * chocolate, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.4007 -8.0463 -0.7059 6.2815 28.5003
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.778 2.878 13.820 <2e-16 ***
## sugarpercent 5.221 5.257 0.993 0.3236
## chocolate 13.051 5.230 2.495 0.0146 *
## sugarpercent:chocolate 10.586 9.350 1.132 0.2609
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.21 on 81 degrees of freedom
## Multiple R-squared: 0.4408, Adjusted R-squared: 0.4201
## F-statistic: 21.28 on 3 and 81 DF, p-value: 2.916e-10
#When chocolate is a factor, the suagrpercent relates slightly differently to winpercent. When there is chocolate, the slope is 10.586 more units than if there was not chocolate.
#No Chocolate: 39.778 + 5.221(sugarpercent)
#Yes Chocolate: (39.778 + 13.051) + (5.221+10.586)(sugarpercent) = 52.829 + 15.807(sugarpercent)
ggplot(candy, aes(sugarpercent, winpercent))+
geom_point()+
geom_smooth(method = "lm", se=FALSE)+
geom_abline(slope=5.221, intercept=39.778, color="purple")+
geom_abline(slope = 15.807, intercept = 52.829, color = "purple")
## `geom_smooth()` using formula 'y ~ x'

#There is not a significant interaction between sugarpercent and chocolate
#D: Model Selection
mod4 <- lm (winpercent~chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus+pricepercent+sugarpercent, candy)
summary(mod4)
##
## Call:
## lm(formula = winpercent ~ chocolate + fruity + caramel + peanutyalmondy +
## nougat + crispedricewafer + hard + bar + pluribus + pricepercent +
## sugarpercent, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.2244 -6.6247 0.1986 6.8420 23.8680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.5340 4.3199 7.994 1.44e-11 ***
## chocolate 19.7481 3.8987 5.065 2.96e-06 ***
## fruity 9.4223 3.7630 2.504 0.01452 *
## caramel 2.2245 3.6574 0.608 0.54493
## peanutyalmondy 10.0707 3.6158 2.785 0.00681 **
## nougat 0.8043 5.7164 0.141 0.88849
## crispedricewafer 8.9190 5.2679 1.693 0.09470 .
## hard -6.1653 3.4551 -1.784 0.07852 .
## bar 0.4415 5.0611 0.087 0.93072
## pluribus -0.8545 3.0401 -0.281 0.77945
## pricepercent -5.9284 5.5132 -1.075 0.28578
## sugarpercent 9.0868 4.6595 1.950 0.05500 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.7 on 73 degrees of freedom
## Multiple R-squared: 0.5402, Adjusted R-squared: 0.4709
## F-statistic: 7.797 on 11 and 73 DF, p-value: 9.504e-09
#chocolate, fruity, and peanutyalmondy are all significant at p<=0.05
#The reduced model: Y = 34.53 + 10.07(peanutyalmondy) + 9.42(fruity) + 19.75(chocolate)
Bwkmod1 <- lm(winpercent~chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+pluribus+pricepercent+sugarpercent, candy)
summary(Bwkmod1)
##
## Call:
## lm(formula = winpercent ~ chocolate + fruity + caramel + peanutyalmondy +
## nougat + crispedricewafer + hard + pluribus + pricepercent +
## sugarpercent, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.2429 -6.5991 0.1894 6.8524 23.9094
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.5938 4.2364 8.166 6.29e-12 ***
## chocolate 19.8199 3.7853 5.236 1.48e-06 ***
## fruity 9.4125 3.7360 2.519 0.01391 *
## caramel 2.2055 3.6263 0.608 0.54492
## peanutyalmondy 10.0518 3.5851 2.804 0.00645 **
## nougat 1.0626 4.8574 0.219 0.82744
## crispedricewafer 9.0877 4.8669 1.867 0.06583 .
## hard -6.1936 3.4168 -1.813 0.07393 .
## pluribus -0.9759 2.6845 -0.364 0.71723
## pricepercent -5.7803 5.2104 -1.109 0.27085
## sugarpercent 9.0699 4.6241 1.961 0.05359 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.63 on 74 degrees of freedom
## Multiple R-squared: 0.5402, Adjusted R-squared: 0.478
## F-statistic: 8.693 on 10 and 74 DF, p-value: 3.102e-09
Bwkmod2 <- lm(winpercent~chocolate+fruity+caramel+peanutyalmondy+crispedricewafer+hard+pluribus+pricepercent+sugarpercent, candy)
summary(Bwkmod2)
##
## Call:
## lm(formula = winpercent ~ chocolate + fruity + caramel + peanutyalmondy +
## crispedricewafer + hard + pluribus + pricepercent + sugarpercent,
## data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.2915 -6.8117 0.2741 6.8864 23.9675
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.701 4.181 8.299 3.22e-12 ***
## chocolate 19.886 3.749 5.304 1.11e-06 ***
## fruity 9.383 3.710 2.529 0.01353 *
## caramel 2.398 3.496 0.686 0.49496
## peanutyalmondy 10.102 3.555 2.842 0.00577 **
## crispedricewafer 8.815 4.675 1.886 0.06322 .
## hard -6.259 3.382 -1.851 0.06814 .
## pluribus -1.120 2.586 -0.433 0.66638
## pricepercent -5.811 5.175 -1.123 0.26510
## sugarpercent 9.170 4.572 2.006 0.04849 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.56 on 75 degrees of freedom
## Multiple R-squared: 0.5399, Adjusted R-squared: 0.4846
## F-statistic: 9.777 on 9 and 75 DF, p-value: 9.705e-10
Bwkmod3 <- lm(winpercent~chocolate+fruity+caramel+peanutyalmondy+crispedricewafer+hard+pricepercent+sugarpercent, candy)
summary(Bwkmod3)
##
## Call:
## lm(formula = winpercent ~ chocolate + fruity + caramel + peanutyalmondy +
## crispedricewafer + hard + pricepercent + sugarpercent, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.8771 -6.7201 0.5461 6.5427 23.6202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.978 3.813 8.911 1.98e-13 ***
## chocolate 20.141 3.683 5.469 5.56e-07 ***
## fruity 9.358 3.689 2.536 0.01325 *
## caramel 2.726 3.395 0.803 0.42453
## peanutyalmondy 10.302 3.506 2.938 0.00437 **
## crispedricewafer 9.045 4.620 1.958 0.05392 .
## hard -6.005 3.313 -1.813 0.07382 .
## pricepercent -5.685 5.139 -1.106 0.27214
## sugarpercent 8.823 4.477 1.971 0.05239 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.51 on 76 degrees of freedom
## Multiple R-squared: 0.5387, Adjusted R-squared: 0.4902
## F-statistic: 11.09 on 8 and 76 DF, p-value: 3.031e-10
Bwkmod4 <- lm(winpercent~chocolate+fruity+peanutyalmondy+crispedricewafer+hard+pricepercent+sugarpercent, candy)
summary(Bwkmod4)
##
## Call:
## lm(formula = winpercent ~ chocolate + fruity + peanutyalmondy +
## crispedricewafer + hard + pricepercent + sugarpercent, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.3149 -5.9715 0.8818 6.5006 23.7828
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.393 3.769 9.125 6.88e-14 ***
## chocolate 19.987 3.669 5.447 5.91e-07 ***
## fruity 8.623 3.566 2.418 0.0180 *
## peanutyalmondy 10.044 3.483 2.883 0.0051 **
## crispedricewafer 9.424 4.585 2.055 0.0432 *
## hard -6.046 3.305 -1.829 0.0712 .
## pricepercent -5.463 5.120 -1.067 0.2893
## sugarpercent 9.540 4.377 2.180 0.0323 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.48 on 77 degrees of freedom
## Multiple R-squared: 0.5348, Adjusted R-squared: 0.4925
## F-statistic: 12.65 on 7 and 77 DF, p-value: 1.093e-10
Bwkmod5 <- lm(winpercent~chocolate+fruity+peanutyalmondy+crispedricewafer+hard+sugarpercent, candy)
summary(Bwkmod5)
##
## Call:
## lm(formula = winpercent ~ chocolate + fruity + peanutyalmondy +
## crispedricewafer + hard + sugarpercent, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.5030 -6.7026 0.5668 5.8802 24.0107
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.941 3.518 9.365 2.12e-14 ***
## chocolate 19.147 3.587 5.338 8.98e-07 ***
## fruity 8.881 3.561 2.494 0.01473 *
## peanutyalmondy 9.483 3.446 2.752 0.00737 **
## crispedricewafer 8.385 4.484 1.870 0.06525 .
## hard -5.669 3.289 -1.724 0.08871 .
## sugarpercent 7.979 4.129 1.932 0.05693 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.49 on 78 degrees of freedom
## Multiple R-squared: 0.5279, Adjusted R-squared: 0.4916
## F-statistic: 14.54 on 6 and 78 DF, p-value: 4.622e-11
Bwkmod6 <- lm(winpercent~chocolate+fruity+peanutyalmondy+crispedricewafer+sugarpercent, candy)
summary(Bwkmod6)
##
## Call:
## lm(formula = winpercent ~ chocolate + fruity + peanutyalmondy +
## crispedricewafer + sugarpercent, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.2424 -7.0503 0.5657 7.3837 25.6678
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.605 3.556 9.170 4.54e-14 ***
## chocolate 19.670 3.618 5.436 5.89e-07 ***
## fruity 7.702 3.538 2.177 0.03246 *
## peanutyalmondy 9.851 3.482 2.829 0.00592 **
## crispedricewafer 8.654 4.537 1.907 0.06010 .
## sugarpercent 7.044 4.144 1.700 0.09308 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.62 on 79 degrees of freedom
## Multiple R-squared: 0.5099, Adjusted R-squared: 0.4789
## F-statistic: 16.44 on 5 and 79 DF, p-value: 4.295e-11
Bwkmod7 <- lm(winpercent~chocolate+fruity+peanutyalmondy+crispedricewafer, candy)
summary(Bwkmod7)
##
## Call:
## lm(formula = winpercent ~ chocolate + fruity + peanutyalmondy +
## crispedricewafer, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.2846 -6.2386 0.5941 7.8253 23.4505
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.411 3.186 11.114 < 2e-16 ***
## chocolate 20.240 3.645 5.552 3.56e-07 ***
## fruity 8.176 3.568 2.292 0.02456 *
## peanutyalmondy 10.291 3.514 2.929 0.00443 **
## crispedricewafer 9.049 4.584 1.974 0.05184 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.75 on 80 degrees of freedom
## Multiple R-squared: 0.492, Adjusted R-squared: 0.4666
## F-statistic: 19.37 on 4 and 80 DF, p-value: 3.545e-11
Bwkmod8 <- lm(winpercent~chocolate+fruity+peanutyalmondy, candy)
summary(Bwkmod8)
##
## Call:
## lm(formula = winpercent ~ chocolate + fruity + peanutyalmondy,
## data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.0497 -7.3084 -0.4523 7.9446 23.8712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.788 3.237 11.057 < 2e-16 ***
## chocolate 21.983 3.599 6.108 3.34e-08 ***
## fruity 7.753 3.625 2.139 0.0354 *
## peanutyalmondy 9.066 3.520 2.576 0.0118 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.94 on 81 degrees of freedom
## Multiple R-squared: 0.4673, Adjusted R-squared: 0.4475
## F-statistic: 23.68 on 3 and 81 DF, p-value: 4.209e-11
#This Bwk model includes chcoclate, fruity, peanutyalmondy
#mod4 vs Bwkmod8
#Varibles are the same: chocolate, fruity, and peanutyalmondy
#MSEmod4 = 10.7 while MSEBwkmod8 = 10.94
#I would choose the mod4 because it's MSE is slightly smaller and it's R^2 and adjR^2 are bigger (meaning it explains more error that Bwkmod8)
#E: Inference and Prediction
confint(mod4)
## 2.5 % 97.5 %
## (Intercept) 25.9244325 43.1435243
## chocolate 11.9779198 27.5182141
## fruity 1.9226936 16.9219505
## caramel -5.0646342 9.5135969
## peanutyalmondy 2.8643149 17.2770620
## nougat -10.5884911 12.1971523
## crispedricewafer -1.5798982 19.4178378
## hard -13.0514159 0.7207629
## bar -9.6451266 10.5282068
## pluribus -6.9134540 5.2044550
## pricepercent -16.9162393 5.0595165
## sugarpercent -0.1995199 18.3730456
#My grandmother used to make homemade peanutbutter bonbons (bite sized peanutbutter/ricecrispy balls dipped in chocolate)
mod5 <- lm(winpercent~chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus, candy)
summary(mod5)
##
## Call:
## lm(formula = winpercent ~ chocolate + fruity + caramel + peanutyalmondy +
## nougat + crispedricewafer + hard + bar + pluribus, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.6779 -5.6765 0.3966 7.0583 21.9144
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.0155 4.0781 8.586 9.13e-13 ***
## chocolate 19.9058 3.8975 5.107 2.41e-06 ***
## fruity 10.2677 3.7887 2.710 0.00833 **
## caramel 3.3843 3.6034 0.939 0.35065
## peanutyalmondy 10.1410 3.5949 2.821 0.00612 **
## nougat 2.4163 5.6897 0.425 0.67229
## crispedricewafer 8.9915 5.3279 1.688 0.09564 .
## hard -4.8726 3.4394 -1.417 0.16072
## bar -0.7220 4.8707 -0.148 0.88256
## pluribus -0.1599 3.0115 -0.053 0.95779
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.85 on 75 degrees of freedom
## Multiple R-squared: 0.5148, Adjusted R-squared: 0.4566
## F-statistic: 8.842 on 9 and 75 DF, p-value: 6.052e-09
bonbons <- data.frame(chocolate=1, fruity=0, caramel = 0, peanutyalmondy = 1, nougat = 0, crispedricewafer=1, hard = 0, bar = 0, pluribus = 1)
view(bonbons)
predict(mod5, bonbons)
## 1
## 73.8939
#I would create a confidence interval since they are more precise (prediction intervals will be wider)
confband <- predict(mod5, bonbons, interval = "confidence")
confband
## fit lwr upr
## 1 73.8939 60.84254 86.94526