library(tidyverse)
## ── Attaching packages ───────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
candy<-read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/candy-power-ranking/candy-data.csv", header=TRUE)
candy<-subset(candy, select = -c(competitorname))
#View(candy)
ggplot(candy, aes(sugarpercent, winpercent)) + geom_point()
The scatter plot above shows very weak or no correlation between sugar percent and win percent. There is no real discernable form or direction. There is a possible outlier around sugar percent of 2 and win percent 82 but is hard to tell with such little shape.
According to the model, for each increase in percentage of sugar, win percent increases by 11.92.
mod <- lm(formula = winpercent ~ sugarpercent, data = candy)
summary(mod)
##
## Call:
## lm(formula = winpercent ~ sugarpercent, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.924 -11.066 -1.168 9.252 36.851
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.609 3.086 14.455 <2e-16 ***
## sugarpercent 11.924 5.560 2.145 0.0349 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.41 on 83 degrees of freedom
## Multiple R-squared: 0.05251, Adjusted R-squared: 0.04109
## F-statistic: 4.6 on 1 and 83 DF, p-value: 0.0349
With a p-value of 0.035 there is a significant relationship between sugar and win percentage.
ggplot(candy, aes(sugarpercent, winpercent)) + geom_point() + geom_abline(slope = mod$coefficients[2], intercept = mod$coefficients[1], color = "blue" )
d)
i) It looks like the mean zero assumptions hold true as the residuals seem to be evenly distributed above and below zero. Additionally, the spread of the residuals seems to be constant.
res <- mod$residuals
fitted = mod$fitted.values
ggplot(NULL, aes(fitted, res)) + geom_point() + geom_abline(slope = 0, intercept = 0, lty = 2, color = "red")
qqnorm(mod$residuals)
qqline(mod$residuals)
mod2 <- lm(formula = winpercent ~ sugarpercent + chocolate, data = candy)
summary(mod2)
##
## Call:
## lm(formula = winpercent ~ sugarpercent + chocolate, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.6981 -8.5153 -0.4489 7.7602 27.4820
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.262 2.552 14.990 < 2e-16 ***
## sugarpercent 8.567 4.355 1.967 0.0525 .
## chocolate 18.273 2.469 7.401 1.06e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.22 on 82 degrees of freedom
## Multiple R-squared: 0.432, Adjusted R-squared: 0.4181
## F-statistic: 31.18 on 2 and 82 DF, p-value: 8.5e-11
ggplot(data=candy, aes(x=sugarpercent, y=winpercent, color=chocolate))+
geom_point()+
geom_abline(slope=mod2$coefficients[2], intercept = mod2$coefficients[1], col="black")+
geom_abline(slope=mod2$coefficients[2], intercept = mod2$coefficients[1]+mod2$coefficients[3], col=4)
mod3 <- lm(formula = winpercent ~ sugarpercent + chocolate + sugarpercent*chocolate, data = candy)
summary(mod3)
##
## Call:
## lm(formula = winpercent ~ sugarpercent + chocolate + sugarpercent *
## chocolate, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.4007 -8.0463 -0.7059 6.2815 28.5003
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.778 2.878 13.820 <2e-16 ***
## sugarpercent 5.221 5.257 0.993 0.3236
## chocolate 13.051 5.230 2.495 0.0146 *
## sugarpercent:chocolate 10.586 9.350 1.132 0.2609
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.21 on 81 degrees of freedom
## Multiple R-squared: 0.4408, Adjusted R-squared: 0.4201
## F-statistic: 21.28 on 3 and 81 DF, p-value: 2.916e-10
ggplot(data=candy, aes(x=sugarpercent, y=winpercent, color=chocolate))+
geom_point()+
geom_abline(slope=mod3$coefficients[2], intercept = mod3$coefficients[1], col="black")+
geom_abline(slope=mod3$coefficients[2] + mod3$coefficients[4], intercept = mod3$coefficients[1]+mod3$coefficients[3], col=4)
saturated_model <- lm(winpercent ~ chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus+sugarpercent+pricepercent, data = candy)
summary(saturated_model)
##
## Call:
## lm(formula = winpercent ~ chocolate + fruity + caramel + peanutyalmondy +
## nougat + crispedricewafer + hard + bar + pluribus + sugarpercent +
## pricepercent, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.2244 -6.6247 0.1986 6.8420 23.8680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.5340 4.3199 7.994 1.44e-11 ***
## chocolate 19.7481 3.8987 5.065 2.96e-06 ***
## fruity 9.4223 3.7630 2.504 0.01452 *
## caramel 2.2245 3.6574 0.608 0.54493
## peanutyalmondy 10.0707 3.6158 2.785 0.00681 **
## nougat 0.8043 5.7164 0.141 0.88849
## crispedricewafer 8.9190 5.2679 1.693 0.09470 .
## hard -6.1653 3.4551 -1.784 0.07852 .
## bar 0.4415 5.0611 0.087 0.93072
## pluribus -0.8545 3.0401 -0.281 0.77945
## sugarpercent 9.0868 4.6595 1.950 0.05500 .
## pricepercent -5.9284 5.5132 -1.075 0.28578
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.7 on 73 degrees of freedom
## Multiple R-squared: 0.5402, Adjusted R-squared: 0.4709
## F-statistic: 7.797 on 11 and 73 DF, p-value: 9.504e-09
reduced_model <- lm(winpercent ~ chocolate + peanutyalmondy + fruity, data = candy)
summary(reduced_model)
##
## Call:
## lm(formula = winpercent ~ chocolate + peanutyalmondy + fruity,
## data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.0497 -7.3084 -0.4523 7.9446 23.8712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.788 3.237 11.057 < 2e-16 ***
## chocolate 21.983 3.599 6.108 3.34e-08 ***
## peanutyalmondy 9.066 3.520 2.576 0.0118 *
## fruity 7.753 3.625 2.139 0.0354 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.94 on 81 degrees of freedom
## Multiple R-squared: 0.4673, Adjusted R-squared: 0.4475
## F-statistic: 23.68 on 3 and 81 DF, p-value: 4.209e-11
library(leaps)
regfit.fwd<-regsubsets(winpercent~., data=candy,
method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(winpercent ~ ., data = candy, method = "forward")
## 11 Variables (and intercept)
## Forced in Forced out
## chocolate FALSE FALSE
## fruity FALSE FALSE
## caramel FALSE FALSE
## peanutyalmondy FALSE FALSE
## nougat FALSE FALSE
## crispedricewafer FALSE FALSE
## hard FALSE FALSE
## bar FALSE FALSE
## pluribus FALSE FALSE
## sugarpercent FALSE FALSE
## pricepercent FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## chocolate fruity caramel peanutyalmondy nougat crispedricewafer hard
## 1 ( 1 ) "*" " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " "*" " " " " " "
## 3 ( 1 ) "*" "*" " " "*" " " " " " "
## 4 ( 1 ) "*" "*" " " "*" " " "*" " "
## 5 ( 1 ) "*" "*" " " "*" " " "*" " "
## 6 ( 1 ) "*" "*" " " "*" " " "*" "*"
## 7 ( 1 ) "*" "*" " " "*" " " "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" " " "*" "*"
## bar pluribus sugarpercent pricepercent
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " "*" " "
## 6 ( 1 ) " " " " "*" " "
## 7 ( 1 ) " " " " "*" "*"
## 8 ( 1 ) " " " " "*" "*"
forward_sel_mod <- lm(winpercent ~ chocolate+peanutyalmondy+fruity, data = candy)
summary(forward_sel_mod)
##
## Call:
## lm(formula = winpercent ~ chocolate + peanutyalmondy + fruity,
## data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.0497 -7.3084 -0.4523 7.9446 23.8712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.788 3.237 11.057 < 2e-16 ***
## chocolate 21.983 3.599 6.108 3.34e-08 ***
## peanutyalmondy 9.066 3.520 2.576 0.0118 *
## fruity 7.753 3.625 2.139 0.0354 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.94 on 81 degrees of freedom
## Multiple R-squared: 0.4673, Adjusted R-squared: 0.4475
## F-statistic: 23.68 on 3 and 81 DF, p-value: 4.209e-11
I would include chocolate, peanutyalmondy and fruity.
c)
i) The variables are the same.
ii) The MSEs are the same.
iii) The models are identical.
confint(forward_sel_mod, level = .95)
## 2.5 % 97.5 %
## (Intercept) 29.3481409 42.22839
## chocolate 14.8220911 29.14475
## peanutyalmondy 2.0632117 16.06867
## fruity 0.5412327 14.96470