library(ISLR) library(tidyverse)
##A
library(ISLR)
library(tidyverse)
candy<-read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/candy-power-ranking/candy-data.csv",
header=TRUE)
summary(candy)
## competitorname chocolate fruity caramel
## Length:85 Min. :0.0000 Min. :0.0000 Min. :0.0000
## Class :character 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Mode :character Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.4353 Mean :0.4471 Mean :0.1647
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## peanutyalmondy nougat crispedricewafer hard
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.1647 Mean :0.08235 Mean :0.08235 Mean :0.1765
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## bar pluribus sugarpercent pricepercent
## Min. :0.0000 Min. :0.0000 Min. :0.0110 Min. :0.0110
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.2200 1st Qu.:0.2550
## Median :0.0000 Median :1.0000 Median :0.4650 Median :0.4650
## Mean :0.2471 Mean :0.5176 Mean :0.4786 Mean :0.4689
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.7320 3rd Qu.:0.6510
## Max. :1.0000 Max. :1.0000 Max. :0.9880 Max. :0.9760
## winpercent
## Min. :22.45
## 1st Qu.:39.14
## Median :47.83
## Mean :50.32
## 3rd Qu.:59.86
## Max. :84.18
mod1<-lm(formula = winpercent ~ chocolate + fruity +caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus,data=candy)
summary(mod1)
##
## Call:
## lm(formula = winpercent ~ chocolate + fruity + caramel + peanutyalmondy +
## nougat + crispedricewafer + hard + bar + pluribus, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.6779 -5.6765 0.3966 7.0583 21.9144
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.0155 4.0781 8.586 9.13e-13 ***
## chocolate 19.9058 3.8975 5.107 2.41e-06 ***
## fruity 10.2677 3.7887 2.710 0.00833 **
## caramel 3.3843 3.6034 0.939 0.35065
## peanutyalmondy 10.1410 3.5949 2.821 0.00612 **
## nougat 2.4163 5.6897 0.425 0.67229
## crispedricewafer 8.9915 5.3279 1.688 0.09564 .
## hard -4.8726 3.4394 -1.417 0.16072
## bar -0.7220 4.8707 -0.148 0.88256
## pluribus -0.1599 3.0115 -0.053 0.95779
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.85 on 75 degrees of freedom
## Multiple R-squared: 0.5148, Adjusted R-squared: 0.4566
## F-statistic: 8.842 on 9 and 75 DF, p-value: 6.052e-09
##B
#a
ggplot(candy, aes(x= sugarpercent, y=winpercent, color = sugarpercent))+geom_point()
#I added color to the graph to make it more readable
#b
#Form: There seems to be no relationship between winpercent and sugarpercent
#Direction: The direction indicates towards linear
#Strength: Very weak, points are very scattered.
#Outlier: No real noticeable outliers, the points are so scattered its hard to tell
# where an outlier could be. The point at ~ (.05,83) is a potential outlier
#c
SugVSWin<-lm(winpercent~sugarpercent, data = candy)
summary(SugVSWin)
##
## Call:
## lm(formula = winpercent ~ sugarpercent, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.924 -11.066 -1.168 9.252 36.851
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.609 3.086 14.455 <2e-16 ***
## sugarpercent 11.924 5.560 2.145 0.0349 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.41 on 83 degrees of freedom
## Multiple R-squared: 0.05251, Adjusted R-squared: 0.04109
## F-statistic: 4.6 on 1 and 83 DF, p-value: 0.0349
#i. The slope is 11.924 for sugarpercent. Increasing sugar increases the win % 11.94%. A p value of .03 tells us this is significant. When there is no sugar the predicted winscore is 44.61.
#ii. A p value of .035 for sugarpercent tells us this is significant.
#iii.
ggplot(candy, aes(x= sugarpercent, y=winpercent, color = sugarpercent))+geom_point()+geom_abline(intercept=44.609, slope =11.924, color = "red", lwd=1)
#d
#Yes we should trust the previous model
par(mfrow = c(2, 2))
plot(SugVSWin)
#ii.
qplot(x = sugarpercent, y = winpercent, data = candy,
geom = "point", alpha = I(.1), ylab = "winpercent") +
stat_smooth(method = "lm",
se = FALSE)
##C
#a
model2<-lm(winpercent~sugarpercent+chocolate, candy)
#i
summary(model2)
##
## Call:
## lm(formula = winpercent ~ sugarpercent + chocolate, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.6981 -8.5153 -0.4489 7.7602 27.4820
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.262 2.552 14.990 < 2e-16 ***
## sugarpercent 8.567 4.355 1.967 0.0525 .
## chocolate 18.273 2.469 7.401 1.06e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.22 on 82 degrees of freedom
## Multiple R-squared: 0.432, Adjusted R-squared: 0.4181
## F-statistic: 31.18 on 2 and 82 DF, p-value: 8.5e-11
#If the candy contains no sugar or chocolate the predicted win score is 38.26%. Increasing by 8.567 if sugar is added.
#If the candy has chocolate and no sugar than the predicted win score is 56.54%, increasing by 8.567% when sugar is added.
#ii
#First equation is for no chocolate but contains sugar
#Second equation is for chocolate and sugar.
\[\hat{y}= 38.262+8.567x\] \[\hat{y}= 56.54+8.567x\]
#iii
ggplot(candy, aes(x=sugarpercent, y=winpercent, color="blue"))+
geom_point()+
geom_abline(intercept = 38.262, slope=8.567,color="red", lwd=1)+
geom_abline(intercept = 56.54, slope=8.567)
#iv
#While chocolate does boost the winpercent of a piece of candy by 18.3%, the candy containing sugar has an insignificant relation to winpercent.
#b.
#i
model3<-lm(winpercent~sugarpercent*chocolate, candy)
summary(model3)
##
## Call:
## lm(formula = winpercent ~ sugarpercent * chocolate, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.4007 -8.0463 -0.7059 6.2815 28.5003
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.778 2.878 13.820 <2e-16 ***
## sugarpercent 5.221 5.257 0.993 0.3236
## chocolate 13.051 5.230 2.495 0.0146 *
## sugarpercent:chocolate 10.586 9.350 1.132 0.2609
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.21 on 81 degrees of freedom
## Multiple R-squared: 0.4408, Adjusted R-squared: 0.4201
## F-statistic: 21.28 on 3 and 81 DF, p-value: 2.916e-10
#The slope coefficient is 5.221%.
#ii
#The first model looks at sugar content but not chocolate. The second model includes chocolate.
\[\hat{y}= 39.788+5.221x\] \[\hat{y}= 52.829+5.221x\]
#iii
ggplot(candy, aes(x=sugarpercent, y=winpercent))+
geom_point()+
geom_abline(intercept = 39.788, slope=5.221,color="red", lwd=1)+
geom_abline(intercept = 56.54, slope=5.221, color="brown", lwd=1)
#iv
#The two graphs look very similar, there is only a slight change in the interactions in this model and the previous model.
##D
library(leaps)
#a
model4<-lm(winpercent~sugarpercent+chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus+pricepercent, candy)
summary(model4)
##
## Call:
## lm(formula = winpercent ~ sugarpercent + chocolate + fruity +
## caramel + peanutyalmondy + nougat + crispedricewafer + hard +
## bar + pluribus + pricepercent, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.2244 -6.6247 0.1986 6.8420 23.8680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.5340 4.3199 7.994 1.44e-11 ***
## sugarpercent 9.0868 4.6595 1.950 0.05500 .
## chocolate 19.7481 3.8987 5.065 2.96e-06 ***
## fruity 9.4223 3.7630 2.504 0.01452 *
## caramel 2.2245 3.6574 0.608 0.54493
## peanutyalmondy 10.0707 3.6158 2.785 0.00681 **
## nougat 0.8043 5.7164 0.141 0.88849
## crispedricewafer 8.9190 5.2679 1.693 0.09470 .
## hard -6.1653 3.4551 -1.784 0.07852 .
## bar 0.4415 5.0611 0.087 0.93072
## pluribus -0.8545 3.0401 -0.281 0.77945
## pricepercent -5.9284 5.5132 -1.075 0.28578
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.7 on 73 degrees of freedom
## Multiple R-squared: 0.5402, Adjusted R-squared: 0.4709
## F-statistic: 7.797 on 11 and 73 DF, p-value: 9.504e-09
#Chocolate, fruity, and peanutyalmondy
smolModel4<-lm(winpercent~sugarpercent+chocolate+fruity+peanutyalmondy, candy)
summary(smolModel4)
##
## Call:
## lm(formula = winpercent ~ sugarpercent + chocolate + fruity +
## peanutyalmondy, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.0204 -7.9139 -0.0392 7.4155 25.8389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.803 3.612 9.081 6.09e-14 ***
## sugarpercent 7.449 4.206 1.771 0.0804 .
## chocolate 21.301 3.574 5.961 6.45e-08 ***
## fruity 7.271 3.588 2.026 0.0461 *
## peanutyalmondy 8.657 3.482 2.486 0.0150 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.8 on 80 degrees of freedom
## Multiple R-squared: 0.4874, Adjusted R-squared: 0.4617
## F-statistic: 19.01 on 4 and 80 DF, p-value: 5.056e-11
#b
forwardselect<-regsubsets(winpercent~sugarpercent+chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus+pricepercent, data=candy,
method="forward")
summary(forwardselect)
## Subset selection object
## Call: regsubsets.formula(winpercent ~ sugarpercent + chocolate + fruity +
## caramel + peanutyalmondy + nougat + crispedricewafer + hard +
## bar + pluribus + pricepercent, data = candy, method = "forward")
## 11 Variables (and intercept)
## Forced in Forced out
## sugarpercent FALSE FALSE
## chocolate FALSE FALSE
## fruity FALSE FALSE
## caramel FALSE FALSE
## peanutyalmondy FALSE FALSE
## nougat FALSE FALSE
## crispedricewafer FALSE FALSE
## hard FALSE FALSE
## bar FALSE FALSE
## pluribus FALSE FALSE
## pricepercent FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## sugarpercent chocolate fruity caramel peanutyalmondy nougat
## 1 ( 1 ) " " "*" " " " " " " " "
## 2 ( 1 ) " " "*" " " " " "*" " "
## 3 ( 1 ) " " "*" "*" " " "*" " "
## 4 ( 1 ) " " "*" "*" " " "*" " "
## 5 ( 1 ) "*" "*" "*" " " "*" " "
## 6 ( 1 ) "*" "*" "*" " " "*" " "
## 7 ( 1 ) "*" "*" "*" " " "*" " "
## 8 ( 1 ) "*" "*" "*" "*" "*" " "
## crispedricewafer hard bar pluribus pricepercent
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " "
## 7 ( 1 ) "*" "*" " " " " "*"
## 8 ( 1 ) "*" "*" " " " " "*"
#c
#i
#I would use chocolate, fruity, and peanutyalmondy because they are statistically significant for our first model.
#ii
#For the reduced model the mse is:
mse <- mean(smolModel4$residuals^2)
mse
## [1] 109.6847
#iii
#I prefer the reduced model because it is easier to understand at a glance. I am also more familiar with the reduced version.
##E
#a
#Intercept: 6.09e^-14
#Sugarpercent: .804
#Chocolate: 6.45e^-8
#Fruity: .0461
#Peanutyalmondy: .0150
#b
#i
#I CHOOSE LOLLIPOPS
model6<-lm(winpercent~chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus, data=candy)
summary
## function (object, ...)
## UseMethod("summary")
## <bytecode: 0x0000000015e3e410>
## <environment: namespace:base>
#35.0155(intercept)+10.2677(fruity)-4.8726
# =40.4 winpercent
#ii
#Many of these predictors are not statistically significant therefore I wouldn't trust this model for new types of candies.