library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#A: Import the data
candy<-read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/candy-power-ranking/candy-data.csv",
                header=TRUE)
#View(candy)

#B: SLR
ggplot(candy, aes(sugarpercent, winpercent))+
  geom_point()

  #It looks like there is a weak or no relationship between winpercent and sugarpercent. If there is a form and direction, it would be linear and weakly positive. No obvious outlires. 

mod1 <- lm(winpercent ~ sugarpercent, candy)
summary(mod1)
## 
## Call:
## lm(formula = winpercent ~ sugarpercent, data = candy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.924 -11.066  -1.168   9.252  36.851 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    44.609      3.086  14.455   <2e-16 ***
## sugarpercent   11.924      5.560   2.145   0.0349 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.41 on 83 degrees of freedom
## Multiple R-squared:  0.05251,    Adjusted R-squared:  0.04109 
## F-statistic:   4.6 on 1 and 83 DF,  p-value: 0.0349
#as sugarpercent increases by 1 unit, inpercent increases by 11.924 units and there is a significnat relationship between sugarpercent and winpercent (p=0.0349)

ggplot(candy, aes(sugarpercent, winpercent))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

ggplot(candy, aes(sugarpercent, mod1$residuals))+
  geom_point()+
  geom_hline(yintercept = 0, color = "green")

  #This residual plot seems to show that the mean zero assumtion is met (there seems to be an equal amount of - and + error, which would average to zero)
  #The residual plot also shows that the residuals are spead evenly, meeting the homoscasticity assumption. 
qqnorm(mod1$residuals)
qqline(mod1$residuals)

  #The qq plot shows that the residulas are relativly normally distributed with a little bit of deviation on the tails which is usually okay.

#C: Catagorical Explanatory Variables and Ineractions
mod2 <- lm(winpercent~sugarpercent+chocolate, candy)
summary(mod2)
## 
## Call:
## lm(formula = winpercent ~ sugarpercent + chocolate, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.6981  -8.5153  -0.4489   7.7602  27.4820 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    38.262      2.552  14.990  < 2e-16 ***
## sugarpercent    8.567      4.355   1.967   0.0525 .  
## chocolate      18.273      2.469   7.401 1.06e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.22 on 82 degrees of freedom
## Multiple R-squared:  0.432,  Adjusted R-squared:  0.4181 
## F-statistic: 31.18 on 2 and 82 DF,  p-value: 8.5e-11
  #With all else held constant, as sugarpercent increases by 1 unit, winpercent increases by 8.567 units
  #chocolate: Y=1, N=0 (so, N=0 is the reference group)
    #No chocolate: Y = 38.262 + 8.567(sugarpercent) 
    #Yes chocolate: Y = 38.262 + 8.567(sugarpercent) + 18.273(1) = 56.535 + 8.567(sugarpercent) 
ggplot(candy, aes(sugarpercent, winpercent))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  geom_abline(slope=8.567, intercept=38.262, color = "red")+
  geom_abline(slope=8.567, intercept = 56.535, color = "red")
## `geom_smooth()` using formula 'y ~ x'

  #Candy that contains chocolate has a significantly higher winpercent than candy that doesn't have chocolate (with all else held constant).
  
mod3 <- lm(winpercent ~ sugarpercent*chocolate, candy)
summary(mod3) 
## 
## Call:
## lm(formula = winpercent ~ sugarpercent * chocolate, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.4007  -8.0463  -0.7059   6.2815  28.5003 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              39.778      2.878  13.820   <2e-16 ***
## sugarpercent              5.221      5.257   0.993   0.3236    
## chocolate                13.051      5.230   2.495   0.0146 *  
## sugarpercent:chocolate   10.586      9.350   1.132   0.2609    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.21 on 81 degrees of freedom
## Multiple R-squared:  0.4408, Adjusted R-squared:  0.4201 
## F-statistic: 21.28 on 3 and 81 DF,  p-value: 2.916e-10
  #When chocolate is a factor, the suagrpercent relates slightly differently to winpercent. When there is chocolate, the slope is 10.586 more units than if there was not chocolate.
  #No Chocolate: 39.778 + 5.221(sugarpercent)
  #Yes Chocolate: (39.778 + 13.051) + (5.221+10.586)(sugarpercent) = 52.829 + 15.807(sugarpercent)
ggplot(candy, aes(sugarpercent, winpercent))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  geom_abline(slope=5.221, intercept=39.778, color="purple")+
  geom_abline(slope = 15.807, intercept = 52.829, color = "purple")
## `geom_smooth()` using formula 'y ~ x'

  #There is not a significant interaction between sugarpercent and chocolate

#D: Model Selection
mod4 <- lm (winpercent~chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus+pricepercent+sugarpercent, candy)
summary(mod4)
## 
## Call:
## lm(formula = winpercent ~ chocolate + fruity + caramel + peanutyalmondy + 
##     nougat + crispedricewafer + hard + bar + pluribus + pricepercent + 
##     sugarpercent, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.2244  -6.6247   0.1986   6.8420  23.8680 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       34.5340     4.3199   7.994 1.44e-11 ***
## chocolate         19.7481     3.8987   5.065 2.96e-06 ***
## fruity             9.4223     3.7630   2.504  0.01452 *  
## caramel            2.2245     3.6574   0.608  0.54493    
## peanutyalmondy    10.0707     3.6158   2.785  0.00681 ** 
## nougat             0.8043     5.7164   0.141  0.88849    
## crispedricewafer   8.9190     5.2679   1.693  0.09470 .  
## hard              -6.1653     3.4551  -1.784  0.07852 .  
## bar                0.4415     5.0611   0.087  0.93072    
## pluribus          -0.8545     3.0401  -0.281  0.77945    
## pricepercent      -5.9284     5.5132  -1.075  0.28578    
## sugarpercent       9.0868     4.6595   1.950  0.05500 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.7 on 73 degrees of freedom
## Multiple R-squared:  0.5402, Adjusted R-squared:  0.4709 
## F-statistic: 7.797 on 11 and 73 DF,  p-value: 9.504e-09
  #chocolate, fruity, and peanutyalmondy are all significant at p<=0.05
  #The reduced model: Y = 34.53 + 10.07(peanutyalmondy) + 9.42(fruity) + 19.75(chocolate)

Bwkmod1 <- lm(winpercent~chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+pluribus+pricepercent+sugarpercent, candy)
summary(Bwkmod1)
## 
## Call:
## lm(formula = winpercent ~ chocolate + fruity + caramel + peanutyalmondy + 
##     nougat + crispedricewafer + hard + pluribus + pricepercent + 
##     sugarpercent, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.2429  -6.5991   0.1894   6.8524  23.9094 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       34.5938     4.2364   8.166 6.29e-12 ***
## chocolate         19.8199     3.7853   5.236 1.48e-06 ***
## fruity             9.4125     3.7360   2.519  0.01391 *  
## caramel            2.2055     3.6263   0.608  0.54492    
## peanutyalmondy    10.0518     3.5851   2.804  0.00645 ** 
## nougat             1.0626     4.8574   0.219  0.82744    
## crispedricewafer   9.0877     4.8669   1.867  0.06583 .  
## hard              -6.1936     3.4168  -1.813  0.07393 .  
## pluribus          -0.9759     2.6845  -0.364  0.71723    
## pricepercent      -5.7803     5.2104  -1.109  0.27085    
## sugarpercent       9.0699     4.6241   1.961  0.05359 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.63 on 74 degrees of freedom
## Multiple R-squared:  0.5402, Adjusted R-squared:  0.478 
## F-statistic: 8.693 on 10 and 74 DF,  p-value: 3.102e-09
Bwkmod2 <- lm(winpercent~chocolate+fruity+caramel+peanutyalmondy+crispedricewafer+hard+pluribus+pricepercent+sugarpercent, candy)
summary(Bwkmod2)
## 
## Call:
## lm(formula = winpercent ~ chocolate + fruity + caramel + peanutyalmondy + 
##     crispedricewafer + hard + pluribus + pricepercent + sugarpercent, 
##     data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.2915  -6.8117   0.2741   6.8864  23.9675 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        34.701      4.181   8.299 3.22e-12 ***
## chocolate          19.886      3.749   5.304 1.11e-06 ***
## fruity              9.383      3.710   2.529  0.01353 *  
## caramel             2.398      3.496   0.686  0.49496    
## peanutyalmondy     10.102      3.555   2.842  0.00577 ** 
## crispedricewafer    8.815      4.675   1.886  0.06322 .  
## hard               -6.259      3.382  -1.851  0.06814 .  
## pluribus           -1.120      2.586  -0.433  0.66638    
## pricepercent       -5.811      5.175  -1.123  0.26510    
## sugarpercent        9.170      4.572   2.006  0.04849 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.56 on 75 degrees of freedom
## Multiple R-squared:  0.5399, Adjusted R-squared:  0.4846 
## F-statistic: 9.777 on 9 and 75 DF,  p-value: 9.705e-10
Bwkmod3 <- lm(winpercent~chocolate+fruity+caramel+peanutyalmondy+crispedricewafer+hard+pricepercent+sugarpercent, candy)
summary(Bwkmod3)
## 
## Call:
## lm(formula = winpercent ~ chocolate + fruity + caramel + peanutyalmondy + 
##     crispedricewafer + hard + pricepercent + sugarpercent, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.8771  -6.7201   0.5461   6.5427  23.6202 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        33.978      3.813   8.911 1.98e-13 ***
## chocolate          20.141      3.683   5.469 5.56e-07 ***
## fruity              9.358      3.689   2.536  0.01325 *  
## caramel             2.726      3.395   0.803  0.42453    
## peanutyalmondy     10.302      3.506   2.938  0.00437 ** 
## crispedricewafer    9.045      4.620   1.958  0.05392 .  
## hard               -6.005      3.313  -1.813  0.07382 .  
## pricepercent       -5.685      5.139  -1.106  0.27214    
## sugarpercent        8.823      4.477   1.971  0.05239 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.51 on 76 degrees of freedom
## Multiple R-squared:  0.5387, Adjusted R-squared:  0.4902 
## F-statistic: 11.09 on 8 and 76 DF,  p-value: 3.031e-10
Bwkmod4 <- lm(winpercent~chocolate+fruity+peanutyalmondy+crispedricewafer+hard+pricepercent+sugarpercent, candy)
summary(Bwkmod4)
## 
## Call:
## lm(formula = winpercent ~ chocolate + fruity + peanutyalmondy + 
##     crispedricewafer + hard + pricepercent + sugarpercent, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.3149  -5.9715   0.8818   6.5006  23.7828 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        34.393      3.769   9.125 6.88e-14 ***
## chocolate          19.987      3.669   5.447 5.91e-07 ***
## fruity              8.623      3.566   2.418   0.0180 *  
## peanutyalmondy     10.044      3.483   2.883   0.0051 ** 
## crispedricewafer    9.424      4.585   2.055   0.0432 *  
## hard               -6.046      3.305  -1.829   0.0712 .  
## pricepercent       -5.463      5.120  -1.067   0.2893    
## sugarpercent        9.540      4.377   2.180   0.0323 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.48 on 77 degrees of freedom
## Multiple R-squared:  0.5348, Adjusted R-squared:  0.4925 
## F-statistic: 12.65 on 7 and 77 DF,  p-value: 1.093e-10
Bwkmod5 <- lm(winpercent~chocolate+fruity+peanutyalmondy+crispedricewafer+hard+sugarpercent, candy)
summary(Bwkmod5)
## 
## Call:
## lm(formula = winpercent ~ chocolate + fruity + peanutyalmondy + 
##     crispedricewafer + hard + sugarpercent, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.5030  -6.7026   0.5668   5.8802  24.0107 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        32.941      3.518   9.365 2.12e-14 ***
## chocolate          19.147      3.587   5.338 8.98e-07 ***
## fruity              8.881      3.561   2.494  0.01473 *  
## peanutyalmondy      9.483      3.446   2.752  0.00737 ** 
## crispedricewafer    8.385      4.484   1.870  0.06525 .  
## hard               -5.669      3.289  -1.724  0.08871 .  
## sugarpercent        7.979      4.129   1.932  0.05693 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.49 on 78 degrees of freedom
## Multiple R-squared:  0.5279, Adjusted R-squared:  0.4916 
## F-statistic: 14.54 on 6 and 78 DF,  p-value: 4.622e-11
Bwkmod6 <- lm(winpercent~chocolate+fruity+peanutyalmondy+crispedricewafer+sugarpercent, candy)
summary(Bwkmod6)
## 
## Call:
## lm(formula = winpercent ~ chocolate + fruity + peanutyalmondy + 
##     crispedricewafer + sugarpercent, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.2424  -7.0503   0.5657   7.3837  25.6678 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        32.605      3.556   9.170 4.54e-14 ***
## chocolate          19.670      3.618   5.436 5.89e-07 ***
## fruity              7.702      3.538   2.177  0.03246 *  
## peanutyalmondy      9.851      3.482   2.829  0.00592 ** 
## crispedricewafer    8.654      4.537   1.907  0.06010 .  
## sugarpercent        7.044      4.144   1.700  0.09308 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.62 on 79 degrees of freedom
## Multiple R-squared:  0.5099, Adjusted R-squared:  0.4789 
## F-statistic: 16.44 on 5 and 79 DF,  p-value: 4.295e-11
Bwkmod7 <- lm(winpercent~chocolate+fruity+peanutyalmondy+crispedricewafer, candy)
summary(Bwkmod7)  
## 
## Call:
## lm(formula = winpercent ~ chocolate + fruity + peanutyalmondy + 
##     crispedricewafer, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.2846  -6.2386   0.5941   7.8253  23.4505 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        35.411      3.186  11.114  < 2e-16 ***
## chocolate          20.240      3.645   5.552 3.56e-07 ***
## fruity              8.176      3.568   2.292  0.02456 *  
## peanutyalmondy     10.291      3.514   2.929  0.00443 ** 
## crispedricewafer    9.049      4.584   1.974  0.05184 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.75 on 80 degrees of freedom
## Multiple R-squared:  0.492,  Adjusted R-squared:  0.4666 
## F-statistic: 19.37 on 4 and 80 DF,  p-value: 3.545e-11
Bwkmod8 <- lm(winpercent~chocolate+fruity+peanutyalmondy, candy)
summary(Bwkmod8)
## 
## Call:
## lm(formula = winpercent ~ chocolate + fruity + peanutyalmondy, 
##     data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.0497  -7.3084  -0.4523   7.9446  23.8712 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      35.788      3.237  11.057  < 2e-16 ***
## chocolate        21.983      3.599   6.108 3.34e-08 ***
## fruity            7.753      3.625   2.139   0.0354 *  
## peanutyalmondy    9.066      3.520   2.576   0.0118 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.94 on 81 degrees of freedom
## Multiple R-squared:  0.4673, Adjusted R-squared:  0.4475 
## F-statistic: 23.68 on 3 and 81 DF,  p-value: 4.209e-11
  #This Bwk model includes chcoclate, fruity, peanutyalmondy

  #mod4 vs Bwkmod8
    #Varibles are the same: chocolate, fruity, and peanutyalmondy
    #MSEmod4 = 10.7 while MSEBwkmod8 = 10.94
    #I would choose the mod4 because it's MSE is slightly smaller and it's R^2 and adjR^2 are bigger (meaning it explains more error that Bwkmod8)

#E: Inference and Prediction
confint(mod4)
##                        2.5 %     97.5 %
## (Intercept)       25.9244325 43.1435243
## chocolate         11.9779198 27.5182141
## fruity             1.9226936 16.9219505
## caramel           -5.0646342  9.5135969
## peanutyalmondy     2.8643149 17.2770620
## nougat           -10.5884911 12.1971523
## crispedricewafer  -1.5798982 19.4178378
## hard             -13.0514159  0.7207629
## bar               -9.6451266 10.5282068
## pluribus          -6.9134540  5.2044550
## pricepercent     -16.9162393  5.0595165
## sugarpercent      -0.1995199 18.3730456
  #My grandmother used to make homemade peanutbutter bonbons (bite sized peanutbutter/ricecrispy balls dipped in chocolate)
mod5 <- lm(winpercent~chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus, candy)
summary(mod5)
## 
## Call:
## lm(formula = winpercent ~ chocolate + fruity + caramel + peanutyalmondy + 
##     nougat + crispedricewafer + hard + bar + pluribus, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.6779  -5.6765   0.3966   7.0583  21.9144 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       35.0155     4.0781   8.586 9.13e-13 ***
## chocolate         19.9058     3.8975   5.107 2.41e-06 ***
## fruity            10.2677     3.7887   2.710  0.00833 ** 
## caramel            3.3843     3.6034   0.939  0.35065    
## peanutyalmondy    10.1410     3.5949   2.821  0.00612 ** 
## nougat             2.4163     5.6897   0.425  0.67229    
## crispedricewafer   8.9915     5.3279   1.688  0.09564 .  
## hard              -4.8726     3.4394  -1.417  0.16072    
## bar               -0.7220     4.8707  -0.148  0.88256    
## pluribus          -0.1599     3.0115  -0.053  0.95779    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.85 on 75 degrees of freedom
## Multiple R-squared:  0.5148, Adjusted R-squared:  0.4566 
## F-statistic: 8.842 on 9 and 75 DF,  p-value: 6.052e-09
bonbons <- data.frame(chocolate=1, fruity=0, caramel = 0, peanutyalmondy = 1, nougat = 0, crispedricewafer=1, hard = 0, bar = 0, pluribus = 1)
   view(bonbons)
predict(mod5, bonbons)
##       1 
## 73.8939
  #I would create a confidence interval since they are more precise (prediction intervals will be wider)
  confband <- predict(mod5, bonbons, interval = "confidence")
  confband
##       fit      lwr      upr
## 1 73.8939 60.84254 86.94526