Midterm #2

library(ISLR) library(tidyverse)

##A

library(ISLR)
library(tidyverse)


candy<-read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/candy-power-ranking/candy-data.csv",
                header=TRUE)

summary(candy)

##  competitorname       chocolate          fruity          caramel      
##  Length:85          Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  Class :character   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Mode  :character   Median :0.0000   Median :0.0000   Median :0.0000  
##                     Mean   :0.4353   Mean   :0.4471   Mean   :0.1647  
##                     3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##                     Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  peanutyalmondy       nougat        crispedricewafer       hard       
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.0000  
##  Mean   :0.1647   Mean   :0.08235   Mean   :0.08235   Mean   :0.1765  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##       bar            pluribus       sugarpercent     pricepercent   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0110   Min.   :0.0110  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.2200   1st Qu.:0.2550  
##  Median :0.0000   Median :1.0000   Median :0.4650   Median :0.4650  
##  Mean   :0.2471   Mean   :0.5176   Mean   :0.4786   Mean   :0.4689  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.7320   3rd Qu.:0.6510  
##  Max.   :1.0000   Max.   :1.0000   Max.   :0.9880   Max.   :0.9760  
##    winpercent   
##  Min.   :22.45  
##  1st Qu.:39.14  
##  Median :47.83  
##  Mean   :50.32  
##  3rd Qu.:59.86  
##  Max.   :84.18

 mod1<-lm(formula = winpercent ~ chocolate + fruity +caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus,data=candy)
summary(mod1)

## 
## Call:
## lm(formula = winpercent ~ chocolate + fruity + caramel + peanutyalmondy + 
##     nougat + crispedricewafer + hard + bar + pluribus, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.6779  -5.6765   0.3966   7.0583  21.9144 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       35.0155     4.0781   8.586 9.13e-13 ***
## chocolate         19.9058     3.8975   5.107 2.41e-06 ***
## fruity            10.2677     3.7887   2.710  0.00833 ** 
## caramel            3.3843     3.6034   0.939  0.35065    
## peanutyalmondy    10.1410     3.5949   2.821  0.00612 ** 
## nougat             2.4163     5.6897   0.425  0.67229    
## crispedricewafer   8.9915     5.3279   1.688  0.09564 .  
## hard              -4.8726     3.4394  -1.417  0.16072    
## bar               -0.7220     4.8707  -0.148  0.88256    
## pluribus          -0.1599     3.0115  -0.053  0.95779    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.85 on 75 degrees of freedom
## Multiple R-squared:  0.5148, Adjusted R-squared:  0.4566 
## F-statistic: 8.842 on 9 and 75 DF,  p-value: 6.052e-09

##B

#a
ggplot(candy, aes(x= sugarpercent, y=winpercent, color = sugarpercent))+geom_point()

#I added color to the graph to make it more readable

#b
#Form: There seems to be no relationship between winpercent and sugarpercent
#Direction: The direction indicates towards linear
#Strength: Very weak, points are very scattered.
#Outlier: No real noticeable outliers, the points are so scattered its hard to tell
#       where an outlier could be. The point at ~ (.05,83) is a potential outlier


#c
SugVSWin<-lm(winpercent~sugarpercent, data = candy)
summary(SugVSWin)

## 
## Call:
## lm(formula = winpercent ~ sugarpercent, data = candy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.924 -11.066  -1.168   9.252  36.851 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    44.609      3.086  14.455   <2e-16 ***
## sugarpercent   11.924      5.560   2.145   0.0349 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.41 on 83 degrees of freedom
## Multiple R-squared:  0.05251,    Adjusted R-squared:  0.04109 
## F-statistic:   4.6 on 1 and 83 DF,  p-value: 0.0349

  #i. The slope is 11.924 for sugarpercent. Increasing sugar increases the win % 11.94%. A p value of .03 tells us this is significant. When there is no sugar the predicted winscore is 44.61.
  #ii. A p value of .035 for sugarpercent tells us this is significant.
  #iii. 
ggplot(candy, aes(x= sugarpercent, y=winpercent, color = sugarpercent))+geom_point()+geom_abline(intercept=44.609, slope =11.924, color = "red", lwd=1)

#d
#Yes we should trust the previous model
par(mfrow = c(2, 2))
plot(SugVSWin)

#ii. 
qplot(x = sugarpercent, y = winpercent, data = candy, 
      geom = "point", alpha = I(.1), ylab = "winpercent") +
  stat_smooth(method = "lm",
              se = FALSE)

##C

#a
model2<-lm(winpercent~sugarpercent+chocolate, candy)
#i
summary(model2)

## 
## Call:
## lm(formula = winpercent ~ sugarpercent + chocolate, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.6981  -8.5153  -0.4489   7.7602  27.4820 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    38.262      2.552  14.990  < 2e-16 ***
## sugarpercent    8.567      4.355   1.967   0.0525 .  
## chocolate      18.273      2.469   7.401 1.06e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.22 on 82 degrees of freedom
## Multiple R-squared:  0.432,  Adjusted R-squared:  0.4181 
## F-statistic: 31.18 on 2 and 82 DF,  p-value: 8.5e-11

#If the candy contains no sugar or chocolate the predicted win score is 38.26%. Increasing by 8.567 if sugar is added.
#If the candy has chocolate and no sugar than the predicted win score is 56.54%, increasing by 8.567% when sugar is added.
#ii

#First equation is for no chocolate but contains sugar
#Second equation is for chocolate and sugar.

\[\hat{y}= 38.262+8.567x\] \[\hat{y}= 56.54+8.567x\]

#iii
ggplot(candy, aes(x=sugarpercent, y=winpercent, color="blue"))+
  geom_point()+
  geom_abline(intercept = 38.262, slope=8.567,color="red", lwd=1)+
  geom_abline(intercept = 56.54, slope=8.567)

#iv
#While chocolate does boost the winpercent of a piece of candy by 18.3%, the candy containing sugar has an insignificant relation to winpercent.

#b.

#i
model3<-lm(winpercent~sugarpercent*chocolate, candy)
summary(model3)

## 
## Call:
## lm(formula = winpercent ~ sugarpercent * chocolate, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.4007  -8.0463  -0.7059   6.2815  28.5003 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              39.778      2.878  13.820   <2e-16 ***
## sugarpercent              5.221      5.257   0.993   0.3236    
## chocolate                13.051      5.230   2.495   0.0146 *  
## sugarpercent:chocolate   10.586      9.350   1.132   0.2609    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.21 on 81 degrees of freedom
## Multiple R-squared:  0.4408, Adjusted R-squared:  0.4201 
## F-statistic: 21.28 on 3 and 81 DF,  p-value: 2.916e-10

#The slope coefficient is 5.221%. 

#ii
#The first model looks at sugar content but not chocolate. The second model includes chocolate.

\[\hat{y}= 39.788+5.221x\] \[\hat{y}= 52.829+5.221x\]

#iii
ggplot(candy, aes(x=sugarpercent, y=winpercent))+
  geom_point()+
  geom_abline(intercept = 39.788, slope=5.221,color="red", lwd=1)+
  geom_abline(intercept = 56.54, slope=5.221, color="brown", lwd=1)

#iv
#The two graphs look very similar, there is only a slight change in the interactions in this model and the previous model.

##D

library(leaps)
#a
model4<-lm(winpercent~sugarpercent+chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus+pricepercent, candy)
summary(model4)

## 
## Call:
## lm(formula = winpercent ~ sugarpercent + chocolate + fruity + 
##     caramel + peanutyalmondy + nougat + crispedricewafer + hard + 
##     bar + pluribus + pricepercent, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.2244  -6.6247   0.1986   6.8420  23.8680 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       34.5340     4.3199   7.994 1.44e-11 ***
## sugarpercent       9.0868     4.6595   1.950  0.05500 .  
## chocolate         19.7481     3.8987   5.065 2.96e-06 ***
## fruity             9.4223     3.7630   2.504  0.01452 *  
## caramel            2.2245     3.6574   0.608  0.54493    
## peanutyalmondy    10.0707     3.6158   2.785  0.00681 ** 
## nougat             0.8043     5.7164   0.141  0.88849    
## crispedricewafer   8.9190     5.2679   1.693  0.09470 .  
## hard              -6.1653     3.4551  -1.784  0.07852 .  
## bar                0.4415     5.0611   0.087  0.93072    
## pluribus          -0.8545     3.0401  -0.281  0.77945    
## pricepercent      -5.9284     5.5132  -1.075  0.28578    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.7 on 73 degrees of freedom
## Multiple R-squared:  0.5402, Adjusted R-squared:  0.4709 
## F-statistic: 7.797 on 11 and 73 DF,  p-value: 9.504e-09

#Chocolate, fruity, and peanutyalmondy
smolModel4<-lm(winpercent~sugarpercent+chocolate+fruity+peanutyalmondy, candy)
summary(smolModel4)

## 
## Call:
## lm(formula = winpercent ~ sugarpercent + chocolate + fruity + 
##     peanutyalmondy, data = candy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.0204  -7.9139  -0.0392   7.4155  25.8389 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      32.803      3.612   9.081 6.09e-14 ***
## sugarpercent      7.449      4.206   1.771   0.0804 .  
## chocolate        21.301      3.574   5.961 6.45e-08 ***
## fruity            7.271      3.588   2.026   0.0461 *  
## peanutyalmondy    8.657      3.482   2.486   0.0150 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.8 on 80 degrees of freedom
## Multiple R-squared:  0.4874, Adjusted R-squared:  0.4617 
## F-statistic: 19.01 on 4 and 80 DF,  p-value: 5.056e-11

#b
forwardselect<-regsubsets(winpercent~sugarpercent+chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus+pricepercent, data=candy,
                       method="forward")
summary(forwardselect)

## Subset selection object
## Call: regsubsets.formula(winpercent ~ sugarpercent + chocolate + fruity + 
##     caramel + peanutyalmondy + nougat + crispedricewafer + hard + 
##     bar + pluribus + pricepercent, data = candy, method = "forward")
## 11 Variables  (and intercept)
##                  Forced in Forced out
## sugarpercent         FALSE      FALSE
## chocolate            FALSE      FALSE
## fruity               FALSE      FALSE
## caramel              FALSE      FALSE
## peanutyalmondy       FALSE      FALSE
## nougat               FALSE      FALSE
## crispedricewafer     FALSE      FALSE
## hard                 FALSE      FALSE
## bar                  FALSE      FALSE
## pluribus             FALSE      FALSE
## pricepercent         FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
##          sugarpercent chocolate fruity caramel peanutyalmondy nougat
## 1  ( 1 ) " "          "*"       " "    " "     " "            " "   
## 2  ( 1 ) " "          "*"       " "    " "     "*"            " "   
## 3  ( 1 ) " "          "*"       "*"    " "     "*"            " "   
## 4  ( 1 ) " "          "*"       "*"    " "     "*"            " "   
## 5  ( 1 ) "*"          "*"       "*"    " "     "*"            " "   
## 6  ( 1 ) "*"          "*"       "*"    " "     "*"            " "   
## 7  ( 1 ) "*"          "*"       "*"    " "     "*"            " "   
## 8  ( 1 ) "*"          "*"       "*"    "*"     "*"            " "   
##          crispedricewafer hard bar pluribus pricepercent
## 1  ( 1 ) " "              " "  " " " "      " "         
## 2  ( 1 ) " "              " "  " " " "      " "         
## 3  ( 1 ) " "              " "  " " " "      " "         
## 4  ( 1 ) "*"              " "  " " " "      " "         
## 5  ( 1 ) "*"              " "  " " " "      " "         
## 6  ( 1 ) "*"              "*"  " " " "      " "         
## 7  ( 1 ) "*"              "*"  " " " "      "*"         
## 8  ( 1 ) "*"              "*"  " " " "      "*"

#c
#i
#I would use chocolate, fruity, and peanutyalmondy because they are statistically significant for our first model.
#ii 
#For the reduced model the mse is:
mse <- mean(smolModel4$residuals^2)
mse

## [1] 109.6847

#iii
#I prefer the reduced model because it is easier to understand at a glance. I am also more familiar with the reduced version.

##E

#a
#Intercept: 6.09e^-14
#Sugarpercent: .804
#Chocolate: 6.45e^-8
#Fruity: .0461
#Peanutyalmondy: .0150  


#b
#i
#I CHOOSE LOLLIPOPS
model6<-lm(winpercent~chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus, data=candy) 
summary

## function (object, ...) 
## UseMethod("summary")
## <bytecode: 0x0000000015e3e410>
## <environment: namespace:base>

#35.0155(intercept)+10.2677(fruity)-4.8726
# =40.4 winpercent

#ii
#Many of these predictors are not statistically significant therefore I wouldn't trust this model for new types of candies.

Midterm #2

Aidan Ramirez

11/11/2020