library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.3 v purrr 0.3.4
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#install.packages("leaps")
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.3
#part 1
tvalue=11.82/5.432
tvalue
## [1] 2.175994
2 * pt(tvalue, 50, lower.tail = FALSE)
## [1] 0.03430773
#part b
A=59-50
A
## [1] 9
B=12485.91-5487
B
## [1] 6998.91
C=B/A
C
## [1] 777.6567
D=5487/50
D
## [1] 109.74
E=C/D
E
## [1] 7.086356
#part 2
#part a
candy<-read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/candy-power-ranking/candy-data.csv",
header = TRUE)
View(candy)
#part B
#part a
ggplot(candy, aes(sugarpercent, winpercent)) +
geom_point()

#part b
"in my eyes I see the one could see that as sugar percent increases we see a general shift tward higher winpercentage, to before i dont think its a very strong corilation
and the form still seams really spread apart, and because the points are so spead apart i dont see any outliers
"
## [1] "in my eyes I see the one could see that as sugar percent increases we see a general shift tward higher winpercentage, to before i dont think its a very strong corilation\nand the form still seams really spread apart, and because the points are so spead apart i dont see any outliers\n"
# part c
#(i)
candy_lm=lm(winpercent~sugarpercent,candy)
#the slope is 11.92 and the intercept is (44.609)
#(ii)
summary(candy_lm)
##
## Call:
## lm(formula = winpercent ~ sugarpercent, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.924 -11.066 -1.168 9.252 36.851
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.609 3.086 14.455 <2e-16 ***
## sugarpercent 11.924 5.560 2.145 0.0349 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.41 on 83 degrees of freedom
## Multiple R-squared: 0.05251, Adjusted R-squared: 0.04109
## F-statistic: 4.6 on 1 and 83 DF, p-value: 0.0349
# there is a relationship but its not superduper strong. When we look at the p test we see its (.035) thats not great.
ggplot(candy, aes(sugarpercent, winpercent)) +
geom_point() +
geom_abline(slope=11.92, intercept = 44.609)

# part d
#(i)
candy<-cbind(candy,
fit=candy_lm$fitted.values,
residual=candy_lm$residuals)
ggplot(data=candy, aes(sugarpercent, residual))+
geom_point()+
ggtitle("Residual Plot")+
xlab("sugar percent")+
ylab("Residuals")+
theme_bw()+
geom_hline(yintercept = 0,
color="blue", lty=2, lwd=1)

mean(candy_lm$residuals)
## [1] 7.755336e-16
"when we look at the mean of the residuales we see that it is very small, from this we can asume that the the data point are distributed relitively even above
and below the linear regetion. as far as the homoscedasticity it looks very uniformly spread around the extimate, I dont see the residuales really changing"
## [1] "when we look at the mean of the residuales we see that it is very small, from this we can asume that the the data point are distributed relitively even above\nand below the linear regetion. as far as the homoscedasticity it looks very uniformly spread around the extimate, I dont see the residuales really changing"
#(ii)
qqnorm(candy_lm$residuals)
qqline(candy_lm$residuals)

"over all it looks like what we expect to see the begining and end point stray abit far away from the line but this is what we expect to see. There is no abnormal
trends in the chart"
## [1] "over all it looks like what we expect to see the begining and end point stray abit far away from the line but this is what we expect to see. There is no abnormal \ntrends in the chart"
#Part C
#a
candy_Fuitlm=lm(winpercent~sugarpercent+fruity,candy)
summary(candy_Fuitlm)
##
## Call:
## lm(formula = winpercent ~ sugarpercent + fruity, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.947 -10.031 1.162 7.603 31.642
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.842 3.187 15.638 < 2e-16 ***
## sugarpercent 11.255 5.169 2.178 0.032313 *
## fruity -10.989 2.922 -3.760 0.000317 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.39 on 82 degrees of freedom
## Multiple R-squared: 0.1918, Adjusted R-squared: 0.1721
## F-statistic: 9.733 on 2 and 82 DF, p-value: 0.0001612
candy_Fuitlm$coefficients
## (Intercept) sugarpercent fruity
## 49.84189 11.25544 -10.98851
#(i)
"if you make your candy fuity you are 10% less likely to win, thus you should not make you candy fuity if you want to win"
## [1] "if you make your candy fuity you are 10% less likely to win, thus you should not make you candy fuity if you want to win"
#(ii)
"only with sugar percentage: winpercentage= 11.255*sugarpercent+49.842
with both sugar and fruity: winpercentage = 11.255*sugarpercentage+(49.842-10.989)"
## [1] "only with sugar percentage: winpercentage= 11.255*sugarpercent+49.842\nwith both sugar and fruity: winpercentage = 11.255*sugarpercentage+(49.842-10.989)"
#(iii)
ggplot(data=candy, aes(x=sugarpercent, y=winpercent, color=fruity))+
geom_point()+
ggtitle("Scatterplot of sugar percentage vs winpercentage")+
theme_bw()+
geom_abline(slope=candy_Fuitlm$coefficients[2], intercept = candy_Fuitlm$coefficients[1], col=2)+
geom_abline(slope=candy_Fuitlm$coefficients[2], intercept = candy_Fuitlm$coefficients[1]+candy_Fuitlm$coefficients[3], col=4)

#(iv)
"acording to the t test this is pretty significant. and i think you can see the shift pretty clearly"
## [1] "acording to the t test this is pretty significant. and i think you can see the shift pretty clearly"
#b
candy_Fuitlm=lm(winpercent~sugarpercent*fruity,candy)
summary(candy_Fuitlm)
##
## Call:
## lm(formula = winpercent ~ sugarpercent * fruity, data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.313 -8.936 1.451 8.002 33.290
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.070 4.097 11.732 <2e-16 ***
## sugarpercent 14.890 7.384 2.017 0.0471 *
## fruity -7.562 5.757 -1.313 0.1927
## sugarpercent:fruity -7.172 10.372 -0.691 0.4912
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.43 on 81 degrees of freedom
## Multiple R-squared: 0.1966, Adjusted R-squared: 0.1668
## F-statistic: 6.607 on 3 and 81 DF, p-value: 0.0004753
candy_Fuitlm$coefficients
## (Intercept) sugarpercent fruity sugarpercent:fruity
## 48.070388 14.890483 -7.562184 -7.171786
#(i)
"if a candy is fuity the win percentage increase less quickly with sugarpercentage. thus you should not make you candy fuity if you want to win"
## [1] "if a candy is fuity the win percentage increase less quickly with sugarpercentage. thus you should not make you candy fuity if you want to win"
#(ii)
"only with sugar percentage: winpercentage= 14.89*sugarpercent+48.0703
with both sugar and fruity: winpercentage = (14.89-7.562184)*sugarpercentage+(48.0703-7.171786)"
## [1] "only with sugar percentage: winpercentage= 14.89*sugarpercent+48.0703\nwith both sugar and fruity: winpercentage = (14.89-7.562184)*sugarpercentage+(48.0703-7.171786)"
#(iii)
ggplot(data=candy, aes(x=sugarpercent, y=winpercent, color=fruity))+
geom_point()+
ggtitle("Scatterplot of sugar percentage vs winpercentage")+
theme_bw()+
geom_abline(slope=candy_Fuitlm$coefficients[2], intercept = candy_Fuitlm$coefficients[1], col=2)+
geom_abline(slope=candy_Fuitlm$coefficients[2]+candy_Fuitlm$coefficients[4], intercept = candy_Fuitlm$coefficients[1]+candy_Fuitlm$coefficients[3], col=4)

#(iv)
"I would argue that it is not significant because of the score on the p test, the are both greater than .1. although i cant help but feel a bit
hurt that people dont like fuity candys"
## [1] "I would argue that it is not significant because of the score on the p test, the are both greater than .1. although i cant help but feel a bit \nhurt that people dont like fuity candys"
# Part D
#part a
candy_Fuit_all=lm(winpercent~(sugarpercent+fruity+chocolate+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus+pricepercent),candy)
summary(candy_Fuit_all)
##
## Call:
## lm(formula = winpercent ~ (sugarpercent + fruity + chocolate +
## caramel + peanutyalmondy + nougat + crispedricewafer + hard +
## bar + pluribus + pricepercent), data = candy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.2244 -6.6247 0.1986 6.8420 23.8680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.5340 4.3199 7.994 1.44e-11 ***
## sugarpercent 9.0868 4.6595 1.950 0.05500 .
## fruity 9.4223 3.7630 2.504 0.01452 *
## chocolate 19.7481 3.8987 5.065 2.96e-06 ***
## caramel 2.2245 3.6574 0.608 0.54493
## peanutyalmondy 10.0707 3.6158 2.785 0.00681 **
## nougat 0.8043 5.7164 0.141 0.88849
## crispedricewafer 8.9190 5.2679 1.693 0.09470 .
## hard -6.1653 3.4551 -1.784 0.07852 .
## bar 0.4415 5.0611 0.087 0.93072
## pluribus -0.8545 3.0401 -0.281 0.77945
## pricepercent -5.9284 5.5132 -1.075 0.28578
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.7 on 73 degrees of freedom
## Multiple R-squared: 0.5402, Adjusted R-squared: 0.4709
## F-statistic: 7.797 on 11 and 73 DF, p-value: 9.504e-09
"the varibles that retain importance is chocolate,fruity, and peanuty/almondy"
## [1] "the varibles that retain importance is chocolate,fruity, and peanuty/almondy"
#part b
regfit.fwd<-regsubsets(winpercent ~ sugarpercent+fruity+chocolate+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus+pricepercent, data=candy,
method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(winpercent ~ sugarpercent + fruity + chocolate +
## caramel + peanutyalmondy + nougat + crispedricewafer + hard +
## bar + pluribus + pricepercent, data = candy, method = "forward")
## 11 Variables (and intercept)
## Forced in Forced out
## sugarpercent FALSE FALSE
## fruity FALSE FALSE
## chocolate FALSE FALSE
## caramel FALSE FALSE
## peanutyalmondy FALSE FALSE
## nougat FALSE FALSE
## crispedricewafer FALSE FALSE
## hard FALSE FALSE
## bar FALSE FALSE
## pluribus FALSE FALSE
## pricepercent FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## sugarpercent fruity chocolate caramel peanutyalmondy nougat
## 1 ( 1 ) " " " " "*" " " " " " "
## 2 ( 1 ) " " " " "*" " " "*" " "
## 3 ( 1 ) " " "*" "*" " " "*" " "
## 4 ( 1 ) " " "*" "*" " " "*" " "
## 5 ( 1 ) "*" "*" "*" " " "*" " "
## 6 ( 1 ) "*" "*" "*" " " "*" " "
## 7 ( 1 ) "*" "*" "*" " " "*" " "
## 8 ( 1 ) "*" "*" "*" "*" "*" " "
## crispedricewafer hard bar pluribus pricepercent
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " "
## 7 ( 1 ) "*" "*" " " " " "*"
## 8 ( 1 ) "*" "*" " " " " "*"
regfit.bwd<-regsubsets(winpercent ~ sugarpercent+fruity+chocolate+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus+pricepercent, data=candy,
method="backward")
summary(regfit.bwd)
## Subset selection object
## Call: regsubsets.formula(winpercent ~ sugarpercent + fruity + chocolate +
## caramel + peanutyalmondy + nougat + crispedricewafer + hard +
## bar + pluribus + pricepercent, data = candy, method = "backward")
## 11 Variables (and intercept)
## Forced in Forced out
## sugarpercent FALSE FALSE
## fruity FALSE FALSE
## chocolate FALSE FALSE
## caramel FALSE FALSE
## peanutyalmondy FALSE FALSE
## nougat FALSE FALSE
## crispedricewafer FALSE FALSE
## hard FALSE FALSE
## bar FALSE FALSE
## pluribus FALSE FALSE
## pricepercent FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: backward
## sugarpercent fruity chocolate caramel peanutyalmondy nougat
## 1 ( 1 ) " " " " "*" " " " " " "
## 2 ( 1 ) " " " " "*" " " "*" " "
## 3 ( 1 ) " " "*" "*" " " "*" " "
## 4 ( 1 ) " " "*" "*" " " "*" " "
## 5 ( 1 ) "*" "*" "*" " " "*" " "
## 6 ( 1 ) "*" "*" "*" " " "*" " "
## 7 ( 1 ) "*" "*" "*" " " "*" " "
## 8 ( 1 ) "*" "*" "*" "*" "*" " "
## crispedricewafer hard bar pluribus pricepercent
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " "
## 7 ( 1 ) "*" "*" " " " " "*"
## 8 ( 1 ) "*" "*" " " " " "*"
regfit.full<-regsubsets(winpercent ~ sugarpercent+fruity+chocolate+caramel+peanutyalmondy+nougat+crispedricewafer+hard+bar+pluribus+pricepercent, data=candy)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(winpercent ~ sugarpercent + fruity + chocolate +
## caramel + peanutyalmondy + nougat + crispedricewafer + hard +
## bar + pluribus + pricepercent, data = candy)
## 11 Variables (and intercept)
## Forced in Forced out
## sugarpercent FALSE FALSE
## fruity FALSE FALSE
## chocolate FALSE FALSE
## caramel FALSE FALSE
## peanutyalmondy FALSE FALSE
## nougat FALSE FALSE
## crispedricewafer FALSE FALSE
## hard FALSE FALSE
## bar FALSE FALSE
## pluribus FALSE FALSE
## pricepercent FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## sugarpercent fruity chocolate caramel peanutyalmondy nougat
## 1 ( 1 ) " " " " "*" " " " " " "
## 2 ( 1 ) " " " " "*" " " "*" " "
## 3 ( 1 ) " " "*" "*" " " "*" " "
## 4 ( 1 ) " " "*" "*" " " "*" " "
## 5 ( 1 ) "*" "*" "*" " " "*" " "
## 6 ( 1 ) "*" "*" "*" " " "*" " "
## 7 ( 1 ) "*" "*" "*" " " "*" " "
## 8 ( 1 ) "*" "*" "*" "*" "*" " "
## crispedricewafer hard bar pluribus pricepercent
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " "
## 7 ( 1 ) "*" "*" " " " " "*"
## 8 ( 1 ) "*" "*" " " " " "*"
#c)
#(i)
"after preforming both forward,best subset, and backward selection and all four agree, chocolate , nut ,and fuity have the most influence."
## [1] "after preforming both forward,best subset, and backward selection and all four agree, chocolate , nut ,and fuity have the most influence."
#(ii)
anova(lm(winpercent~(chocolate+peanutyalmondy+fruity),candy))
## Analysis of Variance Table
##
## Response: winpercent
## Df Sum Sq Mean Sq F value Pr(>F)
## chocolate 1 7368.5 7368.5 61.6029 1.491e-11 ***
## peanutyalmondy 1 582.5 582.5 4.8700 0.03016 *
## fruity 1 547.3 547.3 4.5754 0.03545 *
## Residuals 81 9688.7 119.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
'I dont know if i did something wrong but all my models came up with the same varibles in order to predict win percent. '
## [1] "I dont know if i did something wrong but all my models came up with the same varibles in order to predict win percent. "
#(iii)
"to be honest with you i personally prefer the exhaustive method becusae the data set is so small. But I would have to chose either fwd or bwd method over the basic
linear regesion model. I just feel like they are less prone to human error. they are also quicker to do than any manually"
## [1] "to be honest with you i personally prefer the exhaustive method becusae the data set is so small. But I would have to chose either fwd or bwd method over the basic\nlinear regesion model. I just feel like they are less prone to human error. they are also quicker to do than any manually"
#Part E
#part a
#part b
"my fravorite candy is zolts they are sour candys flavored after fruit with fizz on the inside. it is also a hard candy. its also technically multipeice. so we would have a and estimated win rate of
(50+10.3-0.2-4.9)=55.2 its not bad but not good"
## [1] "my fravorite candy is zolts they are sour candys flavored after fruit with fizz on the inside. it is also a hard candy. its also technically multipeice. so we would have a and estimated win rate of \n(50+10.3-0.2-4.9)=55.2 its not bad but not good"
#(ii)
"we would use a predictor becuse we are useing catigorical varibles It also convays the data more cleanly to the reader"
## [1] "we would use a predictor becuse we are useing catigorical varibles It also convays the data more cleanly to the reader"