#loading library plyr to help rename factor levels
library(plyr)
#load data
mushrooms = read.csv("agaricus-lepiota.csv",header = F)
#rename columns
names(mushrooms) = c("Class",
"CapShape",
"CapSurface",
"CapColor",
"Bruises",
"Odor",
"GillAttachment",
"GillSpacing",
"GillSize",
"GillColor",
"StalkShape",
"StalkRoot",
"StalkSurface Above Ring",
"StalkSurface Below Ring",
"StalkColor Above Ring",
"StalkColor Below Ring",
"VeilType",
"VeilColor",
"RingNumber",
"RingType",
"SporePrintColor",
"Population",
"Habitat")
#rename factor levels
mushrooms$Class = revalue(mushrooms$Class, c("e"="edible","p"="poisonous"))
mushrooms$CapShape = revalue(mushrooms$CapShape, c("b"="bell","c"="conical","x"="convex","f"="flat","k"="knobbed","s"="sunken"))
mushrooms$CapSurface = revalue(mushrooms$CapSurface, c("f"="fibrous","g"="grooves","y"="scaly","s"="smooth"))
mushrooms$CapColor = revalue(mushrooms$CapColor, c("n"="brown","b"="buff","c"="cinnamon","g"="gray","r"="green","p"="pink","u"="purple","e"="red","w"="white","y"="yellow"))
#show first few rows of data renamed
head(mushrooms)
## Class CapShape CapSurface CapColor Bruises Odor GillAttachment
## 1 poisonous convex smooth brown t p f
## 2 edible convex smooth yellow t a f
## 3 edible bell smooth white t l f
## 4 poisonous convex scaly white t p f
## 5 edible convex smooth gray f n f
## 6 edible convex scaly yellow t a f
## GillSpacing GillSize GillColor StalkShape StalkRoot
## 1 c n k e e
## 2 c b k e c
## 3 c b n e c
## 4 c n n e e
## 5 w b k t e
## 6 c b n e c
## StalkSurface Above Ring StalkSurface Below Ring StalkColor Above Ring
## 1 s s w
## 2 s s w
## 3 s s w
## 4 s s w
## 5 s s w
## 6 s s w
## StalkColor Below Ring VeilType VeilColor RingNumber RingType
## 1 w p w o p
## 2 w p w o p
## 3 w p w o p
## 4 w p w o p
## 5 w p w o e
## 6 w p w o p
## SporePrintColor Population Habitat
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
#subset first four columns of the data
MushroomSubset = data.frame(mushrooms$Class,mushrooms$CapShape,mushrooms$CapSurface,mushrooms$CapColor)
#show first few rows of the data
head(MushroomSubset)
## mushrooms.Class mushrooms.CapShape mushrooms.CapSurface
## 1 poisonous convex smooth
## 2 edible convex smooth
## 3 edible bell smooth
## 4 poisonous convex scaly
## 5 edible convex smooth
## 6 edible convex scaly
## mushrooms.CapColor
## 1 brown
## 2 yellow
## 3 white
## 4 white
## 5 gray
## 6 yellow
#random logistic model fitting, because I was curious
MushroomPartialModel = glm(Class~CapShape + CapSurface + CapColor,data = mushrooms, family = binomial(link = logit))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(MushroomPartialModel)
##
## Call:
## glm(formula = Class ~ CapShape + CapSurface + CapColor, family = binomial(link = logit),
## data = mushrooms)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0877 -1.0427 -0.2862 1.0875 2.7470
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.68041 0.25382 -10.560 < 2e-16 ***
## CapShapeconical 17.33853 617.50856 0.028 0.97760
## CapShapeflat 2.67108 0.16913 15.793 < 2e-16 ***
## CapShapeknobbed 3.55429 0.18507 19.205 < 2e-16 ***
## CapShapesunken -12.02817 255.38629 -0.047 0.96244
## CapShapeconvex 2.54187 0.16778 15.150 < 2e-16 ***
## CapSurfacegrooves 17.87103 578.44289 0.031 0.97535
## CapSurfacesmooth 1.18538 0.06817 17.390 < 2e-16 ***
## CapSurfacescaly 0.88260 0.06103 14.462 < 2e-16 ***
## CapColorcinnamon -1.90336 0.39613 -4.805 1.55e-06 ***
## CapColorred -0.55348 0.19294 -2.869 0.00412 **
## CapColorgray -0.65803 0.19250 -3.418 0.00063 ***
## CapColorbrown -1.06939 0.18948 -5.644 1.66e-08 ***
## CapColorpink -0.14602 0.26179 -0.558 0.57700
## CapColorgreen -16.37514 363.54452 -0.045 0.96407
## CapColorpurple -16.37514 363.54452 -0.045 0.96407
## CapColorwhite -1.37665 0.19675 -6.997 2.62e-12 ***
## CapColoryellow 0.31866 0.19903 1.601 0.10936
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 11251.8 on 8123 degrees of freedom
## Residual deviance: 9929.8 on 8106 degrees of freedom
## AIC: 9965.8
##
## Number of Fisher Scoring iterations: 14
#checking if smaller model is better
library(MASS)
stepAIC(MushroomPartialModel,direction = "both")
## Start: AIC=9965.84
## Class ~ CapShape + CapSurface + CapColor
##
## Df Deviance AIC
## <none> 9929.8 9965.8
## - CapSurface 3 10289.5 10319.5
## - CapColor 9 10399.8 10417.8
## - CapShape 5 10497.0 10523.0
##
## Call: glm(formula = Class ~ CapShape + CapSurface + CapColor, family = binomial(link = logit),
## data = mushrooms)
##
## Coefficients:
## (Intercept) CapShapeconical CapShapeflat
## -2.6804 17.3385 2.6711
## CapShapeknobbed CapShapesunken CapShapeconvex
## 3.5543 -12.0282 2.5419
## CapSurfacegrooves CapSurfacesmooth CapSurfacescaly
## 17.8710 1.1854 0.8826
## CapColorcinnamon CapColorred CapColorgray
## -1.9034 -0.5535 -0.6580
## CapColorbrown CapColorpink CapColorgreen
## -1.0694 -0.1460 -16.3751
## CapColorpurple CapColorwhite CapColoryellow
## -16.3751 -1.3767 0.3187
##
## Degrees of Freedom: 8123 Total (i.e. Null); 8106 Residual
## Null Deviance: 11250
## Residual Deviance: 9930 AIC: 9966