First, our data frame is loaded from a raw page on Github.
MushroomDataBase <- read.csv(url("https://raw.githubusercontent.com/WigodskyD/Data-607/master/MushroomData.csv"), header=FALSE)
MushroomDataBase[[24]]<-"0"
levels(MushroomDataBase[[24]])<-c(1,0)
MushroomDataBase[[24]][MushroomDataBase[[1]] == "p"] <- as.numeric("1")
MushroomDataBase[[24]][MushroomDataBase[[1]] == "e"] <- as.numeric("0")
MushroomDataBase[[24]]<-factor(MushroomDataBase[[24]])
head(MushroomDataBase,2)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## V21 V22 V23 V24
## 1 k s u 1
## 2 n n g 0
We use a set of logit models to determine which variables to keep. The 4 most promising are shown in the model below.
GeneralizedLinearModel<-glm(MushroomDataBase[[24]] ~ MushroomDataBase[[2]]+MushroomDataBase[[3]]+MushroomDataBase[[14]]+MushroomDataBase[[23]], family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(GeneralizedLinearModel)
##
## Call:
## glm(formula = MushroomDataBase[[24]] ~ MushroomDataBase[[2]] +
## MushroomDataBase[[3]] + MushroomDataBase[[14]] + MushroomDataBase[[23]],
## family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8947 -0.7032 -0.0004 0.4698 3.2098
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.87595 0.26507 -18.395 < 2e-16 ***
## MushroomDataBase[[2]]c 18.80599 1020.35599 0.018 0.985295
## MushroomDataBase[[2]]f 2.13292 0.20494 10.408 < 2e-16 ***
## MushroomDataBase[[2]]k 2.79753 0.22625 12.365 < 2e-16 ***
## MushroomDataBase[[2]]s -14.48857 424.18367 -0.034 0.972752
## MushroomDataBase[[2]]x 2.12536 0.20236 10.503 < 2e-16 ***
## MushroomDataBase[[3]]g 18.67673 952.17713 0.020 0.984351
## MushroomDataBase[[3]]s 2.28858 0.10607 21.577 < 2e-16 ***
## MushroomDataBase[[3]]y 1.47953 0.09706 15.243 < 2e-16 ***
## MushroomDataBase[[14]]k 4.74934 0.16753 28.350 < 2e-16 ***
## MushroomDataBase[[14]]s 0.45611 0.13304 3.428 0.000607 ***
## MushroomDataBase[[14]]y -0.26955 0.21187 -1.272 0.203293
## MushroomDataBase[[23]]g -0.78517 0.09359 -8.390 < 2e-16 ***
## MushroomDataBase[[23]]l 0.62566 0.11301 5.537 3.09e-08 ***
## MushroomDataBase[[23]]m -0.93699 0.20003 -4.684 2.81e-06 ***
## MushroomDataBase[[23]]p 1.48206 0.12655 11.711 < 2e-16 ***
## MushroomDataBase[[23]]u 2.34234 0.16132 14.520 < 2e-16 ***
## MushroomDataBase[[23]]w -16.47005 169.36673 -0.097 0.922532
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 11251.8 on 8123 degrees of freedom
## Residual deviance: 6032.1 on 8106 degrees of freedom
## AIC: 6068.1
##
## Number of Fisher Scoring iterations: 15
Finally, we rename columns and levels so they are easier to read.
## Could_I_Die Cap_Shape Cap_Surface Stalk_Surface_Below_Ring Habitat
## 1 poison convex smooth smooth urban
## 2 eat it convex smooth smooth grasses
## 3 eat it bell smooth smooth meadows
## 4 poison convex scaly smooth urban
## 5 eat it convex smooth smooth grasses
## 6 eat it convex scaly smooth grasses
## 7 eat it bell smooth smooth meadows
## 8 eat it bell scaly smooth meadows
## 9 poison convex scaly smooth grasses
## 10 eat it bell smooth smooth meadows
## 11 eat it convex scaly smooth grasses
## 12 eat it convex scaly smooth meadows
## 13 eat it bell smooth smooth grasses
## 14 poison convex scaly smooth urban
## 15 eat it convex fibrous fibrous grasses
## 16 eat it sunken fibrous smooth urban
## 17 eat it flat fibrous smooth grasses
## 18 poison convex smooth smooth grasses
## 19 poison convex scaly smooth urban
## 20 poison convex smooth smooth urban
PoisonMushrooms<-data.frame(MushroomDataBase[[24]],MushroomDataBase[[2]],MushroomDataBase[[3]],MushroomDataBase[[14]],MushroomDataBase[[23]])
colnames(PoisonMushrooms)<-c("Could_I_Die","Cap_Shape","Cap_Surface", "Stalk_Surface_Below_Ring", "Habitat")
PoisonMushrooms[[1]]<-as.character(PoisonMushrooms[[1]])
PoisonMushrooms[[2]]<-as.character(PoisonMushrooms[[2]])
PoisonMushrooms[[3]]<-as.character(PoisonMushrooms[[3]])
PoisonMushrooms[[4]]<-as.character(PoisonMushrooms[[4]])
PoisonMushrooms[[5]]<-as.character(PoisonMushrooms[[5]])
PoisonMushrooms[[1]][PoisonMushrooms[[1]] == "1"] <- "poison"
PoisonMushrooms[[1]][PoisonMushrooms[[1]] == "0"] <- "eat it"
PoisonMushrooms[[2]][PoisonMushrooms[[2]] == "x"] <- "convex"
PoisonMushrooms[[2]][PoisonMushrooms[[2]] == "b"] <- "bell"
PoisonMushrooms[[2]][PoisonMushrooms[[2]] == "c"] <- "conical"
PoisonMushrooms[[2]][PoisonMushrooms[[2]] == "f"] <- "flat"
PoisonMushrooms[[2]][PoisonMushrooms[[2]] == "k"] <- "knobbed"
PoisonMushrooms[[2]][PoisonMushrooms[[2]] == "s"] <- "sunken"
PoisonMushrooms[[3]][PoisonMushrooms[[3]] == "f"] <- "fibrous"
PoisonMushrooms[[3]][PoisonMushrooms[[3]] == "g"] <- "grooved"
PoisonMushrooms[[3]][PoisonMushrooms[[3]] == "y"] <- "scaly"
PoisonMushrooms[[3]][PoisonMushrooms[[3]] == "s"] <- "smooth"
PoisonMushrooms[[4]][PoisonMushrooms[[4]] == "f"] <- "fibrous"
PoisonMushrooms[[4]][PoisonMushrooms[[4]] == "y"] <- "scaly"
PoisonMushrooms[[4]][PoisonMushrooms[[4]] == "k"] <- "silky"
PoisonMushrooms[[4]][PoisonMushrooms[[4]] == "s"] <- "smooth"
PoisonMushrooms[[5]][PoisonMushrooms[[5]] == "g"] <- "grasses"
PoisonMushrooms[[5]][PoisonMushrooms[[5]] == "l"] <- "leaves"
PoisonMushrooms[[5]][PoisonMushrooms[[5]] == "m"] <- "meadows"
PoisonMushrooms[[5]][PoisonMushrooms[[5]] == "p"] <- "paths"
PoisonMushrooms[[5]][PoisonMushrooms[[5]] == "u"] <- "urban"
PoisonMushrooms[[5]][PoisonMushrooms[[5]] == "w"] <- "waste"
PoisonMushrooms[[5]][PoisonMushrooms[[5]] == "d"] <- "woods"
head(PoisonMushrooms,20)