#clean the workspace.
rm(list=ls())
getwd()## [1] "C:/Users/Stephen Jones/Documents"
#need data.table for fread.
suppressWarnings(library(data.table))#download .data file from url, write to local drive, then upload to github
mushdata<-fread("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")
write.csv(mushdata,'agaricus-lepiota.csv')#download data from github
library(RCurl)
mushdata<-read.csv(text=getURL("https://raw.githubusercontent.com/sigmasigmaiota/mushroom/master/agaricus-lepiota.csv"))
#examine data
summary(mushdata)## X p x s n t
## Min. : 1 e:4208 b: 452 f:2320 n :2283 f:4748
## 1st Qu.:2032 p:3915 c: 4 g: 4 g :1840 t:3375
## Median :4062 f:3152 s:2555 e :1500
## Mean :4062 k: 828 y:3244 y :1072
## 3rd Qu.:6092 s: 32 w :1040
## Max. :8123 x:3655 b : 168
## (Other): 220
## p.1 f c n.1 k e
## n :3528 a: 210 c:6811 b:5612 b :1728 e:3515
## f :2160 f:7913 w:1312 n:2511 p :1492 t:4608
## s : 576 w :1202
## y : 576 n :1048
## a : 400 g : 752
## l : 400 h : 732
## (Other): 483 (Other):1169
## e.1 s.1 s.2 w w.1 p.2
## ?:2480 f: 552 f: 600 w :4463 w :4383 p:8123
## b:3776 k:2372 k:2304 p :1872 p :1872
## c: 556 s:5175 s:4935 g : 576 g : 576
## e:1119 y: 24 y: 284 n : 448 n : 512
## r: 192 b : 432 b : 432
## o : 192 o : 192
## (Other): 140 (Other): 156
## w.2 o p.3 k.1 s.3 u
## n: 96 n: 36 e:2776 w :2388 a: 384 d:3148
## o: 96 o:7487 f: 48 n :1968 c: 340 g:2148
## w:7923 t: 600 l:1296 k :1871 n: 400 l: 832
## y: 8 n: 36 h :1632 s:1247 m: 292
## p:3967 r : 72 v:4040 p:1144
## b : 48 y:1712 u: 367
## (Other): 144 w: 192
head(mushdata)## X p x s n t p.1 f c n.1 k e e.1 s.1 s.2 w w.1 p.2 w.2 o p.3 k.1 s.3 u
## 1 1 e x s y t a f c b k e c s s w w p w o p n n g
## 2 2 e b s w t l f c b n e c s s w w p w o p n n m
## 3 3 p x y w t p f c n n e e s s w w p w o p k s u
## 4 4 e x s g f n f w b k t e s s w w p w o e n a g
## 5 5 e x y y t a f c b n e c s s w w p w o p k n g
## 6 6 e b s w t a f c b g e c s s w w p w o p k n m
#remove first column which serves as a redundant row counter.
mushdata$X<-NULLsapply(mushdata, function(x) unique(x))## $p
## [1] e p
## Levels: e p
##
## $x
## [1] x b s f k c
## Levels: b c f k s x
##
## $s
## [1] s y f g
## Levels: f g s y
##
## $n
## [1] y w g n e p b u c r
## Levels: b c e g n p r u w y
##
## $t
## [1] t f
## Levels: f t
##
## $p.1
## [1] a l p n f c y s m
## Levels: a c f l m n p s y
##
## $f
## [1] f a
## Levels: a f
##
## $c
## [1] c w
## Levels: c w
##
## $n.1
## [1] b n
## Levels: b n
##
## $k
## [1] k n g p w h u e b r y o
## Levels: b e g h k n o p r u w y
##
## $e
## [1] e t
## Levels: e t
##
## $e.1
## [1] c e b r ?
## Levels: ? b c e r
##
## $s.1
## [1] s f k y
## Levels: f k s y
##
## $s.2
## [1] s f y k
## Levels: f k s y
##
## $w
## [1] w g p n b e o c y
## Levels: b c e g n o p w y
##
## $w.1
## [1] w p g b n e y o c
## Levels: b c e g n o p w y
##
## $p.2
## [1] p
## Levels: p
##
## $w.2
## [1] w n o y
## Levels: n o w y
##
## $o
## [1] o t n
## Levels: n o t
##
## $p.3
## [1] p e l f n
## Levels: e f l n p
##
## $k.1
## [1] n k u h w r o y b
## Levels: b h k n o r u w y
##
## $s.3
## [1] n s a v y c
## Levels: a c n s v y
##
## $u
## [1] g m u d p w l
## Levels: d g l m p u w
colnames(mushdata)[1]<-"Edible"
colnames(mushdata)[2]<-"CapShape"
colnames(mushdata)[3]<-"CapSurface"
colnames(mushdata)[4]<-"CapColor"
colnames(mushdata)[6]<-"Odor"
colnames(mushdata)[22]<-"Population"
colnames(mushdata)[23]<-"Habitat"#Check missing values
sapply(mushdata, function(x) sum(is.na(x)))## Edible CapShape CapSurface CapColor t Odor
## 0 0 0 0 0 0
## f c n.1 k e e.1
## 0 0 0 0 0 0
## s.1 s.2 w w.1 p.2 w.2
## 0 0 0 0 0 0
## o p.3 k.1 Population Habitat
## 0 0 0 0 0
library(car)
mushdata$Edible<-recode(mushdata$Edible,
"'e'='edible';
'p'='poisonous'")
mushdata$CapShape<-recode(mushdata$CapShape,
"'b'='bell';
'c'='conical';
'x'='convex';
'f'='flat';
'k'='knobbed';
's'='sunken'")
mushdata$CapSurface<-recode(mushdata$CapSurface,
"'f'='fibrous';
'g'='grooves';
'y'='scaly';
's'='smooth'")
mushdata$CapColor<-recode(mushdata$CapColor,
"'n'='brown';
'b'='buff';
'c'='cinnamon';
'g'='gray';
'r'='green';
'p'='pink';
'u'='purple';
'e'='red';
'w'='white';
'y'='yellow'")
mushdata$Odor<-recode(mushdata$Odor,
"'a'='almond';
'l'='anise';
'c'='creosote';
'y'='fishy';
'f'='foul';
'm'='musty';
'n'='none';
'p'='pungent';
's'='spicy'")
mushdata$Population<-recode(mushdata$Population,
"'a'='abundant';
'c'='clustered';
'n'='numerous';
's'='scattered';
'v'='several';
'y'='solitary'")
mushdata$Habitat<-recode(mushdata$Habitat,
"'g'='grasses';
'l'='leaves';
'm'='meadows';
'p'='paths';
'u'='urban';
'w'='waste';
'd'='woods'")sapply(mushdata, function(x) sum(is.na(x)))## Edible CapShape CapSurface CapColor t Odor
## 0 0 0 0 0 0
## f c n.1 k e e.1
## 0 0 0 0 0 0
## s.1 s.2 w w.1 p.2 w.2
## 0 0 0 0 0 0
## o p.3 k.1 Population Habitat
## 0 0 0 0 0
mushdata.sub<-mushdata[,c("Edible","CapShape","CapSurface","CapColor","Odor","Population","Habitat")]Edible.Pois<-split(mushdata.sub[2:7],mushdata$Edible)
pois<-as.data.frame(Edible.Pois$poisonous)
edib<-as.data.frame(Edible.Pois$edible)
CS.p<-table(pois$CapShape)
CSur.p<-table(pois$CapSurface)
CC.p<-table(pois$CapColor)
OD.p<-table(pois$Odor)
POP.p<-table(pois$Population)
HB.p<-table(pois$Habitat)
CS.e<-table(edib$CapShape)
CSur.e<-table(edib$CapSurface)
CC.e<-table(edib$CapColor)
OD.e<-table(edib$Odor)
POP.e<-table(edib$Population)
HB.e<-table(edib$Habitat)
Prop.CapShape<-merge(as.data.frame(prop.table(CS.p)),as.data.frame(prop.table(CS.e)),by="Var1")
Prop.CapSurface<-merge(as.data.frame(prop.table(CSur.p)),as.data.frame(prop.table(CSur.e)),by="Var1")
Prop.CapColor<-merge(as.data.frame(prop.table(CC.p)),as.data.frame(prop.table(CC.e)),by="Var1")
Prop.Odor<-merge(as.data.frame(prop.table(OD.p)),as.data.frame(prop.table(OD.e)),by="Var1")
Prop.Population<-merge(as.data.frame(prop.table(POP.p)),as.data.frame(prop.table(POP.e)),by="Var1")
Prop.Habitat<-merge(as.data.frame(prop.table(HB.p)),as.data.frame(prop.table(HB.e)),by="Var1")
names(Prop.CapShape)<-c("Quality","Pois","Edib")
names(Prop.CapSurface)<-c("Quality","Pois","Edib")
names(Prop.CapColor)<-c("Quality","Pois","Edib")
names(Prop.Odor)<-c("Quality","Pois","Edib")
names(Prop.Population)<-c("Quality","Pois","Edib")
names(Prop.Habitat)<-c("Quality","Pois","Edib")library(ggplot2)
ggplot(Prop.CapShape) +
geom_point(aes(Quality,Edib),shape=69,fill="darkgreen", color="green", size=3,alpha=.8) +
geom_point(aes(Quality,Pois),shape=80,fill="darkred", color="red", size=3, alpha=.8) +
theme_bw()+
theme(panel.border = element_blank(),
axis.line = element_line(colour = "black"))+
ggtitle("Mushroom Cap Shape, Edible vs Poisonous",
subtitle="https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")+
xlab("Quality")+
ylab("Percent")ggplot(Prop.CapColor) +
geom_point(aes(Quality,Edib),shape=69,fill="darkgreen", color="green", size=3,alpha=.8) +
geom_point(aes(Quality,Pois),shape=80,fill="darkred", color="red", size=3, alpha=.8) +
theme_bw()+
theme(panel.border = element_blank(),
axis.line = element_line(colour = "black"))+
ggtitle("Mushroom Cap Color, Edible vs Poisonous",
subtitle="https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")+
xlab("Quality")+
ylab("Percent")ggplot(Prop.CapSurface) +
geom_point(aes(Quality,Edib),shape=69,fill="darkgreen", color="green", size=3,alpha=.8) +
geom_point(aes(Quality,Pois),shape=80,fill="darkred", color="red", size=3, alpha=.8) +
theme_bw()+
theme(panel.border = element_blank(),
axis.line = element_line(colour = "black"))+
ggtitle("Mushroom Cap Color, Edible vs Poisonous",
subtitle="https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")+
xlab("Quality")+
ylab("Percent")ggplot(Prop.Habitat) +
geom_point(aes(Quality,Edib),shape=69,fill="darkgreen", color="green", size=3,alpha=.8) +
geom_point(aes(Quality,Pois),shape=80,fill="darkred", color="red", size=3, alpha=.8) +
theme_bw()+
theme(panel.border = element_blank(),
axis.line = element_line(colour = "black"))+
ggtitle("Mushroom Cap Color, Edible vs Poisonous",
subtitle="https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")+
xlab("Quality")+
ylab("Percent")ggplot(Prop.Odor) +
geom_point(aes(Quality,Edib),shape=69,fill="darkgreen", color="green", size=3,alpha=.8) +
geom_point(aes(Quality,Pois),shape=80,fill="darkred", color="red", size=3, alpha=.8) +
theme_bw()+
theme(panel.border = element_blank(),
axis.line = element_line(colour = "black"))+
ggtitle("Mushroom Cap Color, Edible vs Poisonous",
subtitle="https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")+
xlab("Quality")+
ylab("Percent")ggplot(Prop.Population) +
geom_point(aes(Quality,Edib),shape=69,fill="darkgreen", color="green", size=3,alpha=.8) +
geom_point(aes(Quality,Pois),shape=80,fill="darkred", color="red", size=3, alpha=.8) +
theme_bw()+
theme(panel.border = element_blank(),
axis.line = element_line(colour = "black"))+
ggtitle("Mushroom Cap Color, Edible vs Poisonous",
subtitle="https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")+
xlab("Quality")+
ylab("Percent")Link to repository .Rmd file: https://github.com/sigmasigmaiota/mushroom/blob/master/Assignment1_StephenJones.Rmd
Direct link to this .Rmd: https://raw.githubusercontent.com/sigmasigmaiota/mushroom/master/Assignment1_StephenJones.Rmd