Assignment1

Stephen Jones

January 31, 2019

Choose from the tabs below to view code and links to files on github.

Assignment 1

#clean the workspace.
rm(list=ls())
getwd()
## [1] "C:/Users/Stephen Jones/Documents"
#need data.table for fread.
suppressWarnings(library(data.table))
#download .data file from url, write to local drive, then upload to github
mushdata<-fread("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")
write.csv(mushdata,'agaricus-lepiota.csv')
#download data from github
library(RCurl)
mushdata<-read.csv(text=getURL("https://raw.githubusercontent.com/sigmasigmaiota/mushroom/master/agaricus-lepiota.csv"))

#examine data
summary(mushdata)
##        X        p        x        s              n        t       
##  Min.   :   1   e:4208   b: 452   f:2320   n      :2283   f:4748  
##  1st Qu.:2032   p:3915   c:   4   g:   4   g      :1840   t:3375  
##  Median :4062            f:3152   s:2555   e      :1500           
##  Mean   :4062            k: 828   y:3244   y      :1072           
##  3rd Qu.:6092            s:  32            w      :1040           
##  Max.   :8123            x:3655            b      : 168           
##                                            (Other): 220           
##       p.1       f        c        n.1            k        e       
##  n      :3528   a: 210   c:6811   b:5612   b      :1728   e:3515  
##  f      :2160   f:7913   w:1312   n:2511   p      :1492   t:4608  
##  s      : 576                              w      :1202           
##  y      : 576                              n      :1048           
##  a      : 400                              g      : 752           
##  l      : 400                              h      : 732           
##  (Other): 483                              (Other):1169           
##  e.1      s.1      s.2            w             w.1       p.2     
##  ?:2480   f: 552   f: 600   w      :4463   w      :4383   p:8123  
##  b:3776   k:2372   k:2304   p      :1872   p      :1872           
##  c: 556   s:5175   s:4935   g      : 576   g      : 576           
##  e:1119   y:  24   y: 284   n      : 448   n      : 512           
##  r: 192                     b      : 432   b      : 432           
##                             o      : 192   o      : 192           
##                             (Other): 140   (Other): 156           
##  w.2      o        p.3           k.1       s.3      u       
##  n:  96   n:  36   e:2776   w      :2388   a: 384   d:3148  
##  o:  96   o:7487   f:  48   n      :1968   c: 340   g:2148  
##  w:7923   t: 600   l:1296   k      :1871   n: 400   l: 832  
##  y:   8            n:  36   h      :1632   s:1247   m: 292  
##                    p:3967   r      :  72   v:4040   p:1144  
##                             b      :  48   y:1712   u: 367  
##                             (Other): 144            w: 192
head(mushdata)
##   X p x s n t p.1 f c n.1 k e e.1 s.1 s.2 w w.1 p.2 w.2 o p.3 k.1 s.3 u
## 1 1 e x s y t   a f c   b k e   c   s   s w   w   p   w o   p   n   n g
## 2 2 e b s w t   l f c   b n e   c   s   s w   w   p   w o   p   n   n m
## 3 3 p x y w t   p f c   n n e   e   s   s w   w   p   w o   p   k   s u
## 4 4 e x s g f   n f w   b k t   e   s   s w   w   p   w o   e   n   a g
## 5 5 e x y y t   a f c   b n e   c   s   s w   w   p   w o   p   k   n g
## 6 6 e b s w t   a f c   b g e   c   s   s w   w   p   w o   p   k   n m
#remove first column which serves as a redundant row counter.
mushdata$X<-NULL
Using the codebook as a reference, test columns for unique values to verify; do the variables match the codebook?
sapply(mushdata, function(x) unique(x))
## $p
## [1] e p
## Levels: e p
## 
## $x
## [1] x b s f k c
## Levels: b c f k s x
## 
## $s
## [1] s y f g
## Levels: f g s y
## 
## $n
##  [1] y w g n e p b u c r
## Levels: b c e g n p r u w y
## 
## $t
## [1] t f
## Levels: f t
## 
## $p.1
## [1] a l p n f c y s m
## Levels: a c f l m n p s y
## 
## $f
## [1] f a
## Levels: a f
## 
## $c
## [1] c w
## Levels: c w
## 
## $n.1
## [1] b n
## Levels: b n
## 
## $k
##  [1] k n g p w h u e b r y o
## Levels: b e g h k n o p r u w y
## 
## $e
## [1] e t
## Levels: e t
## 
## $e.1
## [1] c e b r ?
## Levels: ? b c e r
## 
## $s.1
## [1] s f k y
## Levels: f k s y
## 
## $s.2
## [1] s f y k
## Levels: f k s y
## 
## $w
## [1] w g p n b e o c y
## Levels: b c e g n o p w y
## 
## $w.1
## [1] w p g b n e y o c
## Levels: b c e g n o p w y
## 
## $p.2
## [1] p
## Levels: p
## 
## $w.2
## [1] w n o y
## Levels: n o w y
## 
## $o
## [1] o t n
## Levels: n o t
## 
## $p.3
## [1] p e l f n
## Levels: e f l n p
## 
## $k.1
## [1] n k u h w r o y b
## Levels: b h k n o r u w y
## 
## $s.3
## [1] n s a v y c
## Levels: a c n s v y
## 
## $u
## [1] g m u d p w l
## Levels: d g l m p u w
I’d like to examine the data describing traits observable to pedestrians or hikers, renaming those variables which have been verified with the codebook.
colnames(mushdata)[1]<-"Edible"
colnames(mushdata)[2]<-"CapShape"
colnames(mushdata)[3]<-"CapSurface"
colnames(mushdata)[4]<-"CapColor"
colnames(mushdata)[6]<-"Odor"
colnames(mushdata)[22]<-"Population"
colnames(mushdata)[23]<-"Habitat"
Check missing values before recoding. There are 0.
#Check missing values
sapply(mushdata, function(x) sum(is.na(x)))
##     Edible   CapShape CapSurface   CapColor          t       Odor 
##          0          0          0          0          0          0 
##          f          c        n.1          k          e        e.1 
##          0          0          0          0          0          0 
##        s.1        s.2          w        w.1        p.2        w.2 
##          0          0          0          0          0          0 
##          o        p.3        k.1 Population    Habitat 
##          0          0          0          0          0
Recode using recode command from “car” package.
library(car)
mushdata$Edible<-recode(mushdata$Edible,
                        "'e'='edible';
                        'p'='poisonous'")
mushdata$CapShape<-recode(mushdata$CapShape,
                        "'b'='bell';
                        'c'='conical';
                        'x'='convex';
                        'f'='flat';
                        'k'='knobbed';
                        's'='sunken'")

mushdata$CapSurface<-recode(mushdata$CapSurface,
                        "'f'='fibrous';
                        'g'='grooves';
                        'y'='scaly';
                        's'='smooth'")

mushdata$CapColor<-recode(mushdata$CapColor,
                        "'n'='brown';
                        'b'='buff';
                        'c'='cinnamon';
                        'g'='gray';
                        'r'='green';
                        'p'='pink';
                        'u'='purple';
                        'e'='red';
                        'w'='white';
                        'y'='yellow'")
mushdata$Odor<-recode(mushdata$Odor,
                        "'a'='almond';
                        'l'='anise';
                        'c'='creosote';
                        'y'='fishy';
                        'f'='foul';
                        'm'='musty';
                        'n'='none';
                        'p'='pungent';
                        's'='spicy'")
mushdata$Population<-recode(mushdata$Population,
                        "'a'='abundant';
                        'c'='clustered';
                        'n'='numerous';
                        's'='scattered';
                        'v'='several';
                        'y'='solitary'")
mushdata$Habitat<-recode(mushdata$Habitat,
                         "'g'='grasses';
                         'l'='leaves';
                         'm'='meadows';
                         'p'='paths';
                         'u'='urban';
                         'w'='waste';
                         'd'='woods'")
Check missing values after recoding.
sapply(mushdata, function(x) sum(is.na(x)))
##     Edible   CapShape CapSurface   CapColor          t       Odor 
##          0          0          0          0          0          0 
##          f          c        n.1          k          e        e.1 
##          0          0          0          0          0          0 
##        s.1        s.2          w        w.1        p.2        w.2 
##          0          0          0          0          0          0 
##          o        p.3        k.1 Population    Habitat 
##          0          0          0          0          0
Create subset with selected variables.
mushdata.sub<-mushdata[,c("Edible","CapShape","CapSurface","CapColor","Odor","Population","Habitat")]
Split the dataframe, in preparation to compare edible and poisonous plots.
Edible.Pois<-split(mushdata.sub[2:7],mushdata$Edible)
pois<-as.data.frame(Edible.Pois$poisonous)
edib<-as.data.frame(Edible.Pois$edible)

CS.p<-table(pois$CapShape)
CSur.p<-table(pois$CapSurface)
CC.p<-table(pois$CapColor)
OD.p<-table(pois$Odor)
POP.p<-table(pois$Population)
HB.p<-table(pois$Habitat)
CS.e<-table(edib$CapShape)
CSur.e<-table(edib$CapSurface)
CC.e<-table(edib$CapColor)
OD.e<-table(edib$Odor)
POP.e<-table(edib$Population)
HB.e<-table(edib$Habitat)

Prop.CapShape<-merge(as.data.frame(prop.table(CS.p)),as.data.frame(prop.table(CS.e)),by="Var1")
Prop.CapSurface<-merge(as.data.frame(prop.table(CSur.p)),as.data.frame(prop.table(CSur.e)),by="Var1")
Prop.CapColor<-merge(as.data.frame(prop.table(CC.p)),as.data.frame(prop.table(CC.e)),by="Var1")
Prop.Odor<-merge(as.data.frame(prop.table(OD.p)),as.data.frame(prop.table(OD.e)),by="Var1")
Prop.Population<-merge(as.data.frame(prop.table(POP.p)),as.data.frame(prop.table(POP.e)),by="Var1")
Prop.Habitat<-merge(as.data.frame(prop.table(HB.p)),as.data.frame(prop.table(HB.e)),by="Var1")

names(Prop.CapShape)<-c("Quality","Pois","Edib")
names(Prop.CapSurface)<-c("Quality","Pois","Edib")
names(Prop.CapColor)<-c("Quality","Pois","Edib")
names(Prop.Odor)<-c("Quality","Pois","Edib")
names(Prop.Population)<-c("Quality","Pois","Edib")
names(Prop.Habitat)<-c("Quality","Pois","Edib")
library(ggplot2)

ggplot(Prop.CapShape) + 
  geom_point(aes(Quality,Edib),shape=69,fill="darkgreen", color="green", size=3,alpha=.8) + 
  geom_point(aes(Quality,Pois),shape=80,fill="darkred", color="red", size=3, alpha=.8) +
  theme_bw()+
  theme(panel.border = element_blank(),
        axis.line = element_line(colour = "black"))+
  ggtitle("Mushroom Cap Shape, Edible vs Poisonous",
          subtitle="https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")+
  xlab("Quality")+
  ylab("Percent")

ggplot(Prop.CapColor) + 
  geom_point(aes(Quality,Edib),shape=69,fill="darkgreen", color="green", size=3,alpha=.8) + 
  geom_point(aes(Quality,Pois),shape=80,fill="darkred", color="red", size=3, alpha=.8) +
  theme_bw()+
  theme(panel.border = element_blank(),
        axis.line = element_line(colour = "black"))+
  ggtitle("Mushroom Cap Color, Edible vs Poisonous",
          subtitle="https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")+
  xlab("Quality")+
  ylab("Percent")

ggplot(Prop.CapSurface) + 
  geom_point(aes(Quality,Edib),shape=69,fill="darkgreen", color="green", size=3,alpha=.8) + 
  geom_point(aes(Quality,Pois),shape=80,fill="darkred", color="red", size=3, alpha=.8) +
  theme_bw()+
  theme(panel.border = element_blank(),
        axis.line = element_line(colour = "black"))+
  ggtitle("Mushroom Cap Color, Edible vs Poisonous",
          subtitle="https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")+
  xlab("Quality")+
  ylab("Percent")

ggplot(Prop.Habitat) + 
  geom_point(aes(Quality,Edib),shape=69,fill="darkgreen", color="green", size=3,alpha=.8) + 
  geom_point(aes(Quality,Pois),shape=80,fill="darkred", color="red", size=3, alpha=.8) +
  theme_bw()+
  theme(panel.border = element_blank(),
        axis.line = element_line(colour = "black"))+
  ggtitle("Mushroom Cap Color, Edible vs Poisonous",
          subtitle="https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")+
  xlab("Quality")+
  ylab("Percent")

ggplot(Prop.Odor) + 
  geom_point(aes(Quality,Edib),shape=69,fill="darkgreen", color="green", size=3,alpha=.8) + 
  geom_point(aes(Quality,Pois),shape=80,fill="darkred", color="red", size=3, alpha=.8) +
  theme_bw()+
  theme(panel.border = element_blank(),
        axis.line = element_line(colour = "black"))+
  ggtitle("Mushroom Cap Color, Edible vs Poisonous",
          subtitle="https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")+
  xlab("Quality")+
  ylab("Percent")

ggplot(Prop.Population) + 
  geom_point(aes(Quality,Edib),shape=69,fill="darkgreen", color="green", size=3,alpha=.8) + 
  geom_point(aes(Quality,Pois),shape=80,fill="darkred", color="red", size=3, alpha=.8) +
  theme_bw()+
  theme(panel.border = element_blank(),
        axis.line = element_line(colour = "black"))+
  ggtitle("Mushroom Cap Color, Edible vs Poisonous",
          subtitle="https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")+
  xlab("Quality")+
  ylab("Percent")

Links

Link to repository .Rmd file: https://github.com/sigmasigmaiota/mushroom/blob/master/Assignment1_StephenJones.Rmd

Direct link to this .Rmd: https://raw.githubusercontent.com/sigmasigmaiota/mushroom/master/Assignment1_StephenJones.Rmd