Assignment Instructions

Mushrooms Dataset. A famous—if slightly moldy—dataset about mushrooms can be found in the UCI repository here: https://archive.ics.uci.edu/ml/datasets/Mushroom.

Your task is to study the dataset and the associated description of the data (i.e. “data dictionary”). You may need to look around a bit, but it’s there! You should take the data, and create a data frame with a subset of the columns in the dataset. You should include the column that indicates edible or poisonous and three or four other columns. You should also add meaningful column names and replace the abbreviations used in the data—for example, in the appropriate column, “e” might become “edible.” Your deliverable is the R code to perform these transformation tasks.

Please place your solution into a single R Markdown (.Rmd) file and publish your solution out to rpubs.com. You should post the .Rmd file in your GitHub repository, and provide the appropriate URLs to your GitHub repository and your rpubs.com file in your assignment link. You should also have the original data file accessible through your code—for example, stored in a GitHub repository and referenced in your code.

Load the Data into R

library(RCurl) 
## Loading required package: bitops
x <- getURL("https://raw.githubusercontent.com/betsyrosalen/DATA_607_Data_Acquisition_and_Management/master/Assignment1/Mushrooms/agaricus-lepiota.csv") 
mushrooms <-data.frame(read.csv(text=x, header=FALSE))
dim(mushrooms)
## [1] 8124   23
head(mushrooms, 3)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
##   V21 V22 V23
## 1   k   s   u
## 2   n   n   g
## 3   n   n   m

Create Header Row

mushrooms_dictionary <- read.table("https://raw.githubusercontent.com/betsyrosalen/DATA_607_Data_Acquisition_and_Management/master/Assignment1/Mushrooms/Data_Dictionary.txt",row.names = 1, sep=":", header=FALSE, quote="")
row.names(mushrooms_dictionary)
##  [1] "classes"                  "cap-shape"               
##  [3] "cap-surface"              "cap-color"               
##  [5] "bruises?"                 "odor"                    
##  [7] "gill-attachment"          "gill-spacing"            
##  [9] "gill-size"                "gill-color"              
## [11] "stalk-shape"              "stalk-root"              
## [13] "stalk-surface-above-ring" "stalk-surface-below-ring"
## [15] "stalk-color-above-ring"   "stalk-color-below-ring"  
## [17] "veil-type"                "veil-color"              
## [19] "ring-number"              "ring-type"               
## [21] "spore-print-color"        "population"              
## [23] "habitat"
names(mushrooms) <- row.names(mushrooms_dictionary)
head(mushrooms, 3)
##   classes cap-shape cap-surface cap-color bruises? odor gill-attachment
## 1       p         x           s         n        t    p               f
## 2       e         x           s         y        t    a               f
## 3       e         b           s         w        t    l               f
##   gill-spacing gill-size gill-color stalk-shape stalk-root
## 1            c         n          k           e          e
## 2            c         b          k           e          c
## 3            c         b          n           e          c
##   stalk-surface-above-ring stalk-surface-below-ring stalk-color-above-ring
## 1                        s                        s                      w
## 2                        s                        s                      w
## 3                        s                        s                      w
##   stalk-color-below-ring veil-type veil-color ring-number ring-type
## 1                      w         p          w           o         p
## 2                      w         p          w           o         p
## 3                      w         p          w           o         p
##   spore-print-color population habitat
## 1                 k          s       u
## 2                 n          n       g
## 3                 n          n       m

Subset the Data for Class, Gill Characteristics, and Spore Color

levels(mushrooms$classes)
## [1] "e" "p"
gills_spores <- mushrooms[c(1,7:10,21)]
dim(gills_spores)
## [1] 8124    6

Rename Data Levels in Subsetted Dataframe

I commented out the code that returns the values in the dictionary because I spent hours on this section and can’t figure out how to print/return just the value that is in the location without also printing the list of all 21 factor levels each time or how to use that directly to rename the factor levels in the gills_spores dataframe.
v1 <- mushrooms_dictionary$V2[1]
v7 <- mushrooms_dictionary$V2[7]
v8 <- mushrooms_dictionary$V2[8]
v9 <- mushrooms_dictionary$V2[9]
v10 <- mushrooms_dictionary$V2[10]
v21 <- mushrooms_dictionary$V2[21]
#v1
#v7
#v8
#v9
#v10
#v21

levels(gills_spores$classes) <- list("edible"="e","poisonous"="p")
levels(gills_spores$`gill-attachment`) <- list("attached"="a","descending"="d","free"="f","notched"="n")
levels(gills_spores$`gill-spacing`) <- list("close"="c","crowded"="w","distant"="d")
levels(gills_spores$`gill-size`) <- list("broad"="b","narrow"="n")
levels(gills_spores$`gill-color`) <- list("black"="k","brown"="n","buff"="b","chocolate"="h","gray"="g","green"="r","orange"="o","pink"="p","purple"="u","red"="e","white"="w","yellow"="y")
levels(gills_spores$`spore-print-color`) <- list("black"="k","brown"="n","buff"="b","chocolate"="h","green"="r","orange"="o","purple"="u","white"="w","yellow"="y")
head(gills_spores)
##     classes gill-attachment gill-spacing gill-size gill-color
## 1 poisonous            free        close    narrow      black
## 2    edible            free        close     broad      black
## 3    edible            free        close     broad      brown
## 4 poisonous            free        close    narrow      brown
## 5    edible            free      crowded     broad      black
## 6    edible            free        close     broad      brown
##   spore-print-color
## 1             black
## 2             brown
## 3             brown
## 4             black
## 5             brown
## 6             black

Further Subset the Data by Class

edible <- subset(gills_spores, classes == 'edible')
poisonous <- subset(gills_spores, classes == 'poisonous')
dim(edible)
## [1] 4208    6
dim(poisonous)
## [1] 3916    6
summary(edible)
##       classes       gill-attachment  gill-spacing   gill-size   
##  edible   :4208   attached  : 192   close  :3008   broad :3920  
##  poisonous:   0   descending:   0   crowded:1200   narrow: 288  
##                   free      :4016   distant:   0                
##                   notched   :   0                               
##                                                                 
##                                                                 
##                                                                 
##    gill-color  spore-print-color
##  white  :956   brown    :1744   
##  brown  :936   black    :1648   
##  pink   :852   white    : 576   
##  purple :444   buff     :  48   
##  black  :344   chocolate:  48   
##  gray   :248   orange   :  48   
##  (Other):428   (Other)  :  96
summary(poisonous)
##       classes       gill-attachment  gill-spacing   gill-size   
##  edible   :   0   attached  :  18   close  :3804   broad :1692  
##  poisonous:3916   descending:   0   crowded: 112   narrow:2224  
##                   free      :3898   distant:   0                
##                   notched   :   0                               
##                                                                 
##                                                                 
##                                                                 
##      gill-color   spore-print-color
##  buff     :1728   white    :1812   
##  pink     : 640   chocolate:1584   
##  chocolate: 528   black    : 224   
##  gray     : 504   brown    : 224   
##  white    : 246   green    :  72   
##  brown    : 112   buff     :   0   
##  (Other)  : 158   (Other)  :   0

Plot Some Data!

Spore Print Color of Edible Mushrooms

plot(edible$`spore-print-color`, las=3)

Spore Print Color of Poisonous Mushrooms

plot(poisonous$`spore-print-color`, las=3)

Gill Color of Edible Mushrooms

plot(edible$`gill-color`, las=3)

Gill Color of Poisonous Mushrooms

plot(poisonous$`gill-color`, las=3)

There’s probably a better plot for this, but it was the best I could do for now…

I am trying to show the correlation between the combination of gill color and spore print color and whether a mushroom is edible or not.

library(ggplot2)
ggplot(data = gills_spores) + 
  geom_jitter(mapping = aes(x = `gill-color`, y = `spore-print-color`, color = classes))