CUNY 607 Week 1 Assignment

Importing in the Mushroom Dataset from Github:

Load Libraries:

library(RCurl)
## Loading required package: bitops
mushroom <- read.csv("https://raw.githubusercontent.com/jcp9010/MSDA/master/agaricus-lepiota.data.csv", header = FALSE)
mushroom.names <- c("Deadly", "Cap.Shape","Cap.Surface","Cap.Color","Bruises?", "Odor","Gill.Attachment","Gill.Spacing","Gill.Size","Gill.Color","Stalk.Shape","Stalk.Root","Stalk.Surface.Above.Ring","Stalk.Surface.Below.Ring","Stalk.Color.Above.Ring","Stalk.Color.Below.Ring","Veil.Type","Veil.Color","Ring.Number","Ring.Type","Spore.Print.Color","Population","Habitat")
colnames(mushroom) <- mushroom.names

Take a subset of columns in the dataset. This should include the column that indicates edible or poisonous and three or four other columns.

mushroom.subset <- subset(mushroom, select = c(Deadly, Cap.Shape, Cap.Color, Odor, Stalk.Shape))

To start converting the letters in the data.frame into something more readible, I had used a for loop to replace all the letters with words.

mushroom.subset2 <- data.frame(lapply(mushroom.subset, as.character), stringsAsFactors = FALSE)

for (i in 1:length(mushroom.subset2$Deadly)){
  if (mushroom.subset2[i, "Deadly"] == 'p'){
    mushroom.subset2[i, "Deadly"] <- "Poisonous"
  } else{
    mushroom.subset2[i, "Deadly"] <- "Edible"
  }
}

for (i in 1:length(mushroom.subset2$Cap.Shape)){
  if (mushroom.subset2[i, "Cap.Shape"] == 'b'){
    mushroom.subset2[i, "Cap.Shape"] <- "Bell"
  } else if (mushroom.subset2[i, "Cap.Shape"] == 'c'){
    mushroom.subset2[i, "Cap.Shape"] <- "Conical"
  } else if (mushroom.subset2[i, "Cap.Shape"] == 'x'){
    mushroom.subset2[i, "Cap.Shape"] <- "Convex"
  } else if (mushroom.subset2[i, "Cap.Shape"] == 'f'){
    mushroom.subset2[i, "Cap.Shape"] <- "Flat"
  } else if (mushroom.subset2[i, "Cap.Shape"] == 'k'){
    mushroom.subset2[i, "Cap.Shape"] <- "Knobbed"
  } else{
    mushroom.subset2[i, "Cap.Shape"] <- "Sunken"
  }
}

for (i in 1:length(mushroom.subset2$Cap.Color)){
  if (mushroom.subset2[i, "Cap.Color"] == 'n'){
    mushroom.subset2[i, "Cap.Color"] <- "Brown"
  } else if (mushroom.subset2[i, "Cap.Color"] == 'b'){
    mushroom.subset2[i, "Cap.Color"] <- "Buff"
  } else if (mushroom.subset2[i, "Cap.Color"] == 'c'){
    mushroom.subset2[i, "Cap.Color"] <- "Cinnamon"
  } else if (mushroom.subset2[i, "Cap.Color"] == 'g'){
    mushroom.subset2[i, "Cap.Color"] <- "Gray"
  } else if (mushroom.subset2[i, "Cap.Color"] == 'r'){
    mushroom.subset2[i, "Cap.Color"] <- "Green"
  } else if (mushroom.subset2[i, "Cap.Color"] == 'p'){
    mushroom.subset2[i, "Cap.Color"] <- "Pink"
  } else if (mushroom.subset2[i, "Cap.Color"] == 'u'){
    mushroom.subset2[i, "Cap.Color"] <- "Purple"
  } else if (mushroom.subset2[i, "Cap.Color"] == 'e'){
    mushroom.subset2[i, "Cap.Color"] <- "Red"
  } else if (mushroom.subset2[i, "Cap.Color"] == 'w'){
    mushroom.subset2[i, "Cap.Color"] <- "White"
  } else{
    mushroom.subset2[i, "Cap.Color"] <- "Yellow"
  } 
}

for (i in 1:length(mushroom.subset2$Odor)){
  if (mushroom.subset2[i, "Odor"] == 'a'){
    mushroom.subset2[i, "Odor"] <- "Almond"
  } else if (mushroom.subset2[i, "Odor"] == 'l'){
    mushroom.subset2[i, "Odor"] <- "Anise"
  } else if (mushroom.subset2[i, "Odor"] == 'c'){
    mushroom.subset2[i, "Odor"] <- "Creosote"
  } else if (mushroom.subset2[i, "Odor"] == 'y'){
    mushroom.subset2[i, "Odor"] <- "Fishy"
  } else if (mushroom.subset2[i, "Odor"] == 'f'){
    mushroom.subset2[i, "Odor"] <- "Foul"
  } else if (mushroom.subset2[i, "Odor"] == 'm'){
    mushroom.subset2[i, "Odor"] <- "Musty"
  } else if (mushroom.subset2[i, "Odor"] == 'n'){
    mushroom.subset2[i, "Odor"] <- "None"
  } else if (mushroom.subset2[i, "Odor"] == 'p'){
    mushroom.subset2[i, "Odor"] <- "Pungent"
  } else{
    mushroom.subset2[i, "Odor"] <- "Spicy"
  }
}

for (i in 1:length(mushroom.subset2$Stalk.Shape)){
  if (mushroom.subset2[i, "Stalk.Shape"] == 'e'){
    mushroom.subset2[i, "Stalk.Shape"] <- "Enlarging"
  } else{
    mushroom.subset2[i, "Stalk.Shape"] <- "Tapering"
  }
}

Below is a print out (using the head() function to limit the output on the screen and not to overwhelm your screen) of the mushroom.subset2.

print(head(mushroom.subset2, 10))
##       Deadly Cap.Shape Cap.Color    Odor Stalk.Shape
## 1  Poisonous    Convex     Brown Pungent   Enlarging
## 2     Edible    Convex    Yellow  Almond   Enlarging
## 3     Edible      Bell     White   Anise   Enlarging
## 4  Poisonous    Convex     White Pungent   Enlarging
## 5     Edible    Convex      Gray    None    Tapering
## 6     Edible    Convex    Yellow  Almond   Enlarging
## 7     Edible      Bell     White  Almond   Enlarging
## 8     Edible      Bell     White   Anise   Enlarging
## 9  Poisonous    Convex     White Pungent   Enlarging
## 10    Edible      Bell    Yellow  Almond   Enlarging

However, we run into a problem. We had essentially converted all of the data.frame factors into characters, making data analysis somewhat more difficult.

print(summary(mushroom.subset2))
##     Deadly           Cap.Shape          Cap.Color        
##  Length:8124        Length:8124        Length:8124       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##      Odor           Stalk.Shape       
##  Length:8124        Length:8124       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

To convert the characters into factors, I will use package “dplyr”.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mushroom.subset3 <- mushroom.subset2 %>% mutate_if(is.character,as.factor)
print(summary(mushroom.subset3))
##        Deadly       Cap.Shape      Cap.Color         Odor     
##  Edible   :4208   Bell   : 452   Brown  :2284   None   :3528  
##  Poisonous:3916   Conical:   4   Gray   :1840   Foul   :2160  
##                   Convex :3656   Red    :1500   Fishy  : 576  
##                   Flat   :3152   Yellow :1072   Spicy  : 576  
##                   Knobbed: 828   White  :1040   Almond : 400  
##                   Sunken :  32   Buff   : 168   Anise  : 400  
##                                  (Other): 220   (Other): 484  
##     Stalk.Shape  
##  Enlarging:3516  
##  Tapering :4608  
##                  
##                  
##                  
##                  
## 

As you can see, this data is now more usable for creating even further subsets for analysis.

Visualization of the Data

barplot(table(mushroom.subset3$Cap.Shape), main = "Mushroom Cap Shapes")

Frequency Bar Plot

Shape.Table <- table(mushroom.subset3$Cap.Shape)
Shape.Table.Ratios <- Shape.Table/length(mushroom.subset3$Deadly)
print(Shape.Table.Ratios)
## 
##         Bell      Conical       Convex         Flat      Knobbed 
## 0.0556376169 0.0004923683 0.4500246184 0.3879862137 0.1019202363 
##       Sunken 
## 0.0039389463
barplot(Shape.Table.Ratios, main = "Mushroom Cap Shapes Frequency Bar Plot")

Pie Chart

pie(Shape.Table.Ratios, main = "Mushroom Cap Shapes Pie Chart")