CUNY MSDS Data 607

Your task is to study the dataset and the associated description of the data (i.e. “data dictionary”). You may need to look around a bit, but it’s there! You should take the data, and create a data frame with a subset of the columns in the dataset. You should include the column that indicates edible or poisonous and three or four other columns. You should also add meaningful column names and replace the abbreviations used in the data-for example, in the appropriate column, “e” might become “edible.” Your deliverable is the R code to perform these transformation tasks.

library(RCurl)
library(tidyverse)

url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
mushroom <- getURL(url)
mushroom_df <- read.csv(text = mushroom, header = F, sep = ',', stringsAsFactors = F)

Created mushroom_names to hold new column names. Then assigned mushroom_df with the new names using the colnames function.

mushroom_names <- c("class", "cap-shape", "cap-surface","cap-color", "bruises?", "odor", "gill-attachment",
                    "gill-spacing", "gill-size", "gill-color", "stalk-shape" ,"stalk-root", "stalk-surface-above-ring",
                    "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring" ,"veil-type",
                    "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat")

colnames(mushroom_df) <- mushroom_names

Run a for-loop to rename class values:

i <- 1
for (x in mushroom_df$class){
  if (x == 'p'){
    mushroom_df$class[i] <- "poisonous"
  }else if (x == 'e'){
    mushroom_df$class[i] <- "edible"
  }
  i <- i + 1
  }

Run a for-loop to rename population values:

i <- 1
for (x in mushroom_df$population){
  if (x == 'a'){
    mushroom_df$population[i] <- "abundant"
  }else if (x == 'c'){
    mushroom_df$population[i] <- "clustered"
  }else if (x == 'n'){
    mushroom_df$population[i] <- "numerous"
  }else if (x == 's'){
    mushroom_df$population[i] <- "scattered"
  }else if (x == 'v'){
    mushroom_df$population[i] <- "several"
  }else if (x == 'y'){
    mushroom_df$population[i] <- "solitary"
  }
  i <- i + 1
}

Run a for-loop to rename habitat values:

i <- 1
for (x in mushroom_df$habitat){
  if (x == 'g'){
    mushroom_df$habitat[i] <- "grasses"
  }else if (x == 'l'){
    mushroom_df$habitat[i] <- "leaves"
  }else if (x == 'm'){
    mushroom_df$habitat[i] <- "meadows"
  }else if (x == 'p'){
    mushroom_df$habitat[i] <- "paths"
  }else if (x == 'u'){
    mushroom_df$habitat[i] <- "urban"
  }else if (x == 'w'){
    mushroom_df$habitat[i] <- "waste"
  }else if (x == 'd'){
    mushroom_df$habitat[i] <- "woods"
  }
  i <- i + 1
}

Run a for-loop to rename gill-size values:

i <- 1
for (x in mushroom_df$`gill-size`){
  if (x == 'b'){
    mushroom_df$`gill-size`[i] <- "broad"
  }else if (x == 'n'){
    mushroom_df$`gill-size`[i] <- "narrow"
  }
  i <- i + 1
}

Create a subset of the data and group by class. Using the %>% operator.

mushroom_subset <- mushroom_df %>% group_by(class) %>% select(class, population, habitat, 'gill-size')

head(mushroom_subset)

## # A tibble: 6 x 4
## # Groups: class [2]
##   class     population habitat `gill-size`
##   <chr>     <chr>      <chr>   <chr>      
## 1 poisonous scattered  urban   narrow     
## 2 edible    numerous   grasses broad      
## 3 edible    numerous   meadows broad      
## 4 poisonous scattered  urban   narrow     
## 5 edible    abundant   grasses broad      
## 6 edible    numerous   grasses broad

The total amount of edible vs. poisonous mushrooms in our dataset:

group_by(mushroom_subset, class) %>% summarise(count=n())

## # A tibble: 2 x 2
##   class     count
##   <chr>     <int>
## 1 edible     4208
## 2 poisonous  3916

Total mushrooms per each habitat:

group_by(mushroom_subset, habitat) %>% summarise(count=n())

## # A tibble: 7 x 2
##   habitat count
##   <chr>   <int>
## 1 grasses  2148
## 2 leaves    832
## 3 meadows   292
## 4 paths    1144
## 5 urban     368
## 6 waste     192
## 7 woods    3148

Create a graph with ggplot on Edible vs. Poisonous Mushrooms, based on Habitat

ggplot(mushroom_subset, aes(mushroom_subset$habitat)) + geom_bar(aes(fill = mushroom_subset$class)) +
  ggtitle("Mushrooms - Edible vs. Poisonous based on Habitat") +
  xlab("Mushroom Habitat") +
  ylab("Count") +
  labs(fill = "Class") + 
  geom_text(stat = "count", aes(label = ..count.., y = ..count..), vjust=-0.5) +
  theme_dark()

Questions

Modified the code below from Niteen in data606 slack channel - changed dataset to work with this assignment and used %>% operator. Seems to work the same as the for loop with less code - question I have is about the performance vs. for loop? Real life what would work better?

It also seems you have to assign the code to a variable, or else the entire dataframe will show.

mushroom_df1 <- mushroom_df %>% 
  mutate(`gill-size`=case_when(`gill-size`=='b'~'broad',
                                             `gill-size`=='n'~'narrow'))

CUNY MSDS Data 607 - HW 1

Nicholas Schettini

Janurary 29, 2018