Loading Data into a Data Frame

Emrah Akin

August 28, 2017


columns <- c("Edible", "cap_shape", "cap_surface", "cap_color", "Bruises", 
             "Odor", "gill_attach", "gill_spacing", "gill_size", "Gill_Color", 
             "stalk_shape", "stalk_root", "stalk_surface_above", 
             "stalk_surface_below",  "stalk_color_above", "stalk_color_below", 
             "veil_type", "veil_color", "ring_number", 
             "ring_type", "spore_print_color", "population", "habitat")


Reading the dataset into R:

url <- getURL("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")
mushr <- read.csv(text = url, col.names = columns)


Subsetting the dataset to include 4 variables:

mushrSub <- subset(mushr, select = c(1,5,6,10))
head(mushrSub, 5)
##   Edible Bruises Odor Gill_Color
## 1      e       t    a          k
## 2      e       t    l          n
## 3      p       t    p          n
## 4      e       f    n          k
## 5      e       t    a          n


Displaying the order of the factor levels before changing their name

summary(mushr$Edible)
##    e    p 
## 4208 3915
summary(mushr$Gill_Color)
##    b    e    g    h    k    n    o    p    r    u    w    y 
## 1728   96  752  732  407 1048   64 1492   24  492 1202   86
summary(mushr$Odor)
##    a    c    f    l    m    n    p    s    y 
##  400  192 2160  400   36 3528  255  576  576
summary(mushr$Bruises)
##    f    t 
## 4748 3375


Re-naming factor levels

levels(mushrSub$Edible) <- c("edible", "poisonous")
levels(mushrSub$Bruises) <- c("no bruise", "bruised")
levels(mushrSub$Gill_Color) <- c("buff", "red", "gray", "chocolate", "black",
                                  "brown", "orange", "pink", "green", "purple",
                                 "white", "yellow")
levels(mushrSub$Odor) <- c("almond", "creosote","foul","anise", "musty",
                           "none", "pungent", "spicy", "fishy")


Analyzing data with dplyr package to display efficiency of variables in determining toxicity of a mushroom:

mushrSub %>%
  group_by(Odor) %>%
  summarise(Toxicity_Rate_by_Odor = sum(Edible == "poisonous") / n(), 
            Number_of_Mushrooms = n())
## # A tibble: 9 x 3
##       Odor Toxicity_Rate_by_Odor Number_of_Mushrooms
##     <fctr>                 <dbl>               <int>
## 1   almond            0.00000000                 400
## 2 creosote            1.00000000                 192
## 3     foul            1.00000000                2160
## 4    anise            0.00000000                 400
## 5    musty            1.00000000                  36
## 6     none            0.03401361                3528
## 7  pungent            1.00000000                 255
## 8    spicy            1.00000000                 576
## 9    fishy            1.00000000                 576
# According to the dataset, a mushroom with no odor is poisonous with a probability of about %3.4


mushrSub %>%
  group_by(Bruises) %>%
  summarise(Toxicity_Rate_By_Bruises = sum(Edible == "poisonous") / n(),
            Number_of_Mushrooms = n())
## # A tibble: 2 x 3
##     Bruises Toxicity_Rate_By_Bruises Number_of_Mushrooms
##      <fctr>                    <dbl>               <int>
## 1 no bruise                0.6933446                4748
## 2   bruised                0.1845926                3375


mushrSub %>%
  group_by(Gill_Color) %>%
  summarise(Toxicity_Rate_By_Gill_Color = sum(Edible == "poisonous")/n(),
            Number_of_Mushrooms = n())
## # A tibble: 12 x 3
##    Gill_Color Toxicity_Rate_By_Gill_Color Number_of_Mushrooms
##        <fctr>                       <dbl>               <int>
##  1       buff                  1.00000000                1728
##  2        red                  0.00000000                  96
##  3       gray                  0.67021277                 752
##  4  chocolate                  0.72131148                 732
##  5      black                  0.15479115                 407
##  6      brown                  0.10687023                1048
##  7     orange                  0.00000000                  64
##  8       pink                  0.42895442                1492
##  9      green                  1.00000000                  24
## 10     purple                  0.09756098                 492
## 11      white                  0.20465890                1202
## 12     yellow                  0.25581395                  86
# According to the dataset, a mushroom with the gill_color buff is poisonous with a probability of %100