columns <- c("Edible", "cap_shape", "cap_surface", "cap_color", "Bruises",
"Odor", "gill_attach", "gill_spacing", "gill_size", "Gill_Color",
"stalk_shape", "stalk_root", "stalk_surface_above",
"stalk_surface_below", "stalk_color_above", "stalk_color_below",
"veil_type", "veil_color", "ring_number",
"ring_type", "spore_print_color", "population", "habitat")
Reading the dataset into R:
url <- getURL("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")
mushr <- read.csv(text = url, col.names = columns)
Subsetting the dataset to include 4 variables:
mushrSub <- subset(mushr, select = c(1,5,6,10))
head(mushrSub, 5)
## Edible Bruises Odor Gill_Color
## 1 e t a k
## 2 e t l n
## 3 p t p n
## 4 e f n k
## 5 e t a n
Displaying the order of the factor levels before changing their name
summary(mushr$Edible)
## e p
## 4208 3915
summary(mushr$Gill_Color)
## b e g h k n o p r u w y
## 1728 96 752 732 407 1048 64 1492 24 492 1202 86
summary(mushr$Odor)
## a c f l m n p s y
## 400 192 2160 400 36 3528 255 576 576
summary(mushr$Bruises)
## f t
## 4748 3375
Re-naming factor levels
levels(mushrSub$Edible) <- c("edible", "poisonous")
levels(mushrSub$Bruises) <- c("no bruise", "bruised")
levels(mushrSub$Gill_Color) <- c("buff", "red", "gray", "chocolate", "black",
"brown", "orange", "pink", "green", "purple",
"white", "yellow")
levels(mushrSub$Odor) <- c("almond", "creosote","foul","anise", "musty",
"none", "pungent", "spicy", "fishy")
Analyzing data with dplyr package to display efficiency of variables in determining toxicity of a mushroom:
mushrSub %>%
group_by(Odor) %>%
summarise(Toxicity_Rate_by_Odor = sum(Edible == "poisonous") / n(),
Number_of_Mushrooms = n())
## # A tibble: 9 x 3
## Odor Toxicity_Rate_by_Odor Number_of_Mushrooms
## <fctr> <dbl> <int>
## 1 almond 0.00000000 400
## 2 creosote 1.00000000 192
## 3 foul 1.00000000 2160
## 4 anise 0.00000000 400
## 5 musty 1.00000000 36
## 6 none 0.03401361 3528
## 7 pungent 1.00000000 255
## 8 spicy 1.00000000 576
## 9 fishy 1.00000000 576
# According to the dataset, a mushroom with no odor is poisonous with a probability of about %3.4
mushrSub %>%
group_by(Bruises) %>%
summarise(Toxicity_Rate_By_Bruises = sum(Edible == "poisonous") / n(),
Number_of_Mushrooms = n())
## # A tibble: 2 x 3
## Bruises Toxicity_Rate_By_Bruises Number_of_Mushrooms
## <fctr> <dbl> <int>
## 1 no bruise 0.6933446 4748
## 2 bruised 0.1845926 3375
mushrSub %>%
group_by(Gill_Color) %>%
summarise(Toxicity_Rate_By_Gill_Color = sum(Edible == "poisonous")/n(),
Number_of_Mushrooms = n())
## # A tibble: 12 x 3
## Gill_Color Toxicity_Rate_By_Gill_Color Number_of_Mushrooms
## <fctr> <dbl> <int>
## 1 buff 1.00000000 1728
## 2 red 0.00000000 96
## 3 gray 0.67021277 752
## 4 chocolate 0.72131148 732
## 5 black 0.15479115 407
## 6 brown 0.10687023 1048
## 7 orange 0.00000000 64
## 8 pink 0.42895442 1492
## 9 green 1.00000000 24
## 10 purple 0.09756098 492
## 11 white 0.20465890 1202
## 12 yellow 0.25581395 86
# According to the dataset, a mushroom with the gill_color buff is poisonous with a probability of %100