At first glance it may be difficult to understand what the database is trying to convey.
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
originalMdf <- read.csv(url, header = F, sep = ",", stringsAsFactors = F)
head(originalMdf, 10)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## 7 e b s w t a f c b g e c s s w w p w o p
## 8 e b y w t l f c b n e c s s w w p w o p
## 9 p x y w t p f c n p e e s s w w p w o p
## 10 e b s y t a f c b g e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
## 7 k n m
## 8 n s m
## 9 k v g
## 10 k s m
For reference: New column names to be assigned.
Mdf <- originalMdf
#Renaming the column names for readability
colnames(Mdf) <- c("Class", "C_shape", "C_surface", "C_color", "Bruises", "Odor", "G_Attachment", "G_spacing", "G_size", "G_color", "S_shape", "S_root", "S_surfaceAR", "S_surfaceBR", "S_colorAR", "S_colorBR", "V_type", "V_color", "R_number", "R_type", "SporeCol", "Population", "Habitat")
#Renaming values in the column for readability
Mdf$Class[Mdf$Class == 'p'] <- "Poisonous"
Mdf$Class[Mdf$Class == 'e'] <- "Edible"
Mdf$C_shape[Mdf$C_shape == 'b'] <- "Bell"
Mdf$C_shape[Mdf$C_shape == 'c'] <- "Conical"
Mdf$C_shape[Mdf$C_shape == 'x'] <- "Convex"
Mdf$C_shape[Mdf$C_shape == 'f'] <- "Flat"
Mdf$C_shape[Mdf$C_shape == 'k'] <- "Knobbed"
Mdf$C_shape[Mdf$C_shape == 's'] <- "Sunken"
Mdf$C_surface[Mdf$C_surface == 'f'] <- "Fibrous"
Mdf$C_surface[Mdf$C_surface == 'g'] <- "Grooves"
Mdf$C_surface[Mdf$C_surface == 'y'] <- "Scaly"
Mdf$C_surface[Mdf$C_surface == 's'] <- "Smooth"
Mdf$C_color[Mdf$C_color == 'n'] <- "Brown"
Mdf$C_color[Mdf$C_color == 'b'] <- "Buff"
Mdf$C_color[Mdf$C_color == 'c'] <- "Cinnamon"
Mdf$C_color[Mdf$C_color == 'g'] <- "Gray"
Mdf$C_color[Mdf$C_color == 'r'] <- "Green"
Mdf$C_color[Mdf$C_color == 'p'] <- "Pink"
Mdf$C_color[Mdf$C_color == 'u'] <- "Purple"
Mdf$C_color[Mdf$C_color == 'e'] <- "Red"
Mdf$C_color[Mdf$C_color == 'w'] <- "White"
Mdf$C_color[Mdf$C_color == 'y'] <- "Yellow"
Mdf$Bruises[Mdf$Bruises == 't'] <- "Yes"
Mdf$Bruises[Mdf$Bruises == 'f'] <- "No"
Mdf$Odor[Mdf$Odor == 'a'] <- "Almond"
Mdf$Odor[Mdf$Odor == 'l'] <- "Anise"
Mdf$Odor[Mdf$Odor == 'c'] <- "Creosote"
Mdf$Odor[Mdf$Odor == 'y'] <- "Fishy"
Mdf$Odor[Mdf$Odor == 'f'] <- "Foul"
Mdf$Odor[Mdf$Odor == 'm'] <- "Musty"
Mdf$Odor[Mdf$Odor == 'n'] <- "None"
Mdf$Odor[Mdf$Odor == 'p'] <- "Pungent"
Mdf$Odor[Mdf$Odor == 's'] <- "Spicy"
Mdf$G_Attachment[Mdf$G_Attachment == 'a'] <- "Attached"
Mdf$G_Attachment[Mdf$G_Attachment == 'd'] <- "Descending"
Mdf$G_Attachment[Mdf$G_Attachment == 'f'] <- "Free"
Mdf$G_Attachment[Mdf$G_Attachment == 'n'] <- "Notched"
Mdf$G_spacing[Mdf$G_spacing == 'c'] <- "Closed"
Mdf$G_spacing[Mdf$G_spacing == 'w'] <- "Crowded"
Mdf$G_spacing[Mdf$G_spacing == 'd'] <- "Distant"
Mdf$G_size[Mdf$G_size == 'b'] <- "Broad"
Mdf$G_size[Mdf$G_size == 'n'] <- "Narrow"
Mdf$G_color[Mdf$G_color == 'k'] <- "Black"
Mdf$G_color[Mdf$G_color == 'n'] <- "Brown"
Mdf$G_color[Mdf$G_color == 'b'] <- "Buff"
Mdf$G_color[Mdf$G_color == 'h'] <- "Chocolate"
Mdf$G_color[Mdf$G_color == 'g'] <- "Gray"
Mdf$G_color[Mdf$G_color == 'r'] <- "Green"
Mdf$G_color[Mdf$G_color == 'o'] <- "Orange"
Mdf$G_color[Mdf$G_color == 'p'] <- "Pink"
Mdf$G_color[Mdf$G_color == 'u'] <- "Purple"
Mdf$G_color[Mdf$G_color == 'e'] <- "Red"
Mdf$G_color[Mdf$G_color == 'w'] <- "White"
Mdf$G_color[Mdf$G_color == 'y'] <- "Yellow"
Mdf$S_shape[Mdf$S_shape == 'e'] <- "Enlarging"
Mdf$S_shape[Mdf$S_shape == 't'] <- "Tapering"
Mdf$S_root[Mdf$S_root == 'b'] <- "Bulbous"
Mdf$S_root[Mdf$S_root == 'c'] <- "Club"
Mdf$S_root[Mdf$S_root == 'u'] <- "Cup"
Mdf$S_root[Mdf$S_root == 'e'] <- "Equal"
Mdf$S_surfaceAR[Mdf$S_surfaceAR == 'f'] <- "Fibrous"
Mdf$S_surfaceAR[Mdf$S_surfaceAR == 'y'] <- "Scaly"
Mdf$S_surfaceAR[Mdf$S_surfaceAR == 'k'] <- "Silky"
Mdf$S_surfaceAR[Mdf$S_surfaceAR == 's'] <- "Smooth"
Mdf$S_surfaceBR[Mdf$S_surfaceBR == 'f'] <- "Fibrous"
Mdf$S_surfaceBR[Mdf$S_surfaceBR == 'y'] <- "Scaly"
Mdf$S_surfaceBR[Mdf$S_surfaceBR == 'k'] <- "Silky"
Mdf$S_surfaceBR[Mdf$S_surfaceBR == 's'] <- "Smooth"
Mdf$S_colorAR[Mdf$S_colorAR == 'n'] <- "Brown"
Mdf$S_colorAR[Mdf$S_colorAR == 'b'] <- "Brown"
Mdf$S_colorAR[Mdf$S_colorAR == 'c'] <- "Brown"
Mdf$S_colorAR[Mdf$S_colorAR == 'g'] <- "Gray"
Mdf$S_colorAR[Mdf$S_colorAR == 'o'] <- "Orange"
Mdf$S_colorAR[Mdf$S_colorAR == 'p'] <- "Pink"
Mdf$S_colorAR[Mdf$S_colorAR == 'e'] <- "Red"
Mdf$S_colorAR[Mdf$S_colorAR == 'w'] <- "White"
Mdf$S_colorAR[Mdf$S_colorAR == 'y'] <- "Yellow"
Mdf$S_colorBR[Mdf$S_colorBR == 'n'] <- "Brown"
Mdf$S_colorBR[Mdf$S_colorBR == 'b'] <- "Brown"
Mdf$S_colorBR[Mdf$S_colorBR == 'c'] <- "Brown"
Mdf$S_colorBR[Mdf$S_colorBR == 'g'] <- "Gray"
Mdf$S_colorBR[Mdf$S_colorBR == 'o'] <- "Orange"
Mdf$S_colorBR[Mdf$S_colorBR == 'p'] <- "Pink"
Mdf$S_colorBR[Mdf$S_colorBR == 'e'] <- "Red"
Mdf$S_colorBR[Mdf$S_colorBR == 'w'] <- "White"
Mdf$S_colorBR[Mdf$S_colorBR == 'y'] <- "Yellow"
Mdf$V_type[Mdf$V_type == 'p'] <- "Partial"
Mdf$V_type[Mdf$V_type == 'u'] <- "Universal"
Mdf$V_color[Mdf$V_color == 'n'] <- "Brown"
Mdf$V_color[Mdf$V_color == 'o'] <- "Orange"
Mdf$V_color[Mdf$V_color == 'w'] <- "White"
Mdf$V_color[Mdf$V_color == 'y'] <- "Yellow"
Mdf$R_number[Mdf$R_number == 'n'] <- "None"
Mdf$R_number[Mdf$R_number == 'o'] <- "One"
Mdf$R_number[Mdf$R_number == 't'] <- "Two"
Mdf$R_type[Mdf$R_type == 'c'] <- "Cobwebby"
Mdf$R_type[Mdf$R_type == 'e'] <- "Evanescent"
Mdf$R_type[Mdf$R_type == 'f'] <- "Flaring"
Mdf$R_type[Mdf$R_type == 'l'] <- "Large"
Mdf$R_type[Mdf$R_type == 'n'] <- "None"
Mdf$R_type[Mdf$R_type == 'p'] <- "Pendant"
Mdf$R_type[Mdf$R_type == 's'] <- "Sheathing"
Mdf$R_type[Mdf$R_type == 'z'] <- "Zone"
Mdf$SporeCol[Mdf$SporeCol == 'k'] <- "Black"
Mdf$SporeCol[Mdf$SporeCol == 'n'] <- "Brown"
Mdf$SporeCol[Mdf$SporeCol == 'b'] <- "Buff"
Mdf$SporeCol[Mdf$SporeCol == 'h'] <- "Chocolate"
Mdf$SporeCol[Mdf$SporeCol == 'r'] <- "Green"
Mdf$SporeCol[Mdf$SporeCol == 'o'] <- "Orange"
Mdf$SporeCol[Mdf$SporeCol == 'u'] <- "Purple"
Mdf$SporeCol[Mdf$SporeCol == 'w'] <- "White"
Mdf$SporeCol[Mdf$SporeCol == 'y'] <- "Yellow"
Mdf$Population[Mdf$Population == 'a'] <- "Abundant"
Mdf$Population[Mdf$Population == 'c'] <- "Clustered"
Mdf$Population[Mdf$Population == 'n'] <- "Numerous"
Mdf$Population[Mdf$Population == 's'] <- "Scattered"
Mdf$Population[Mdf$Population == 'v'] <- "Several"
Mdf$Population[Mdf$Population == 'y'] <- "Solitary"
Mdf$Habitat[Mdf$Habitat == 'g'] <- "Grasses"
Mdf$Habitat[Mdf$Habitat == 'l'] <- "Leaves"
Mdf$Habitat[Mdf$Habitat == 'm'] <- "Meadows"
Mdf$Habitat[Mdf$Habitat == 'p'] <- "Paths"
Mdf$Habitat[Mdf$Habitat == 'u'] <- "Urban"
Mdf$Habitat[Mdf$Habitat == 'w'] <- "Waste"
Mdf$Habitat[Mdf$Habitat == 'd'] <- "Woods"
head(Mdf)
## Class C_shape C_surface C_color Bruises Odor G_Attachment
## 1 Poisonous Convex Smooth Brown Yes Pungent Free
## 2 Edible Convex Smooth Yellow Yes Almond Free
## 3 Edible Bell Smooth White Yes Anise Free
## 4 Poisonous Convex Scaly White Yes Pungent Free
## 5 Edible Convex Smooth Gray No None Free
## 6 Edible Convex Scaly Yellow Yes Almond Free
## G_spacing G_size G_color S_shape S_root S_surfaceAR S_surfaceBR
## 1 Closed Narrow Black Enlarging Equal Smooth Smooth
## 2 Closed Broad Black Enlarging Club Smooth Smooth
## 3 Closed Broad Brown Enlarging Club Smooth Smooth
## 4 Closed Narrow Brown Enlarging Equal Smooth Smooth
## 5 Crowded Broad Black Tapering Equal Smooth Smooth
## 6 Closed Broad Brown Enlarging Club Smooth Smooth
## S_colorAR S_colorBR V_type V_color R_number R_type SporeCol
## 1 White White Partial White One Pendant Black
## 2 White White Partial White One Pendant Brown
## 3 White White Partial White One Pendant Brown
## 4 White White Partial White One Pendant Black
## 5 White White Partial White One Evanescent Brown
## 6 White White Partial White One Pendant Black
## Population Habitat
## 1 Scattered Urban
## 2 Numerous Grasses
## 3 Numerous Meadows
## 4 Scattered Urban
## 5 Abundant Grasses
## 6 Numerous Grasses
library(plyr)
library(dplyr)
M2df <- select(Mdf, Class, C_shape, C_surface, C_color, Habitat)
head(M2df, 25)
## Class C_shape C_surface C_color Habitat
## 1 Poisonous Convex Smooth Brown Urban
## 2 Edible Convex Smooth Yellow Grasses
## 3 Edible Bell Smooth White Meadows
## 4 Poisonous Convex Scaly White Urban
## 5 Edible Convex Smooth Gray Grasses
## 6 Edible Convex Scaly Yellow Grasses
## 7 Edible Bell Smooth White Meadows
## 8 Edible Bell Scaly White Meadows
## 9 Poisonous Convex Scaly White Grasses
## 10 Edible Bell Smooth Yellow Meadows
## 11 Edible Convex Scaly Yellow Grasses
## 12 Edible Convex Scaly Yellow Meadows
## 13 Edible Bell Smooth Yellow Grasses
## 14 Poisonous Convex Scaly White Urban
## 15 Edible Convex Fibrous Brown Grasses
## 16 Edible Sunken Fibrous Gray Urban
## 17 Edible Flat Fibrous White Grasses
## 18 Poisonous Convex Smooth Brown Grasses
## 19 Poisonous Convex Scaly White Urban
## 20 Poisonous Convex Smooth Brown Urban
## 21 Edible Bell Smooth Yellow Meadows
## 22 Poisonous Convex Scaly Brown Grasses
## 23 Edible Bell Scaly Yellow Meadows
## 24 Edible Bell Scaly White Meadows
## 25 Edible Bell Smooth White Meadows
#Totals for values in each column
summary(factor(M2df$Class))
## Edible Poisonous
## 4208 3916
summary(factor(M2df$C_shape))
## Bell Conical Convex Flat Knobbed Sunken
## 452 4 3656 3152 828 32
summary(factor(M2df$C_surface))
## Fibrous Grooves Scaly Smooth
## 2320 4 3244 2556
summary(factor(M2df$C_color))
## Brown Buff Cinnamon Gray Green Pink Purple Red
## 2284 168 44 1840 16 144 16 1500
## White Yellow
## 1040 1072
summary(factor(M2df$Habitat))
## Grasses Leaves Meadows Paths Urban Waste Woods
## 2148 832 292 1144 368 192 3148
#Details of poisonous mushrooms
poison <- M2df %>% filter(Class == 'Poisonous')
head(poison, 20)
## Class C_shape C_surface C_color Habitat
## 1 Poisonous Convex Smooth Brown Urban
## 2 Poisonous Convex Scaly White Urban
## 3 Poisonous Convex Scaly White Grasses
## 4 Poisonous Convex Scaly White Urban
## 5 Poisonous Convex Smooth Brown Grasses
## 6 Poisonous Convex Scaly White Urban
## 7 Poisonous Convex Smooth Brown Urban
## 8 Poisonous Convex Scaly Brown Grasses
## 9 Poisonous Flat Smooth White Grasses
## 10 Poisonous Convex Scaly White Urban
## 11 Poisonous Convex Scaly Brown Urban
## 12 Poisonous Convex Scaly White Grasses
## 13 Poisonous Convex Scaly Brown Urban
## 14 Poisonous Convex Smooth White Urban
## 15 Poisonous Convex Scaly Brown Urban
## 16 Poisonous Convex Scaly White Grasses
## 17 Poisonous Convex Scaly White Urban
## 18 Poisonous Convex Smooth White Grasses
## 19 Poisonous Flat Scaly Brown Grasses
## 20 Poisonous Convex Scaly White Urban
table(poison$Habitat)
##
## Grasses Leaves Meadows Paths Urban Woods
## 740 592 36 1008 272 1268
Most poisonous mushroom a found in the Woods.
#Details of edible mushrooms
edible <- M2df %>% filter(Class == 'Edible')
head(edible, 20)
## Class C_shape C_surface C_color Habitat
## 1 Edible Convex Smooth Yellow Grasses
## 2 Edible Bell Smooth White Meadows
## 3 Edible Convex Smooth Gray Grasses
## 4 Edible Convex Scaly Yellow Grasses
## 5 Edible Bell Smooth White Meadows
## 6 Edible Bell Scaly White Meadows
## 7 Edible Bell Smooth Yellow Meadows
## 8 Edible Convex Scaly Yellow Grasses
## 9 Edible Convex Scaly Yellow Meadows
## 10 Edible Bell Smooth Yellow Grasses
## 11 Edible Convex Fibrous Brown Grasses
## 12 Edible Sunken Fibrous Gray Urban
## 13 Edible Flat Fibrous White Grasses
## 14 Edible Bell Smooth Yellow Meadows
## 15 Edible Bell Scaly Yellow Meadows
## 16 Edible Bell Scaly White Meadows
## 17 Edible Bell Smooth White Meadows
## 18 Edible Convex Scaly Yellow Meadows
## 19 Edible Convex Scaly White Meadows
## 20 Edible Flat Fibrous Brown Urban
table(edible$C_surface)
##
## Fibrous Scaly Smooth
## 1560 1504 1144
Fibrous Mushrooms are mostly edible.