As part of assignment 1,We will be studing the “Mushroom dataset” available at https://archive.ics.uci.edu/ml/datasets/Mushroom.
URL <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
MushroomsData <- read.table(URL, header = FALSE, sep = ",")
‘Head’ command provides the slight insight of data without crowding your workspace.
head(MushroomsData)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
str(MushroomsData)
## 'data.frame': 8124 obs. of 23 variables:
## $ V1 : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
## $ V2 : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
## $ V3 : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
## $ V4 : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
## $ V5 : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
## $ V6 : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
## $ V7 : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
## $ V8 : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
## $ V9 : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
## $ V10: Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
## $ V11: Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
## $ V12: Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
## $ V13: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ V14: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ V15: Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ V16: Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ V17: Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
## $ V18: Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ V19: Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
## $ V20: Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
## $ V21: Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
## $ V22: Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
## $ V23: Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...
MushroomsData <- data.frame(MushroomsData)
subdata <- subset(MushroomsData,select = c(V1,V2,V3,V4))
head(subdata)
## V1 V2 V3 V4
## 1 p x s n
## 2 e x s y
## 3 e b s w
## 4 p x y w
## 5 e x s g
## 6 e x y y
names(subdata) <- c("Edible/Poisonous","cap-shape","cap-surface","cap-color")
head(subdata)
## Edible/Poisonous cap-shape cap-surface cap-color
## 1 p x s n
## 2 e x s y
## 3 e b s w
## 4 p x y w
## 5 e x s g
## 6 e x y y
C.1) ‘table’ command comes handy whenever we are stuck with total count of each attribute in dataset.
table(subdata$`Edible/Poisonous`)
##
## e p
## 4208 3916
table(subdata$`cap-shape`)
##
## b c f k s x
## 452 4 3152 828 32 3656
table(subdata$`cap-surface`)
##
## f g s y
## 2320 4 2556 3244
table(subdata$`cap-color`)
##
## b c e g n p r u w y
## 168 44 1500 1840 2284 144 16 16 1040 1072
C.2)“gsub” can be used to rename the abbreviations but we wont be using it with other columns as ‘gsub’ replaces every abbreviation and can create unncessary duplication of words.
subdata$`Edible/Poisonous` <- gsub("e","edible",subdata$`Edible/Poisonous`)
subdata$`Edible/Poisonous` <- gsub("p","poisonous",subdata$`Edible/Poisonous`)
subdata1 <- data.frame(subdata)
head(subdata1)
## Edible.Poisonous cap.shape cap.surface cap.color
## 1 poisonous x s n
## 2 edible x s y
## 3 edible b s w
## 4 poisonous x y w
## 5 edible x s g
## 6 edible x y y
C.3) Given below is another way to replace abbreviations [neat and clean]
subdata1$cap.shape = as.character(subdata1$cap.shape)
subdata1$cap.surface = as.character(subdata1$cap.surface)
subdata1$cap.color = as.character(subdata1$cap.color)
subdata1$cap.shape[subdata1$cap.shape == "x"] = "convex";
subdata1$cap.shape[subdata1$cap.shape == "b"] = "bell";
subdata1$cap.shape[subdata1$cap.shape == "c"] = "conical";
subdata1$cap.shape[subdata1$cap.shape == "f"] = "flat";
subdata1$cap.shape[subdata1$cap.shape == "k"] = "knobbed";
subdata1$cap.shape[subdata1$cap.shape == "s"] = "sunken";
subdata1$cap.surface[subdata1$cap.surface == "f"] = "fibrous"
subdata1$cap.surface[subdata1$cap.surface == "g"] = "grooves"
subdata1$cap.surface[subdata1$cap.surface == "y"] = "scaly"
subdata1$cap.surface[subdata1$cap.surface == "s"] = "smooth"
subdata1$cap.color[subdata1$cap.color == "n"] = "brown"
subdata1$cap.color[subdata1$cap.color == "b"] = "buff"
subdata1$cap.color[subdata1$cap.color == "c"] = "cinnamon"
subdata1$cap.color[subdata1$cap.color == "g"] = "gray"
subdata1$cap.color[subdata1$cap.color == "r"] = "green"
subdata1$cap.color[subdata1$cap.color == "p"] = "pink"
subdata1$cap.color[subdata1$cap.color == "u"] = "purple"
subdata1$cap.color[subdata1$cap.color == "e"] = "red"
subdata1$cap.color[subdata1$cap.color == "w"] = "white"
subdata1$cap.color[subdata1$cap.color == "y"] = "yellow"
head(subdata1)
## Edible.Poisonous cap.shape cap.surface cap.color
## 1 poisonous convex smooth brown
## 2 edible convex smooth yellow
## 3 edible bell smooth white
## 4 poisonous convex scaly white
## 5 edible convex smooth gray
## 6 edible convex scaly yellow
Final dataset required as per task given.