Load the data into R
dat <- read.table("agaricus-lepiota.data", sep = ",")
head(dat)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
Compare the dataset to the dataset description
str(data)
## function (..., list = character(), package = NULL, lib.loc = NULL,
## verbose = getOption("verbose"), envir = .GlobalEnv)
Data set contains 23 columns first column states whether poisoinous (p) or edible (e) next 22 columns are attributes stated in data description Dataset also has 8124 observations as indicated nu data description file
Choose the columns to work on and name them
mushroom_dataset <- as.data.frame(dat[c(1,5,11,17,19)])
colnames(mushroom_dataset) <- c("Class", "bruises?", "stalk-shape", "veil-type", "ring-number")
str(mushroom_dataset)
## 'data.frame': 8124 obs. of 5 variables:
## $ Class : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
## $ bruises? : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
## $ stalk-shape: Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
## $ veil-type : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
## $ ring-number: Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
These are all factors, will change to character so it’s easier to work with
mushroom_dataset$Class <- as.character(mushroom_dataset$Class)
mushroom_dataset$`bruises?` <- as.character(mushroom_dataset$`bruises?`)
mushroom_dataset$`stalk-shape` <-as.character(mushroom_dataset$`stalk-shape`)
mushroom_dataset$`veil-type` <- as.character(mushroom_dataset$`veil-type`)
mushroom_dataset$`ring-number` <- as.character(mushroom_dataset$`ring-number`)
Renaming the variables in the dataset so it’s easier to read, and return them to easier to use data types
for (i in 1:8124) {
if (mushroom_dataset$Class[i] == 'e')
mushroom_dataset$Class[i] <- "edible"
else
mushroom_dataset$Class[i] = 'poisonous'
}
mushroom_dataset$Class <- as.factor(mushroom_dataset$Class)
for (i in 1:8124) {
if (mushroom_dataset$`bruises?`[i] == 't')
mushroom_dataset$`bruises?`[i] <- "TRUE"
else
mushroom_dataset$`bruises?`[i] = "FALSE"
}
mushroom_dataset$`bruises?` <- as.factor(mushroom_dataset$`bruises?`)
for (i in 1:8124) {
if (mushroom_dataset$`stalk-shape`[i] == 'e')
mushroom_dataset$`stalk-shape`[i] <- 'enlarging'
else
mushroom_dataset$`stalk-shape`[i] = 'tapering'
}
mushroom_dataset$`stalk-shape` <-as.factor(mushroom_dataset$`stalk-shape`)
for (i in 1:8124) {
if (mushroom_dataset$`veil-type`[i] == 'p')
mushroom_dataset$`veil-type`[i] <- 'partial'
else
mushroom_dataset$`stalk-shape`[i] = 'universal'
}
mushroom_dataset$`veil-type` <- as.factor(mushroom_dataset$`veil-type`)
for (i in 1:8124) {
if (mushroom_dataset$`ring-number`[i] == 'n')
mushroom_dataset$`ring-number`[i] = '0'
else if (mushroom_dataset$`ring-number`[i] == 'o')
mushroom_dataset$`ring-number`[i] = '1'
else
mushroom_dataset$`ring-number`[i] = '2'
}
mushroom_dataset$`ring-number` <- as.numeric(mushroom_dataset$`ring-number`)
Check the new dataset
str(mushroom_dataset)
## 'data.frame': 8124 obs. of 5 variables:
## $ Class : Factor w/ 2 levels "edible","poisonous": 2 1 1 2 1 1 1 1 2 1 ...
## $ bruises? : Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 1 2 2 2 2 2 ...
## $ stalk-shape: Factor w/ 2 levels "enlarging","tapering": 1 1 1 1 2 1 1 1 1 1 ...
## $ veil-type : Factor w/ 1 level "partial": 1 1 1 1 1 1 1 1 1 1 ...
## $ ring-number: num 1 1 1 1 1 1 1 1 1 1 ...
head(mushroom_dataset)
## Class bruises? stalk-shape veil-type ring-number
## 1 poisonous TRUE enlarging partial 1
## 2 edible TRUE enlarging partial 1
## 3 edible TRUE enlarging partial 1
## 4 poisonous TRUE enlarging partial 1
## 5 edible FALSE tapering partial 1
## 6 edible TRUE enlarging partial 1