- Load the selected dataset into R.
# Original data location: "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
# Personal Git for this assignment: "https://raw.githubusercontent.com/ChadRyanBailey/607-Week1-Assignment/master/Mushroom%20Data.csv"
fileLocation <- "https://raw.githubusercontent.com/ChadRyanBailey/607-Week1-Assignment/master/Mushroom%20Data.csv"
mushrooms <- read.table(file = fileLocation, header = FALSE, sep = ",")
- Add meaningful column names (using provided data dictionary).
names(mushrooms) <- c("edible-or-poisonous"
,"cap-shape"
,"cap-surface"
,"cap-color"
,"bruises?"
,"odor"
,"gill-attachment"
,"gill-spacing"
,"gill-size"
,"gill-color"
,"stalk-shape"
,"stalk-root"
,"stalk-surface-above-ring"
,"stalk-surface-below-ring"
,"stalk-color-above-ring"
,"stalk-color-below-ring"
,"veil-type"
,"veil-color"
,"ring-number"
,"ring-type"
,"spore-print-color"
,"population"
,"habitat")
- Review the first few rows of the data and a summary of the data.
head(mushrooms)
## edible-or-poisonous cap-shape cap-surface cap-color bruises? odor
## 1 p x s n t p
## 2 e x s y t a
## 3 e b s w t l
## 4 p x y w t p
## 5 e x s g f n
## 6 e x y y t a
## gill-attachment gill-spacing gill-size gill-color stalk-shape stalk-root
## 1 f c n k e e
## 2 f c b k e c
## 3 f c b n e c
## 4 f c n n e e
## 5 f w b k t e
## 6 f c b n e c
## stalk-surface-above-ring stalk-surface-below-ring stalk-color-above-ring
## 1 s s w
## 2 s s w
## 3 s s w
## 4 s s w
## 5 s s w
## 6 s s w
## stalk-color-below-ring veil-type veil-color ring-number ring-type
## 1 w p w o p
## 2 w p w o p
## 3 w p w o p
## 4 w p w o p
## 5 w p w o e
## 6 w p w o p
## spore-print-color population habitat
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
summary(mushrooms)
## edible-or-poisonous cap-shape cap-surface cap-color bruises?
## e:4208 b: 452 f:2320 n :2284 f:4748
## p:3916 c: 4 g: 4 g :1840 t:3376
## f:3152 s:2556 e :1500
## k: 828 y:3244 y :1072
## s: 32 w :1040
## x:3656 b : 168
## (Other): 220
## odor gill-attachment gill-spacing gill-size gill-color
## n :3528 a: 210 c:6812 b:5612 b :1728
## f :2160 f:7914 w:1312 n:2512 p :1492
## s : 576 w :1202
## y : 576 n :1048
## a : 400 g : 752
## l : 400 h : 732
## (Other): 484 (Other):1170
## stalk-shape stalk-root stalk-surface-above-ring stalk-surface-below-ring
## e:3516 ?:2480 f: 552 f: 600
## t:4608 b:3776 k:2372 k:2304
## c: 556 s:5176 s:4936
## e:1120 y: 24 y: 284
## r: 192
##
##
## stalk-color-above-ring stalk-color-below-ring veil-type veil-color
## w :4464 w :4384 p:8124 n: 96
## p :1872 p :1872 o: 96
## g : 576 g : 576 w:7924
## n : 448 n : 512 y: 8
## b : 432 b : 432
## o : 192 o : 192
## (Other): 140 (Other): 156
## ring-number ring-type spore-print-color population habitat
## n: 36 e:2776 w :2388 a: 384 d:3148
## o:7488 f: 48 n :1968 c: 340 g:2148
## t: 600 l:1296 k :1872 n: 400 l: 832
## n: 36 h :1632 s:1248 m: 292
## p:3968 r : 72 v:4040 p:1144
## b : 48 y:1712 u: 368
## (Other): 144 w: 192
- Subset the data, including the column “edible-or-poisonous” and three or four other columns
mushrooms_subset <- mushrooms[, c("edible-or-poisonous"
,"gill-spacing"
,"gill-size"
,"population"
,"habitat")]
head(mushrooms_subset)
## edible-or-poisonous gill-spacing gill-size population habitat
## 1 p c n s u
## 2 e c b n g
## 3 e c b n m
## 4 p c n s u
## 5 e w b a g
## 6 e c b n g
summary(mushrooms_subset)
## edible-or-poisonous gill-spacing gill-size population habitat
## e:4208 c:6812 b:5612 a: 384 d:3148
## p:3916 w:1312 n:2512 c: 340 g:2148
## n: 400 l: 832
## s:1248 m: 292
## v:4040 p:1144
## y:1712 u: 368
## w: 192
- Replace the abbreviations used in the data
eop <- mushrooms_subset[,"edible-or-poisonous"]
eop <- sub("e", "edible", eop)
eop <- sub("p", "poisonous", eop)
mushrooms_subset[,"edible-or-poisonous"] <-eop
gspacing <- mushrooms_subset[,"gill-spacing"]
gspacing <- sub("c", "close", gspacing)
gspacing <- sub("w", "crowded", gspacing)
mushrooms_subset[,"gill-spacing"] <- gspacing
gsize <- mushrooms_subset[,"gill-size"]
gsize <- sub("b", "broad", gsize)
gsize <- sub("n", "narrow", gsize)
mushrooms_subset[,"gill-size"] <- gsize
population <- mushrooms_subset[,"population"]
population <- sub("\\ba", "abundant", population)
population <- sub("\\bc", "clustered", population)
population <- sub("\\bn", "numerous", population)
population <- sub("\\bs", "scattered", population)
population <- sub("\\bv", "several", population)
population <- sub("\\by", "solitary", population)
mushrooms_subset[,"population"] <- population
habitat <- mushrooms_subset[,"habitat"]
habitat <- sub("\\bd", "woods", habitat)
habitat <- sub("\\bg", "grasses", habitat)
habitat <- sub("\\bl", "leaves", habitat)
habitat <- sub("\\bm", "meadows", habitat)
habitat <- sub("\\bp", "paths", habitat)
habitat <- sub("\\bu", "urban", habitat)
habitat <- sub("\\bw\\b", "waste", habitat)
mushrooms_subset[,"habitat"] <- habitat
#convert character fields back to factors
mushrooms_subset <- as.data.frame(unclass(mushrooms_subset))
#review transformed data
head(mushrooms_subset)
## edible.or.poisonous gill.spacing gill.size population habitat
## 1 poisonous close narrow scattered urban
## 2 edible close broad numerous grasses
## 3 edible close broad numerous meadows
## 4 poisonous close narrow scattered urban
## 5 edible crowded broad abundant grasses
## 6 edible close broad numerous grasses
summary(mushrooms_subset)
## edible.or.poisonous gill.spacing gill.size population
## edible :4208 close :6812 broad :5612 abundant : 384
## poisonous:3916 crowded:1312 narrow:2512 clustered: 340
## numerous : 400
## scattered:1248
## several :4040
## solitary :1712
##
## habitat
## grasses:2148
## leaves : 832
## meadows: 292
## paths :1144
## urban : 368
## waste : 192
## woods :3148