library(tidyverse)
A. Read in dataset from GitHub repository and review raw data
url <- "https://raw.githubusercontent.com/kecbenson/DATA_607_Wk1/master/agaricus-lepiota.data.txt"
df_raw <- as.tibble(read.csv(url, header = FALSE))
df_raw
## # A tibble: 8,124 x 23
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 p x s n t p f c n k e e
## 2 e x s y t a f c b k e c
## 3 e b s w t l f c b n e c
## 4 p x y w t p f c n n e e
## 5 e x s g f n f w b k t e
## 6 e x y y t a f c b n e c
## 7 e b s w t a f c b g e c
## 8 e b y w t l f c b n e c
## 9 p x y w t p f c n p e e
## 10 e b s y t a f c b g e c
## # ... with 8,114 more rows, and 11 more variables: V13 <fct>, V14 <fct>,
## # V15 <fct>, V16 <fct>, V17 <fct>, V18 <fct>, V19 <fct>, V20 <fct>,
## # V21 <fct>, V22 <fct>, V23 <fct>
summary(df_raw)
## V1 V2 V3 V4 V5 V6
## e:4208 b: 452 f:2320 n :2284 f:4748 n :3528
## p:3916 c: 4 g: 4 g :1840 t:3376 f :2160
## f:3152 s:2556 e :1500 s : 576
## k: 828 y:3244 y :1072 y : 576
## s: 32 w :1040 a : 400
## x:3656 b : 168 l : 400
## (Other): 220 (Other): 484
## V7 V8 V9 V10 V11 V12 V13
## a: 210 c:6812 b:5612 b :1728 e:3516 ?:2480 f: 552
## f:7914 w:1312 n:2512 p :1492 t:4608 b:3776 k:2372
## w :1202 c: 556 s:5176
## n :1048 e:1120 y: 24
## g : 752 r: 192
## h : 732
## (Other):1170
## V14 V15 V16 V17 V18 V19
## f: 600 w :4464 w :4384 p:8124 n: 96 n: 36
## k:2304 p :1872 p :1872 o: 96 o:7488
## s:4936 g : 576 g : 576 w:7924 t: 600
## y: 284 n : 448 n : 512 y: 8
## b : 432 b : 432
## o : 192 o : 192
## (Other): 140 (Other): 156
## V20 V21 V22 V23
## e:2776 w :2388 a: 384 d:3148
## f: 48 n :1968 c: 340 g:2148
## l:1296 k :1872 n: 400 l: 832
## n: 36 h :1632 s:1248 m: 292
## p:3968 r : 72 v:4040 p:1144
## b : 48 y:1712 u: 368
## (Other): 144 w: 192
B. Create dataframe with character instead of factor data
df1 <- as.tibble(read.csv(url, header = FALSE, stringsAsFactors=FALSE))
df1
## # A tibble: 8,124 x 23
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 p x s n t p f c n k e e
## 2 e x s y t a f c b k e c
## 3 e b s w t l f c b n e c
## 4 p x y w t p f c n n e e
## 5 e x s g f n f w b k t e
## 6 e x y y t a f c b n e c
## 7 e b s w t a f c b g e c
## 8 e b y w t l f c b n e c
## 9 p x y w t p f c n p e e
## 10 e b s y t a f c b g e c
## # ... with 8,114 more rows, and 11 more variables: V13 <chr>, V14 <chr>,
## # V15 <chr>, V16 <chr>, V17 <chr>, V18 <chr>, V19 <chr>, V20 <chr>,
## # V21 <chr>, V22 <chr>, V23 <chr>
C. Select a subset of the columns in the dataset and add meaningful column names
df2 <- subset(df1, select = c(V1, V2, V4, V6, V22, V23))
df2
## # A tibble: 8,124 x 6
## V1 V2 V4 V6 V22 V23
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 p x n p s u
## 2 e x y a n g
## 3 e b w l n m
## 4 p x w p s u
## 5 e x g n a g
## 6 e x y a n g
## 7 e b w a n m
## 8 e b w l s m
## 9 p x w p v g
## 10 e b y a s m
## # ... with 8,114 more rows
colnames(df2) <- c("edible", "cap_shape", "cap_color", "odor", "population", "habitat")
df2
## # A tibble: 8,124 x 6
## edible cap_shape cap_color odor population habitat
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 p x n p s u
## 2 e x y a n g
## 3 e b w l n m
## 4 p x w p s u
## 5 e x g n a g
## 6 e x y a n g
## 7 e b w a n m
## 8 e b w l s m
## 9 p x w p v g
## 10 e b y a s m
## # ... with 8,114 more rows
D. Replace abbreviations in the data with descriptive text
# edible = edible, poisonous
df2$edible[df2$edible == "e"] <- "edible"
df2$edible[df2$edible == "p"] <- "poisonous"
# check all instances accounted for
table(df2$edible)
##
## edible poisonous
## 4208 3916
sum(table(df2$edible))
## [1] 8124
# cap_shape = bell, conical, convex, flat, knobbed, sunken
df2$cap_shape[df2$cap_shape == "b"] <- "bell"
df2$cap_shape[df2$cap_shape == "c"] <- "conical"
df2$cap_shape[df2$cap_shape == "x"] <- "convex"
df2$cap_shape[df2$cap_shape == "f"] <- "flat"
df2$cap_shape[df2$cap_shape == "k"] <- "knobbed"
df2$cap_shape[df2$cap_shape == "s"] <- "sunken"
# check all instances accounted for
table(df2$cap_shape)
##
## bell conical convex flat knobbed sunken
## 452 4 3656 3152 828 32
sum(table(df2$cap_shape))
## [1] 8124
# cap_color = brown, buff, cinnamon, gray, green, pink, purple, red, white, yellow
df2$cap_color[df2$cap_color == "n"] <- "brown"
df2$cap_color[df2$cap_color == "b"] <- "buff"
df2$cap_color[df2$cap_color == "c"] <- "cinnamon"
df2$cap_color[df2$cap_color == "g"] <- "gray"
df2$cap_color[df2$cap_color == "r"] <- "green"
df2$cap_color[df2$cap_color == "p"] <- "pink"
df2$cap_color[df2$cap_color == "u"] <- "purple"
df2$cap_color[df2$cap_color == "e"] <- "red"
df2$cap_color[df2$cap_color == "w"] <- "white"
df2$cap_color[df2$cap_color == "y"] <- "yellow"
# check all instances accounted for
table(df2$cap_color)
##
## brown buff cinnamon gray green pink purple red
## 2284 168 44 1840 16 144 16 1500
## white yellow
## 1040 1072
sum(table(df2$cap_color))
## [1] 8124
# odor = almond, anise, creosote, fishy, foul, musty, none, pungent, spicy
df2$odor[df2$odor == "a"] <- "almond"
df2$odor[df2$odor == "l"] <- "anise"
df2$odor[df2$odor == "c"] <- "creosote"
df2$odor[df2$odor == "y"] <- "fishy"
df2$odor[df2$odor == "f"] <- "foul"
df2$odor[df2$odor == "m"] <- "musty"
df2$odor[df2$odor == "n"] <- "none"
df2$odor[df2$odor == "p"] <- "pungent"
df2$odor[df2$odor == "s"] <- "spicy"
# check all instances accounted for
table(df2$odor)
##
## almond anise creosote fishy foul musty none pungent
## 400 400 192 576 2160 36 3528 256
## spicy
## 576
sum(table(df2$odor))
## [1] 8124
# population = abundant, clustered, numerous, scattered, several, solitary
df2$population[df2$population == "a"] <- "abundant"
df2$population[df2$population == "c"] <- "clustered"
df2$population[df2$population == "n"] <- "numerous"
df2$population[df2$population == "s"] <- "scattered"
df2$population[df2$population == "v"] <- "several"
df2$population[df2$population == "y"] <- "solitary"
# check all instances accounted for
table(df2$population)
##
## abundant clustered numerous scattered several solitary
## 384 340 400 1248 4040 1712
sum(table(df2$population))
## [1] 8124
# habitat = grasses, leaves, meadows, paths, urban, waste, woods
df2$habitat[df2$habitat == "g"] <- "grasses"
df2$habitat[df2$habitat == "l"] <- "leaves"
df2$habitat[df2$habitat == "m"] <- "meadows"
df2$habitat[df2$habitat == "p"] <- "paths"
df2$habitat[df2$habitat == "u"] <- "urban"
df2$habitat[df2$habitat == "w"] <- "waste"
df2$habitat[df2$habitat == "d"] <- "woods"
# check all instances accounted for
table(df2$habitat)
##
## grasses leaves meadows paths urban waste woods
## 2148 832 292 1144 368 192 3148
sum(table(df2$habitat))
## [1] 8124
# final dataframe
df2
## # A tibble: 8,124 x 6
## edible cap_shape cap_color odor population habitat
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 poisonous convex brown pungent scattered urban
## 2 edible convex yellow almond numerous grasses
## 3 edible bell white anise numerous meadows
## 4 poisonous convex white pungent scattered urban
## 5 edible convex gray none abundant grasses
## 6 edible convex yellow almond numerous grasses
## 7 edible bell white almond numerous meadows
## 8 edible bell white anise scattered meadows
## 9 poisonous convex white pungent several grasses
## 10 edible bell yellow almond scattered meadows
## # ... with 8,114 more rows
str(df2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 8124 obs. of 6 variables:
## $ edible : chr "poisonous" "edible" "edible" "poisonous" ...
## $ cap_shape : chr "convex" "convex" "bell" "convex" ...
## $ cap_color : chr "brown" "yellow" "white" "white" ...
## $ odor : chr "pungent" "almond" "anise" "pungent" ...
## $ population: chr "scattered" "numerous" "numerous" "scattered" ...
## $ habitat : chr "urban" "grasses" "meadows" "urban" ...
E. Graph and review distribution of column data
barplot(table(df2$edible), main = "edible / poisonous")

barplot(table(df2$cap_shape), main = "cap shape")

barplot(table(df2$cap_color), main = "cap color")

barplot(table(df2$odor), main = "cap odor")

barplot(table(df2$population), main = "population")

barplot(table(df2$habitat), main = "habitat")
