# load packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.4
## -- Attaching packages ----------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1.9000 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.4
## -- Conflicts -------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# download, read data
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
download.file(url, "mushroom.txt", method = "wininet", quiet = FALSE, mode = "w", cacheOK = TRUE)
# read, extract & transform data
df <- read.table("mushroom.txt", header = F, sep = ",") %>%
# select 4 columns only
dplyr::select(V1, V2, V3, V4) %>%
dplyr::rename("classes" = V1, "cap-shape" = V2, "cap-surface" = V3, "cap-color" = V4) %>%
dplyr::mutate(classes = dplyr::case_when(classes == "e" ~ "edible",
classes == "p" ~ "poisonous",
TRUE ~ "others"),
`cap-shape` = dplyr::case_when(`cap-shape` == "b" ~ "bell",
`cap-shape` == "c" ~ "conical",
`cap-shape` == "x" ~ "convex",
`cap-shape` == "f" ~ "flat",
`cap-shape` == "k" ~ "knobbed",
`cap-shape` == "s" ~ "sunken",
TRUE ~ "others"),
`cap-surface` = dplyr::case_when(`cap-surface` == "f" ~ "fibrous",
`cap-surface` == "g" ~ "grooves",
`cap-surface` == "y" ~ "scaly",
`cap-surface` == "s" ~ "smooth",
TRUE ~ "others"),
`cap-color` = dplyr::case_when(`cap-color` == "n" ~ "brown",
`cap-color` == "b" ~ "buff",
`cap-color` == "c" ~ "cinnamon",
`cap-color` == "g" ~ "gray",
`cap-color` == "r" ~ "green",
`cap-color` == "p" ~ "pink",
`cap-color` == "u" ~ "purple",
`cap-color` == "e" ~ "red",
`cap-color` == "w" ~ "white",
`cap-color` == "y" ~ "yellow",
TRUE ~ "others"))
## Warning: package 'bindrcpp' was built under R version 3.4.4
# check summary, missing value & unique values by column
purrr::map(list(summary,
function(x) {colSums(is.na(x))},
function(x) {sapply(x, unique)}),
function(x){x(df)})
## [[1]]
## classes cap-shape cap-surface
## Length:8124 Length:8124 Length:8124
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## cap-color
## Length:8124
## Class :character
## Mode :character
##
## [[2]]
## classes cap-shape cap-surface cap-color
## 0 0 0 0
##
## [[3]]
## [[3]]$classes
## [1] "poisonous" "edible"
##
## [[3]]$`cap-shape`
## [1] "convex" "bell" "sunken" "flat" "knobbed" "conical"
##
## [[3]]$`cap-surface`
## [1] "smooth" "scaly" "fibrous" "grooves"
##
## [[3]]$`cap-color`
## [1] "brown" "yellow" "white" "gray" "red" "pink"
## [7] "buff" "purple" "cinnamon" "green"
# print & plot distribution by column
par(mfrow = c(2, 2))
sapply(df, function(x) round(prop.table(ftable(x)), 2)) %>%
print %>%
lapply(., barplot) %>%
invisible
## $classes
## x edible poisonous
##
## 0.52 0.48
##
## $`cap-shape`
## x bell conical convex flat knobbed sunken
##
## 0.06 0.00 0.45 0.39 0.10 0.00
##
## $`cap-surface`
## x fibrous grooves scaly smooth
##
## 0.29 0.00 0.40 0.31
##
## $`cap-color`
## x brown buff cinnamon gray green pink purple red white yellow
##
## 0.28 0.02 0.01 0.23 0.00 0.02 0.00 0.18 0.13 0.13
