library(readr)
library(ggplot2)
The Mushrooms data is a well-known dataset in the data science community makes it a good dataset to use for comparative benchmarking. For example, if someone was working to build a better decision tree algorithm (or other predictive classifier) to analyze categorical data, this dataset could be useful. (https://archive.ics.uci.edu/ml/datasets/Mushroom)
Display original dataset columns.
mushrooms <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")
## Warning: Duplicated column names deduplicated: 'p' => 'p_1' [6], 'n' =>
## 'n_1' [9], 'e' => 'e_1' [12], 's' => 's_1' [13], 's' => 's_2' [14], 'w' =>
## 'w_1' [16], 'p' => 'p_2' [17], 'w' => 'w_2' [18], 'p' => 'p_3' [20], 'k' =>
## 'k_1' [21], 's' => 's_3' [22]
## Parsed with column specification:
## cols(
## .default = col_character(),
## t = col_logical(),
## f = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 210 parsing failures.
## row col expected actual file
## 6038 f 1/0/T/F/TRUE/FALSE a 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
## 6040 f 1/0/T/F/TRUE/FALSE a 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
## 6375 f 1/0/T/F/TRUE/FALSE a 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
## 6424 f 1/0/T/F/TRUE/FALSE a 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
## 6434 f 1/0/T/F/TRUE/FALSE a 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
## .... ... .................. ...... ..........................................................................................
## See problems(...) for more details.
1. Use the summary function to gain an overview of the data set. Then display attributes.
summary(mushrooms)
## p x s
## Length:8123 Length:8123 Length:8123
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## n t p_1 f
## Length:8123 Mode :logical Length:8123 Mode :logical
## Class :character FALSE:4748 Class :character FALSE:7913
## Mode :character TRUE :3375 Mode :character NA's :210
## c n_1 k
## Length:8123 Length:8123 Length:8123
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## e e_1 s_1
## Length:8123 Length:8123 Length:8123
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## s_2 w w_1
## Length:8123 Length:8123 Length:8123
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## p_2 w_2 o
## Length:8123 Length:8123 Length:8123
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## p_3 k_1 s_3
## Length:8123 Length:8123 Length:8123
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## u
## Length:8123
## Class :character
## Mode :character
2. Create a new data frame with a subset of the columns
subsetMushrooms <- mushrooms[c(1, 2, 3:4)]
3. Create new column names for the new data frame.
colnames(subsetMushrooms) <- c("class", "bruises", "gill-size", "habitat")
DT::datatable(subsetMushrooms, options = list(pageLength = 10))
4. Use the summary function to create an overview of your new data frame and print attributes.
summary(subsetMushrooms)
## class bruises gill-size
## Length:8123 Length:8123 Length:8123
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## habitat
## Length:8123
## Class :character
## Mode :character
5. 4. Replace abbreviation with meaningful names (i.e.; e = edible)
#newdata <- subset(SubsetMush, class >= "e" & Class <= "p")
newdata <- subset(subsetMushrooms, class == "e" | class <= "p")
newdata [newdata == "e"] <- "Edible"
newdata [newdata == "p"] <- "Poisonous"
6. See examples of all of steps 1-5 above.
DT::datatable(newdata, options = list(pageLength = 10))
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.0.1 v dplyr 0.7.8
## v tidyr 0.8.2 v stringr 1.3.1
## v purrr 0.2.5 v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#library(ggplot2)
#library(dplyr)
#factorMushrooms <- mushrooms[c(1, 3, 9, 23)]
#factorMushrooms <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")
subsetMushrooms <- mushrooms[c(1, 3, 9, 23)]
colnames(subsetMushrooms) <- c("class", "capsurface", "gillsize", "habitat")
summary(subsetMushrooms)
## class capsurface gillsize
## Length:8123 Length:8123 Length:8123
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## habitat
## Length:8123
## Class :character
## Mode :character
subsetMushrooms <- subsetMushrooms %>%
mutate(class=as.factor(class), capsurface=as.factor(capsurface),
gillsize=as.factor(gillsize), habitat=as.factor(habitat))
#mutate(class=factor(class), habitat=factor(habitat))
#unique(factorMushrooms$class)
summary(subsetMushrooms)
## class capsurface gillsize habitat
## e:4208 f:2320 b:5612 d:3148
## p:3915 g: 4 n:2511 g:2148
## s:2555 l: 832
## y:3244 m: 292
## p:1144
## u: 367
## w: 192
levels(subsetMushrooms$class)
## [1] "e" "p"
levels(subsetMushrooms$capsurface)
## [1] "f" "g" "s" "y"
levels(subsetMushrooms$gillsize)
## [1] "b" "n"
levels(subsetMushrooms$habitat)
## [1] "d" "g" "l" "m" "p" "u" "w"
# Set up factors
#df <- data.frame(a = factorMushrooms$class, factorMushrooms$bruises)
ggplot(data = subsetMushrooms) +
geom_point(mapping = aes(x=class, y=habitat))
#subsetMushrooms <- subsetMushrooms[c(1, 3, 9, 23)]
require(data.table)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
counts<-data.frame(table(subsetMushrooms))
DT::datatable(counts, options = list(pageLength = 10))
plot(subsetMushrooms)
plot(counts$class,counts$capsurface)
Please email to: kleber.perez@live.com for any suggestion.