Very often, we’re tasked with taking data in one form and transforming it for easier downstream analysis. We will spend several weeks in this course on tidying and transformation operations. Some of this work could be done in SQL or R (or Python or.). Here, you are asked to use R-you may use base functions or packages as you like.

Mushroom Dataset

Load the tidyverse package

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.3
## Warning: package 'readr' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.3
## Warning: package 'dplyr' was built under R version 3.4.2
## Warning: package 'forcats' was built under R version 3.4.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Load the mushroom dataset into R

mushroom <- read_csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"))
## Warning: Duplicated column names deduplicated: 'p' => 'p_1' [6], 'n' =>
## 'n_1' [9], 'e' => 'e_1' [12], 's' => 's_1' [13], 's' => 's_2' [14], 'w' =>
## 'w_1' [16], 'p' => 'p_2' [17], 'w' => 'w_2' [18], 'p' => 'p_3' [20], 'k' =>
## 'k_1' [21], 's' => 's_3' [22]
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.

Use the glimpse functin from the tidverse to take a look at the data set.

glimpse(mushroom)
## Observations: 8,123
## Variables: 23
## $ p   <chr> "e", "e", "p", "e", "e", "e", "e", "p", "e", "e", "e", "e"...
## $ x   <chr> "x", "b", "x", "x", "x", "b", "b", "x", "b", "x", "x", "b"...
## $ s   <chr> "s", "s", "y", "s", "y", "s", "y", "y", "s", "y", "y", "s"...
## $ n   <chr> "y", "w", "w", "g", "y", "w", "w", "w", "y", "y", "y", "y"...
## $ t   <chr> "t", "t", "t", "f", "t", "t", "t", "t", "t", "t", "t", "t"...
## $ p_1 <chr> "a", "l", "p", "n", "a", "a", "l", "p", "a", "l", "a", "a"...
## $ f   <chr> "f", "f", "f", "f", "f", "f", "f", "f", "f", "f", "f", "f"...
## $ c   <chr> "c", "c", "c", "w", "c", "c", "c", "c", "c", "c", "c", "c"...
## $ n_1 <chr> "b", "b", "n", "b", "b", "b", "b", "n", "b", "b", "b", "b"...
## $ k   <chr> "k", "n", "n", "k", "n", "g", "n", "p", "g", "g", "n", "w"...
## $ e   <chr> "e", "e", "e", "t", "e", "e", "e", "e", "e", "e", "e", "e"...
## $ e_1 <chr> "c", "c", "e", "e", "c", "c", "c", "e", "c", "c", "c", "c"...
## $ s_1 <chr> "s", "s", "s", "s", "s", "s", "s", "s", "s", "s", "s", "s"...
## $ s_2 <chr> "s", "s", "s", "s", "s", "s", "s", "s", "s", "s", "s", "s"...
## $ w   <chr> "w", "w", "w", "w", "w", "w", "w", "w", "w", "w", "w", "w"...
## $ w_1 <chr> "w", "w", "w", "w", "w", "w", "w", "w", "w", "w", "w", "w"...
## $ p_2 <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p"...
## $ w_2 <chr> "w", "w", "w", "w", "w", "w", "w", "w", "w", "w", "w", "w"...
## $ o   <chr> "o", "o", "o", "o", "o", "o", "o", "o", "o", "o", "o", "o"...
## $ p_3 <chr> "p", "p", "p", "e", "p", "p", "p", "p", "p", "p", "p", "p"...
## $ k_1 <chr> "n", "n", "k", "n", "k", "k", "n", "k", "k", "n", "k", "n"...
## $ s_3 <chr> "n", "n", "s", "a", "n", "n", "s", "v", "s", "n", "s", "s"...
## $ u   <chr> "g", "m", "u", "g", "g", "m", "m", "g", "m", "g", "m", "g"...

The glimpse function gives the end-user a look at the mushroom dataset. The mushroom dataset has 23 variables and 8,123 observations. In addition, the dataset has all character variables.

Create a subset of the mushroom data set containing the following variables along with the attribute variable: cap-shape, cap-color, habitat, odor, and ring-number

#return subsets of vectors, matrices or data frames which meet conditions
mushroom_sub <- subset(mushroom, select = c(p,x,n,u,p_1,o))

Rename the variables in the mushroom_sub data set using the colnames feature, and then list the names of the data set using the names function

#retrieve or set the row or column names of a matrix-like object
colnames(mushroom_sub) <- c("MushroomType","CapShape","CapColor",
                            "Habitat","Odor","RingNumber")
#functions to get or set the names of an object
names(mushroom_sub)
## [1] "MushroomType" "CapShape"     "CapColor"     "Habitat"     
## [5] "Odor"         "RingNumber"

Take a look at the first 3 observations using the head feature

#head Returns the first or last parts of a vector, matrix, table, data frame or function. Since head() and tail() are generic functions, they may also have been extended to other classes.
#n a single integer. If positive, size for the resulting object: number of elements for a vector (including lists), rows for a matrix or data frame or lines for a function. If negative, all but the n last/first number of elements of x.
head(mushroom_sub,n=3)
## # A tibble: 3 x 6
##   MushroomType CapShape CapColor Habitat Odor  RingNumber
##   <chr>        <chr>    <chr>    <chr>   <chr> <chr>     
## 1 e            x        y        g       a     o         
## 2 e            b        w        m       l     o         
## 3 p            x        w        u       p     o

Load the plyr package

library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact

Rename the variables that correspond to the MushroomType variable using the revalue feature

#assign new values to the MushroomType variable
mushroom_sub$MushroomType <-revalue(mushroom_sub$MushroomType, c("e"="Edible", "p"="Poisonous"))

#assign new values to the CapShape variable
mushroom_sub$CapShape <-revalue(mushroom_sub$CapShape, c("b"="Bell","c"="Conical","x"="Convex","f"="Flat","k"="Knobbed", "s"="Sunken"))

#assign new values to the CapColor variable
mushroom_sub$CapColor <-revalue(mushroom_sub$CapColor, c("n"="Brown","b"="Buff","c"="Cinnamon","g"="Gray","r"="Green","p"="Pink","u"="Purple","e"="Red","w"="White","y"="Yellow"))

#assign new values to the Habitat variable
mushroom_sub$Habitat <-revalue(mushroom_sub$Habitat, c("g"="Grasses","l"="Leaves","m"="Meadows","p"="Paths","u"="Urban","w"="Waste","d"="Woods"))

#assign new values to the Odor variable
mushroom_sub$Odor <-revalue(mushroom_sub$Odor, c("a"="Almond","l"="Anise","c"="Creosote","y"="Fishy","f"="Foul","m"="Musty","n"="none","p"="Pungent","s"="Spicy"))

#assign new values to the RingNumber variable
mushroom_sub$RingNumber <-revalue(mushroom_sub$RingNumber, c("n"="None","o"="One","t"="Two"))

#view the first 20 variables using the head function
head(mushroom_sub,n=20)
## # A tibble: 20 x 6
##    MushroomType CapShape CapColor Habitat Odor    RingNumber
##    <chr>        <chr>    <chr>    <chr>   <chr>   <chr>     
##  1 Edible       Convex   Yellow   Grasses Almond  One       
##  2 Edible       Bell     White    Meadows Anise   One       
##  3 Poisonous    Convex   White    Urban   Pungent One       
##  4 Edible       Convex   Gray     Grasses none    One       
##  5 Edible       Convex   Yellow   Grasses Almond  One       
##  6 Edible       Bell     White    Meadows Almond  One       
##  7 Edible       Bell     White    Meadows Anise   One       
##  8 Poisonous    Convex   White    Grasses Pungent One       
##  9 Edible       Bell     Yellow   Meadows Almond  One       
## 10 Edible       Convex   Yellow   Grasses Anise   One       
## 11 Edible       Convex   Yellow   Meadows Almond  One       
## 12 Edible       Bell     Yellow   Grasses Almond  One       
## 13 Poisonous    Convex   White    Urban   Pungent One       
## 14 Edible       Convex   Brown    Grasses none    One       
## 15 Edible       Sunken   Gray     Urban   none    One       
## 16 Edible       Flat     White    Grasses none    One       
## 17 Poisonous    Convex   Brown    Grasses Pungent One       
## 18 Poisonous    Convex   White    Urban   Pungent One       
## 19 Poisonous    Convex   Brown    Urban   Pungent One       
## 20 Edible       Bell     Yellow   Meadows Almond  One

Obtain a count of each variable,build a contingency table, and then plot findings on a bar plot.

MushroomType

#table uses the cross-classifying factors to build a contingency table of the counts at each combination of factor levels.
Types <- table(mushroom_sub$MushroomType)
Types
## 
##    Edible Poisonous 
##      4208      3915
TypesFreqs <- Types/sum(Types)
TypesFreqs
## 
##    Edible Poisonous 
## 0.5180352 0.4819648
#bar plots creates a bar plot with vertical or horizontal bars.
barplot(TypesFreqs, main = "Mushroom Type Bar Plot", col = "Beige")

CapShape

Shape <- table(mushroom_sub$CapShape)
Shape
## 
##    Bell Conical  Convex    Flat Knobbed  Sunken 
##     452       4    3655    3152     828      32
ShapeFreq <- Shape/sum(Shape)
ShapeFreq
## 
##         Bell      Conical       Convex         Flat      Knobbed 
## 0.0556444663 0.0004924289 0.4499569125 0.3880339776 0.1019327835 
##       Sunken 
## 0.0039394312
barplot(ShapeFreq, main = "Cap Shape Bar Plot", col = "Yellow")

CapColor

Color <- table(mushroom_sub$CapColor)
Color
## 
##    Brown     Buff Cinnamon     Gray    Green     Pink   Purple      Red 
##     2283      168       44     1840       16      144       16     1500 
##    White   Yellow 
##     1040     1072
ColorFreq <- Color/sum(Color)
ColorFreq
## 
##       Brown        Buff    Cinnamon        Gray       Green        Pink 
## 0.281053798 0.020682014 0.005416718 0.226517297 0.001969716 0.017727441 
##      Purple         Red       White      Yellow 
## 0.001969716 0.184660840 0.128031515 0.131970947
barplot(ColorFreq, main = "Cap Color Bar Plot", col = "Lavender")

Habitat

HabCount <- table(mushroom_sub$Habitat)
HabCount
## 
## Grasses  Leaves Meadows   Paths   Urban   Waste   Woods 
##    2148     832     292    1144     367     192    3148
HabCountFreq <- HabCount/sum(HabCount)
HabCountFreq
## 
##    Grasses     Leaves    Meadows      Paths      Urban      Waste 
## 0.26443432 0.10242521 0.03594731 0.14083467 0.04518035 0.02363659 
##      Woods 
## 0.38754155
barplot(HabCountFreq, main = "Habitat Bar Plot", col = "Blue")

Odor

OdorCount <- table(mushroom_sub$Odor)
OdorCount
## 
##   Almond    Anise Creosote    Fishy     Foul    Musty     none  Pungent 
##      400      400      192      576     2160       36     3528      255 
##    Spicy 
##      576
OdorCountFreq <- OdorCount/sum(OdorCount)
OdorCountFreq
## 
##     Almond      Anise   Creosote      Fishy       Foul      Musty 
## 0.04924289 0.04924289 0.02363659 0.07090976 0.26591161 0.00443186 
##       none    Pungent      Spicy 
## 0.43432229 0.03139234 0.07090976
barplot(OdorCountFreq, main = "Odor Barplot", col = "Lightblue",
        names.arg=c("Alm...","Ani...","Creo...","Fishy","Foul","Musty","NA","Pung...","Spicy"))

RingNumber

RingNumCnt <- table(mushroom_sub$RingNumber)
RingNumCnt
## 
## None  One  Two 
##   36 7487  600
RingNumCntFreq <- RingNumCnt/sum(RingNumCnt)
RingNumCntFreq
## 
##       None        One        Two 
## 0.00443186 0.92170380 0.07386434
barplot(RingNumCntFreq, main = "Ring Number Bar Plot", col = "Cyan")