Very often, we’re tasked with taking data in one form and transforming it for easier downstream analysis. We will spend several weeks in this course on tidying and transformation operations. Some of this work could be done in SQL or R (or Python or.). Here, you are asked to use R-you may use base functions or packages as you like.
Load the tidyverse package
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.3
## Warning: package 'readr' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.3
## Warning: package 'dplyr' was built under R version 3.4.2
## Warning: package 'forcats' was built under R version 3.4.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Load the mushroom dataset into R
mushroom <- read_csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"))
## Warning: Duplicated column names deduplicated: 'p' => 'p_1' [6], 'n' =>
## 'n_1' [9], 'e' => 'e_1' [12], 's' => 's_1' [13], 's' => 's_2' [14], 'w' =>
## 'w_1' [16], 'p' => 'p_2' [17], 'w' => 'w_2' [18], 'p' => 'p_3' [20], 'k' =>
## 'k_1' [21], 's' => 's_3' [22]
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
Use the glimpse functin from the tidverse to take a look at the data set.
glimpse(mushroom)
## Observations: 8,123
## Variables: 23
## $ p <chr> "e", "e", "p", "e", "e", "e", "e", "p", "e", "e", "e", "e"...
## $ x <chr> "x", "b", "x", "x", "x", "b", "b", "x", "b", "x", "x", "b"...
## $ s <chr> "s", "s", "y", "s", "y", "s", "y", "y", "s", "y", "y", "s"...
## $ n <chr> "y", "w", "w", "g", "y", "w", "w", "w", "y", "y", "y", "y"...
## $ t <chr> "t", "t", "t", "f", "t", "t", "t", "t", "t", "t", "t", "t"...
## $ p_1 <chr> "a", "l", "p", "n", "a", "a", "l", "p", "a", "l", "a", "a"...
## $ f <chr> "f", "f", "f", "f", "f", "f", "f", "f", "f", "f", "f", "f"...
## $ c <chr> "c", "c", "c", "w", "c", "c", "c", "c", "c", "c", "c", "c"...
## $ n_1 <chr> "b", "b", "n", "b", "b", "b", "b", "n", "b", "b", "b", "b"...
## $ k <chr> "k", "n", "n", "k", "n", "g", "n", "p", "g", "g", "n", "w"...
## $ e <chr> "e", "e", "e", "t", "e", "e", "e", "e", "e", "e", "e", "e"...
## $ e_1 <chr> "c", "c", "e", "e", "c", "c", "c", "e", "c", "c", "c", "c"...
## $ s_1 <chr> "s", "s", "s", "s", "s", "s", "s", "s", "s", "s", "s", "s"...
## $ s_2 <chr> "s", "s", "s", "s", "s", "s", "s", "s", "s", "s", "s", "s"...
## $ w <chr> "w", "w", "w", "w", "w", "w", "w", "w", "w", "w", "w", "w"...
## $ w_1 <chr> "w", "w", "w", "w", "w", "w", "w", "w", "w", "w", "w", "w"...
## $ p_2 <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p"...
## $ w_2 <chr> "w", "w", "w", "w", "w", "w", "w", "w", "w", "w", "w", "w"...
## $ o <chr> "o", "o", "o", "o", "o", "o", "o", "o", "o", "o", "o", "o"...
## $ p_3 <chr> "p", "p", "p", "e", "p", "p", "p", "p", "p", "p", "p", "p"...
## $ k_1 <chr> "n", "n", "k", "n", "k", "k", "n", "k", "k", "n", "k", "n"...
## $ s_3 <chr> "n", "n", "s", "a", "n", "n", "s", "v", "s", "n", "s", "s"...
## $ u <chr> "g", "m", "u", "g", "g", "m", "m", "g", "m", "g", "m", "g"...
The glimpse function gives the end-user a look at the mushroom dataset. The mushroom dataset has 23 variables and 8,123 observations. In addition, the dataset has all character variables.
Create a subset of the mushroom data set containing the following variables along with the attribute variable: cap-shape, cap-color, habitat, odor, and ring-number
#return subsets of vectors, matrices or data frames which meet conditions
mushroom_sub <- subset(mushroom, select = c(p,x,n,u,p_1,o))
Rename the variables in the mushroom_sub data set using the colnames feature, and then list the names of the data set using the names function
#retrieve or set the row or column names of a matrix-like object
colnames(mushroom_sub) <- c("MushroomType","CapShape","CapColor",
"Habitat","Odor","RingNumber")
#functions to get or set the names of an object
names(mushroom_sub)
## [1] "MushroomType" "CapShape" "CapColor" "Habitat"
## [5] "Odor" "RingNumber"
Take a look at the first 3 observations using the head feature
#head Returns the first or last parts of a vector, matrix, table, data frame or function. Since head() and tail() are generic functions, they may also have been extended to other classes.
#n a single integer. If positive, size for the resulting object: number of elements for a vector (including lists), rows for a matrix or data frame or lines for a function. If negative, all but the n last/first number of elements of x.
head(mushroom_sub,n=3)
## # A tibble: 3 x 6
## MushroomType CapShape CapColor Habitat Odor RingNumber
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 e x y g a o
## 2 e b w m l o
## 3 p x w u p o
Load the plyr package
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
Rename the variables that correspond to the MushroomType variable using the revalue feature
#assign new values to the MushroomType variable
mushroom_sub$MushroomType <-revalue(mushroom_sub$MushroomType, c("e"="Edible", "p"="Poisonous"))
#assign new values to the CapShape variable
mushroom_sub$CapShape <-revalue(mushroom_sub$CapShape, c("b"="Bell","c"="Conical","x"="Convex","f"="Flat","k"="Knobbed", "s"="Sunken"))
#assign new values to the CapColor variable
mushroom_sub$CapColor <-revalue(mushroom_sub$CapColor, c("n"="Brown","b"="Buff","c"="Cinnamon","g"="Gray","r"="Green","p"="Pink","u"="Purple","e"="Red","w"="White","y"="Yellow"))
#assign new values to the Habitat variable
mushroom_sub$Habitat <-revalue(mushroom_sub$Habitat, c("g"="Grasses","l"="Leaves","m"="Meadows","p"="Paths","u"="Urban","w"="Waste","d"="Woods"))
#assign new values to the Odor variable
mushroom_sub$Odor <-revalue(mushroom_sub$Odor, c("a"="Almond","l"="Anise","c"="Creosote","y"="Fishy","f"="Foul","m"="Musty","n"="none","p"="Pungent","s"="Spicy"))
#assign new values to the RingNumber variable
mushroom_sub$RingNumber <-revalue(mushroom_sub$RingNumber, c("n"="None","o"="One","t"="Two"))
#view the first 20 variables using the head function
head(mushroom_sub,n=20)
## # A tibble: 20 x 6
## MushroomType CapShape CapColor Habitat Odor RingNumber
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Edible Convex Yellow Grasses Almond One
## 2 Edible Bell White Meadows Anise One
## 3 Poisonous Convex White Urban Pungent One
## 4 Edible Convex Gray Grasses none One
## 5 Edible Convex Yellow Grasses Almond One
## 6 Edible Bell White Meadows Almond One
## 7 Edible Bell White Meadows Anise One
## 8 Poisonous Convex White Grasses Pungent One
## 9 Edible Bell Yellow Meadows Almond One
## 10 Edible Convex Yellow Grasses Anise One
## 11 Edible Convex Yellow Meadows Almond One
## 12 Edible Bell Yellow Grasses Almond One
## 13 Poisonous Convex White Urban Pungent One
## 14 Edible Convex Brown Grasses none One
## 15 Edible Sunken Gray Urban none One
## 16 Edible Flat White Grasses none One
## 17 Poisonous Convex Brown Grasses Pungent One
## 18 Poisonous Convex White Urban Pungent One
## 19 Poisonous Convex Brown Urban Pungent One
## 20 Edible Bell Yellow Meadows Almond One
Obtain a count of each variable,build a contingency table, and then plot findings on a bar plot.
#table uses the cross-classifying factors to build a contingency table of the counts at each combination of factor levels.
Types <- table(mushroom_sub$MushroomType)
Types
##
## Edible Poisonous
## 4208 3915
TypesFreqs <- Types/sum(Types)
TypesFreqs
##
## Edible Poisonous
## 0.5180352 0.4819648
#bar plots creates a bar plot with vertical or horizontal bars.
barplot(TypesFreqs, main = "Mushroom Type Bar Plot", col = "Beige")
Shape <- table(mushroom_sub$CapShape)
Shape
##
## Bell Conical Convex Flat Knobbed Sunken
## 452 4 3655 3152 828 32
ShapeFreq <- Shape/sum(Shape)
ShapeFreq
##
## Bell Conical Convex Flat Knobbed
## 0.0556444663 0.0004924289 0.4499569125 0.3880339776 0.1019327835
## Sunken
## 0.0039394312
barplot(ShapeFreq, main = "Cap Shape Bar Plot", col = "Yellow")
Color <- table(mushroom_sub$CapColor)
Color
##
## Brown Buff Cinnamon Gray Green Pink Purple Red
## 2283 168 44 1840 16 144 16 1500
## White Yellow
## 1040 1072
ColorFreq <- Color/sum(Color)
ColorFreq
##
## Brown Buff Cinnamon Gray Green Pink
## 0.281053798 0.020682014 0.005416718 0.226517297 0.001969716 0.017727441
## Purple Red White Yellow
## 0.001969716 0.184660840 0.128031515 0.131970947
barplot(ColorFreq, main = "Cap Color Bar Plot", col = "Lavender")
HabCount <- table(mushroom_sub$Habitat)
HabCount
##
## Grasses Leaves Meadows Paths Urban Waste Woods
## 2148 832 292 1144 367 192 3148
HabCountFreq <- HabCount/sum(HabCount)
HabCountFreq
##
## Grasses Leaves Meadows Paths Urban Waste
## 0.26443432 0.10242521 0.03594731 0.14083467 0.04518035 0.02363659
## Woods
## 0.38754155
barplot(HabCountFreq, main = "Habitat Bar Plot", col = "Blue")
OdorCount <- table(mushroom_sub$Odor)
OdorCount
##
## Almond Anise Creosote Fishy Foul Musty none Pungent
## 400 400 192 576 2160 36 3528 255
## Spicy
## 576
OdorCountFreq <- OdorCount/sum(OdorCount)
OdorCountFreq
##
## Almond Anise Creosote Fishy Foul Musty
## 0.04924289 0.04924289 0.02363659 0.07090976 0.26591161 0.00443186
## none Pungent Spicy
## 0.43432229 0.03139234 0.07090976
barplot(OdorCountFreq, main = "Odor Barplot", col = "Lightblue",
names.arg=c("Alm...","Ani...","Creo...","Fishy","Foul","Musty","NA","Pung...","Spicy"))
RingNumCnt <- table(mushroom_sub$RingNumber)
RingNumCnt
##
## None One Two
## 36 7487 600
RingNumCntFreq <- RingNumCnt/sum(RingNumCnt)
RingNumCntFreq
##
## None One Two
## 0.00443186 0.92170380 0.07386434
barplot(RingNumCntFreq, main = "Ring Number Bar Plot", col = "Cyan")