DATA607_Assignment 1-Loading Data into a data frame

As data scientists, there are times when we’re tasked with taking data in one form and transforming it for easier downstream analysis. In this assignment, some simple transformations will be done on the mushrooms dataset (taken from). A data frame with a subset of the columns in the original data set will be created. Meaningful column names will be added to replace the abbreviations used in the dataset.

Reading the mushroom dataset into R

mushroom_dataset<- read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"), header = FALSE)

A glimpse at the dataset

head(mushroom_dataset,3)

##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
##   V21 V22 V23
## 1   k   s   u
## 2   n   n   g
## 3   n   n   m

Current column names

names(mushroom_dataset)

##  [1] "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10" "V11"
## [12] "V12" "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22"
## [23] "V23"

Subsetting the mushroom dataset

y<-c(mushroom_dataset[,c(1,6,22,23)])
mushroom_dataset2<-data.frame(y)
#head(mushroom_dataset2)

Renaming columns in the new dataset

names(mushroom_dataset2)<-c("class","odor","population", "habitat")
head(mushroom_dataset2,3)

##   class odor population habitat
## 1     p    p          s       u
## 2     e    a          n       g
## 3     e    l          n       m

Converting abbreviated data into meaningful data

#Converting abbreviated data into meaningful data:Class column
levels(mushroom_dataset2$`class`) <- c(levels(mushroom_dataset2$`class`), c("Poisonous","Edible"))
mushroom_dataset2$`class`[mushroom_dataset2$`class` == "p"] <- "Poisonous"
mushroom_dataset2$`class`[mushroom_dataset2$`class` == "e"] <- "Edible"


#Converting abbreviated data into meaningful data:Odor column
levels(mushroom_dataset2$`odor`) <- c(levels(mushroom_dataset2$`odor`), "almond", "anise", "creosote", "fishy", "foul", "musty", "none", "pungent", "spicy")
mushroom_dataset2$`odor`[mushroom_dataset2$`odor` == 'a'] <- 'almond'
mushroom_dataset2$`odor`[mushroom_dataset2$`odor` == 'l'] <- 'anise'
mushroom_dataset2$`odor`[mushroom_dataset2$`odor` == 'c'] <- 'creosote'
mushroom_dataset2$`odor`[mushroom_dataset2$`odor` == 'y'] <- 'fishy'
mushroom_dataset2$`odor`[mushroom_dataset2$`odor` ==  'f'] <- 'foul'
mushroom_dataset2$`odor`[mushroom_dataset2$`odor` ==  'm'] <- 'musty'
mushroom_dataset2$`odor`[mushroom_dataset2$`odor` ==  'n'] <- 'none'
mushroom_dataset2$`odor`[mushroom_dataset2$`odor` ==  'p'] <- 'pungent'
mushroom_dataset2$`odor`[mushroom_dataset2$`odor` ==  's'] <- 'spicy'


#Converting abbreviated data into meaningful data:Population column
levels(mushroom_dataset2$`population`) <- c(levels(mushroom_dataset2$`population`), c("abundant", "clustered", "numerous", "scattered","several","solitary"))
mushroom_dataset2$`population`[mushroom_dataset2$`population` ==  'a'] <- "abundant"
mushroom_dataset2$`population`[mushroom_dataset2$`population` ==  'c'] <- "clustered"
mushroom_dataset2$`population`[mushroom_dataset2$`population` ==  'n'] <- "numerous"
mushroom_dataset2$`population`[mushroom_dataset2$`population` ==  "s"] <- "scattered"
mushroom_dataset2$`population`[mushroom_dataset2$`population` ==  "v"] <- "several"
mushroom_dataset2$`population`[mushroom_dataset2$`population` ==  "y"] <- "solitary"



#Converting abbreviated data into meaningful data:Habitat column
levels(mushroom_dataset2$`habitat`) <- c(levels(mushroom_dataset2$`habitat`), c("Grasses", "Leaves", "Meadows", "Paths","Urban","Waste","Woods"))
mushroom_dataset2$`habitat`[mushroom_dataset2$`habitat` == "g"] <- "Grasses"
mushroom_dataset2$`habitat`[mushroom_dataset2$`habitat` == "l"] <- "Leaves"
mushroom_dataset2$`habitat`[mushroom_dataset2$`habitat` == "m"] <- "Meadows"
mushroom_dataset2$`habitat`[mushroom_dataset2$`habitat` == "p"] <- "Paths"
mushroom_dataset2$`habitat`[mushroom_dataset2$`habitat` == "u"] <- "Urban"
mushroom_dataset2$`habitat`[mushroom_dataset2$`habitat` == "w"] <- "Waste"
mushroom_dataset2$`habitat`[mushroom_dataset2$`habitat` == "d"] <- "Woods"

A glimpse at the tranformed data set

head(mushroom_dataset2,5)

##       class    odor population habitat
## 1 Poisonous pungent  scattered   Urban
## 2    Edible  almond   numerous Grasses
## 3    Edible   anise   numerous Meadows
## 4 Poisonous pungent  scattered   Urban
## 5    Edible    none   abundant Grasses

Note about the mushroom dataset: This is such a well-known dataset in the data science community that it makes a good dataset to use for comparative benchmarking. For example, if someone was working to build a better decision tree algorithm (or other predictive classifier) to analyze categorical data, this dataset could be useful. (extract taken from assignment question document)