theUrl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
mushroomsData <- read.table(file = theUrl, header = FALSE, sep = ",", stringsAsFactors = FALSE)
##write data set to a csv file
write.table(mushroomsData, file = "mushroomsData.csv", sep = ",")
Once written in .csv file - I uploaded this file to github and now loading data from github link
giturl = "https://raw.githubusercontent.com/ahussan/DATA_607_CUNY_SPS/master/Week_1_Assignment/mushroomsData.csv"
mushroomsData = read.table(giturl, sep = ",", stringsAsFactors = FALSE)
head(mushroomsData)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
cat("Number of rows in the dataset: ", nrow(mushroomsData),"\n")
## Number of rows in the dataset: 8124
cat("Number of columns in the dataset: ", ncol(mushroomsData), "\n")
## Number of columns in the dataset: 23
Let’s select few columns to work with. We are going to select the following columns: Edible/Poisonous, Cap-Shape, Cap-Surface, Cap-color and Odor
###
firstDF <- mushroomsData[, 1:4]
secondDf <- mushroomsData[, 6]
##combine first and second data frames
mymushrooms = cbind(firstDF, secondDf)
head(mymushrooms)
## V1 V2 V3 V4 secondDf
## 1 p x s n p
## 2 e x s y a
## 3 e b s w l
## 4 p x y w p
## 5 e x s g n
## 6 e x y y a
colnames(mymushrooms) <- c("Edible/Poisonous","Cap-Shape","Cap-Surface","Cap-Color","Odor");
head(mymushrooms)
## Edible/Poisonous Cap-Shape Cap-Surface Cap-Color Odor
## 1 p x s n p
## 2 e x s y a
## 3 e b s w l
## 4 p x y w p
## 5 e x s g n
## 6 e x y y a
Let’s write few functions to transform the data in each column
transformClassData <- function(key){
switch (key,
'p' = 'poisonous',
'e' = 'edible'
)
}
transformCapShapeData <- function(key){
switch (key,
'b' = 'bell',
'c' = 'conical',
'x' = 'convex',
'f' = 'flat',
'k' = 'knobbed',
's' = 'sunken'
)
}
transformCapSurfaceData <- function(key){
switch (key,
'f' = 'fibrous',
'g' = 'grooves',
'y' = 'scaly',
's' = 'smooth'
)
}
transformCapColorData <- function(key){
switch (key,
'n' = 'brown',
'b' = 'buff',
'c' = 'cinnamon',
'g' = 'gray',
'r' = 'green',
'p' = 'pink',
'u' = 'purple',
'e' = 'red',
'w' = 'white',
'y' = 'yellow'
)
}
transformOdorData <- function(key){
switch (as.character(key),
'a' = 'almond',
'l' = 'anise',
'c' = 'creosote',
'y' = 'fishy',
'f' = 'foul',
'm' = 'musty',
'n' = 'none',
'p' = 'pungent',
's' = 'spicy'
)
}
Now that we have all the functions in place, we can replace the abbreviations with the corresponding values:
mymushrooms$`Edible/Poisonous` <- sapply(mymushrooms$`Edible/Poisonous`, transformClassData)
mymushrooms$`Cap-Shape` <- sapply(mymushrooms$`Cap-Shape`, transformCapShapeData)
mymushrooms$`Cap-Surface` <- sapply(mymushrooms$`Cap-Surface`, transformCapSurfaceData)
mymushrooms$`Cap-Color` <- sapply(mymushrooms$`Cap-Color`, transformCapColorData)
mymushrooms$Odor <- sapply(mymushrooms$Odor, transformOdorData)
Let’s take a look to the head of our data fram:
head(mymushrooms)
## Edible/Poisonous Cap-Shape Cap-Surface Cap-Color Odor
## 1 poisonous convex smooth brown pungent
## 2 edible convex smooth yellow almond
## 3 edible bell smooth white anise
## 4 poisonous convex scaly white pungent
## 5 edible convex smooth gray none
## 6 edible convex scaly yellow almond