In week 1 assignment for DATA607. we will load data into dataframe that is provided by UCI Mushroom dataset located : https://archive.ics.uci.edu/ml/datasets/Mushroom
This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family . Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended.
Attribute Information:
Type : Edible e , Poisonous = p
cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y
bruises: bruises=t,no=f
odor: almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s
gill-attachment: attached=a,descending=d,free=f,notched=n
gill-spacing: close=c,crowded=w,distant=d
gill-size: broad=b,narrow=n
gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y
stalk-shape: enlarging=e,tapering=t
stalk-root: bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,missing=?
stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
veil-type: partial=p,universal=u
veil-color: brown=n,orange=o,white=w,yellow=y
ring-number: none=n,one=o,two=t
ring-type: cobwebby=c,evanescent=e,flaring=f,large=l, none=n,pendant=p,sheathing=s,zone=z
spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r, orange=o,purple=u,white=w,yellow=y
population: abundant=a,clustered=c,numerous=n, scattered=s,several=v,solitary=y
habitat: grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d
Load data from given URL(https://archive.ics.uci.edu/ml/datasets/Mushroom) into R , subset and create a new Data frame selecting few columns from original including 1st column. Provide meangingfull name to columns Headers, and also update the values of each column based on the data dictionary values.
library(stringr)
library(XML)
## Warning: package 'XML' was built under R version 3.5.2
library(maps)
library(httr)
mush_table <- read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", header=FALSE , sep="," )
mushrooms <- as.data.frame(mush_table)
# subsetting the data to create a a new data frame with 5 columns
mush_subset <- subset(mushrooms, select=c(1,2,3,4,6,22,23))
#Giving columns meaningfull names
colnames(mush_subset) <- c("Type","Shape","Surface","Color","Odor","Population_Type","Habitat")
#Changing the column values with meaningfull values based on conditional statements
mush_subset$Type <- ifelse(str_detect(mush_subset$Type, "e") == TRUE, "Edible", "Poisonous")
#bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
mush_subset$Shape <- ifelse(str_detect(mush_subset$Shape, "x") == TRUE, "convex", ifelse(str_detect(mush_subset$Shape, "c") == TRUE, "conical",ifelse(str_detect(mush_subset$Shape, "b") == TRUE, "bell" , ifelse(str_detect(mush_subset$Shape, "k") == TRUE, "knobbed",ifelse(str_detect(mush_subset$Shape, "f") == TRUE, "flat","sunken")))))
#fibrous=f,grooves=g,scaly=y,smooth=s
mush_subset$Surface <- ifelse(str_detect(mush_subset$Surface, "f") == TRUE, "fibrous", ifelse(str_detect(mush_subset$Surface, "g") == TRUE, "grooves",ifelse(str_detect(mush_subset$Surface, "y") == TRUE, "scaly" , "smooth")))
#brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y
mush_subset$Color <- ifelse(str_detect(mush_subset$Color, "n") == TRUE, "brown", ifelse(str_detect(mush_subset$Color, "b") == TRUE, "buff",ifelse(str_detect(mush_subset$Color, "c") == TRUE, "cinnamon" , ifelse(str_detect(mush_subset$Color, "g") == TRUE, "gray",ifelse(str_detect(mush_subset$Color, "p") == TRUE, "pink",ifelse(str_detect(mush_subset$Color, "u") == TRUE, "purple",ifelse(str_detect(mush_subset$Color, "r") == TRUE, "red",ifelse(str_detect(mush_subset$Color, "w") == TRUE, "white","yellow"))))))))
#almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s
mush_subset$Odor <- ifelse(str_detect(mush_subset$Odor, "a") == TRUE, "almond", ifelse(str_detect(mush_subset$Odor, "l") == TRUE, "anise",ifelse(str_detect(mush_subset$Odor, "c") == TRUE, "creosote",ifelse(str_detect(mush_subset$Odor, "y") == TRUE, "fishy" , ifelse(str_detect(mush_subset$Odor, "f") == TRUE, "foul",ifelse(str_detect(mush_subset$Odor, "m") == TRUE, "musty",ifelse(str_detect(mush_subset$Odor, "p") == TRUE, "pungent",ifelse(str_detect(mush_subset$Odor, "s") == TRUE, "spicy","none"))))))))
#population: abundant=a,clustered=c,numerous=n, scattered=s,several=v,solitary=y
mush_subset$Population_Type <- ifelse(str_detect(mush_subset$Population_Type, "a") == TRUE, "abundant", ifelse(str_detect(mush_subset$Population_Type, "c") == TRUE, "clustered",ifelse(str_detect(mush_subset$Population_Type, "n") == TRUE, "numerous" , ifelse(str_detect(mush_subset$Population_Type, "s") == TRUE, "scattered",ifelse(str_detect(mush_subset$Population_Type, "v") == TRUE, "several","solitary")))))
#habitat: grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d
mush_subset$Habitat <- ifelse(str_detect(mush_subset$Habitat, "g") == TRUE, "grasses", ifelse(str_detect(mush_subset$Habitat, "l") == TRUE, "leaves",ifelse(str_detect(mush_subset$Habitat, "m") == TRUE, "meadows" , ifelse(str_detect(mush_subset$Odor, "p") == TRUE, "paths",ifelse(str_detect(mush_subset$Habitat, "u") == TRUE, "urban",ifelse(str_detect(mush_subset$w, "p") == TRUE, "waste","woods"))))))
The data frame after subsetting, has been given meaningfull name and updating column values is as below.
head(mush_subset,n=20)
## Type Shape Surface Color Odor Population_Type Habitat
## 1 Poisonous convex smooth brown pungent scattered paths
## 2 Edible convex smooth yellow almond numerous grasses
## 3 Edible bell smooth white anise numerous meadows
## 4 Poisonous convex scaly white pungent scattered paths
## 5 Edible convex smooth gray none abundant grasses
## 6 Edible convex scaly yellow almond numerous grasses
## 7 Edible bell smooth white almond numerous meadows
## 8 Edible bell scaly white anise scattered meadows
## 9 Poisonous convex scaly white pungent several grasses
## 10 Edible bell smooth yellow almond scattered meadows
## 11 Edible convex scaly yellow anise numerous grasses
## 12 Edible convex scaly yellow almond scattered meadows
## 13 Edible bell smooth yellow almond scattered grasses
## 14 Poisonous convex scaly white pungent several paths
## 15 Edible convex fibrous brown none abundant grasses
## 16 Edible sunken fibrous gray none solitary urban
## 17 Edible flat fibrous white none abundant grasses
## 18 Poisonous convex smooth brown pungent scattered grasses
## 19 Poisonous convex scaly white pungent scattered paths
## 20 Poisonous convex smooth brown pungent scattered paths