Make sure to set header=F since data file don’t have column names
Mushroom.Data = read.csv("https://raw.githubusercontent.com/mlforsachid/MSDSQ1/master/Data607/Week1/agaricus-lepiota.csv", header=F, stringsAsFactors=FALSE)
We can use names function to assign column names First column is the class label. Let’s call it Edibility
names(Mushroom.Data) = c("Edibility", "CapShape", "CapSurface", "CapColor", "Bruises", "Odor", "GrillAttachment", "GrillSpacing", "GrillSize", "GrillColor", "StalkShape", "StalkRoot","StalkSurfaceAboveRing","StalkSurfaceBelowRing", "StalkColorAboveRing", "StalkColorBelowRing", "VeilType", "VeilColor", "RingNumber", "RingType", "SporeSprintColor", "Population", "Habitat")
dim(Mushroom.Data)
## [1] 8124 23
Mushroom dataframe has 8124 rows and 23 columns
table(Mushroom.Data$Edibility)
##
## e p
## 4208 3916
Out of 8124 samples 4208 samples are edible and 3916 samples are poisonous
**Let’s take first four attribute along with Edibility class variable
Mushroon.Sub = subset(Mushroom.Data, select=c("Edibility", "CapShape", "CapSurface", "CapColor", "Bruises"))
dim(Mushroon.Sub)
## [1] 8124 5
Subset has 8124 rows and 5 columns. First column is a class variable whihc is Edibility
head(Mushroon.Sub,10)
## Edibility CapShape CapSurface CapColor Bruises
## 1 p x s n t
## 2 e x s y t
## 3 e b s w t
## 4 p x y w t
## 5 e x s g f
## 6 e x y y t
## 7 e b s w t
## 8 e b y w t
## 9 p x y w t
## 10 e b s y t
tail(Mushroon.Sub,10)
## Edibility CapShape CapSurface CapColor Bruises
## 8115 p f y c f
## 8116 e x s n f
## 8117 p k y n f
## 8118 p k s e f
## 8119 p k y n f
## 8120 e k s n f
## 8121 e x s n f
## 8122 e f s n f
## 8123 p k y n f
## 8124 e x s n f
Replace e with Edible and p with Poisonous in class variable
Mushroon.Sub$Edibility[Mushroon.Sub$Edibility=="e"] = "Edible"
Mushroon.Sub$Edibility[Mushroon.Sub$Edibility=="p"] = "Poisonous"
unique(Mushroon.Sub$Edibility)
## [1] "Poisonous" "Edible"
We cas see that abbreaviatoins are successfully replaced