Loading information from the source
download.file('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', 'agaricus-lepiota.data')
df <- read.table('agaricus-lepiota.data', sep=',', stringsAsFactors=FALSE)
Understanding the structure of data to compare with the data dictionary Map names with headings
head(df)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
dim(df)
## [1] 8124 23
names(df)
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11"
## [12] "V12" "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22"
## [23] "V23"
Getting a subset of the dataframe and substitute column names
dfnew <- subset(df,select = c(V2,V3,V4,V1))
colnames(dfnew)[1] <- "shape"
colnames(dfnew)[2] <- "surface"
colnames(dfnew)[3] <- "color"
colnames(dfnew)[4] <- "classes"
summary(dfnew)
## shape surface color
## Length:8124 Length:8124 Length:8124
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## classes
## Length:8124
## Class :character
## Mode :character
head (dfnew)
## shape surface color classes
## 1 x s n p
## 2 x s y e
## 3 b s w e
## 4 x y w p
## 5 x s g e
## 6 x y y e
Substitute letters for understandable information defined in data dictionary
dfnew$classes <- as.character(dfnew$classes)
dfnew$classes[dfnew$classes=="e"]<- "edible"
dfnew$classes[dfnew$classes=="p"]<- "poisonous"
dfnew$classes <- as.factor(dfnew$classes)
dfnew$shape <- as.character(dfnew$shape)
dfnew$shape[dfnew$shape=="b"]<- "bell"
dfnew$shape[dfnew$shape=="c"]<- "conical"
dfnew$shape[dfnew$shape=="f"]<- "flat"
dfnew$shape[dfnew$shape=="k"]<- "knobbed"
dfnew$shape[dfnew$shape=="s"]<- "sunken"
dfnew$shape[dfnew$shape=="x"]<- "convex"
dfnew$shape <- as.factor(dfnew$shape)
dfnew$surface <- as.character(dfnew$surface)
dfnew$surface[dfnew$surface=="f"]<- "fibrous"
dfnew$surface[dfnew$surface=="g"]<- "grooves"
dfnew$surface[dfnew$surface=="s"]<- "smooth"
dfnew$surface[dfnew$surface=="y"]<- "scaly"
dfnew$surface <- as.factor(dfnew$surface)
dfnew$color <- as.character(dfnew$color)
dfnew$color[dfnew$color=="n"]<- "brown"
dfnew$color[dfnew$color=="g"]<- "gray"
dfnew$color[dfnew$color=="e"]<- "red"
dfnew$color[dfnew$color=="y"]<- "yellow"
dfnew$color[dfnew$color=="w"]<- "white"
dfnew$color[dfnew$color=="b"]<- "buff"
dfnew$color <- as.factor(dfnew$color)
And then displayed the new data frame
summary(dfnew)
## shape surface color classes
## bell : 452 fibrous:2320 brown :2284 edible :4208
## conical: 4 grooves: 4 gray :1840 poisonous:3916
## convex :3656 scaly :3244 red :1500
## flat :3152 smooth :2556 yellow :1072
## knobbed: 828 white :1040
## sunken : 32 buff : 168
## (Other): 220