Assignment – Loading Data into a Data Frame
#Load from remote
mashroom_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
#Load from local
#mashroom_url <- "data/mushrooms.csv"
#Load dataframe
df_mashroom <- read.table(file = mashroom_url, header = TRUE, sep = ",")
#Make sure the data is loaded
head(df_mashroom)
## p x s n t p.1 f c n.1 k e e.1 s.1 s.2 w w.1 p.2 w.2 o p.3 k.1 s.3 u
## 1 e x s y t a f c b k e c s s w w p w o p n n g
## 2 e b s w t l f c b n e c s s w w p w o p n n m
## 3 p x y w t p f c n n e e s s w w p w o p k s u
## 4 e x s g f n f w b k t e s s w w p w o e n a g
## 5 e x y y t a f c b n e c s s w w p w o p k n g
## 6 e b s w t a f c b g e c s s w w p w o p k n m
#View summary of the data
summary(df_mashroom)
## p x s n t p.1
## e:4208 b: 452 f:2320 n :2283 f:4748 n :3528
## p:3915 c: 4 g: 4 g :1840 t:3375 f :2160
## f:3152 s:2555 e :1500 s : 576
## k: 828 y:3244 y :1072 y : 576
## s: 32 w :1040 a : 400
## x:3655 b : 168 l : 400
## (Other): 220 (Other): 483
## f c n.1 k e e.1 s.1
## a: 210 c:6811 b:5612 b :1728 e:3515 ?:2480 f: 552
## f:7913 w:1312 n:2511 p :1492 t:4608 b:3776 k:2372
## w :1202 c: 556 s:5175
## n :1048 e:1119 y: 24
## g : 752 r: 192
## h : 732
## (Other):1169
## s.2 w w.1 p.2 w.2 o
## f: 600 w :4463 w :4383 p:8123 n: 96 n: 36
## k:2304 p :1872 p :1872 o: 96 o:7487
## s:4935 g : 576 g : 576 w:7923 t: 600
## y: 284 n : 448 n : 512 y: 8
## b : 432 b : 432
## o : 192 o : 192
## (Other): 140 (Other): 156
## p.3 k.1 s.3 u
## e:2776 w :2388 a: 384 d:3148
## f: 48 n :1968 c: 340 g:2148
## l:1296 k :1871 n: 400 l: 832
## n: 36 h :1632 s:1247 m: 292
## p:3967 r : 72 v:4040 p:1144
## b : 48 y:1712 u: 367
## (Other): 144 w: 192
#Count the number of rows
NROW(df_mashroom)
## [1] 8123
#Check if there is na values
NROW(na.omit(df_mashroom))
## [1] 8123
As we see there is data in all rows. Lets rename columns to make it easy to read
df_mashroom_new <- cbind(df_mashroom[,1:4], df_mashroom[,6])
head(df_mashroom_new)
## p x s n df_mashroom[, 6]
## 1 e x s y a
## 2 e b s w l
## 3 p x y w p
## 4 e x s g n
## 5 e x y y a
## 6 e b s w a
#Read the current row names
names(df_mashroom_new)
## [1] "p" "x" "s"
## [4] "n" "df_mashroom[, 6]"
colnames(df_mashroom_new) <- c("edible_poisonous","shape","surface","color","odor")
#We can also rename individual columns like this
#colnames(df_mashroom)[2] <- "newname"
head(df_mashroom_new)
## edible_poisonous shape surface color odor
## 1 e x s y a
## 2 e b s w l
## 3 p x y w p
## 4 e x s g n
## 5 e x y y a
## 6 e b s w a
Example of extracting numerical value from a column
#Excample of extracting numerical value from a column
df_mashroom_new$IsEdible[df_mashroom_new$edible_poisonous == 'e'] <- 1
df_mashroom_new$IsEdible[df_mashroom_new$edible_poisonous == 'p'] <- 0
df_mashroom_new$IsPoisonous[df_mashroom_new$edible_poisonous == 'e'] <- 0
df_mashroom_new$IsPoisonous[df_mashroom_new$edible_poisonous == 'p'] <- 1
head(df_mashroom_new)
## edible_poisonous shape surface color odor IsEdible IsPoisonous
## 1 e x s y a 1 0
## 2 e b s w l 1 0
## 3 p x y w p 0 1
## 4 e x s g n 1 0
## 5 e x y y a 1 0
## 6 e b s w a 1 0
levels(df_mashroom_new$edible_poisonous)[levels(df_mashroom_new$edible_poisonous)=='e'] <- 'edible'
levels(df_mashroom_new$edible_poisonous)[levels(df_mashroom_new$edible_poisonous)=='p'] <- 'poisonous'
head(df_mashroom_new)
## edible_poisonous shape surface color odor IsEdible IsPoisonous
## 1 edible x s y a 1 0
## 2 edible b s w l 1 0
## 3 poisonous x y w p 0 1
## 4 edible x s g n 1 0
## 5 edible x y y a 1 0
## 6 edible b s w a 1 0
levels(df_mashroom_new$shape)[levels(df_mashroom_new$shape)=='b'] <- 'bell'
levels(df_mashroom_new$shape)[levels(df_mashroom_new$shape)=='c'] <- 'conical'
levels(df_mashroom_new$shape)[levels(df_mashroom_new$shape)=='x'] <- 'convex'
levels(df_mashroom_new$shape)[levels(df_mashroom_new$shape)=='f'] <- 'flat'
levels(df_mashroom_new$shape)[levels(df_mashroom_new$shape)=='k'] <- 'knobbed'
levels(df_mashroom_new$shape)[levels(df_mashroom_new$shape)=='s'] <- 'sunken'
head(df_mashroom_new)
## edible_poisonous shape surface color odor IsEdible IsPoisonous
## 1 edible convex s y a 1 0
## 2 edible bell s w l 1 0
## 3 poisonous convex y w p 0 1
## 4 edible convex s g n 1 0
## 5 edible convex y y a 1 0
## 6 edible bell s w a 1 0
levels(df_mashroom_new$surface)[levels(df_mashroom_new$surface)=='f'] <- 'fibrous'
levels(df_mashroom_new$surface)[levels(df_mashroom_new$surface)=='g'] <- 'grooves'
levels(df_mashroom_new$surface)[levels(df_mashroom_new$surface)=='y'] <- 'scaly'
levels(df_mashroom_new$surface)[levels(df_mashroom_new$surface)=='s'] <- 'smooth'
head(df_mashroom_new)
## edible_poisonous shape surface color odor IsEdible IsPoisonous
## 1 edible convex smooth y a 1 0
## 2 edible bell smooth w l 1 0
## 3 poisonous convex scaly w p 0 1
## 4 edible convex smooth g n 1 0
## 5 edible convex scaly y a 1 0
## 6 edible bell smooth w a 1 0
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='n'] <- 'brown'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='b'] <- 'buff'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='c'] <- 'cinnamon'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='g'] <- 'gray'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='r'] <- 'green'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='p'] <- 'pink'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='u'] <- 'purple'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='e'] <- 'red'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='w'] <- 'white'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='y'] <- 'yellow'
head(df_mashroom_new)
## edible_poisonous shape surface color odor IsEdible IsPoisonous
## 1 edible convex smooth yellow a 1 0
## 2 edible bell smooth white l 1 0
## 3 poisonous convex scaly white p 0 1
## 4 edible convex smooth gray n 1 0
## 5 edible convex scaly yellow a 1 0
## 6 edible bell smooth white a 1 0
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='a'] <- 'almond'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='l'] <- 'anise'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='c'] <- 'creosote'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='y'] <- 'fishy'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='f'] <- 'foul'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='m'] <- 'musty'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='n'] <- 'none'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='p'] <- 'pungent'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='s'] <- 'spicy'
head(df_mashroom_new)
## edible_poisonous shape surface color odor IsEdible IsPoisonous
## 1 edible convex smooth yellow almond 1 0
## 2 edible bell smooth white anise 1 0
## 3 poisonous convex scaly white pungent 0 1
## 4 edible convex smooth gray none 1 0
## 5 edible convex scaly yellow almond 1 0
## 6 edible bell smooth white almond 1 0
Check the levels
levels(df_mashroom_new$edible_poisonous)
## [1] "edible" "poisonous"
levels(df_mashroom_new$shape)
## [1] "bell" "conical" "flat" "knobbed" "sunken" "convex"
levels(df_mashroom_new$surface)
## [1] "fibrous" "grooves" "smooth" "scaly"
levels(df_mashroom_new$color)
## [1] "buff" "cinnamon" "red" "gray" "brown" "pink"
## [7] "green" "purple" "white" "yellow"
levels(df_mashroom_new$odor)
## [1] "almond" "creosote" "foul" "anise" "musty" "none"
## [7] "pungent" "spicy" "fishy"
After cleaning up lets see the summary
summary(df_mashroom_new)
## edible_poisonous shape surface color
## edible :4208 bell : 452 fibrous:2320 brown :2283
## poisonous:3915 conical: 4 grooves: 4 gray :1840
## flat :3152 smooth :2555 red :1500
## knobbed: 828 scaly :3244 yellow :1072
## sunken : 32 white :1040
## convex :3655 buff : 168
## (Other): 220
## odor IsEdible IsPoisonous
## none :3528 Min. :0.000 Min. :0.000
## foul :2160 1st Qu.:0.000 1st Qu.:0.000
## spicy : 576 Median :1.000 Median :0.000
## fishy : 576 Mean :0.518 Mean :0.482
## almond : 400 3rd Qu.:1.000 3rd Qu.:1.000
## anise : 400 Max. :1.000 Max. :1.000
## (Other): 483
Plot each feature
plot(df_mashroom_new$edible_poisonous)

plot(df_mashroom_new$shape)

plot(df_mashroom_new$surface)

plot(df_mashroom_new$color)

plot(df_mashroom_new$odor)

Final cleaned up dataset
head(df_mashroom_new)
## edible_poisonous shape surface color odor IsEdible IsPoisonous
## 1 edible convex smooth yellow almond 1 0
## 2 edible bell smooth white anise 1 0
## 3 poisonous convex scaly white pungent 0 1
## 4 edible convex smooth gray none 1 0
## 5 edible convex scaly yellow almond 1 0
## 6 edible bell smooth white almond 1 0