Assignment – Loading Data into a Data Frame

#Load from remote
mashroom_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"

#Load from local
#mashroom_url <- "data/mushrooms.csv"

#Load dataframe
df_mashroom <- read.table(file = mashroom_url, header = TRUE, sep = ",")

#Make sure the data is loaded
head(df_mashroom)
##   p x s n t p.1 f c n.1 k e e.1 s.1 s.2 w w.1 p.2 w.2 o p.3 k.1 s.3 u
## 1 e x s y t   a f c   b k e   c   s   s w   w   p   w o   p   n   n g
## 2 e b s w t   l f c   b n e   c   s   s w   w   p   w o   p   n   n m
## 3 p x y w t   p f c   n n e   e   s   s w   w   p   w o   p   k   s u
## 4 e x s g f   n f w   b k t   e   s   s w   w   p   w o   e   n   a g
## 5 e x y y t   a f c   b n e   c   s   s w   w   p   w o   p   k   n g
## 6 e b s w t   a f c   b g e   c   s   s w   w   p   w o   p   k   n m
#View summary of the data
summary(df_mashroom)
##  p        x        s              n        t             p.1      
##  e:4208   b: 452   f:2320   n      :2283   f:4748   n      :3528  
##  p:3915   c:   4   g:   4   g      :1840   t:3375   f      :2160  
##           f:3152   s:2555   e      :1500            s      : 576  
##           k: 828   y:3244   y      :1072            y      : 576  
##           s:  32            w      :1040            a      : 400  
##           x:3655            b      : 168            l      : 400  
##                             (Other): 220            (Other): 483  
##  f        c        n.1            k        e        e.1      s.1     
##  a: 210   c:6811   b:5612   b      :1728   e:3515   ?:2480   f: 552  
##  f:7913   w:1312   n:2511   p      :1492   t:4608   b:3776   k:2372  
##                             w      :1202            c: 556   s:5175  
##                             n      :1048            e:1119   y:  24  
##                             g      : 752            r: 192           
##                             h      : 732                             
##                             (Other):1169                             
##  s.2            w             w.1       p.2      w.2      o       
##  f: 600   w      :4463   w      :4383   p:8123   n:  96   n:  36  
##  k:2304   p      :1872   p      :1872            o:  96   o:7487  
##  s:4935   g      : 576   g      : 576            w:7923   t: 600  
##  y: 284   n      : 448   n      : 512            y:   8           
##           b      : 432   b      : 432                             
##           o      : 192   o      : 192                             
##           (Other): 140   (Other): 156                             
##  p.3           k.1       s.3      u       
##  e:2776   w      :2388   a: 384   d:3148  
##  f:  48   n      :1968   c: 340   g:2148  
##  l:1296   k      :1871   n: 400   l: 832  
##  n:  36   h      :1632   s:1247   m: 292  
##  p:3967   r      :  72   v:4040   p:1144  
##           b      :  48   y:1712   u: 367  
##           (Other): 144            w: 192
#Count the number of rows
NROW(df_mashroom)
## [1] 8123
#Check if there is na values
NROW(na.omit(df_mashroom))
## [1] 8123

As we see there is data in all rows. Lets rename columns to make it easy to read

df_mashroom_new <- cbind(df_mashroom[,1:4], df_mashroom[,6])
head(df_mashroom_new)
##   p x s n df_mashroom[, 6]
## 1 e x s y                a
## 2 e b s w                l
## 3 p x y w                p
## 4 e x s g                n
## 5 e x y y                a
## 6 e b s w                a
#Read the current row names
names(df_mashroom_new)
## [1] "p"                "x"                "s"               
## [4] "n"                "df_mashroom[, 6]"
colnames(df_mashroom_new) <- c("edible_poisonous","shape","surface","color","odor")

#We can also rename individual columns like this
#colnames(df_mashroom)[2] <- "newname"

head(df_mashroom_new)
##   edible_poisonous shape surface color odor
## 1                e     x       s     y    a
## 2                e     b       s     w    l
## 3                p     x       y     w    p
## 4                e     x       s     g    n
## 5                e     x       y     y    a
## 6                e     b       s     w    a

Example of extracting numerical value from a column

#Excample of extracting numerical value from a column
df_mashroom_new$IsEdible[df_mashroom_new$edible_poisonous == 'e'] <- 1
df_mashroom_new$IsEdible[df_mashroom_new$edible_poisonous == 'p'] <- 0

df_mashroom_new$IsPoisonous[df_mashroom_new$edible_poisonous == 'e'] <- 0
df_mashroom_new$IsPoisonous[df_mashroom_new$edible_poisonous == 'p'] <- 1

head(df_mashroom_new)
##   edible_poisonous shape surface color odor IsEdible IsPoisonous
## 1                e     x       s     y    a        1           0
## 2                e     b       s     w    l        1           0
## 3                p     x       y     w    p        0           1
## 4                e     x       s     g    n        1           0
## 5                e     x       y     y    a        1           0
## 6                e     b       s     w    a        1           0
levels(df_mashroom_new$edible_poisonous)[levels(df_mashroom_new$edible_poisonous)=='e'] <- 'edible'
levels(df_mashroom_new$edible_poisonous)[levels(df_mashroom_new$edible_poisonous)=='p'] <- 'poisonous'

head(df_mashroom_new)
##   edible_poisonous shape surface color odor IsEdible IsPoisonous
## 1           edible     x       s     y    a        1           0
## 2           edible     b       s     w    l        1           0
## 3        poisonous     x       y     w    p        0           1
## 4           edible     x       s     g    n        1           0
## 5           edible     x       y     y    a        1           0
## 6           edible     b       s     w    a        1           0
levels(df_mashroom_new$shape)[levels(df_mashroom_new$shape)=='b'] <- 'bell'
levels(df_mashroom_new$shape)[levels(df_mashroom_new$shape)=='c'] <- 'conical'
levels(df_mashroom_new$shape)[levels(df_mashroom_new$shape)=='x'] <- 'convex'
levels(df_mashroom_new$shape)[levels(df_mashroom_new$shape)=='f'] <- 'flat'
levels(df_mashroom_new$shape)[levels(df_mashroom_new$shape)=='k'] <- 'knobbed'
levels(df_mashroom_new$shape)[levels(df_mashroom_new$shape)=='s'] <- 'sunken'

head(df_mashroom_new)
##   edible_poisonous  shape surface color odor IsEdible IsPoisonous
## 1           edible convex       s     y    a        1           0
## 2           edible   bell       s     w    l        1           0
## 3        poisonous convex       y     w    p        0           1
## 4           edible convex       s     g    n        1           0
## 5           edible convex       y     y    a        1           0
## 6           edible   bell       s     w    a        1           0
levels(df_mashroom_new$surface)[levels(df_mashroom_new$surface)=='f'] <- 'fibrous'
levels(df_mashroom_new$surface)[levels(df_mashroom_new$surface)=='g'] <- 'grooves'
levels(df_mashroom_new$surface)[levels(df_mashroom_new$surface)=='y'] <- 'scaly'
levels(df_mashroom_new$surface)[levels(df_mashroom_new$surface)=='s'] <- 'smooth'

head(df_mashroom_new)
##   edible_poisonous  shape surface color odor IsEdible IsPoisonous
## 1           edible convex  smooth     y    a        1           0
## 2           edible   bell  smooth     w    l        1           0
## 3        poisonous convex   scaly     w    p        0           1
## 4           edible convex  smooth     g    n        1           0
## 5           edible convex   scaly     y    a        1           0
## 6           edible   bell  smooth     w    a        1           0
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='n'] <- 'brown'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='b'] <- 'buff'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='c'] <- 'cinnamon'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='g'] <- 'gray'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='r'] <- 'green'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='p'] <- 'pink'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='u'] <- 'purple'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='e'] <- 'red'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='w'] <- 'white'
levels(df_mashroom_new$color)[levels(df_mashroom_new$color)=='y'] <- 'yellow'

head(df_mashroom_new)
##   edible_poisonous  shape surface  color odor IsEdible IsPoisonous
## 1           edible convex  smooth yellow    a        1           0
## 2           edible   bell  smooth  white    l        1           0
## 3        poisonous convex   scaly  white    p        0           1
## 4           edible convex  smooth   gray    n        1           0
## 5           edible convex   scaly yellow    a        1           0
## 6           edible   bell  smooth  white    a        1           0
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='a'] <- 'almond'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='l'] <- 'anise'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='c'] <- 'creosote'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='y'] <- 'fishy'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='f'] <- 'foul'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='m'] <- 'musty'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='n'] <- 'none'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='p'] <- 'pungent'
levels(df_mashroom_new$odor)[levels(df_mashroom_new$odor)=='s'] <- 'spicy'

head(df_mashroom_new)
##   edible_poisonous  shape surface  color    odor IsEdible IsPoisonous
## 1           edible convex  smooth yellow  almond        1           0
## 2           edible   bell  smooth  white   anise        1           0
## 3        poisonous convex   scaly  white pungent        0           1
## 4           edible convex  smooth   gray    none        1           0
## 5           edible convex   scaly yellow  almond        1           0
## 6           edible   bell  smooth  white  almond        1           0

Check the levels

levels(df_mashroom_new$edible_poisonous)
## [1] "edible"    "poisonous"
levels(df_mashroom_new$shape)
## [1] "bell"    "conical" "flat"    "knobbed" "sunken"  "convex"
levels(df_mashroom_new$surface)
## [1] "fibrous" "grooves" "smooth"  "scaly"
levels(df_mashroom_new$color)
##  [1] "buff"     "cinnamon" "red"      "gray"     "brown"    "pink"    
##  [7] "green"    "purple"   "white"    "yellow"
levels(df_mashroom_new$odor)
## [1] "almond"   "creosote" "foul"     "anise"    "musty"    "none"    
## [7] "pungent"  "spicy"    "fishy"

After cleaning up lets see the summary

summary(df_mashroom_new)
##   edible_poisonous     shape         surface         color     
##  edible   :4208    bell   : 452   fibrous:2320   brown  :2283  
##  poisonous:3915    conical:   4   grooves:   4   gray   :1840  
##                    flat   :3152   smooth :2555   red    :1500  
##                    knobbed: 828   scaly  :3244   yellow :1072  
##                    sunken :  32                  white  :1040  
##                    convex :3655                  buff   : 168  
##                                                  (Other): 220  
##       odor         IsEdible      IsPoisonous   
##  none   :3528   Min.   :0.000   Min.   :0.000  
##  foul   :2160   1st Qu.:0.000   1st Qu.:0.000  
##  spicy  : 576   Median :1.000   Median :0.000  
##  fishy  : 576   Mean   :0.518   Mean   :0.482  
##  almond : 400   3rd Qu.:1.000   3rd Qu.:1.000  
##  anise  : 400   Max.   :1.000   Max.   :1.000  
##  (Other): 483

Plot each feature

plot(df_mashroom_new$edible_poisonous)

plot(df_mashroom_new$shape)

plot(df_mashroom_new$surface)

plot(df_mashroom_new$color)

plot(df_mashroom_new$odor)

Final cleaned up dataset

head(df_mashroom_new)
##   edible_poisonous  shape surface  color    odor IsEdible IsPoisonous
## 1           edible convex  smooth yellow  almond        1           0
## 2           edible   bell  smooth  white   anise        1           0
## 3        poisonous convex   scaly  white pungent        0           1
## 4           edible convex  smooth   gray    none        1           0
## 5           edible convex   scaly yellow  almond        1           0
## 6           edible   bell  smooth  white  almond        1           0