Library

library(stringr)

Downloads the headers from my github account

mushroom_props <- readLines("https://raw.githubusercontent.com/Sirwel/data607-week1/master/mushroom_names.txt",warn = FALSE)
mushroom_props
##  [1] "class:edible=e,poisonous=p"                                                                                  
##  [2] "cap-shape:bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s"                                               
##  [3] "cap-surface:fibrous=f,grooves=g,scaly=y,smooth=s"                                                            
##  [4] "cap-color:brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y"                   
##  [5] "bruises:bruises=t,no=f"                                                                                      
##  [6] "odor:almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s"                            
##  [7] "gill-attachment:attached=a,descending=d,free=f,notched=n"                                                    
##  [8] "gill-spacing:close=c,crowded=w,distant=d"                                                                    
##  [9] "gill-size:broad=b,narrow=n"                                                                                  
## [10] "gill-color:black=k,brown=n,buff=b,chocolate=h,gray=g,green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y"
## [11] "stalk-shape:enlarging=e,tapering=t"                                                                          
## [12] "stalk-root:bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?"                                  
## [13] "stalk-surface-above-ring:fibrous=f,scaly=y,silky=k,smooth=s"                                                 
## [14] "stalk-surface-below-ring:fibrous=f,scaly=y,silky=k,smooth=s"                                                 
## [15] "stalk-color-above-ring:brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y"              
## [16] "stalk-color-below-ring:brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y"              
## [17] "veil-type:partial=p,universal=u"                                                                             
## [18] "veil-color:brown=n,orange=o,white=w,yellow=y"                                                                
## [19] "ring-number:none=n,one=o,two=t"                                                                              
## [20] "ring-type:cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z"                     
## [21] "spore-print-color:black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y"             
## [22] "population:abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y"                               
## [23] "habitat:grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d  "

Extract headers from the file that defines the dataset’s properties

mushroom_headers <- noquote(str_extract_all(mushroom_props,"[a-zA-Z].+:"))
mushroom_headers
## [[1]]
## [1] class:
## 
## [[2]]
## [1] cap-shape:
## 
## [[3]]
## [1] cap-surface:
## 
## [[4]]
## [1] cap-color:
## 
## [[5]]
## [1] bruises:
## 
## [[6]]
## [1] odor:
## 
## [[7]]
## [1] gill-attachment:
## 
## [[8]]
## [1] gill-spacing:
## 
## [[9]]
## [1] gill-size:
## 
## [[10]]
## [1] gill-color:
## 
## [[11]]
## [1] stalk-shape:
## 
## [[12]]
## [1] stalk-root:
## 
## [[13]]
## [1] stalk-surface-above-ring:
## 
## [[14]]
## [1] stalk-surface-below-ring:
## 
## [[15]]
## [1] stalk-color-above-ring:
## 
## [[16]]
## [1] stalk-color-below-ring:
## 
## [[17]]
## [1] veil-type:
## 
## [[18]]
## [1] veil-color:
## 
## [[19]]
## [1] ring-number:
## 
## [[20]]
## [1] ring-type:
## 
## [[21]]
## [1] spore-print-color:
## 
## [[22]]
## [1] population:
## 
## [[23]]
## [1] habitat:

Loads mushroom’s dataframe from my github repository

mushroom_dt <-read.csv("https://raw.githubusercontent.com/Sirwel/data607-week1/master/mushroom.csv",col.names = mushroom_headers,header = FALSE,check.names = FALSE, stringsAsFactors = FALSE)

Unprocessed Mushroom Dataframe

head(mushroom_dt, n=30)
##    class: cap-shape: cap-surface: cap-color: bruises: odor:
## 1       p          x            s          n        t     p
## 2       e          x            s          y        t     a
## 3       e          b            s          w        t     l
## 4       p          x            y          w        t     p
## 5       e          x            s          g        f     n
## 6       e          x            y          y        t     a
## 7       e          b            s          w        t     a
## 8       e          b            y          w        t     l
## 9       p          x            y          w        t     p
## 10      e          b            s          y        t     a
## 11      e          x            y          y        t     l
## 12      e          x            y          y        t     a
## 13      e          b            s          y        t     a
## 14      p          x            y          w        t     p
## 15      e          x            f          n        f     n
## 16      e          s            f          g        f     n
## 17      e          f            f          w        f     n
## 18      p          x            s          n        t     p
## 19      p          x            y          w        t     p
## 20      p          x            s          n        t     p
## 21      e          b            s          y        t     a
## 22      p          x            y          n        t     p
## 23      e          b            y          y        t     l
## 24      e          b            y          w        t     a
## 25      e          b            s          w        t     l
## 26      p          f            s          w        t     p
## 27      e          x            y          y        t     a
## 28      e          x            y          w        t     l
## 29      e          f            f          n        f     n
## 30      e          x            s          y        t     a
##    gill-attachment: gill-spacing: gill-size: gill-color: stalk-shape:
## 1                 f             c          n           k            e
## 2                 f             c          b           k            e
## 3                 f             c          b           n            e
## 4                 f             c          n           n            e
## 5                 f             w          b           k            t
## 6                 f             c          b           n            e
## 7                 f             c          b           g            e
## 8                 f             c          b           n            e
## 9                 f             c          n           p            e
## 10                f             c          b           g            e
## 11                f             c          b           g            e
## 12                f             c          b           n            e
## 13                f             c          b           w            e
## 14                f             c          n           k            e
## 15                f             w          b           n            t
## 16                f             c          n           k            e
## 17                f             w          b           k            t
## 18                f             c          n           n            e
## 19                f             c          n           n            e
## 20                f             c          n           k            e
## 21                f             c          b           k            e
## 22                f             c          n           n            e
## 23                f             c          b           k            e
## 24                f             c          b           w            e
## 25                f             c          b           g            e
## 26                f             c          n           n            e
## 27                f             c          b           n            e
## 28                f             c          b           w            e
## 29                f             c          n           k            e
## 30                f             w          n           n            t
##    stalk-root: stalk-surface-above-ring: stalk-surface-below-ring:
## 1            e                         s                         s
## 2            c                         s                         s
## 3            c                         s                         s
## 4            e                         s                         s
## 5            e                         s                         s
## 6            c                         s                         s
## 7            c                         s                         s
## 8            c                         s                         s
## 9            e                         s                         s
## 10           c                         s                         s
## 11           c                         s                         s
## 12           c                         s                         s
## 13           c                         s                         s
## 14           e                         s                         s
## 15           e                         s                         f
## 16           e                         s                         s
## 17           e                         s                         s
## 18           e                         s                         s
## 19           e                         s                         s
## 20           e                         s                         s
## 21           c                         s                         s
## 22           e                         s                         s
## 23           c                         s                         s
## 24           c                         s                         s
## 25           c                         s                         s
## 26           e                         s                         s
## 27           c                         s                         s
## 28           c                         s                         s
## 29           e                         s                         s
## 30           b                         s                         s
##    stalk-color-above-ring: stalk-color-below-ring: veil-type: veil-color:
## 1                        w                       w          p           w
## 2                        w                       w          p           w
## 3                        w                       w          p           w
## 4                        w                       w          p           w
## 5                        w                       w          p           w
## 6                        w                       w          p           w
## 7                        w                       w          p           w
## 8                        w                       w          p           w
## 9                        w                       w          p           w
## 10                       w                       w          p           w
## 11                       w                       w          p           w
## 12                       w                       w          p           w
## 13                       w                       w          p           w
## 14                       w                       w          p           w
## 15                       w                       w          p           w
## 16                       w                       w          p           w
## 17                       w                       w          p           w
## 18                       w                       w          p           w
## 19                       w                       w          p           w
## 20                       w                       w          p           w
## 21                       w                       w          p           w
## 22                       w                       w          p           w
## 23                       w                       w          p           w
## 24                       w                       w          p           w
## 25                       w                       w          p           w
## 26                       w                       w          p           w
## 27                       w                       w          p           w
## 28                       w                       w          p           w
## 29                       w                       w          p           w
## 30                       w                       w          p           w
##    ring-number: ring-type: spore-print-color: population: habitat:
## 1             o          p                  k           s        u
## 2             o          p                  n           n        g
## 3             o          p                  n           n        m
## 4             o          p                  k           s        u
## 5             o          e                  n           a        g
## 6             o          p                  k           n        g
## 7             o          p                  k           n        m
## 8             o          p                  n           s        m
## 9             o          p                  k           v        g
## 10            o          p                  k           s        m
## 11            o          p                  n           n        g
## 12            o          p                  k           s        m
## 13            o          p                  n           s        g
## 14            o          p                  n           v        u
## 15            o          e                  k           a        g
## 16            o          p                  n           y        u
## 17            o          e                  n           a        g
## 18            o          p                  k           s        g
## 19            o          p                  n           s        u
## 20            o          p                  n           s        u
## 21            o          p                  n           s        m
## 22            o          p                  n           v        g
## 23            o          p                  n           s        m
## 24            o          p                  n           n        m
## 25            o          p                  k           s        m
## 26            o          p                  n           v        g
## 27            o          p                  n           n        m
## 28            o          p                  n           n        m
## 29            o          p                  k           y        u
## 30            o          p                  n           v        d

Dimensions

dim(mushroom_dt)
## [1] 8124   23

Data Processing

This fragment uses regular expressions in order to read the field’s values from the property files in order to parse the dataframe to a readable format:

counter <- 1

for (header_idx in mushroom_headers) {
  level <- levels(as.factor(mushroom_dt[,header_idx]))
  
  rowFound <- grep(mushroom_headers[counter],mushroom_props,value = TRUE)

    for (index in level) {
      expr <- paste("[a-z]{1,}=[",index,"]",sep ="") 
        if (str_detect(rowFound,expr)){
      
           a <- str_extract(rowFound,expr)
     
           b <- str_extract(a,"[a-z]{1,}")
          
           mushroom_dt[header_idx][mushroom_dt[header_idx] == index] <- b
          
         } else {
            print("No value found in the dataset") 
                            }
      
      
   } 
  counter <- counter + 1
}

Dataframe Subset:

mushroom_subset <- data.frame(mushroom_dt[1],mushroom_dt[2],mushroom_dt[4],mushroom_dt[18],mushroom_dt[21])

head(mushroom_subset, n = 50)
##       class. cap.shape. cap.color. veil.color. spore.print.color.
## 1  poisonous     convex      brown       white              black
## 2     edible     convex     yellow       white              brown
## 3     edible       bell      white       white              brown
## 4  poisonous     convex      white       white              black
## 5     edible     convex       gray       white              brown
## 6     edible     convex     yellow       white              black
## 7     edible       bell      white       white              black
## 8     edible       bell      white       white              brown
## 9  poisonous     convex      white       white              black
## 10    edible       bell     yellow       white              black
## 11    edible     convex     yellow       white              brown
## 12    edible     convex     yellow       white              black
## 13    edible       bell     yellow       white              brown
## 14 poisonous     convex      white       white              brown
## 15    edible     convex      brown       white              black
## 16    edible     sunken       gray       white              brown
## 17    edible       flat      white       white              brown
## 18 poisonous     convex      brown       white              black
## 19 poisonous     convex      white       white              brown
## 20 poisonous     convex      brown       white              brown
## 21    edible       bell     yellow       white              brown
## 22 poisonous     convex      brown       white              brown
## 23    edible       bell     yellow       white              brown
## 24    edible       bell      white       white              brown
## 25    edible       bell      white       white              black
## 26 poisonous       flat      white       white              brown
## 27    edible     convex     yellow       white              brown
## 28    edible     convex      white       white              brown
## 29    edible       flat      brown       white              black
## 30    edible     convex     yellow       white              brown
## 31    edible       bell     yellow       white              brown
## 32 poisonous     convex      white       white              brown
## 33    edible     convex     yellow       white              brown
## 34    edible     convex      brown       white              brown
## 35    edible       bell     yellow       white              brown
## 36    edible     convex     yellow       white              brown
## 37    edible     sunken       gray       white              black
## 38 poisonous     convex      brown       white              brown
## 39    edible     convex     yellow       white              brown
## 40    edible       bell     yellow       white              black
## 41    edible       bell     yellow       white              brown
## 42    edible     convex     yellow       white              black
## 43    edible     convex      brown       white              black
## 44 poisonous     convex      white       white              brown
## 45    edible     convex     yellow       white              black
## 46    edible     convex      white       white              brown
## 47    edible     convex     yellow       white              black
## 48    edible     convex      white       white              brown
## 49    edible     convex     yellow       white              brown
## 50    edible       flat     yellow       white              black