Section 1: Read in DataSet and Report Summary

#Read with factors - Stringasfactors = TRUE to show summary counts
newdata <- read.csv(file="C:/Users/Banu/Documents/RScriptfiles/New folder/agaricus-lepiota.csv", header=FALSE, sep=",")
names(newdata) <- c("classes","cap-shape:","cap-surface:","cap-color:","bruises:","odor:","gill-attachment:","gill-spacing:","gill-size:","gill-color:","stalk-shape:","stalk-root:","stalk-surface-above-ring:","stalk-surface-below-ring:","stalk-color-above-ring:","stalk-color-below-ring:","veil-type:","veil-color:","ring-number:","ring-type:","spore-print-color:","population:","habitat:") 
names(newdata) <- gsub(":", "",names(newdata))
names(newdata) <- gsub("-", "_",names(newdata))
names(newdata)
##  [1] "classes"                  "cap_shape"               
##  [3] "cap_surface"              "cap_color"               
##  [5] "bruises"                  "odor"                    
##  [7] "gill_attachment"          "gill_spacing"            
##  [9] "gill_size"                "gill_color"              
## [11] "stalk_shape"              "stalk_root"              
## [13] "stalk_surface_above_ring" "stalk_surface_below_ring"
## [15] "stalk_color_above_ring"   "stalk_color_below_ring"  
## [17] "veil_type"                "veil_color"              
## [19] "ring_number"              "ring_type"               
## [21] "spore_print_color"        "population"              
## [23] "habitat"
head(newdata)
##   classes cap_shape cap_surface cap_color bruises odor gill_attachment
## 1       p         x           s         n       t    p               f
## 2       e         x           s         y       t    a               f
## 3       e         b           s         w       t    l               f
## 4       p         x           y         w       t    p               f
## 5       e         x           s         g       f    n               f
## 6       e         x           y         y       t    a               f
##   gill_spacing gill_size gill_color stalk_shape stalk_root
## 1            c         n          k           e          e
## 2            c         b          k           e          c
## 3            c         b          n           e          c
## 4            c         n          n           e          e
## 5            w         b          k           t          e
## 6            c         b          n           e          c
##   stalk_surface_above_ring stalk_surface_below_ring stalk_color_above_ring
## 1                        s                        s                      w
## 2                        s                        s                      w
## 3                        s                        s                      w
## 4                        s                        s                      w
## 5                        s                        s                      w
## 6                        s                        s                      w
##   stalk_color_below_ring veil_type veil_color ring_number ring_type
## 1                      w         p          w           o         p
## 2                      w         p          w           o         p
## 3                      w         p          w           o         p
## 4                      w         p          w           o         p
## 5                      w         p          w           o         e
## 6                      w         p          w           o         p
##   spore_print_color population habitat
## 1                 k          s       u
## 2                 n          n       g
## 3                 n          n       m
## 4                 k          s       u
## 5                 n          a       g
## 6                 k          n       g
str(newdata)
## 'data.frame':    8124 obs. of  23 variables:
##  $ classes                 : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
##  $ cap_shape               : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
##  $ cap_surface             : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
##  $ cap_color               : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
##  $ bruises                 : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
##  $ odor                    : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
##  $ gill_attachment         : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
##  $ gill_spacing            : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
##  $ gill_size               : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
##  $ gill_color              : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
##  $ stalk_shape             : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
##  $ stalk_root              : Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
##  $ stalk_surface_above_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ stalk_surface_below_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ stalk_color_above_ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ stalk_color_below_ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ veil_type               : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
##  $ veil_color              : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ ring_number             : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ring_type               : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
##  $ spore_print_color       : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
##  $ population              : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
##  $ habitat                 : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...
summary(newdata)   
##  classes  cap_shape cap_surface   cap_color    bruises       odor     
##  e:4208   b: 452    f:2320      n      :2284   f:4748   n      :3528  
##  p:3916   c:   4    g:   4      g      :1840   t:3376   f      :2160  
##           f:3152    s:2556      e      :1500            s      : 576  
##           k: 828    y:3244      y      :1072            y      : 576  
##           s:  32                w      :1040            a      : 400  
##           x:3656                b      : 168            l      : 400  
##                                 (Other): 220            (Other): 484  
##  gill_attachment gill_spacing gill_size   gill_color   stalk_shape
##  a: 210          c:6812       b:5612    b      :1728   e:3516     
##  f:7914          w:1312       n:2512    p      :1492   t:4608     
##                                         w      :1202              
##                                         n      :1048              
##                                         g      : 752              
##                                         h      : 732              
##                                         (Other):1170              
##  stalk_root stalk_surface_above_ring stalk_surface_below_ring
##  ?:2480     f: 552                   f: 600                  
##  b:3776     k:2372                   k:2304                  
##  c: 556     s:5176                   s:4936                  
##  e:1120     y:  24                   y: 284                  
##  r: 192                                                      
##                                                              
##                                                              
##  stalk_color_above_ring stalk_color_below_ring veil_type veil_color
##  w      :4464           w      :4384           p:8124    n:  96    
##  p      :1872           p      :1872                     o:  96    
##  g      : 576           g      : 576                     w:7924    
##  n      : 448           n      : 512                     y:   8    
##  b      : 432           b      : 432                               
##  o      : 192           o      : 192                               
##  (Other): 140           (Other): 156                               
##  ring_number ring_type spore_print_color population habitat 
##  n:  36      e:2776    w      :2388      a: 384     d:3148  
##  o:7488      f:  48    n      :1968      c: 340     g:2148  
##  t: 600      l:1296    k      :1872      n: 400     l: 832  
##              n:  36    h      :1632      s:1248     m: 292  
##              p:3968    r      :  72      v:4040     p:1144  
##                        b      :  48      y:1712     u: 368  
##                        (Other): 144                 w: 192

Section 2: Change levels for only 3 columns

#check levels and change level Classes
levels(newdata$classes)[levels(newdata$classes)=="e"] <- "edible"
levels(newdata$classes)[levels(newdata$classes)=="p"] <- "poisonous"
#check levels and change level cap_shape
levels(newdata$cap_shape)[levels(newdata$cap_shape)=="b"] <- "bell"
levels(newdata$cap_shape)[levels(newdata$cap_shape)=="c"] <- "conical"
levels(newdata$cap_shape)[levels(newdata$cap_shape)=="f"] <- "flat"
levels(newdata$cap_shape)[levels(newdata$cap_shape)=="k"] <- "knobbed"
levels(newdata$cap_shape)[levels(newdata$cap_shape)=="s"] <- "sunken"
levels(newdata$cap_shape)[levels(newdata$cap_shape)=="x"] <- "convex"
#check levels and change level cap_surface
levels(newdata$cap_surface)[levels(newdata$cap_surface)=="f"] <- "fibrous"
levels(newdata$cap_surface)[levels(newdata$cap_surface)=="g"] <- "grooves"
levels(newdata$cap_surface)[levels(newdata$cap_surface)=="y"] <- "scaly"
levels(newdata$cap_surface)[levels(newdata$cap_surface)=="s"] <- "smooth"

for (n in names(newdata))
  if (is.factor(newdata[[n]])) {
    print (n)
      print(levels(newdata[[n]]))
  }
## [1] "classes"
## [1] "edible"    "poisonous"
## [1] "cap_shape"
## [1] "bell"    "conical" "flat"    "knobbed" "sunken"  "convex" 
## [1] "cap_surface"
## [1] "fibrous" "grooves" "smooth"  "scaly"  
## [1] "cap_color"
##  [1] "b" "c" "e" "g" "n" "p" "r" "u" "w" "y"
## [1] "bruises"
## [1] "f" "t"
## [1] "odor"
## [1] "a" "c" "f" "l" "m" "n" "p" "s" "y"
## [1] "gill_attachment"
## [1] "a" "f"
## [1] "gill_spacing"
## [1] "c" "w"
## [1] "gill_size"
## [1] "b" "n"
## [1] "gill_color"
##  [1] "b" "e" "g" "h" "k" "n" "o" "p" "r" "u" "w" "y"
## [1] "stalk_shape"
## [1] "e" "t"
## [1] "stalk_root"
## [1] "?" "b" "c" "e" "r"
## [1] "stalk_surface_above_ring"
## [1] "f" "k" "s" "y"
## [1] "stalk_surface_below_ring"
## [1] "f" "k" "s" "y"
## [1] "stalk_color_above_ring"
## [1] "b" "c" "e" "g" "n" "o" "p" "w" "y"
## [1] "stalk_color_below_ring"
## [1] "b" "c" "e" "g" "n" "o" "p" "w" "y"
## [1] "veil_type"
## [1] "p"
## [1] "veil_color"
## [1] "n" "o" "w" "y"
## [1] "ring_number"
## [1] "n" "o" "t"
## [1] "ring_type"
## [1] "e" "f" "l" "n" "p"
## [1] "spore_print_color"
## [1] "b" "h" "k" "n" "o" "r" "u" "w" "y"
## [1] "population"
## [1] "a" "c" "n" "s" "v" "y"
## [1] "habitat"
## [1] "d" "g" "l" "m" "p" "u" "w"

Section 3: Use Tibble to change few more cols since changing levels using index was taking a while

dftibble <- tibble::as.tibble(newdata)
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
#cols4
dftibble <- dftibble %>%                    
  mutate(cap_color = case_when(
    .$cap_color == "n" ~ "brown",
    .$cap_color == "b" ~ "buff",
    .$cap_color == "c" ~ "cinnamon",
    .$cap_color == "g" ~ "gray",
    .$cap_color == "p" ~ "pink",
    .$cap_color == "u" ~ "purple",
    .$cap_color == "e" ~ "red",
    .$cap_color == "w" ~ "white",
    .$cap_color == "y" ~ "yellow",
    TRUE ~ "other"))
#col5
dftibble <- dftibble %>% mutate(bruises = case_when(
  .$bruises == "t" ~ "bruises",
  .$bruises == "f" ~ "no"
  ,TRUE ~ "other"))
#col12 
dftibble <- dftibble %>% 
  mutate(stalk_root = case_when(
    .$stalk_root == "b" ~ "bulbous",
    .$stalk_root == "c" ~ "club",
    .$stalk_root == "u" ~ "cup",
    .$stalk_root == "e" ~ "equal",
    .$stalk_root == "z" ~ "rhiomorphus",
    .$stalk_root == "r" ~ "rooted",
    .$stalk_root == "?" ~ "missing"
    ,TRUE ~ "other"))

Section 3: Convert tibble back to dataframe and report summary

#convert to factor
dftibble <- mutate_at(dftibble, vars(cap_color, bruises,stalk_root), as.factor)
head(dftibble)
## # A tibble: 6 x 23
##   classes cap_shape cap_surface cap_color bruises odor  gill_attachment
##   <fct>   <fct>     <fct>       <fct>     <fct>   <fct> <fct>          
## 1 poison~ convex    smooth      brown     bruises p     f              
## 2 edible  convex    smooth      yellow    bruises a     f              
## 3 edible  bell      smooth      white     bruises l     f              
## 4 poison~ convex    scaly       white     bruises p     f              
## 5 edible  convex    smooth      gray      no      n     f              
## 6 edible  convex    scaly       yellow    bruises a     f              
## # ... with 16 more variables: gill_spacing <fct>, gill_size <fct>,
## #   gill_color <fct>, stalk_shape <fct>, stalk_root <fct>,
## #   stalk_surface_above_ring <fct>, stalk_surface_below_ring <fct>,
## #   stalk_color_above_ring <fct>, stalk_color_below_ring <fct>,
## #   veil_type <fct>, veil_color <fct>, ring_number <fct>, ring_type <fct>,
## #   spore_print_color <fct>, population <fct>, habitat <fct>
#convert back to dataframe
finaldata <- data.frame(dftibble)
nrow(finaldata)
## [1] 8124
str(finaldata)
## 'data.frame':    8124 obs. of  23 variables:
##  $ classes                 : Factor w/ 2 levels "edible","poisonous": 2 1 1 2 1 1 1 1 2 1 ...
##  $ cap_shape               : Factor w/ 6 levels "bell","conical",..: 6 6 1 6 6 6 1 1 6 1 ...
##  $ cap_surface             : Factor w/ 4 levels "fibrous","grooves",..: 3 3 3 4 3 4 3 4 4 3 ...
##  $ cap_color               : Factor w/ 10 levels "brown","buff",..: 1 10 9 9 4 10 9 9 9 10 ...
##  $ bruises                 : Factor w/ 2 levels "bruises","no": 1 1 1 1 2 1 1 1 1 1 ...
##  $ odor                    : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
##  $ gill_attachment         : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
##  $ gill_spacing            : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
##  $ gill_size               : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
##  $ gill_color              : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
##  $ stalk_shape             : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
##  $ stalk_root              : Factor w/ 5 levels "bulbous","club",..: 3 2 2 3 3 2 2 2 3 2 ...
##  $ stalk_surface_above_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ stalk_surface_below_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ stalk_color_above_ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ stalk_color_below_ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ veil_type               : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
##  $ veil_color              : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ ring_number             : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ring_type               : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
##  $ spore_print_color       : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
##  $ population              : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
##  $ habitat                 : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...

Section 4: create subset with those missing and non missing on stalk_root

#Use file with StringasFactors=FALSE
allnonmissing <- finaldata[!finaldata$stalk_root == "missing",c(1,2:4,12) ] #subset rows with no missing for attribute
Submissingrows <- finaldata[finaldata$stalk_root == "missing",c(1,2:4,12) ] #subset only missing rows 
dim(allnonmissing)
## [1] 5644    5
dim(Submissingrows)
## [1] 2480    5
head(allnonmissing)
##     classes cap_shape cap_surface cap_color stalk_root
## 1 poisonous    convex      smooth     brown      equal
## 2    edible    convex      smooth    yellow       club
## 3    edible      bell      smooth     white       club
## 4 poisonous    convex       scaly     white      equal
## 5    edible    convex      smooth      gray      equal
## 6    edible    convex       scaly    yellow       club
head(Submissingrows)
##        classes cap_shape cap_surface cap_color stalk_root
## 3985    edible    convex       scaly      buff    missing
## 4024 poisonous    convex       scaly       red    missing
## 4077    edible      flat       scaly    purple    missing
## 4101 poisonous    convex       scaly       red    missing
## 4105 poisonous    convex       scaly     brown    missing
## 4197 poisonous    convex       scaly     brown    missing

Section 5: create subset with some conditions on rows

df1 <- subset(allnonmissing, cap_color == "yellow" & cap_surface == "scaly" )
df2 <- subset(allnonmissing, cap_color %in% c("yellow","brown") & cap_surface == "scaly" )
summary(df1)
##       classes      cap_shape    cap_surface     cap_color     stalk_root 
##  edible   :224   bell   : 66   fibrous:  0   yellow  :556   bulbous:324  
##  poisonous:332   conical:  2   grooves:  0   brown   :  0   club   :136  
##                  flat   :212   smooth :  0   buff    :  0   equal  :  0  
##                  knobbed:  2   scaly  :556   cinnamon:  0   missing:  0  
##                  sunken :  0                 gray    :  0   rooted : 96  
##                  convex :274                 other   :  0                
##                                              (Other) :  0
str(df1)
## 'data.frame':    556 obs. of  5 variables:
##  $ classes    : Factor w/ 2 levels "edible","poisonous": 1 1 1 1 1 1 1 1 1 1 ...
##  $ cap_shape  : Factor w/ 6 levels "bell","conical",..: 6 6 6 1 6 6 1 1 6 6 ...
##  $ cap_surface: Factor w/ 4 levels "fibrous","grooves",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ cap_color  : Factor w/ 10 levels "brown","buff",..: 10 10 10 10 10 10 10 10 10 10 ...
##  $ stalk_root : Factor w/ 5 levels "bulbous","club",..: 2 2 2 2 2 2 2 2 5 2 ...
head(df1)
##    classes cap_shape cap_surface cap_color stalk_root
## 6   edible    convex       scaly    yellow       club
## 11  edible    convex       scaly    yellow       club
## 12  edible    convex       scaly    yellow       club
## 23  edible      bell       scaly    yellow       club
## 27  edible    convex       scaly    yellow       club
## 33  edible    convex       scaly    yellow       club
str(df2)
## 'data.frame':    1040 obs. of  5 variables:
##  $ classes    : Factor w/ 2 levels "edible","poisonous": 1 1 1 2 1 1 1 1 1 2 ...
##  $ cap_shape  : Factor w/ 6 levels "bell","conical",..: 6 6 6 6 1 6 6 6 1 6 ...
##  $ cap_surface: Factor w/ 4 levels "fibrous","grooves",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ cap_color  : Factor w/ 10 levels "brown","buff",..: 10 10 10 1 10 10 10 1 10 1 ...
##  $ stalk_root : Factor w/ 5 levels "bulbous","club",..: 2 2 2 3 2 2 2 5 2 3 ...
summary(df2)
##       classes      cap_shape    cap_surface      cap_color     stalk_root 
##  edible   :632   bell   : 68   fibrous:   0   yellow  :556   bulbous:636  
##  poisonous:408   conical:  2   grooves:   0   brown   :484   club   :148  
##                  flat   :448   smooth :   0   buff    :  0   equal  : 64  
##                  knobbed: 12   scaly  :1040   cinnamon:  0   missing:  0  
##                  sunken :  0                  gray    :  0   rooted :192  
##                  convex :510                  other   :  0                
##                                               (Other) :  0
head(df2)
##      classes cap_shape cap_surface cap_color stalk_root
## 6     edible    convex       scaly    yellow       club
## 11    edible    convex       scaly    yellow       club
## 12    edible    convex       scaly    yellow       club
## 22 poisonous    convex       scaly     brown      equal
## 23    edible      bell       scaly    yellow       club
## 27    edible    convex       scaly    yellow       club