Section 1: Read in DataSet and Report Summary
#Read with factors - Stringasfactors = TRUE to show summary counts
newdata <- read.csv(file="C:/Users/Banu/Documents/RScriptfiles/New folder/agaricus-lepiota.csv", header=FALSE, sep=",")
names(newdata) <- c("classes","cap-shape:","cap-surface:","cap-color:","bruises:","odor:","gill-attachment:","gill-spacing:","gill-size:","gill-color:","stalk-shape:","stalk-root:","stalk-surface-above-ring:","stalk-surface-below-ring:","stalk-color-above-ring:","stalk-color-below-ring:","veil-type:","veil-color:","ring-number:","ring-type:","spore-print-color:","population:","habitat:")
names(newdata) <- gsub(":", "",names(newdata))
names(newdata) <- gsub("-", "_",names(newdata))
names(newdata)
## [1] "classes" "cap_shape"
## [3] "cap_surface" "cap_color"
## [5] "bruises" "odor"
## [7] "gill_attachment" "gill_spacing"
## [9] "gill_size" "gill_color"
## [11] "stalk_shape" "stalk_root"
## [13] "stalk_surface_above_ring" "stalk_surface_below_ring"
## [15] "stalk_color_above_ring" "stalk_color_below_ring"
## [17] "veil_type" "veil_color"
## [19] "ring_number" "ring_type"
## [21] "spore_print_color" "population"
## [23] "habitat"
head(newdata)
## classes cap_shape cap_surface cap_color bruises odor gill_attachment
## 1 p x s n t p f
## 2 e x s y t a f
## 3 e b s w t l f
## 4 p x y w t p f
## 5 e x s g f n f
## 6 e x y y t a f
## gill_spacing gill_size gill_color stalk_shape stalk_root
## 1 c n k e e
## 2 c b k e c
## 3 c b n e c
## 4 c n n e e
## 5 w b k t e
## 6 c b n e c
## stalk_surface_above_ring stalk_surface_below_ring stalk_color_above_ring
## 1 s s w
## 2 s s w
## 3 s s w
## 4 s s w
## 5 s s w
## 6 s s w
## stalk_color_below_ring veil_type veil_color ring_number ring_type
## 1 w p w o p
## 2 w p w o p
## 3 w p w o p
## 4 w p w o p
## 5 w p w o e
## 6 w p w o p
## spore_print_color population habitat
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
str(newdata)
## 'data.frame': 8124 obs. of 23 variables:
## $ classes : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
## $ cap_shape : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
## $ cap_surface : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
## $ cap_color : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
## $ bruises : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
## $ odor : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
## $ gill_attachment : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
## $ gill_spacing : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
## $ gill_size : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
## $ gill_color : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
## $ stalk_shape : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
## $ stalk_root : Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
## $ stalk_surface_above_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ stalk_surface_below_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ stalk_color_above_ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ stalk_color_below_ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ veil_type : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
## $ veil_color : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ ring_number : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
## $ ring_type : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
## $ spore_print_color : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
## $ population : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
## $ habitat : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...
summary(newdata)
## classes cap_shape cap_surface cap_color bruises odor
## e:4208 b: 452 f:2320 n :2284 f:4748 n :3528
## p:3916 c: 4 g: 4 g :1840 t:3376 f :2160
## f:3152 s:2556 e :1500 s : 576
## k: 828 y:3244 y :1072 y : 576
## s: 32 w :1040 a : 400
## x:3656 b : 168 l : 400
## (Other): 220 (Other): 484
## gill_attachment gill_spacing gill_size gill_color stalk_shape
## a: 210 c:6812 b:5612 b :1728 e:3516
## f:7914 w:1312 n:2512 p :1492 t:4608
## w :1202
## n :1048
## g : 752
## h : 732
## (Other):1170
## stalk_root stalk_surface_above_ring stalk_surface_below_ring
## ?:2480 f: 552 f: 600
## b:3776 k:2372 k:2304
## c: 556 s:5176 s:4936
## e:1120 y: 24 y: 284
## r: 192
##
##
## stalk_color_above_ring stalk_color_below_ring veil_type veil_color
## w :4464 w :4384 p:8124 n: 96
## p :1872 p :1872 o: 96
## g : 576 g : 576 w:7924
## n : 448 n : 512 y: 8
## b : 432 b : 432
## o : 192 o : 192
## (Other): 140 (Other): 156
## ring_number ring_type spore_print_color population habitat
## n: 36 e:2776 w :2388 a: 384 d:3148
## o:7488 f: 48 n :1968 c: 340 g:2148
## t: 600 l:1296 k :1872 n: 400 l: 832
## n: 36 h :1632 s:1248 m: 292
## p:3968 r : 72 v:4040 p:1144
## b : 48 y:1712 u: 368
## (Other): 144 w: 192
Section 2: Change levels for only 3 columns
#check levels and change level Classes
levels(newdata$classes)[levels(newdata$classes)=="e"] <- "edible"
levels(newdata$classes)[levels(newdata$classes)=="p"] <- "poisonous"
#check levels and change level cap_shape
levels(newdata$cap_shape)[levels(newdata$cap_shape)=="b"] <- "bell"
levels(newdata$cap_shape)[levels(newdata$cap_shape)=="c"] <- "conical"
levels(newdata$cap_shape)[levels(newdata$cap_shape)=="f"] <- "flat"
levels(newdata$cap_shape)[levels(newdata$cap_shape)=="k"] <- "knobbed"
levels(newdata$cap_shape)[levels(newdata$cap_shape)=="s"] <- "sunken"
levels(newdata$cap_shape)[levels(newdata$cap_shape)=="x"] <- "convex"
#check levels and change level cap_surface
levels(newdata$cap_surface)[levels(newdata$cap_surface)=="f"] <- "fibrous"
levels(newdata$cap_surface)[levels(newdata$cap_surface)=="g"] <- "grooves"
levels(newdata$cap_surface)[levels(newdata$cap_surface)=="y"] <- "scaly"
levels(newdata$cap_surface)[levels(newdata$cap_surface)=="s"] <- "smooth"
for (n in names(newdata))
if (is.factor(newdata[[n]])) {
print (n)
print(levels(newdata[[n]]))
}
## [1] "classes"
## [1] "edible" "poisonous"
## [1] "cap_shape"
## [1] "bell" "conical" "flat" "knobbed" "sunken" "convex"
## [1] "cap_surface"
## [1] "fibrous" "grooves" "smooth" "scaly"
## [1] "cap_color"
## [1] "b" "c" "e" "g" "n" "p" "r" "u" "w" "y"
## [1] "bruises"
## [1] "f" "t"
## [1] "odor"
## [1] "a" "c" "f" "l" "m" "n" "p" "s" "y"
## [1] "gill_attachment"
## [1] "a" "f"
## [1] "gill_spacing"
## [1] "c" "w"
## [1] "gill_size"
## [1] "b" "n"
## [1] "gill_color"
## [1] "b" "e" "g" "h" "k" "n" "o" "p" "r" "u" "w" "y"
## [1] "stalk_shape"
## [1] "e" "t"
## [1] "stalk_root"
## [1] "?" "b" "c" "e" "r"
## [1] "stalk_surface_above_ring"
## [1] "f" "k" "s" "y"
## [1] "stalk_surface_below_ring"
## [1] "f" "k" "s" "y"
## [1] "stalk_color_above_ring"
## [1] "b" "c" "e" "g" "n" "o" "p" "w" "y"
## [1] "stalk_color_below_ring"
## [1] "b" "c" "e" "g" "n" "o" "p" "w" "y"
## [1] "veil_type"
## [1] "p"
## [1] "veil_color"
## [1] "n" "o" "w" "y"
## [1] "ring_number"
## [1] "n" "o" "t"
## [1] "ring_type"
## [1] "e" "f" "l" "n" "p"
## [1] "spore_print_color"
## [1] "b" "h" "k" "n" "o" "r" "u" "w" "y"
## [1] "population"
## [1] "a" "c" "n" "s" "v" "y"
## [1] "habitat"
## [1] "d" "g" "l" "m" "p" "u" "w"
Section 3: Use Tibble to change few more cols since changing levels using index was taking a while
dftibble <- tibble::as.tibble(newdata)
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
#cols4
dftibble <- dftibble %>%
mutate(cap_color = case_when(
.$cap_color == "n" ~ "brown",
.$cap_color == "b" ~ "buff",
.$cap_color == "c" ~ "cinnamon",
.$cap_color == "g" ~ "gray",
.$cap_color == "p" ~ "pink",
.$cap_color == "u" ~ "purple",
.$cap_color == "e" ~ "red",
.$cap_color == "w" ~ "white",
.$cap_color == "y" ~ "yellow",
TRUE ~ "other"))
#col5
dftibble <- dftibble %>% mutate(bruises = case_when(
.$bruises == "t" ~ "bruises",
.$bruises == "f" ~ "no"
,TRUE ~ "other"))
#col12
dftibble <- dftibble %>%
mutate(stalk_root = case_when(
.$stalk_root == "b" ~ "bulbous",
.$stalk_root == "c" ~ "club",
.$stalk_root == "u" ~ "cup",
.$stalk_root == "e" ~ "equal",
.$stalk_root == "z" ~ "rhiomorphus",
.$stalk_root == "r" ~ "rooted",
.$stalk_root == "?" ~ "missing"
,TRUE ~ "other"))
Section 3: Convert tibble back to dataframe and report summary
#convert to factor
dftibble <- mutate_at(dftibble, vars(cap_color, bruises,stalk_root), as.factor)
head(dftibble)
## # A tibble: 6 x 23
## classes cap_shape cap_surface cap_color bruises odor gill_attachment
## <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 poison~ convex smooth brown bruises p f
## 2 edible convex smooth yellow bruises a f
## 3 edible bell smooth white bruises l f
## 4 poison~ convex scaly white bruises p f
## 5 edible convex smooth gray no n f
## 6 edible convex scaly yellow bruises a f
## # ... with 16 more variables: gill_spacing <fct>, gill_size <fct>,
## # gill_color <fct>, stalk_shape <fct>, stalk_root <fct>,
## # stalk_surface_above_ring <fct>, stalk_surface_below_ring <fct>,
## # stalk_color_above_ring <fct>, stalk_color_below_ring <fct>,
## # veil_type <fct>, veil_color <fct>, ring_number <fct>, ring_type <fct>,
## # spore_print_color <fct>, population <fct>, habitat <fct>
#convert back to dataframe
finaldata <- data.frame(dftibble)
nrow(finaldata)
## [1] 8124
str(finaldata)
## 'data.frame': 8124 obs. of 23 variables:
## $ classes : Factor w/ 2 levels "edible","poisonous": 2 1 1 2 1 1 1 1 2 1 ...
## $ cap_shape : Factor w/ 6 levels "bell","conical",..: 6 6 1 6 6 6 1 1 6 1 ...
## $ cap_surface : Factor w/ 4 levels "fibrous","grooves",..: 3 3 3 4 3 4 3 4 4 3 ...
## $ cap_color : Factor w/ 10 levels "brown","buff",..: 1 10 9 9 4 10 9 9 9 10 ...
## $ bruises : Factor w/ 2 levels "bruises","no": 1 1 1 1 2 1 1 1 1 1 ...
## $ odor : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
## $ gill_attachment : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
## $ gill_spacing : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
## $ gill_size : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
## $ gill_color : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
## $ stalk_shape : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
## $ stalk_root : Factor w/ 5 levels "bulbous","club",..: 3 2 2 3 3 2 2 2 3 2 ...
## $ stalk_surface_above_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ stalk_surface_below_ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ stalk_color_above_ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ stalk_color_below_ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ veil_type : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
## $ veil_color : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ ring_number : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
## $ ring_type : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
## $ spore_print_color : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
## $ population : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
## $ habitat : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...
Section 4: create subset with those missing and non missing on stalk_root
#Use file with StringasFactors=FALSE
allnonmissing <- finaldata[!finaldata$stalk_root == "missing",c(1,2:4,12) ] #subset rows with no missing for attribute
Submissingrows <- finaldata[finaldata$stalk_root == "missing",c(1,2:4,12) ] #subset only missing rows
dim(allnonmissing)
## [1] 5644 5
dim(Submissingrows)
## [1] 2480 5
head(allnonmissing)
## classes cap_shape cap_surface cap_color stalk_root
## 1 poisonous convex smooth brown equal
## 2 edible convex smooth yellow club
## 3 edible bell smooth white club
## 4 poisonous convex scaly white equal
## 5 edible convex smooth gray equal
## 6 edible convex scaly yellow club
head(Submissingrows)
## classes cap_shape cap_surface cap_color stalk_root
## 3985 edible convex scaly buff missing
## 4024 poisonous convex scaly red missing
## 4077 edible flat scaly purple missing
## 4101 poisonous convex scaly red missing
## 4105 poisonous convex scaly brown missing
## 4197 poisonous convex scaly brown missing
Section 5: create subset with some conditions on rows
df1 <- subset(allnonmissing, cap_color == "yellow" & cap_surface == "scaly" )
df2 <- subset(allnonmissing, cap_color %in% c("yellow","brown") & cap_surface == "scaly" )
summary(df1)
## classes cap_shape cap_surface cap_color stalk_root
## edible :224 bell : 66 fibrous: 0 yellow :556 bulbous:324
## poisonous:332 conical: 2 grooves: 0 brown : 0 club :136
## flat :212 smooth : 0 buff : 0 equal : 0
## knobbed: 2 scaly :556 cinnamon: 0 missing: 0
## sunken : 0 gray : 0 rooted : 96
## convex :274 other : 0
## (Other) : 0
str(df1)
## 'data.frame': 556 obs. of 5 variables:
## $ classes : Factor w/ 2 levels "edible","poisonous": 1 1 1 1 1 1 1 1 1 1 ...
## $ cap_shape : Factor w/ 6 levels "bell","conical",..: 6 6 6 1 6 6 1 1 6 6 ...
## $ cap_surface: Factor w/ 4 levels "fibrous","grooves",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ cap_color : Factor w/ 10 levels "brown","buff",..: 10 10 10 10 10 10 10 10 10 10 ...
## $ stalk_root : Factor w/ 5 levels "bulbous","club",..: 2 2 2 2 2 2 2 2 5 2 ...
head(df1)
## classes cap_shape cap_surface cap_color stalk_root
## 6 edible convex scaly yellow club
## 11 edible convex scaly yellow club
## 12 edible convex scaly yellow club
## 23 edible bell scaly yellow club
## 27 edible convex scaly yellow club
## 33 edible convex scaly yellow club
str(df2)
## 'data.frame': 1040 obs. of 5 variables:
## $ classes : Factor w/ 2 levels "edible","poisonous": 1 1 1 2 1 1 1 1 1 2 ...
## $ cap_shape : Factor w/ 6 levels "bell","conical",..: 6 6 6 6 1 6 6 6 1 6 ...
## $ cap_surface: Factor w/ 4 levels "fibrous","grooves",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ cap_color : Factor w/ 10 levels "brown","buff",..: 10 10 10 1 10 10 10 1 10 1 ...
## $ stalk_root : Factor w/ 5 levels "bulbous","club",..: 2 2 2 3 2 2 2 5 2 3 ...
summary(df2)
## classes cap_shape cap_surface cap_color stalk_root
## edible :632 bell : 68 fibrous: 0 yellow :556 bulbous:636
## poisonous:408 conical: 2 grooves: 0 brown :484 club :148
## flat :448 smooth : 0 buff : 0 equal : 64
## knobbed: 12 scaly :1040 cinnamon: 0 missing: 0
## sunken : 0 gray : 0 rooted :192
## convex :510 other : 0
## (Other) : 0
head(df2)
## classes cap_shape cap_surface cap_color stalk_root
## 6 edible convex scaly yellow club
## 11 edible convex scaly yellow club
## 12 edible convex scaly yellow club
## 22 poisonous convex scaly brown equal
## 23 edible bell scaly yellow club
## 27 edible convex scaly yellow club