library(stringr)
mushroom_props <- readLines("https://raw.githubusercontent.com/Sirwel/data607-week1/master/mushroom_names.txt",warn = FALSE)
mushroom_props
## [1] "class:edible=e,poisonous=p"
## [2] "cap-shape:bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s"
## [3] "cap-surface:fibrous=f,grooves=g,scaly=y,smooth=s"
## [4] "cap-color:brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y"
## [5] "bruises:bruises=t,no=f"
## [6] "odor:almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s"
## [7] "gill-attachment:attached=a,descending=d,free=f,notched=n"
## [8] "gill-spacing:close=c,crowded=w,distant=d"
## [9] "gill-size:broad=b,narrow=n"
## [10] "gill-color:black=k,brown=n,buff=b,chocolate=h,gray=g,green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y"
## [11] "stalk-shape:enlarging=e,tapering=t"
## [12] "stalk-root:bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?"
## [13] "stalk-surface-above-ring:fibrous=f,scaly=y,silky=k,smooth=s"
## [14] "stalk-surface-below-ring:fibrous=f,scaly=y,silky=k,smooth=s"
## [15] "stalk-color-above-ring:brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y"
## [16] "stalk-color-below-ring:brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y"
## [17] "veil-type:partial=p,universal=u"
## [18] "veil-color:brown=n,orange=o,white=w,yellow=y"
## [19] "ring-number:none=n,one=o,two=t"
## [20] "ring-type:cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z"
## [21] "spore-print-color:black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y"
## [22] "population:abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y"
## [23] "habitat:grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d "
mushroom_headers <- noquote(str_extract_all(mushroom_props,"[a-zA-Z].+:"))
mushroom_headers
## [[1]]
## [1] class:
##
## [[2]]
## [1] cap-shape:
##
## [[3]]
## [1] cap-surface:
##
## [[4]]
## [1] cap-color:
##
## [[5]]
## [1] bruises:
##
## [[6]]
## [1] odor:
##
## [[7]]
## [1] gill-attachment:
##
## [[8]]
## [1] gill-spacing:
##
## [[9]]
## [1] gill-size:
##
## [[10]]
## [1] gill-color:
##
## [[11]]
## [1] stalk-shape:
##
## [[12]]
## [1] stalk-root:
##
## [[13]]
## [1] stalk-surface-above-ring:
##
## [[14]]
## [1] stalk-surface-below-ring:
##
## [[15]]
## [1] stalk-color-above-ring:
##
## [[16]]
## [1] stalk-color-below-ring:
##
## [[17]]
## [1] veil-type:
##
## [[18]]
## [1] veil-color:
##
## [[19]]
## [1] ring-number:
##
## [[20]]
## [1] ring-type:
##
## [[21]]
## [1] spore-print-color:
##
## [[22]]
## [1] population:
##
## [[23]]
## [1] habitat:
mushroom_dt <-read.csv("https://raw.githubusercontent.com/Sirwel/data607-week1/master/mushroom.csv",col.names = mushroom_headers,header = FALSE,check.names = FALSE, stringsAsFactors = FALSE)
head(mushroom_dt, n=30)
## class: cap-shape: cap-surface: cap-color: bruises: odor:
## 1 p x s n t p
## 2 e x s y t a
## 3 e b s w t l
## 4 p x y w t p
## 5 e x s g f n
## 6 e x y y t a
## 7 e b s w t a
## 8 e b y w t l
## 9 p x y w t p
## 10 e b s y t a
## 11 e x y y t l
## 12 e x y y t a
## 13 e b s y t a
## 14 p x y w t p
## 15 e x f n f n
## 16 e s f g f n
## 17 e f f w f n
## 18 p x s n t p
## 19 p x y w t p
## 20 p x s n t p
## 21 e b s y t a
## 22 p x y n t p
## 23 e b y y t l
## 24 e b y w t a
## 25 e b s w t l
## 26 p f s w t p
## 27 e x y y t a
## 28 e x y w t l
## 29 e f f n f n
## 30 e x s y t a
## gill-attachment: gill-spacing: gill-size: gill-color: stalk-shape:
## 1 f c n k e
## 2 f c b k e
## 3 f c b n e
## 4 f c n n e
## 5 f w b k t
## 6 f c b n e
## 7 f c b g e
## 8 f c b n e
## 9 f c n p e
## 10 f c b g e
## 11 f c b g e
## 12 f c b n e
## 13 f c b w e
## 14 f c n k e
## 15 f w b n t
## 16 f c n k e
## 17 f w b k t
## 18 f c n n e
## 19 f c n n e
## 20 f c n k e
## 21 f c b k e
## 22 f c n n e
## 23 f c b k e
## 24 f c b w e
## 25 f c b g e
## 26 f c n n e
## 27 f c b n e
## 28 f c b w e
## 29 f c n k e
## 30 f w n n t
## stalk-root: stalk-surface-above-ring: stalk-surface-below-ring:
## 1 e s s
## 2 c s s
## 3 c s s
## 4 e s s
## 5 e s s
## 6 c s s
## 7 c s s
## 8 c s s
## 9 e s s
## 10 c s s
## 11 c s s
## 12 c s s
## 13 c s s
## 14 e s s
## 15 e s f
## 16 e s s
## 17 e s s
## 18 e s s
## 19 e s s
## 20 e s s
## 21 c s s
## 22 e s s
## 23 c s s
## 24 c s s
## 25 c s s
## 26 e s s
## 27 c s s
## 28 c s s
## 29 e s s
## 30 b s s
## stalk-color-above-ring: stalk-color-below-ring: veil-type: veil-color:
## 1 w w p w
## 2 w w p w
## 3 w w p w
## 4 w w p w
## 5 w w p w
## 6 w w p w
## 7 w w p w
## 8 w w p w
## 9 w w p w
## 10 w w p w
## 11 w w p w
## 12 w w p w
## 13 w w p w
## 14 w w p w
## 15 w w p w
## 16 w w p w
## 17 w w p w
## 18 w w p w
## 19 w w p w
## 20 w w p w
## 21 w w p w
## 22 w w p w
## 23 w w p w
## 24 w w p w
## 25 w w p w
## 26 w w p w
## 27 w w p w
## 28 w w p w
## 29 w w p w
## 30 w w p w
## ring-number: ring-type: spore-print-color: population: habitat:
## 1 o p k s u
## 2 o p n n g
## 3 o p n n m
## 4 o p k s u
## 5 o e n a g
## 6 o p k n g
## 7 o p k n m
## 8 o p n s m
## 9 o p k v g
## 10 o p k s m
## 11 o p n n g
## 12 o p k s m
## 13 o p n s g
## 14 o p n v u
## 15 o e k a g
## 16 o p n y u
## 17 o e n a g
## 18 o p k s g
## 19 o p n s u
## 20 o p n s u
## 21 o p n s m
## 22 o p n v g
## 23 o p n s m
## 24 o p n n m
## 25 o p k s m
## 26 o p n v g
## 27 o p n n m
## 28 o p n n m
## 29 o p k y u
## 30 o p n v d
dim(mushroom_dt)
## [1] 8124 23
This fragment uses regular expressions in order to read the field’s values from the property files in order to parse the dataframe to a readable format:
counter <- 1
for (header_idx in mushroom_headers) {
level <- levels(as.factor(mushroom_dt[,header_idx]))
rowFound <- grep(mushroom_headers[counter],mushroom_props,value = TRUE)
for (index in level) {
expr <- paste("[a-z]{1,}=[",index,"]",sep ="")
if (str_detect(rowFound,expr)){
a <- str_extract(rowFound,expr)
b <- str_extract(a,"[a-z]{1,}")
mushroom_dt[header_idx][mushroom_dt[header_idx] == index] <- b
} else {
print("No value found in the dataset")
}
}
counter <- counter + 1
}
mushroom_subset <- data.frame(mushroom_dt[1],mushroom_dt[2],mushroom_dt[4],mushroom_dt[18],mushroom_dt[21])
head(mushroom_subset, n = 50)
## class. cap.shape. cap.color. veil.color. spore.print.color.
## 1 poisonous convex brown white black
## 2 edible convex yellow white brown
## 3 edible bell white white brown
## 4 poisonous convex white white black
## 5 edible convex gray white brown
## 6 edible convex yellow white black
## 7 edible bell white white black
## 8 edible bell white white brown
## 9 poisonous convex white white black
## 10 edible bell yellow white black
## 11 edible convex yellow white brown
## 12 edible convex yellow white black
## 13 edible bell yellow white brown
## 14 poisonous convex white white brown
## 15 edible convex brown white black
## 16 edible sunken gray white brown
## 17 edible flat white white brown
## 18 poisonous convex brown white black
## 19 poisonous convex white white brown
## 20 poisonous convex brown white brown
## 21 edible bell yellow white brown
## 22 poisonous convex brown white brown
## 23 edible bell yellow white brown
## 24 edible bell white white brown
## 25 edible bell white white black
## 26 poisonous flat white white brown
## 27 edible convex yellow white brown
## 28 edible convex white white brown
## 29 edible flat brown white black
## 30 edible convex yellow white brown
## 31 edible bell yellow white brown
## 32 poisonous convex white white brown
## 33 edible convex yellow white brown
## 34 edible convex brown white brown
## 35 edible bell yellow white brown
## 36 edible convex yellow white brown
## 37 edible sunken gray white black
## 38 poisonous convex brown white brown
## 39 edible convex yellow white brown
## 40 edible bell yellow white black
## 41 edible bell yellow white brown
## 42 edible convex yellow white black
## 43 edible convex brown white black
## 44 poisonous convex white white brown
## 45 edible convex yellow white black
## 46 edible convex white white brown
## 47 edible convex yellow white black
## 48 edible convex white white brown
## 49 edible convex yellow white brown
## 50 edible flat yellow white black