Read CSV [A] File Name to read [B] There is no header [C] “,” is separator [D] A character vector of strings which are to be interpreted as “NA” (NOT Available or missing)
mushroom_full_dataFile<-read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", header= FALSE, sep=",",na.strings = "?")
head (mushroom_full_dataFile)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
mushroom_dictionay <- read.table("./data_dictionary.txt",row.names = 1, sep=":")
head(mushroom_dictionay)
## V2
## class poison=p,edible=e
## cap-shape bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
## cap-surface fibrous=f,grooves=g,scaly=y,smooth=s
## cap-color brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
## bruises? bruises=t,no=f
## odor almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
Take the first column of dictionary and use it as the rows names of the data file
names(mushroom_full_dataFile) <- row.names(mushroom_dictionay)
head(mushroom_full_dataFile)
## class cap-shape cap-surface cap-color bruises? odor gill-attachment
## 1 p x s n t p f
## 2 e x s y t a f
## 3 e b s w t l f
## 4 p x y w t p f
## 5 e x s g f n f
## 6 e x y y t a f
## gill-spacing gill-size gill-color stalk-shape stalk-root
## 1 c n k e e
## 2 c b k e c
## 3 c b n e c
## 4 c n n e e
## 5 w b k t e
## 6 c b n e c
## stalk-surface-above-ring stalk-surface-below-ring stalk-color-above-ring
## 1 s s w
## 2 s s w
## 3 s s w
## 4 s s w
## 5 s s w
## 6 s s w
## stalk-color-below-ring veil-type veil-color ring-number ring-type
## 1 w p w o p
## 2 w p w o p
## 3 w p w o p
## 4 w p w o p
## 5 w p w o e
## 6 w p w o p
## spore-print-color population habitat
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
Select a class column and four others
mushroom_sub_df <- subset(mushroom_full_dataFile, select=c("class","cap-color","habitat", "ring-number","odor"))
head(mushroom_sub_df)
## class cap-color habitat ring-number odor
## 1 p n u o p
## 2 e y g o a
## 3 e w m o l
## 4 p w u o p
## 5 e g g o n
## 6 e y g o a
Create function to replace abbreviation with its translation from data dictionary (uses gsub with regex)
getTranslation <- function(a) {
return(sapply(1:nrow(mushroom_sub_df), function(x)
gsub(paste('(^|.*,)(.*)=',mushroom_sub_df[x,a],'.*',sep=""),"\\2",mushroom_dictionay[a,1] )
))
}
mushroom_translated_df <- sapply(colnames(mushroom_sub_df), function(x) getTranslation(x))
head(mushroom_translated_df)
## class cap-color habitat ring-number odor
## [1,] "poison" "brown" "urban" "one" "pungent"
## [2,] "edible" "yellow" "grasses" "one" "almond"
## [3,] "edible" "white" "meadows" "one" "anise"
## [4,] "poison" "white" "urban" "one" "pungent"
## [5,] "edible" "gray" "grasses" "one" "none"
## [6,] "edible" "yellow" "grasses" "one" "almond"