Task 1 : Obtain and study the dataset provided on mushrooms.

#A wrapper for libcurl, provides functions of general HTTP requests
library(RCurl)
## Loading required package: bitops
#Lets fetch a URL
mushrooms = read.csv(text=getURL('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'), header = FALSE, sep = ',')

Attribute (or column) information is found on the website. First, they classify the mushrooms as either poisonous, edible, or eat at your own risk (Also poisonous.) Then assign 22 variables (columns) to each case. Meaning in total there will be 23 columns starting with whether or not the mushroom is edible.

summary(mushrooms)
##  V1       V2       V3             V4       V5             V6      
##  e:4208   b: 452   f:2320   n      :2284   f:4748   n      :3528  
##  p:3916   c:   4   g:   4   g      :1840   t:3376   f      :2160  
##           f:3152   s:2556   e      :1500            s      : 576  
##           k: 828   y:3244   y      :1072            y      : 576  
##           s:  32            w      :1040            a      : 400  
##           x:3656            b      : 168            l      : 400  
##                             (Other): 220            (Other): 484  
##  V7       V8       V9            V10       V11      V12      V13     
##  a: 210   c:6812   b:5612   b      :1728   e:3516   ?:2480   f: 552  
##  f:7914   w:1312   n:2512   p      :1492   t:4608   b:3776   k:2372  
##                             w      :1202            c: 556   s:5176  
##                             n      :1048            e:1120   y:  24  
##                             g      : 752            r: 192           
##                             h      : 732                             
##                             (Other):1170                             
##  V14           V15            V16       V17      V18      V19     
##  f: 600   w      :4464   w      :4384   p:8124   n:  96   n:  36  
##  k:2304   p      :1872   p      :1872            o:  96   o:7488  
##  s:4936   g      : 576   g      : 576            w:7924   t: 600  
##  y: 284   n      : 448   n      : 512            y:   8           
##           b      : 432   b      : 432                             
##           o      : 192   o      : 192                             
##           (Other): 140   (Other): 156                             
##  V20           V21       V22      V23     
##  e:2776   w      :2388   a: 384   d:3148  
##  f:  48   n      :1968   c: 340   g:2148  
##  l:1296   k      :1872   n: 400   l: 832  
##  n:  36   h      :1632   s:1248   m: 292  
##  p:3968   r      :  72   v:4040   p:1144  
##           b      :  48   y:1712   u: 368  
##           (Other): 144            w: 192

We can see V1 has either E or P…I’m guessing edible or poisonous….V2 has BCFKSX…so does attribute 1 on the website. So I’m going to use their list provided with an offset of -1 per attribute adjusting for E/P column.

Task 2 : Take the data and create a data frame with a subset of the colulmns.

names(mushrooms) = c('edible','capshape','capsurface','capcolor','bruises','odor','gillattach','gillspace','gillsize','gillcolor','stalkshape','stalkroot','stalksurfaceabovering','stalksurfacebelowring','stalkcolorabovering','stalkcolorbelowring','veiltype','veilcolor','ringnumber','ringtype','sporeprintcolor','population','habitat')

myMushrooms = subset(mushrooms,population=='y',c(edible,veilcolor,veiltype,bruises))

Task 3 : Change variable values to something more meaningful.

myMushrooms$edible = as.character(myMushrooms$edible)
myMushrooms$veilcolor = as.character(myMushrooms$veilcolor)
myMushrooms$veiltype = as.character((myMushrooms$veiltype))
myMushrooms$bruises = as.character(myMushrooms$bruises)

myMushrooms$edible[myMushrooms$edible=='e'] = 'edible'
myMushrooms$edible[myMushrooms$edible=='p'] = 'poisonous'

myMushrooms$veilcolor[myMushrooms$veilcolor=='n'] = 'brown'
myMushrooms$veilcolor[myMushrooms$veilcolor== 'o'] = 'orange'
myMushrooms$veilcolor[myMushrooms$veilcolor=='w'] = 'white'
myMushrooms$veilcolor[myMushrooms$veilcolor=='y'] = 'yellow'

myMushrooms$veiltype[myMushrooms$veiltype=='p'] = 'partial'
myMushrooms$veiltype[myMushrooms$veiltype=='u'] = 'universal'

myMushrooms$bruises[myMushrooms$bruises=='t'] = 'true'
myMushrooms$bruises[myMushrooms$bruises=='f'] = 'false'

myMushrooms$edible = as.factor(myMushrooms$edible)
myMushrooms$veilcolor = as.factor(myMushrooms$veilcolor)
myMushrooms$veiltype = as.factor((myMushrooms$veiltype))
myMushrooms$bruises = as.factor(myMushrooms$bruises)

head(myMushrooms)
##    edible veilcolor veiltype bruises
## 16 edible     white  partial   false
## 29 edible     white  partial   false
## 34 edible     white  partial    true
## 42 edible     white  partial    true
## 43 edible     white  partial   false
## 60 edible     white  partial    true
tail(myMushrooms)
##      edible veilcolor veiltype bruises
## 7941 edible     white  partial    true
## 7966 edible     white  partial    true
## 7985 edible     white  partial   false
## 7987 edible     white  partial   false
## 8002 edible     white  partial   false
## 8039 edible     white  partial    true
dim(myMushrooms)
## [1] 1712    4