DATA 607 - Week 1 Assignment

library(tidyverse)

A. Read in dataset from GitHub repository and review raw data

url <- "https://raw.githubusercontent.com/kecbenson/DATA_607_Wk1/master/agaricus-lepiota.data.txt"
df_raw <- as.tibble(read.csv(url, header = FALSE))
df_raw

## # A tibble: 8,124 x 23
##    V1    V2    V3    V4    V5    V6    V7    V8    V9    V10   V11   V12  
##    <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
##  1 p     x     s     n     t     p     f     c     n     k     e     e    
##  2 e     x     s     y     t     a     f     c     b     k     e     c    
##  3 e     b     s     w     t     l     f     c     b     n     e     c    
##  4 p     x     y     w     t     p     f     c     n     n     e     e    
##  5 e     x     s     g     f     n     f     w     b     k     t     e    
##  6 e     x     y     y     t     a     f     c     b     n     e     c    
##  7 e     b     s     w     t     a     f     c     b     g     e     c    
##  8 e     b     y     w     t     l     f     c     b     n     e     c    
##  9 p     x     y     w     t     p     f     c     n     p     e     e    
## 10 e     b     s     y     t     a     f     c     b     g     e     c    
## # ... with 8,114 more rows, and 11 more variables: V13 <fct>, V14 <fct>,
## #   V15 <fct>, V16 <fct>, V17 <fct>, V18 <fct>, V19 <fct>, V20 <fct>,
## #   V21 <fct>, V22 <fct>, V23 <fct>

summary(df_raw)

##  V1       V2       V3             V4       V5             V6      
##  e:4208   b: 452   f:2320   n      :2284   f:4748   n      :3528  
##  p:3916   c:   4   g:   4   g      :1840   t:3376   f      :2160  
##           f:3152   s:2556   e      :1500            s      : 576  
##           k: 828   y:3244   y      :1072            y      : 576  
##           s:  32            w      :1040            a      : 400  
##           x:3656            b      : 168            l      : 400  
##                             (Other): 220            (Other): 484  
##  V7       V8       V9            V10       V11      V12      V13     
##  a: 210   c:6812   b:5612   b      :1728   e:3516   ?:2480   f: 552  
##  f:7914   w:1312   n:2512   p      :1492   t:4608   b:3776   k:2372  
##                             w      :1202            c: 556   s:5176  
##                             n      :1048            e:1120   y:  24  
##                             g      : 752            r: 192           
##                             h      : 732                             
##                             (Other):1170                             
##  V14           V15            V16       V17      V18      V19     
##  f: 600   w      :4464   w      :4384   p:8124   n:  96   n:  36  
##  k:2304   p      :1872   p      :1872            o:  96   o:7488  
##  s:4936   g      : 576   g      : 576            w:7924   t: 600  
##  y: 284   n      : 448   n      : 512            y:   8           
##           b      : 432   b      : 432                             
##           o      : 192   o      : 192                             
##           (Other): 140   (Other): 156                             
##  V20           V21       V22      V23     
##  e:2776   w      :2388   a: 384   d:3148  
##  f:  48   n      :1968   c: 340   g:2148  
##  l:1296   k      :1872   n: 400   l: 832  
##  n:  36   h      :1632   s:1248   m: 292  
##  p:3968   r      :  72   v:4040   p:1144  
##           b      :  48   y:1712   u: 368  
##           (Other): 144            w: 192

B. Create dataframe with character instead of factor data

df1 <- as.tibble(read.csv(url, header = FALSE, stringsAsFactors=FALSE))
df1

## # A tibble: 8,124 x 23
##    V1    V2    V3    V4    V5    V6    V7    V8    V9    V10   V11   V12  
##    <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 p     x     s     n     t     p     f     c     n     k     e     e    
##  2 e     x     s     y     t     a     f     c     b     k     e     c    
##  3 e     b     s     w     t     l     f     c     b     n     e     c    
##  4 p     x     y     w     t     p     f     c     n     n     e     e    
##  5 e     x     s     g     f     n     f     w     b     k     t     e    
##  6 e     x     y     y     t     a     f     c     b     n     e     c    
##  7 e     b     s     w     t     a     f     c     b     g     e     c    
##  8 e     b     y     w     t     l     f     c     b     n     e     c    
##  9 p     x     y     w     t     p     f     c     n     p     e     e    
## 10 e     b     s     y     t     a     f     c     b     g     e     c    
## # ... with 8,114 more rows, and 11 more variables: V13 <chr>, V14 <chr>,
## #   V15 <chr>, V16 <chr>, V17 <chr>, V18 <chr>, V19 <chr>, V20 <chr>,
## #   V21 <chr>, V22 <chr>, V23 <chr>

C. Select a subset of the columns in the dataset and add meaningful column names

df2 <- subset(df1, select = c(V1, V2, V4, V6, V22, V23))
df2

## # A tibble: 8,124 x 6
##    V1    V2    V4    V6    V22   V23  
##    <chr> <chr> <chr> <chr> <chr> <chr>
##  1 p     x     n     p     s     u    
##  2 e     x     y     a     n     g    
##  3 e     b     w     l     n     m    
##  4 p     x     w     p     s     u    
##  5 e     x     g     n     a     g    
##  6 e     x     y     a     n     g    
##  7 e     b     w     a     n     m    
##  8 e     b     w     l     s     m    
##  9 p     x     w     p     v     g    
## 10 e     b     y     a     s     m    
## # ... with 8,114 more rows

colnames(df2) <- c("edible", "cap_shape", "cap_color", "odor", "population", "habitat")
df2

## # A tibble: 8,124 x 6
##    edible cap_shape cap_color odor  population habitat
##    <chr>  <chr>     <chr>     <chr> <chr>      <chr>  
##  1 p      x         n         p     s          u      
##  2 e      x         y         a     n          g      
##  3 e      b         w         l     n          m      
##  4 p      x         w         p     s          u      
##  5 e      x         g         n     a          g      
##  6 e      x         y         a     n          g      
##  7 e      b         w         a     n          m      
##  8 e      b         w         l     s          m      
##  9 p      x         w         p     v          g      
## 10 e      b         y         a     s          m      
## # ... with 8,114 more rows

D. Replace abbreviations in the data with descriptive text

# edible = edible, poisonous
df2$edible[df2$edible == "e"] <- "edible"
df2$edible[df2$edible == "p"] <- "poisonous"
# check all instances accounted for
table(df2$edible)

## 
##    edible poisonous 
##      4208      3916

sum(table(df2$edible))

## [1] 8124

# cap_shape = bell, conical, convex, flat, knobbed, sunken
df2$cap_shape[df2$cap_shape == "b"] <- "bell"
df2$cap_shape[df2$cap_shape == "c"] <- "conical"
df2$cap_shape[df2$cap_shape == "x"] <- "convex"
df2$cap_shape[df2$cap_shape == "f"] <- "flat"
df2$cap_shape[df2$cap_shape == "k"] <- "knobbed"
df2$cap_shape[df2$cap_shape == "s"] <- "sunken"
# check all instances accounted for
table(df2$cap_shape)

## 
##    bell conical  convex    flat knobbed  sunken 
##     452       4    3656    3152     828      32

sum(table(df2$cap_shape))

## [1] 8124

# cap_color = brown, buff, cinnamon, gray, green, pink, purple, red, white, yellow
df2$cap_color[df2$cap_color == "n"] <- "brown"
df2$cap_color[df2$cap_color == "b"] <- "buff"
df2$cap_color[df2$cap_color == "c"] <- "cinnamon"
df2$cap_color[df2$cap_color == "g"] <- "gray"
df2$cap_color[df2$cap_color == "r"] <- "green"
df2$cap_color[df2$cap_color == "p"] <- "pink"
df2$cap_color[df2$cap_color == "u"] <- "purple"
df2$cap_color[df2$cap_color == "e"] <- "red"
df2$cap_color[df2$cap_color == "w"] <- "white"
df2$cap_color[df2$cap_color == "y"] <- "yellow"
# check all instances accounted for
table(df2$cap_color)

## 
##    brown     buff cinnamon     gray    green     pink   purple      red 
##     2284      168       44     1840       16      144       16     1500 
##    white   yellow 
##     1040     1072

sum(table(df2$cap_color))

## [1] 8124

# odor = almond, anise, creosote, fishy, foul, musty, none, pungent, spicy
df2$odor[df2$odor == "a"] <- "almond"
df2$odor[df2$odor == "l"] <- "anise"
df2$odor[df2$odor == "c"] <- "creosote"
df2$odor[df2$odor == "y"] <- "fishy"
df2$odor[df2$odor == "f"] <- "foul"
df2$odor[df2$odor == "m"] <- "musty"
df2$odor[df2$odor == "n"] <- "none"
df2$odor[df2$odor == "p"] <- "pungent"
df2$odor[df2$odor == "s"] <- "spicy"
# check all instances accounted for
table(df2$odor)

## 
##   almond    anise creosote    fishy     foul    musty     none  pungent 
##      400      400      192      576     2160       36     3528      256 
##    spicy 
##      576

sum(table(df2$odor))

## [1] 8124

# population = abundant, clustered, numerous, scattered, several, solitary
df2$population[df2$population == "a"] <- "abundant"
df2$population[df2$population == "c"] <- "clustered"
df2$population[df2$population == "n"] <- "numerous"
df2$population[df2$population == "s"] <- "scattered"
df2$population[df2$population == "v"] <- "several"
df2$population[df2$population == "y"] <- "solitary"
# check all instances accounted for
table(df2$population)

## 
##  abundant clustered  numerous scattered   several  solitary 
##       384       340       400      1248      4040      1712

sum(table(df2$population))

## [1] 8124

# habitat = grasses, leaves, meadows, paths, urban, waste, woods
df2$habitat[df2$habitat == "g"] <- "grasses"
df2$habitat[df2$habitat == "l"] <- "leaves"
df2$habitat[df2$habitat == "m"] <- "meadows"
df2$habitat[df2$habitat == "p"] <- "paths"
df2$habitat[df2$habitat == "u"] <- "urban"
df2$habitat[df2$habitat == "w"] <- "waste"
df2$habitat[df2$habitat == "d"] <- "woods"
# check all instances accounted for
table(df2$habitat)

## 
## grasses  leaves meadows   paths   urban   waste   woods 
##    2148     832     292    1144     368     192    3148

sum(table(df2$habitat))

## [1] 8124

# final dataframe
df2

## # A tibble: 8,124 x 6
##    edible    cap_shape cap_color odor    population habitat
##    <chr>     <chr>     <chr>     <chr>   <chr>      <chr>  
##  1 poisonous convex    brown     pungent scattered  urban  
##  2 edible    convex    yellow    almond  numerous   grasses
##  3 edible    bell      white     anise   numerous   meadows
##  4 poisonous convex    white     pungent scattered  urban  
##  5 edible    convex    gray      none    abundant   grasses
##  6 edible    convex    yellow    almond  numerous   grasses
##  7 edible    bell      white     almond  numerous   meadows
##  8 edible    bell      white     anise   scattered  meadows
##  9 poisonous convex    white     pungent several    grasses
## 10 edible    bell      yellow    almond  scattered  meadows
## # ... with 8,114 more rows

str(df2)

## Classes 'tbl_df', 'tbl' and 'data.frame':    8124 obs. of  6 variables:
##  $ edible    : chr  "poisonous" "edible" "edible" "poisonous" ...
##  $ cap_shape : chr  "convex" "convex" "bell" "convex" ...
##  $ cap_color : chr  "brown" "yellow" "white" "white" ...
##  $ odor      : chr  "pungent" "almond" "anise" "pungent" ...
##  $ population: chr  "scattered" "numerous" "numerous" "scattered" ...
##  $ habitat   : chr  "urban" "grasses" "meadows" "urban" ...

E. Graph and review distribution of column data

barplot(table(df2$edible), main = "edible / poisonous")

barplot(table(df2$cap_shape), main = "cap shape")

barplot(table(df2$cap_color), main = "cap color")

barplot(table(df2$odor), main = "cap odor")

barplot(table(df2$population), main = "population")

barplot(table(df2$habitat), main = "habitat")

DATA 607 - Week 1 Assignment

Kevin Benson

August 31, 2018

A. Read in dataset from GitHub repository and review raw data

B. Create dataframe with character instead of factor data

C. Select a subset of the columns in the dataset and add meaningful column names

D. Replace abbreviations in the data with descriptive text

E. Graph and review distribution of column data