Set up workspace

rm(list=ls())

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)

setwd("/Users/ryanweber/Desktop/CUNY/Data 607 Db/Assignments/Week 1/Assignment1/Data607_Assignment1")

Read in files

Read in data dictionary

  dictionary <- read.table("data-dictionary.txt",row.names=NULL,stringsAsFactors = FALSE, sep=":")

Read in data set

  dataset <- read.table("dataSet.txt",row.names=NULL,stringsAsFactors = FALSE, sep=",")

Attempt to parse data dictionary

colnames(dictionary) <- c("field", "values")
dictionary$field <- gsub("[0-9]+\\. ", "", dictionary$field)
dictionary$field <- gsub(" ", "", dictionary$field)
dictionary$values <- gsub(" ", "", dictionary$values)
dictionary
##                       field
## 1                 cap-shape
## 2               cap-surface
## 3                 cap-color
## 4                  bruises?
## 5                      odor
## 6           gill-attachment
## 7              gill-spacing
## 8                 gill-size
## 9                gill-color
## 10              stalk-shape
## 11               stalk-root
## 12 stalk-surface-above-ring
## 13 stalk-surface-below-ring
## 14   stalk-color-above-ring
## 15   stalk-color-below-ring
## 16                veil-type
## 17               veil-color
## 18              ring-number
## 19                ring-type
## 20        spore-print-color
## 21               population
## 22                  habitat
##                                                                                               values
## 1                                                bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
## 2                                                               fibrous=f,grooves=g,scaly=y,smooth=s
## 3                    brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
## 4                                                                                     bruises=t,no=f
## 5                        almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
## 6                                                           attached=a,descending=d,free=f,notched=n
## 7                                                                        close=c,crowded=w,distant=d
## 8                                                                                   broad=b,narrow=n
## 9  black=k,brown=n,buff=b,chocolate=h,gray=g,green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
## 10                                                                            enlarging=e,tapering=t
## 11                                   bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
## 12                                                                fibrous=f,scaly=y,silky=k,smooth=s
## 13                                                                fibrous=f,scaly=y,silky=k,smooth=s
## 14                           brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
## 15                           brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
## 16                                                                             partial=p,universal=u
## 17                                                                 brown=n,orange=o,white=w,yellow=y
## 18                                                                                none=n,one=o,two=t
## 19                     cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
## 20                     black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
## 21                                abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
## 22                                      grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

Apply column names from dictionary to dataset

colnames(dataset) <- gsub("-", "_", c("edibility", dictionary$field))

Set poisonous as a factor

  dataset$edibility <- factor(dataset$edibility, levels=c("e", "p"), labels=c("edible", "poisonous"))

Parsing the rest of the columns using the data dictionary

I couldn’t see how to use the values column to create a factor. Is there an “apply” way to do this?

  for (i in 2:length(dataset)){
    
      # set up name value pairs for the factor
      temp <- data.frame(val = unlist(strsplit(dictionary$values[i - 1], ",")), stringsAsFactors = FALSE)
      temp2 <- as.data.frame(str_split_fixed(temp$val, "=", 2), stringsAsFactors = FALSE)
      colnames(temp2) <- c("label", "level")
    
      dataset[, i] <- factor(dataset[, i], levels=temp2$level, labels=temp2$label)
 
  }
      
  head(dataset)
##   edibility cap_shape cap_surface cap_color bruises?    odor
## 1 poisonous    convex      smooth     brown  bruises pungent
## 2    edible    convex      smooth    yellow  bruises  almond
## 3    edible      bell      smooth     white  bruises   anise
## 4 poisonous    convex       scaly     white  bruises pungent
## 5    edible    convex      smooth      gray       no    none
## 6    edible    convex       scaly    yellow  bruises  almond
##   gill_attachment gill_spacing gill_size gill_color stalk_shape stalk_root
## 1            free        close    narrow      black   enlarging      equal
## 2            free        close     broad      black   enlarging       club
## 3            free        close     broad      brown   enlarging       club
## 4            free        close    narrow      brown   enlarging      equal
## 5            free      crowded     broad      black    tapering      equal
## 6            free        close     broad      brown   enlarging       club
##   stalk_surface_above_ring stalk_surface_below_ring stalk_color_above_ring
## 1                   smooth                   smooth                  white
## 2                   smooth                   smooth                  white
## 3                   smooth                   smooth                  white
## 4                   smooth                   smooth                  white
## 5                   smooth                   smooth                  white
## 6                   smooth                   smooth                  white
##   stalk_color_below_ring veil_type veil_color ring_number  ring_type
## 1                  white   partial      white         one    pendant
## 2                  white   partial      white         one    pendant
## 3                  white   partial      white         one    pendant
## 4                  white   partial      white         one    pendant
## 5                  white   partial      white         one evanescent
## 6                  white   partial      white         one    pendant
##   spore_print_color population habitat
## 1             black  scattered   urban
## 2             brown   numerous grasses
## 3             brown   numerous meadows
## 4             black  scattered   urban
## 5             brown   abundant grasses
## 6             black   numerous grasses

Select off the first 4 columns

  dataset <- dataset[, 1:4]

  head(dataset)
##   edibility cap_shape cap_surface cap_color
## 1 poisonous    convex      smooth     brown
## 2    edible    convex      smooth    yellow
## 3    edible      bell      smooth     white
## 4 poisonous    convex       scaly     white
## 5    edible    convex      smooth      gray
## 6    edible    convex       scaly    yellow