rm(list=ls())
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
setwd("/Users/ryanweber/Desktop/CUNY/Data 607 Db/Assignments/Week 1/Assignment1/Data607_Assignment1")
dictionary <- read.table("data-dictionary.txt",row.names=NULL,stringsAsFactors = FALSE, sep=":")
dataset <- read.table("dataSet.txt",row.names=NULL,stringsAsFactors = FALSE, sep=",")
colnames(dictionary) <- c("field", "values")
dictionary$field <- gsub("[0-9]+\\. ", "", dictionary$field)
dictionary$field <- gsub(" ", "", dictionary$field)
dictionary$values <- gsub(" ", "", dictionary$values)
dictionary
## field
## 1 cap-shape
## 2 cap-surface
## 3 cap-color
## 4 bruises?
## 5 odor
## 6 gill-attachment
## 7 gill-spacing
## 8 gill-size
## 9 gill-color
## 10 stalk-shape
## 11 stalk-root
## 12 stalk-surface-above-ring
## 13 stalk-surface-below-ring
## 14 stalk-color-above-ring
## 15 stalk-color-below-ring
## 16 veil-type
## 17 veil-color
## 18 ring-number
## 19 ring-type
## 20 spore-print-color
## 21 population
## 22 habitat
## values
## 1 bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
## 2 fibrous=f,grooves=g,scaly=y,smooth=s
## 3 brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
## 4 bruises=t,no=f
## 5 almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
## 6 attached=a,descending=d,free=f,notched=n
## 7 close=c,crowded=w,distant=d
## 8 broad=b,narrow=n
## 9 black=k,brown=n,buff=b,chocolate=h,gray=g,green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
## 10 enlarging=e,tapering=t
## 11 bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
## 12 fibrous=f,scaly=y,silky=k,smooth=s
## 13 fibrous=f,scaly=y,silky=k,smooth=s
## 14 brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
## 15 brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
## 16 partial=p,universal=u
## 17 brown=n,orange=o,white=w,yellow=y
## 18 none=n,one=o,two=t
## 19 cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
## 20 black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
## 21 abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
## 22 grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d
colnames(dataset) <- gsub("-", "_", c("edibility", dictionary$field))
dataset$edibility <- factor(dataset$edibility, levels=c("e", "p"), labels=c("edible", "poisonous"))
I couldn’t see how to use the values column to create a factor. Is there an “apply” way to do this?
for (i in 2:length(dataset)){
# set up name value pairs for the factor
temp <- data.frame(val = unlist(strsplit(dictionary$values[i - 1], ",")), stringsAsFactors = FALSE)
temp2 <- as.data.frame(str_split_fixed(temp$val, "=", 2), stringsAsFactors = FALSE)
colnames(temp2) <- c("label", "level")
dataset[, i] <- factor(dataset[, i], levels=temp2$level, labels=temp2$label)
}
head(dataset)
## edibility cap_shape cap_surface cap_color bruises? odor
## 1 poisonous convex smooth brown bruises pungent
## 2 edible convex smooth yellow bruises almond
## 3 edible bell smooth white bruises anise
## 4 poisonous convex scaly white bruises pungent
## 5 edible convex smooth gray no none
## 6 edible convex scaly yellow bruises almond
## gill_attachment gill_spacing gill_size gill_color stalk_shape stalk_root
## 1 free close narrow black enlarging equal
## 2 free close broad black enlarging club
## 3 free close broad brown enlarging club
## 4 free close narrow brown enlarging equal
## 5 free crowded broad black tapering equal
## 6 free close broad brown enlarging club
## stalk_surface_above_ring stalk_surface_below_ring stalk_color_above_ring
## 1 smooth smooth white
## 2 smooth smooth white
## 3 smooth smooth white
## 4 smooth smooth white
## 5 smooth smooth white
## 6 smooth smooth white
## stalk_color_below_ring veil_type veil_color ring_number ring_type
## 1 white partial white one pendant
## 2 white partial white one pendant
## 3 white partial white one pendant
## 4 white partial white one pendant
## 5 white partial white one evanescent
## 6 white partial white one pendant
## spore_print_color population habitat
## 1 black scattered urban
## 2 brown numerous grasses
## 3 brown numerous meadows
## 4 black scattered urban
## 5 brown abundant grasses
## 6 black numerous grasses
dataset <- dataset[, 1:4]
head(dataset)
## edibility cap_shape cap_surface cap_color
## 1 poisonous convex smooth brown
## 2 edible convex smooth yellow
## 3 edible bell smooth white
## 4 poisonous convex scaly white
## 5 edible convex smooth gray
## 6 edible convex scaly yellow