“tidy data”


messy vs. tidy (molten)


reshaping data


reshape example

head(VADeaths, 10) #wide format
##       Rural Male Rural Female Urban Male Urban Female
## 50-54       11.7          8.7       15.4          8.4
## 55-59       18.1         11.7       24.3         13.6
## 60-64       26.9         20.3       37.0         19.3
## 65-69       41.0         30.9       54.6         35.1
## 70-74       66.0         54.3       71.1         50.0
library(reshape2)
help(melt.data.frame)

reshape example (continued)

# go from wide to long format
# in this case, there is no id variable
# but in most cases we'll have one or more variables specified in the id.vars argument 
VADeaths <- melt(data=VADeaths, id.vars=NULL) 
head(VADeaths)
##    Var1         Var2 value
## 1 50-54   Rural Male  11.7
## 2 55-59   Rural Male  18.1
## 3 60-64   Rural Male  26.9
## 4 65-69   Rural Male  41.0
## 5 70-74   Rural Male  66.0
## 6 50-54 Rural Female   8.7
names(VADeaths) <- c("Age", "LocationSex", "Deaths")

reshape example (continued)

# Var2 represents 2 variables
is.rural <- grepl("Rural", VADeaths$LocationSex)
VADeaths$Location <- ifelse(is.rural, "Rural", "Urban")
is.male <- grepl("Male", VADeaths$LocationSex)
VADeaths$Sex <- ifelse(is.male, "Male", "Female")

head(VADeaths)
##     Age  LocationSex Deaths Location    Sex
## 1 50-54   Rural Male   11.7    Rural   Male
## 2 55-59   Rural Male   18.1    Rural   Male
## 3 60-64   Rural Male   26.9    Rural   Male
## 4 65-69   Rural Male   41.0    Rural   Male
## 5 70-74   Rural Male   66.0    Rural   Male
## 6 50-54 Rural Female    8.7    Rural Female
VADeaths$LocationSex <- NULL

reshape example (continued)

# Another way: Logical Variables
VADeaths$Rural <- is.rural
VADeaths$Male  <- is.male
head(VADeaths)
##     Age Deaths Location    Sex Rural  Male
## 1 50-54   11.7    Rural   Male  TRUE  TRUE
## 2 55-59   18.1    Rural   Male  TRUE  TRUE
## 3 60-64   26.9    Rural   Male  TRUE  TRUE
## 4 65-69   41.0    Rural   Male  TRUE  TRUE
## 5 70-74   66.0    Rural   Male  TRUE  TRUE
## 6 50-54    8.7    Rural Female  TRUE FALSE
VADeaths$Rural <- VADeaths$Male <- NULL

reshape exercise


reshape exercise solution

VADeaths.wide <- reshape(VADeaths, idvar = c("Age", "Location"), timevar = "Sex", direction = "wide")
head(VADeaths.wide)
##      Age Location Deaths.Male Deaths.Female
## 1  50-54    Rural        11.7           8.7
## 2  55-59    Rural        18.1          11.7
## 3  60-64    Rural        26.9          20.3
## 4  65-69    Rural        41.0          30.9
## 5  70-74    Rural        66.0          54.3
## 11 50-54    Urban        15.4           8.4

part 2: summarizing data


dplyr package


dplyr example

library(dplyr)
head(VADeaths)
##     Age Deaths Location    Sex
## 1 50-54   11.7    Rural   Male
## 2 55-59   18.1    Rural   Male
## 3 60-64   26.9    Rural   Male
## 4 65-69   41.0    Rural   Male
## 5 70-74   66.0    Rural   Male
## 6 50-54    8.7    Rural Female

dplyr example (continued)

VADeaths <- group_by(VADeaths, Age, Location)
VADeaths <- select(VADeaths, Age:Location)          #specify range of columns
VADeaths <- select(VADeaths, Age, Location, Deaths) #specify columns and reorder
head(VADeaths)
## Source: local data frame [6 x 3]
## Groups: Age, Location [5]
## 
##      Age Location Deaths
##   <fctr>    <chr>  <dbl>
## 1  50-54    Rural   11.7
## 2  55-59    Rural   18.1
## 3  60-64    Rural   26.9
## 4  65-69    Rural   41.0
## 5  70-74    Rural   66.0
## 6  50-54    Rural    8.7

dplyr example (continued)

# calculate mean deaths and number of observations in each group
summarise(VADeaths, AvgDeaths = mean(Deaths), Count = n())
## Source: local data frame [10 x 4]
## Groups: Age [?]
## 
##       Age Location AvgDeaths Count
##    <fctr>    <chr>     <dbl> <int>
## 1   50-54    Rural     10.20     2
## 2   50-54    Urban     11.90     2
## 3   55-59    Rural     14.90     2
## 4   55-59    Urban     18.95     2
## 5   60-64    Rural     23.60     2
## 6   60-64    Urban     28.15     2
## 7   65-69    Rural     35.95     2
## 8   65-69    Urban     44.85     2
## 9   70-74    Rural     60.15     2
## 10  70-74    Urban     60.55     2

dplyr example (continued)

filter(VADeaths, Deaths >= 20, Location == "Urban")
## Source: local data frame [6 x 3]
## Groups: Age, Location [4]
## 
##      Age Location Deaths
##   <fctr>    <chr>  <dbl>
## 1  55-59    Urban   24.3
## 2  60-64    Urban   37.0
## 3  65-69    Urban   54.6
## 4  70-74    Urban   71.1
## 5  65-69    Urban   35.1
## 6  70-74    Urban   50.0

dplyr exercise!

Use CO2 dataset

  1. Compute average uptake and standard deviation by plant type and treatment
  2. Compute a 95% confidence interval for each plant type and treatment
  3. Compute a 95% confidence interval for a new “Uptake per Concentration” column

dplyr exercise solution

head(CO2)

# define groups by Type and Treatment
CO2 <- group_by(CO2, Type, Treatment)

# compute mean and standard deviation of uptake by Type and Treatment
summarise(CO2, uptake.mean=mean(uptake), uptake.sd=sd(uptake))

# compute values needed for confidence interval by Type and Treatment
CO2.CI <- summarise(CO2, uptake.mean=mean(uptake), uptake.sd=sd(uptake), count=n())
CO2.CI <- mutate(CO2.CI, CI.U = uptake.mean + 1.96*uptake.sd/sqrt(count), CI.L = uptake.mean - 1.96*uptake.sd/sqrt(count))

# create new "uptake per concentration" column
CO2 <- mutate(CO2, uptake_per_conc = uptake/conc)

part 3: visualizing data


ggplot2 package


ggplot example

library(ggplot2)
head(CO2)
##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled   95   16.0
## 2   Qn1 Quebec nonchilled  175   30.4
## 3   Qn1 Quebec nonchilled  250   34.8
## 4   Qn1 Quebec nonchilled  350   37.2
## 5   Qn1 Quebec nonchilled  500   35.3
## 6   Qn1 Quebec nonchilled  675   39.2

ggplot example (continued)

qplot(x = conc, y = uptake, data = CO2) #default geom = point

#ggplot syntax
ggplot(data = CO2, aes(x = conc, y = uptake)) + geom_point()

ggplot example (continued)

qplot(x = conc, y = uptake, data = CO2, colour = Type)

#ggplot syntax
ggplot(data = CO2, aes(x = conc, y = uptake, colour = Type)) + geom_point()

ggplot example (continued)

qplot(x = conc, y = uptake, data = CO2, colour = Type) + facet_grid(. ~ Treatment)

#ggplot syntax
ggplot(data = CO2, aes(x = conc, y = uptake, colour = Type)) + geom_point() + facet_grid(. ~ Treatment)

ggplot example (continued)

qplot(x = conc, y = uptake, data = CO2, colour = Type, group = Plant, geom = "line") + facet_grid(. ~ Treatment, labeller = "label_both")

#ggplot syntax
ggplot(data = CO2, aes(x = conc, y = uptake, colour = Type)) + geom_point() + facet_grid(. ~ Treatment, labeller = "label_both") + geom_line(group = Plant)
ggplot(data = CO2, aes(x = conc, y = uptake, colour = Type, group = Plant)) + geom_point() + facet_grid(. ~ Treatment, labeller = "label_both") + geom_line()

ggplot exercise!