- Reshaping Data
reshape()functionreshape2package andmelt()function
- Summarizing Data
dplyrpackage
- Visualizing Data
ggplot2package
Amanda Mejia
BST 753 Spring 2014
reshape() functionreshape2 package and melt() functiondplyr packageggplot2 packageWhat is tidy data?
What is messy data?
Source: Tidy Data (Wickham 2011)
To learn more about tidy data, check out Hadley Wickham's paper and presentation. Both have lots of examples.

reshape2 package: melt() <- for wide to longreshape2 package: cast() <- for long to widestats package: reshape() <- for long to widehead(VADeaths, 10) #wide format
## Rural Male Rural Female Urban Male Urban Female
## 50-54 11.7 8.7 15.4 8.4
## 55-59 18.1 11.7 24.3 13.6
## 60-64 26.9 20.3 37.0 19.3
## 65-69 41.0 30.9 54.6 35.1
## 70-74 66.0 54.3 71.1 50.0
library(reshape2)
help(melt.data.frame)
# go from wide to long format
VADeaths <- melt(data = VADeaths, id.vars = "Rural")
head(VADeaths)
## Var1 Var2 value
## 1 50-54 Rural Male 11.7
## 2 55-59 Rural Male 18.1
## 3 60-64 Rural Male 26.9
## 4 65-69 Rural Male 41.0
## 5 70-74 Rural Male 66.0
## 6 50-54 Rural Female 8.7
names(VADeaths)[c(1, 3)] <- c("Age", "Deaths")
# Var2 represents 2 variables
is.rural <- grepl("Rural", VADeaths$Var2)
VADeaths$Location <- ifelse(is.rural, "Rural", "Urban")
is.male <- grepl("Male", VADeaths$Var2)
VADeaths$Sex <- ifelse(is.male, "Male", "Female")
head(VADeaths)
## Age Var2 Deaths Location Sex
## 1 50-54 Rural Male 11.7 Rural Male
## 2 55-59 Rural Male 18.1 Rural Male
## 3 60-64 Rural Male 26.9 Rural Male
## 4 65-69 Rural Male 41.0 Rural Male
## 5 70-74 Rural Male 66.0 Rural Male
## 6 50-54 Rural Female 8.7 Rural Female
VADeaths$Var2 <- NULL
# Another way: Logical Variables
VADeaths$Rural <- is.rural
VADeaths$Male <- is.male
head(VADeaths)
## Age Deaths Location Sex Rural Male
## 1 50-54 11.7 Rural Male TRUE TRUE
## 2 55-59 18.1 Rural Male TRUE TRUE
## 3 60-64 26.9 Rural Male TRUE TRUE
## 4 65-69 41.0 Rural Male TRUE TRUE
## 5 70-74 66.0 Rural Male TRUE TRUE
## 6 50-54 8.7 Rural Female TRUE FALSE
reshape() or dcast()filter() - select a subset of rows (faster than subset())select() - select a subset of columns arrange() - orders rows by variable(s)mutate() - add new columns summarise() - compute mean, median, count, and other summariesgroup_by():
arrange() and summarise()library(dplyr)
head(VADeaths)
## Age Deaths Location Sex Rural Male
## 1 50-54 11.7 Rural Male TRUE TRUE
## 2 55-59 18.1 Rural Male TRUE TRUE
## 3 60-64 26.9 Rural Male TRUE TRUE
## 4 65-69 41.0 Rural Male TRUE TRUE
## 5 70-74 66.0 Rural Male TRUE TRUE
## 6 50-54 8.7 Rural Female TRUE FALSE
VADeaths$Rural <- NULL
VADeaths$Male <- NULL
VADeaths <- group_by(VADeaths, Age, Location)
VADeaths <- select(VADeaths, Age:Location) #specify range of columns
VADeaths <- select(VADeaths, Age, Location, Deaths) #specify certain columns
VADeaths
## Source: local data frame [20 x 3]
## Groups: Age, Location
##
## Age Location Deaths
## 1 50-54 Rural 11.7
## 2 55-59 Rural 18.1
## 3 60-64 Rural 26.9
## 4 65-69 Rural 41.0
## 5 70-74 Rural 66.0
## 6 50-54 Rural 8.7
## 7 55-59 Rural 11.7
## 8 60-64 Rural 20.3
## 9 65-69 Rural 30.9
## 10 70-74 Rural 54.3
## 11 50-54 Urban 15.4
## 12 55-59 Urban 24.3
## 13 60-64 Urban 37.0
## 14 65-69 Urban 54.6
## 15 70-74 Urban 71.1
## 16 50-54 Urban 8.4
## 17 55-59 Urban 13.6
## 18 60-64 Urban 19.3
## 19 65-69 Urban 35.1
## 20 70-74 Urban 50.0
# calculate mean deaths and number of observations in each group
summarise(VADeaths, TotalDeaths = mean(Deaths), Count = n())
## Source: local data frame [10 x 4]
## Groups: Age
##
## Age Location TotalDeaths Count
## 1 70-74 Urban 60.55 2
## 2 65-69 Urban 44.85 2
## 3 60-64 Urban 28.15 2
## 4 55-59 Urban 18.95 2
## 5 50-54 Urban 11.90 2
## 6 70-74 Rural 60.15 2
## 7 65-69 Rural 35.95 2
## 8 60-64 Rural 23.60 2
## 9 55-59 Rural 14.90 2
## 10 50-54 Rural 10.20 2
filter(VADeaths, Deaths >= 20, Location == "Urban")
## Source: local data frame [6 x 3]
## Groups: Age, Location
##
## Age Location Deaths
## 1 55-59 Urban 24.3
## 2 60-64 Urban 37.0
## 3 65-69 Urban 54.6
## 4 70-74 Urban 71.1
## 5 65-69 Urban 35.1
## 6 70-74 Urban 50.0
Use CO2 dataset
head(CO2)
# define groups by Type and Treatment
CO2 <- group_by(CO2, Type, Treatment)
# compute mean and standard deviation of uptake by Type and Treatment
summarise(CO2, uptake.mean = mean(uptake), uptake.sd = sd(uptake))
# compute values needed for confidence interval by Type and Treatment
CO2.CI <- summarise(CO2, uptake.mean = mean(uptake), uptake.sd = sd(uptake),
count = n())
CO2.CI <- mutate(CO2.CI, CI.U = uptake.mean + 1.96 * uptake.sd/sqrt(count),
CI.L = uptake.mean - 1.96 * uptake.sd/sqrt(count))
# create new 'uptake per concentration' column
CO2 <- mutate(CO2, uptake_per_conc = uptake/conc)
qplot() - "quick" plot, similar syntax to base graphicsggplot() - full functionality of ggplot2qplot() and build up to ggplot()library(ggplot2)
head(CO2)
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
qplot(x = conc, y = uptake, data = CO2) #default geom = point
# ggplot syntax
ggplot(data = CO2, aes(x = conc, y = uptake)) + geom_point()
qplot(x = conc, y = uptake, data = CO2, colour = Type)
# ggplot syntax
ggplot(data = CO2, aes(x = conc, y = uptake, colour = Type)) + geom_point()
qplot(x = conc, y = uptake, data = CO2, colour = Type) + facet_grid(. ~ Treatment)
# ggplot syntax
ggplot(data = CO2, aes(x = conc, y = uptake, colour = Type)) + geom_point() +
facet_grid(. ~ Treatment)
qplot(x = conc, y = uptake, data = CO2, colour = Type, group = Plant, geom = "line") +
facet_grid(. ~ Treatment, labeller = "label_both")
# ggplot syntax
ggplot(data = CO2, aes(x = conc, y = uptake, colour = Type)) + geom_point() +
facet_grid(. ~ Treatment, labeller = "label_both") + geom_line(group = Plant)
ggplot(data = CO2, aes(x = conc, y = uptake, colour = Type, group = Plant)) +
geom_point() + facet_grid(. ~ Treatment, labeller = "label_both") + geom_line()