Each type of observational unit is a table (won’t talk much about this)
Variables stored in both rows and columns
Source: Tidy Data (Wickham 2011)
To learn more about tidy data, check out Hadley Wickham’s paper and presentation. Both have lots of examples.
reshape2
package: melt()
<- for wide to longreshape2
package: cast()
<- for long to widestats
package: reshape()
<- for long to widehead(VADeaths, 10) #wide format
## Rural Male Rural Female Urban Male Urban Female
## 50-54 11.7 8.7 15.4 8.4
## 55-59 18.1 11.7 24.3 13.6
## 60-64 26.9 20.3 37.0 19.3
## 65-69 41.0 30.9 54.6 35.1
## 70-74 66.0 54.3 71.1 50.0
library(reshape2)
help(melt.data.frame)
# go from wide to long format
# in this case, there is no id variable
# but in most cases we'll have one or more variables specified in the id.vars argument
VADeaths <- melt(data=VADeaths, id.vars=NULL)
head(VADeaths)
## Var1 Var2 value
## 1 50-54 Rural Male 11.7
## 2 55-59 Rural Male 18.1
## 3 60-64 Rural Male 26.9
## 4 65-69 Rural Male 41.0
## 5 70-74 Rural Male 66.0
## 6 50-54 Rural Female 8.7
names(VADeaths) <- c("Age", "LocationSex", "Deaths")
# Var2 represents 2 variables
is.rural <- grepl("Rural", VADeaths$LocationSex)
VADeaths$Location <- ifelse(is.rural, "Rural", "Urban")
is.male <- grepl("Male", VADeaths$LocationSex)
VADeaths$Sex <- ifelse(is.male, "Male", "Female")
head(VADeaths)
## Age LocationSex Deaths Location Sex
## 1 50-54 Rural Male 11.7 Rural Male
## 2 55-59 Rural Male 18.1 Rural Male
## 3 60-64 Rural Male 26.9 Rural Male
## 4 65-69 Rural Male 41.0 Rural Male
## 5 70-74 Rural Male 66.0 Rural Male
## 6 50-54 Rural Female 8.7 Rural Female
VADeaths$LocationSex <- NULL
# Another way: Logical Variables
VADeaths$Rural <- is.rural
VADeaths$Male <- is.male
head(VADeaths)
## Age Deaths Location Sex Rural Male
## 1 50-54 11.7 Rural Male TRUE TRUE
## 2 55-59 18.1 Rural Male TRUE TRUE
## 3 60-64 26.9 Rural Male TRUE TRUE
## 4 65-69 41.0 Rural Male TRUE TRUE
## 5 70-74 66.0 Rural Male TRUE TRUE
## 6 50-54 8.7 Rural Female TRUE FALSE
VADeaths$Rural <- VADeaths$Male <- NULL
reshape()
or dcast()
VADeaths.wide <- reshape(VADeaths, idvar = c("Age", "Location"), timevar = "Sex", direction = "wide")
head(VADeaths.wide)
## Age Location Deaths.Male Deaths.Female
## 1 50-54 Rural 11.7 8.7
## 2 55-59 Rural 18.1 11.7
## 3 60-64 Rural 26.9 20.3
## 4 65-69 Rural 41.0 30.9
## 5 70-74 Rural 66.0 54.3
## 11 50-54 Urban 15.4 8.4
filter()
- select a subset of rows (faster than subset()
)select()
- select a subset of columnsarrange()
- orders rows by variable(s)mutate()
- add new columnssummarise()
- compute mean, median, count, and other summariesgroup_by()
:
arrange()
and summarise()
library(dplyr)
head(VADeaths)
## Age Deaths Location Sex
## 1 50-54 11.7 Rural Male
## 2 55-59 18.1 Rural Male
## 3 60-64 26.9 Rural Male
## 4 65-69 41.0 Rural Male
## 5 70-74 66.0 Rural Male
## 6 50-54 8.7 Rural Female
VADeaths <- group_by(VADeaths, Age, Location)
VADeaths <- select(VADeaths, Age:Location) #specify range of columns
VADeaths <- select(VADeaths, Age, Location, Deaths) #specify columns and reorder
head(VADeaths)
## Source: local data frame [6 x 3]
## Groups: Age, Location [5]
##
## Age Location Deaths
## <fctr> <chr> <dbl>
## 1 50-54 Rural 11.7
## 2 55-59 Rural 18.1
## 3 60-64 Rural 26.9
## 4 65-69 Rural 41.0
## 5 70-74 Rural 66.0
## 6 50-54 Rural 8.7
# calculate mean deaths and number of observations in each group
summarise(VADeaths, AvgDeaths = mean(Deaths), Count = n())
## Source: local data frame [10 x 4]
## Groups: Age [?]
##
## Age Location AvgDeaths Count
## <fctr> <chr> <dbl> <int>
## 1 50-54 Rural 10.20 2
## 2 50-54 Urban 11.90 2
## 3 55-59 Rural 14.90 2
## 4 55-59 Urban 18.95 2
## 5 60-64 Rural 23.60 2
## 6 60-64 Urban 28.15 2
## 7 65-69 Rural 35.95 2
## 8 65-69 Urban 44.85 2
## 9 70-74 Rural 60.15 2
## 10 70-74 Urban 60.55 2
filter(VADeaths, Deaths >= 20, Location == "Urban")
## Source: local data frame [6 x 3]
## Groups: Age, Location [4]
##
## Age Location Deaths
## <fctr> <chr> <dbl>
## 1 55-59 Urban 24.3
## 2 60-64 Urban 37.0
## 3 65-69 Urban 54.6
## 4 70-74 Urban 71.1
## 5 65-69 Urban 35.1
## 6 70-74 Urban 50.0
Use CO2 dataset
head(CO2)
# define groups by Type and Treatment
CO2 <- group_by(CO2, Type, Treatment)
# compute mean and standard deviation of uptake by Type and Treatment
summarise(CO2, uptake.mean=mean(uptake), uptake.sd=sd(uptake))
# compute values needed for confidence interval by Type and Treatment
CO2.CI <- summarise(CO2, uptake.mean=mean(uptake), uptake.sd=sd(uptake), count=n())
CO2.CI <- mutate(CO2.CI, CI.U = uptake.mean + 1.96*uptake.sd/sqrt(count), CI.L = uptake.mean - 1.96*uptake.sd/sqrt(count))
# create new "uptake per concentration" column
CO2 <- mutate(CO2, uptake_per_conc = uptake/conc)
qplot()
- “quick” plot, similar syntax to base graphicsggplot()
- full functionality of ggplot2qplot()
and build up to ggplot()
library(ggplot2)
head(CO2)
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
qplot(x = conc, y = uptake, data = CO2) #default geom = point
#ggplot syntax
ggplot(data = CO2, aes(x = conc, y = uptake)) + geom_point()
qplot(x = conc, y = uptake, data = CO2, colour = Type)
#ggplot syntax
ggplot(data = CO2, aes(x = conc, y = uptake, colour = Type)) + geom_point()
qplot(x = conc, y = uptake, data = CO2, colour = Type) + facet_grid(. ~ Treatment)
#ggplot syntax
ggplot(data = CO2, aes(x = conc, y = uptake, colour = Type)) + geom_point() + facet_grid(. ~ Treatment)
qplot(x = conc, y = uptake, data = CO2, colour = Type, group = Plant, geom = "line") + facet_grid(. ~ Treatment, labeller = "label_both")
#ggplot syntax
ggplot(data = CO2, aes(x = conc, y = uptake, colour = Type)) + geom_point() + facet_grid(. ~ Treatment, labeller = "label_both") + geom_line(group = Plant)
ggplot(data = CO2, aes(x = conc, y = uptake, colour = Type, group = Plant)) + geom_point() + facet_grid(. ~ Treatment, labeller = "label_both") + geom_line()