http://cran.r-project.org/doc/contrib/Baggott-refcard-v2.pdf
http://cran.r-project.org/doc/contrib/Torfs+Brauer-Short-R-Intro.pdf
http://had.co.nz/stat480/r/
http://www.rstudio.com/wp-content/uploads/2015/01/data-wrangling-cheatsheet.pdf
load some required libraries
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# assign the first 6 rows of the iris dataset as a toy dataframe
df <- head(iris)
df
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# inspect the structure of the data frame
str(df)
## 'data.frame': 6 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1
# compute summary statistics for each column
summary(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.60 Min. :3.00 Min. :1.30 Min. :0.200
## 1st Qu.:4.75 1st Qu.:3.12 1st Qu.:1.40 1st Qu.:0.200
## Median :4.95 Median :3.35 Median :1.40 Median :0.200
## Mean :4.95 Mean :3.38 Mean :1.45 Mean :0.233
## 3rd Qu.:5.08 3rd Qu.:3.58 3rd Qu.:1.48 3rd Qu.:0.200
## Max. :5.40 Max. :3.90 Max. :1.70 Max. :0.400
## Species
## setosa :6
## versicolor:0
## virginica :0
##
##
##
# look at the fifth row
df[5, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 5 5 3.6 1.4 0.2 setosa
# three different ways to look at the second column
# by index
df[ , 2]
## [1] 3.5 3.0 3.2 3.1 3.6 3.9
# by name
df[ , "Sepal.Width"]
## [1] 3.5 3.0 3.2 3.1 3.6 3.9
# extracting the column vector
df$Sepal.Width
## [1] 3.5 3.0 3.2 3.1 3.6 3.9
# find rows with Sepal.Length at least 5
df$Sepal.Length >= 5
## [1] TRUE FALSE FALSE FALSE TRUE TRUE
# select all rows with Sepal.Length at least 5
df[df$Sepal.Length >= 5, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# equivalent to selecting rows 1, 5, and 6 using a logical vector
df[c(T,F,F,F,T,T), ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# nicer syntax for the same thing
# note: the second entry of subset is a logical vector
subset(df, Sepal.Length >= 5)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# filter ANDs conditions when given multiple arguments
filter(df, Sepal.Length >= 5, Petal.Length <= 1.4)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 5.0 3.6 1.4 0.2 setosa
# create a new column by adding the sepal and petal widths
# ugly
df$total_width <- df$Sepal.Width + df$Petal.Width
# nicer
# note: mutate always gives back same number of rows
mutate(df, total_width=Sepal.Width + Petal.Width)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species total_width
## 1 5.1 3.5 1.4 0.2 setosa 3.7
## 2 4.9 3.0 1.4 0.2 setosa 3.2
## 3 4.7 3.2 1.3 0.2 setosa 3.4
## 4 4.6 3.1 1.5 0.2 setosa 3.3
## 5 5.0 3.6 1.4 0.2 setosa 3.8
## 6 5.4 3.9 1.7 0.4 setosa 4.3
# compute the average sepal length
mean(df$Sepal.Length)
## [1] 4.95
# note: summarize gives back one row
summarize(df, mean_sepal_length=mean(Sepal.Length))
## mean_sepal_length
## 1 4.95
# note: we're using the full iris dataframe (instead of the first 6 rows) from here on
# use grep to find indices of the column names that contain 'Length'
grep('Length', names(iris))
## [1] 1 3
# same, but return the values instead of indices
grep('Length', names(iris), value=T)
## [1] "Sepal.Length" "Petal.Length"
# same, but return a logical vector when there's a match
grepl('Length', names(iris))
## [1] TRUE FALSE TRUE FALSE FALSE
# find all rows where the species matches the pattern 'to'
filter(iris, grepl('to', Species))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 22 5.1 3.7 1.5 0.4 setosa
## 23 4.6 3.6 1.0 0.2 setosa
## 24 5.1 3.3 1.7 0.5 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 26 5.0 3.0 1.6 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 30 4.7 3.2 1.6 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 32 5.4 3.4 1.5 0.4 setosa
## 33 5.2 4.1 1.5 0.1 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 35 4.9 3.1 1.5 0.2 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 37 5.5 3.5 1.3 0.2 setosa
## 38 4.9 3.6 1.4 0.1 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 40 5.1 3.4 1.5 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 45 5.1 3.8 1.9 0.4 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 47 5.1 3.8 1.6 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 49 5.3 3.7 1.5 0.2 setosa
## 50 5.0 3.3 1.4 0.2 setosa
# same as above, but also require sepal length to be at least 5
# note: be careful to always use single '&' and '|' here, not double
filter(iris, grepl('to', Species), Sepal.Length >= 5)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 5.0 3.6 1.4 0.2 setosa
## 3 5.4 3.9 1.7 0.4 setosa
## 4 5.0 3.4 1.5 0.2 setosa
## 5 5.4 3.7 1.5 0.2 setosa
## 6 5.8 4.0 1.2 0.2 setosa
## 7 5.7 4.4 1.5 0.4 setosa
## 8 5.4 3.9 1.3 0.4 setosa
## 9 5.1 3.5 1.4 0.3 setosa
## 10 5.7 3.8 1.7 0.3 setosa
## 11 5.1 3.8 1.5 0.3 setosa
## 12 5.4 3.4 1.7 0.2 setosa
## 13 5.1 3.7 1.5 0.4 setosa
## 14 5.1 3.3 1.7 0.5 setosa
## 15 5.0 3.0 1.6 0.2 setosa
## 16 5.0 3.4 1.6 0.4 setosa
## 17 5.2 3.5 1.5 0.2 setosa
## 18 5.2 3.4 1.4 0.2 setosa
## 19 5.4 3.4 1.5 0.4 setosa
## 20 5.2 4.1 1.5 0.1 setosa
## 21 5.5 4.2 1.4 0.2 setosa
## 22 5.0 3.2 1.2 0.2 setosa
## 23 5.5 3.5 1.3 0.2 setosa
## 24 5.1 3.4 1.5 0.2 setosa
## 25 5.0 3.5 1.3 0.3 setosa
## 26 5.0 3.5 1.6 0.6 setosa
## 27 5.1 3.8 1.9 0.4 setosa
## 28 5.1 3.8 1.6 0.2 setosa
## 29 5.3 3.7 1.5 0.2 setosa
## 30 5.0 3.3 1.4 0.2 setosa
# get a list of all unique species
unique(iris$Species)
## [1] setosa versicolor virginica
## Levels: setosa versicolor virginica
# count the number of rows for each species
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
# use dplyr to do the same, returning a dataframe (instead of table)
iris %>%
group_by(Species) %>%
summarize(num_rows=n())
## Source: local data frame [3 x 2]
##
## Species num_rows
## 1 setosa 50
## 2 versicolor 50
## 3 virginica 50
# use dplyr to group by species and compute the average and standard deviation of sepal length for each species
iris %>%
group_by(Species) %>%
summarize(mean_sepal_length=mean(Sepal.Length),
sd_sepal_length=sd(Sepal.Length))
## Source: local data frame [3 x 3]
##
## Species mean_sepal_length sd_sepal_length
## 1 setosa 5.006 0.3525
## 2 versicolor 5.936 0.5162
## 3 virginica 6.588 0.6359
# quick glimpse at the power of ggplot2
# look at the built-in diamonds dataset
head(diamonds)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
# plot the number of carats vs. price, split by diamond color
qplot(x=price, y=carat, data=diamonds, facets = . ~ color)
# see more plot examples here
# http://had.co.nz/stat480/r/graphics.html