cheatsheets and reference cards:

http://cran.r-project.org/doc/contrib/Baggott-refcard-v2.pdf
http://cran.r-project.org/doc/contrib/Torfs+Brauer-Short-R-Intro.pdf
http://had.co.nz/stat480/r/
http://www.rstudio.com/wp-content/uploads/2015/01/data-wrangling-cheatsheet.pdf

load some required libraries

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# assign the first 6 rows of the iris dataset as a toy dataframe
df <- head(iris)
df

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

# inspect the structure of the data frame
str(df)

## 'data.frame':    6 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1

# compute summary statistics for each column
summary(df)

##   Sepal.Length   Sepal.Width    Petal.Length   Petal.Width   
##  Min.   :4.60   Min.   :3.00   Min.   :1.30   Min.   :0.200  
##  1st Qu.:4.75   1st Qu.:3.12   1st Qu.:1.40   1st Qu.:0.200  
##  Median :4.95   Median :3.35   Median :1.40   Median :0.200  
##  Mean   :4.95   Mean   :3.38   Mean   :1.45   Mean   :0.233  
##  3rd Qu.:5.08   3rd Qu.:3.58   3rd Qu.:1.48   3rd Qu.:0.200  
##  Max.   :5.40   Max.   :3.90   Max.   :1.70   Max.   :0.400  
##        Species 
##  setosa    :6  
##  versicolor:0  
##  virginica :0  
##                
##                
##

# look at the fifth row

df[5, ]

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 5            5         3.6          1.4         0.2  setosa

# three different ways to look at the second column
# by index
df[ , 2]

## [1] 3.5 3.0 3.2 3.1 3.6 3.9

# by name
df[ , "Sepal.Width"]

## [1] 3.5 3.0 3.2 3.1 3.6 3.9

# extracting the column vector
df$Sepal.Width

## [1] 3.5 3.0 3.2 3.1 3.6 3.9

# find rows with Sepal.Length at least 5
df$Sepal.Length >= 5

## [1]  TRUE FALSE FALSE FALSE  TRUE  TRUE

# select all rows with Sepal.Length at least 5
df[df$Sepal.Length >= 5, ]

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

# equivalent to selecting rows 1, 5, and 6 using a logical vector
df[c(T,F,F,F,T,T), ]

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

# nicer syntax for the same thing
# note: the second entry of subset is a logical vector
subset(df, Sepal.Length >= 5)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

# filter ANDs conditions when given multiple arguments
filter(df, Sepal.Length >= 5, Petal.Length <= 1.4)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          5.0         3.6          1.4         0.2  setosa

# create a new column by adding the sepal and petal widths
# ugly
df$total_width <- df$Sepal.Width + df$Petal.Width
# nicer
# note: mutate always gives back same number of rows
mutate(df, total_width=Sepal.Width + Petal.Width)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species total_width
## 1          5.1         3.5          1.4         0.2  setosa         3.7
## 2          4.9         3.0          1.4         0.2  setosa         3.2
## 3          4.7         3.2          1.3         0.2  setosa         3.4
## 4          4.6         3.1          1.5         0.2  setosa         3.3
## 5          5.0         3.6          1.4         0.2  setosa         3.8
## 6          5.4         3.9          1.7         0.4  setosa         4.3

# compute the average sepal length
mean(df$Sepal.Length)

## [1] 4.95

# note: summarize gives back one row
summarize(df, mean_sepal_length=mean(Sepal.Length))

##   mean_sepal_length
## 1              4.95

# note: we're using the full iris dataframe (instead of the first 6 rows) from here on

# use grep to find indices of the column names that contain 'Length'
grep('Length', names(iris))

## [1] 1 3

# same, but return the values instead of indices
grep('Length', names(iris), value=T)

## [1] "Sepal.Length" "Petal.Length"

# same, but return a logical vector when there's a match
grepl('Length', names(iris))

## [1]  TRUE FALSE  TRUE FALSE FALSE

# find all rows where the species matches the pattern 'to'
filter(iris, grepl('to', Species))

##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 12          4.8         3.4          1.6         0.2  setosa
## 13          4.8         3.0          1.4         0.1  setosa
## 14          4.3         3.0          1.1         0.1  setosa
## 15          5.8         4.0          1.2         0.2  setosa
## 16          5.7         4.4          1.5         0.4  setosa
## 17          5.4         3.9          1.3         0.4  setosa
## 18          5.1         3.5          1.4         0.3  setosa
## 19          5.7         3.8          1.7         0.3  setosa
## 20          5.1         3.8          1.5         0.3  setosa
## 21          5.4         3.4          1.7         0.2  setosa
## 22          5.1         3.7          1.5         0.4  setosa
## 23          4.6         3.6          1.0         0.2  setosa
## 24          5.1         3.3          1.7         0.5  setosa
## 25          4.8         3.4          1.9         0.2  setosa
## 26          5.0         3.0          1.6         0.2  setosa
## 27          5.0         3.4          1.6         0.4  setosa
## 28          5.2         3.5          1.5         0.2  setosa
## 29          5.2         3.4          1.4         0.2  setosa
## 30          4.7         3.2          1.6         0.2  setosa
## 31          4.8         3.1          1.6         0.2  setosa
## 32          5.4         3.4          1.5         0.4  setosa
## 33          5.2         4.1          1.5         0.1  setosa
## 34          5.5         4.2          1.4         0.2  setosa
## 35          4.9         3.1          1.5         0.2  setosa
## 36          5.0         3.2          1.2         0.2  setosa
## 37          5.5         3.5          1.3         0.2  setosa
## 38          4.9         3.6          1.4         0.1  setosa
## 39          4.4         3.0          1.3         0.2  setosa
## 40          5.1         3.4          1.5         0.2  setosa
## 41          5.0         3.5          1.3         0.3  setosa
## 42          4.5         2.3          1.3         0.3  setosa
## 43          4.4         3.2          1.3         0.2  setosa
## 44          5.0         3.5          1.6         0.6  setosa
## 45          5.1         3.8          1.9         0.4  setosa
## 46          4.8         3.0          1.4         0.3  setosa
## 47          5.1         3.8          1.6         0.2  setosa
## 48          4.6         3.2          1.4         0.2  setosa
## 49          5.3         3.7          1.5         0.2  setosa
## 50          5.0         3.3          1.4         0.2  setosa

# same as above, but also require sepal length to be at least 5
# note: be careful to always use single '&' and '|' here, not double
filter(iris, grepl('to', Species), Sepal.Length >= 5)

##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           5.0         3.6          1.4         0.2  setosa
## 3           5.4         3.9          1.7         0.4  setosa
## 4           5.0         3.4          1.5         0.2  setosa
## 5           5.4         3.7          1.5         0.2  setosa
## 6           5.8         4.0          1.2         0.2  setosa
## 7           5.7         4.4          1.5         0.4  setosa
## 8           5.4         3.9          1.3         0.4  setosa
## 9           5.1         3.5          1.4         0.3  setosa
## 10          5.7         3.8          1.7         0.3  setosa
## 11          5.1         3.8          1.5         0.3  setosa
## 12          5.4         3.4          1.7         0.2  setosa
## 13          5.1         3.7          1.5         0.4  setosa
## 14          5.1         3.3          1.7         0.5  setosa
## 15          5.0         3.0          1.6         0.2  setosa
## 16          5.0         3.4          1.6         0.4  setosa
## 17          5.2         3.5          1.5         0.2  setosa
## 18          5.2         3.4          1.4         0.2  setosa
## 19          5.4         3.4          1.5         0.4  setosa
## 20          5.2         4.1          1.5         0.1  setosa
## 21          5.5         4.2          1.4         0.2  setosa
## 22          5.0         3.2          1.2         0.2  setosa
## 23          5.5         3.5          1.3         0.2  setosa
## 24          5.1         3.4          1.5         0.2  setosa
## 25          5.0         3.5          1.3         0.3  setosa
## 26          5.0         3.5          1.6         0.6  setosa
## 27          5.1         3.8          1.9         0.4  setosa
## 28          5.1         3.8          1.6         0.2  setosa
## 29          5.3         3.7          1.5         0.2  setosa
## 30          5.0         3.3          1.4         0.2  setosa

# get a list of all unique species
unique(iris$Species)

## [1] setosa     versicolor virginica 
## Levels: setosa versicolor virginica

# count the number of rows for each species
table(iris$Species)

## 
##     setosa versicolor  virginica 
##         50         50         50

# use dplyr to do the same, returning a dataframe (instead of table)
iris %>%
  group_by(Species) %>%
  summarize(num_rows=n())

## Source: local data frame [3 x 2]
## 
##      Species num_rows
## 1     setosa       50
## 2 versicolor       50
## 3  virginica       50

# use dplyr to group by species and compute the average and standard deviation of sepal length for each species
iris %>%
  group_by(Species) %>%
  summarize(mean_sepal_length=mean(Sepal.Length),
            sd_sepal_length=sd(Sepal.Length))

## Source: local data frame [3 x 3]
## 
##      Species mean_sepal_length sd_sepal_length
## 1     setosa             5.006          0.3525
## 2 versicolor             5.936          0.5162
## 3  virginica             6.588          0.6359

# quick glimpse at the power of ggplot2
# look at the built-in diamonds dataset
head(diamonds)

##   carat       cut color clarity depth table price    x    y    z
## 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48

# plot the number of carats vs. price, split by diamond color
qplot(x=price, y=carat, data=diamonds, facets = . ~ color)

plot of chunk unnamed-chunk-3

# see more plot examples here
# http://had.co.nz/stat480/r/graphics.html

A Brief Introduction to Data Manipulation in R

Jake Hofman

February 6, 2015

cheatsheets and reference cards: