Subsetting Data

Using Base R

df <- iris
df

Three ways of accessing values from a single column

df$Sepal.Length
##   [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4
##  [18] 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7 4.8 5.4 5.2 5.5
##  [35] 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0
##  [52] 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8
##  [69] 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4
##  [86] 6.0 6.7 6.3 5.6 5.5 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8
## [103] 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7
## [120] 6.0 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7
## [137] 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9
df[1]
df[,1]
##   [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4
##  [18] 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7 4.8 5.4 5.2 5.5
##  [35] 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0
##  [52] 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8
##  [69] 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4
##  [86] 6.0 6.7 6.3 5.6 5.5 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8
## [103] 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7
## [120] 6.0 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7
## [137] 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9

Rows

df[1,]

We can get more than just one row or column

df[1:10,]
df[1:10, 1:2]

We can do negative selection

df[1:10, -1]
df[1:10, -(1:3)]

We can add columns

df$id <- 1:nrow(df)

df

Let’s try subsetting using a condition

which(df$Species == "virginica")
##  [1] 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
## [18] 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
## [35] 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150

If we just use the numbers, it looks like this

df[101:150,]

Replace it with the which() function we used before

df[which(df$Species == "virginica"), ]

Save it as a new data frame

test <- df[which(df$Species == "virginica"), ]

test

Try it numerically (name, petal width, and ID of all which have a Sepal Length less than 6)

test <- df[which(df$Sepal.Length < 6), 4:6]

test

Using dplyr

library(tidyverse)

Three useful functions for subsetting

filter()
select()
mutate()

A couple of other things from dplyr we’ll talk about later

group_by()
summarize()

Getting started with dplyr

filter() subsets observations (rows) based on a condition

setosas <- filter(df, Species == "setosa")

setosas

select() subsets variables (columns)

setosas_sepal <- select(setosas, Sepal.Length, Species, id)

setosas_sepal

Combining those together with regular R syntax looks like this:

setosas_sepal <- select(filter(df, Species == "setosa"), Sepal.Length, Species, id)

setosas_sepal

Using the Pipe

# Example from Hadley Wickham
foo_foo <- little_bunny()

bop(scoop_up(hop_through(foo_foo, forest), field_mouse), head)

foo_foo %>%
  hop_through(forest) %>%
  scoop_up(field_mouse) %>%
  bop(head)

We can apply this syntax to the iris data

setosas <- df %>%
  filter(Species == "setosa")

setosas
setosas <- df %>%
  filter(Species == "setosa") %>%
  select(Sepal.Length, Species, id)

setosas
df2 <- iris %>%
  mutate(id = row_number())

df2

Tidying data

Tidying the iris dataset with gather()

tidy <- df %>%
  gather(key = variable, value = value, Sepal.Length:Petal.Width)

tidy

Trying it with something a little messier

wide <- read_csv("http://dartgo.org/qbs_iris")
wide

Using gather() and separate() to clean the data

tidy <- wide %>%
  gather(key = bad_id, value = value, setosa_1:ncol(wide))

tidy
tidy <- wide %>%
  gather(key = bad_id, value = value, setosa_1:ncol(wide)) %>%
  separate(col = bad_id, into = c("species", "id"), sep = "_")

tidy

Getting back to our original dataset

df_normal <- wide %>%
  gather(key = bad_id, value = value, setosa_1:ncol(wide)) %>%
  separate(col = bad_id, into = c("species", "id"), sep = "_") %>%
  spread(key = variable, value = value)

df_normal

Manipulating/Aggregating data

Split - Apply - Combine
Using group_by() and summarize()

species_means <- df %>%
  group_by(Species) %>%
  summarize(pl = mean(Petal.Length), pw = mean(Petal.Width), sl = mean(Sepal.Length), sw = mean(Sepal.Width))

species_means

Using a Star Wars Example

starwars
sw_heights <- starwars %>%
  group_by(homeworld, species) %>%
  summarize(mean_height = mean(height))

sw_heights