Subsetting Data Frames

Sean Wen

15th January 2019

Prologue

Dataset

head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
# Modify the dataset slightly for our session
df <- data.frame("Model"=row.names(mtcars), mtcars, stringsAsFactors=FALSE)

row.names(df) <- NULL

head(df)
##               Model  mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1         Mazda RX4 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2     Mazda RX4 Wag 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3        Datsun 710 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## 4    Hornet 4 Drive 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## 6           Valiant 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

Indexing

Indexing

# Select one rows (and all columns)
df.small <- df[1, ]
print(df.small)
##       Model mpg cyl disp  hp drat   wt  qsec vs am gear carb
## 1 Mazda RX4  21   6  160 110  3.9 2.62 16.46  0  1    4    4
# Select several contiguous rows (and all columns)
df.small <- df[1:2, ]
print(df.small)
##           Model mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1     Mazda RX4  21   6  160 110  3.9 2.620 16.46  0  1    4    4
## 2 Mazda RX4 Wag  21   6  160 110  3.9 2.875 17.02  0  1    4    4
# Select several non-contiguous rows (and all columns)
df.small <- df[c(1,3, 6), ]
print(df.small)
##        Model  mpg cyl disp  hp drat   wt  qsec vs am gear carb
## 1  Mazda RX4 21.0   6  160 110 3.90 2.62 16.46  0  1    4    4
## 3 Datsun 710 22.8   4  108  93 3.85 2.32 18.61  1  1    4    1
## 6    Valiant 18.1   6  225 105 2.76 3.46 20.22  1  0    3    1

Indexing

# Select one column (and all rows)
df.small <- df[, 1, drop=FALSE]
head(df.small)
##               Model
## 1         Mazda RX4
## 2     Mazda RX4 Wag
## 3        Datsun 710
## 4    Hornet 4 Drive
## 5 Hornet Sportabout
## 6           Valiant
# Select several contiguous columns (and all rows)
df.small <- df[, 1:2]
head(df.small)
##               Model  mpg
## 1         Mazda RX4 21.0
## 2     Mazda RX4 Wag 21.0
## 3        Datsun 710 22.8
## 4    Hornet 4 Drive 21.4
## 5 Hornet Sportabout 18.7
## 6           Valiant 18.1
# Select several non-contiguous columns (and all rows)
df.small <- df[, c(1,3, 6)]
head(df.small)
##               Model cyl drat
## 1         Mazda RX4   6 3.90
## 2     Mazda RX4 Wag   6 3.90
## 3        Datsun 710   4 3.85
## 4    Hornet 4 Drive   6 3.08
## 5 Hornet Sportabout   8 3.15
## 6           Valiant   6 2.76

Indexing

# Select one rows and column
df.small <- df[1, 1]
df.small
## [1] "Mazda RX4"
# Select several contiguous rows and columns
df.small <- df[1:2, 1:2]
print(df.small)
##           Model mpg
## 1     Mazda RX4  21
## 2 Mazda RX4 Wag  21
# Select several non-contiguous rows and columns
df.small <- df[c(1,3, 6), c(1,3, 6)]
print(df.small)
##        Model cyl drat
## 1  Mazda RX4   6 3.90
## 3 Datsun 710   4 3.85
## 6    Valiant   6 2.76

Indexing

which() function

# Create vector of characters as an example
random <- sample(c("A", "B", "C"), size=10, replace=TRUE)
random
##  [1] "A" "B" "B" "B" "B" "C" "B" "C" "A" "C"
# Perform logical test. TRUE indicates the specified condition is met
random=="A"
##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
# which() indicates the order number which corresponds to TRUE
which(random=="A")
## [1] 1 9
# Using the square bracket notation, we subset our desired character
random[which(random=="A")]
## [1] "A" "A"

which() function

df.small <- df[which(df$Model=="Datsun 710"), ]

which() function

# Using AND operator
df.small <- df[which(df$cyl==6 & df$disp <= 160), ]
dim(df.small)
## [1]  3 12
# Using OR operator
df.small <- df[which(df$cyl==6 | df$disp <= 160), ]
dim(df.small)
## [1] 18 12

which() function

# First specific your conditions
conditions <- c("Hornet 4 Drive", "Valiant", "Merc 280", "onda Civic", "Lotus Europa")

# Subset base on your specified conditions
df.small <- df[which(df$Model %in% conditions), ]
print(df.small)
##             Model  mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 4  Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 6         Valiant 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## 10       Merc 280 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 28   Lotus Europa 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2

subset() function

df.small <- subset(df, subset=(Model=="Datsun 710"), )
print(df.small)
##        Model  mpg cyl disp hp drat   wt  qsec vs am gear carb
## 3 Datsun 710 22.8   4  108 93 3.85 2.32 18.61  1  1    4    1

subset() function

# Using AND operator
df.small <- subset(df, subset=(cyl==6 & disp <= 160), )
dim(df.small)
## [1]  3 12
# Using OR operator
df.small <- subset(df, subset=(cyl==6 | disp <= 160), )
dim(df.small)
## [1] 18 12

grep() function

# Subset every Madza model
df.small <- df[grep("Mazda", df$Model), ]
print(df.small)
##           Model mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1     Mazda RX4  21   6  160 110  3.9 2.620 16.46  0  1    4    4
## 2 Mazda RX4 Wag  21   6  160 110  3.9 2.875 17.02  0  1    4    4
# Subset every Merc model
df.small <- df[grep("Merc", df$Model), ]
print(df.small)
##          Model  mpg cyl  disp  hp drat   wt qsec vs am gear carb
## 8    Merc 240D 24.4   4 146.7  62 3.69 3.19 20.0  1  0    4    2
## 9     Merc 230 22.8   4 140.8  95 3.92 3.15 22.9  1  0    4    2
## 10    Merc 280 19.2   6 167.6 123 3.92 3.44 18.3  1  0    4    4
## 11   Merc 280C 17.8   6 167.6 123 3.92 3.44 18.9  1  0    4    4
## 12  Merc 450SE 16.4   8 275.8 180 3.07 4.07 17.4  0  0    3    3
## 13  Merc 450SL 17.3   8 275.8 180 3.07 3.73 17.6  0  0    3    3
## 14 Merc 450SLC 15.2   8 275.8 180 3.07 3.78 18.0  0  0    3    3

grep() function

# Create vector of characters as an example
random <- sample(c("Alice", "Allen", "Adam"), size=10, replace=TRUE)
random
##  [1] "Adam"  "Allen" "Alice" "Allen" "Alice" "Adam"  "Adam"  "Allen"
##  [9] "Adam"  "Alice"
# grep() indicates the order number which corresponds to TRUE
grep("Ad", random)
## [1] 1 6 7 9
# Using the square bracket notation, we subset our desired character
random[grep("Ad", random)]
## [1] "Adam" "Adam" "Adam" "Adam"

Subsetting columns

# Subsetting one column
df.small <- df[, "Model", drop=FALSE]
head(df.small)
##               Model
## 1         Mazda RX4
## 2     Mazda RX4 Wag
## 3        Datsun 710
## 4    Hornet 4 Drive
## 5 Hornet Sportabout
## 6           Valiant
# Subsetting multiple columns
df.small <- df[, c("Model", "wt")]
head(df.small)
##               Model    wt
## 1         Mazda RX4 2.620
## 2     Mazda RX4 Wag 2.875
## 3        Datsun 710 2.320
## 4    Hornet 4 Drive 3.215
## 5 Hornet Sportabout 3.440
## 6           Valiant 3.460

Subsetting columns

# First specific your conditions
conditions <- c("Model", "disp", "vs", "am", "gear")

# Subset base on your specified conditions
df.small <- df[, names(df.small) %in% conditions]
head(df.small)
##               Model cyl  hp    wt vs gear
## 1         Mazda RX4   6 110 2.620  0    4
## 2     Mazda RX4 Wag   6 110 2.875  0    4
## 3        Datsun 710   4  93 2.320  1    4
## 4    Hornet 4 Drive   6 110 3.215  1    3
## 5 Hornet Sportabout   8 175 3.440  0    3
## 6           Valiant   6 105 3.460  1    3

Subsetting columns

# Subset columns begining with c
df.small <- df[, grep("^c", names(df))]
head(df.small)
##   cyl carb
## 1   6    4
## 2   6    4
## 3   4    1
## 4   6    1
## 5   8    2
## 6   6    1

Subsetting both rows and columns at the same time

df.small <- df[grep("Merc", df$Model), c("Model", "wt")]
head(df.small)
##         Model   wt
## 8   Merc 240D 3.19
## 9    Merc 230 3.15
## 10   Merc 280 3.44
## 11  Merc 280C 3.44
## 12 Merc 450SE 4.07
## 13 Merc 450SL 3.73

Summary