Adapted from: “Getting and Cleaning Data”, week 3 - lesson 1 by Johns Hopkins University (Coursera)
Create a Data Frame
Setting up data frame…
set.seed(2777)
X <- data.frame("North"=(sample(1:9))*4,
"South"=(sample(1:9))*12,
"West"=(sample(1:9))*3,
"East"=(sample(1:9))*5)
X
Generating random NAs…
(credits: https://www.r-bloggers.com/function-to-generate-a-random-data-set/)
NAins <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
X <- NAins(X, .1)
X
Subsetting (Quick Review)
Subsetting column by number:
X[,1]
[1] 24 20 16 12 8 28 4 32 36
Subsetting column by name:
X[,"South"]
[1] 72 96 60 108 24 84 NA 36 12
X$South
[1] 72 96 60 108 24 84 NA 36 12
identical(X[,"South"], X$South)
[1] TRUE
Subsetting rows and columns
X[3:5, c("East", "West")]
Subset using logicals ands and ors
With and (&)
X[(X$East >= 30 & X$North <= 30),]
With or (|)
X[(X$West >= 30 | X$South <= 50),]
Dealing with missing values (NA)
If I’m dealing with NA values subsetting on NAs will not produce the actual rows, so you want to use the which() command. So if you use which(), it will return the indices where selected variable(s) is greater than 0 in this case.
So when it returns the indices where selected variable(s) is greater than 0, it doesn’t return the NAs, and so you can subset the data set.
X[which(c(X$North > 0 & X$South > 0 & X$West > 0 & X$East > 0)),]
Sorting
Sort low to high (increase)
sort(X$East)
[1] 5 10 15 20 25 30 35 40
Sort high to low (decrease)
sort(X$South, decreasing = T)
[1] 108 96 84 72 60 36 24 12
Sort decreasing with NA in last
sort(X$West, decreasing = T, na.last = T)
[1] 27 24 18 15 9 6 3 NA NA
Ordering
Ordering data by variable North
X[order(X$North),]
Decreasing order by North while subsetting columns South & West
X[order(X$North, decreasing = T),c("South", "West")]
Ordering by multiple values
X[order(X$North, X$South),]
Ordering with plyr
Load the package
library(plyr)
Increasing order with arrange() from plyr
arrange(X, West)
Decreasing order with arrange() from plyr
arrange(X, desc(East))
Adding rows and columns
Adding column
X$Southeast <- (X$South + X$East) / 2
X
Another way to add column
Y <- cbind(X, Northwest=(X$North + X$West)/2)
Y
Add row
Y <- rbind(X, c(40, 48, 12, 45, (48+40)/2))
Y
END
LS0tCnRpdGxlOiAiU3Vic2V0dGluZyBhbmQgU29ydGluZyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMjIyBBZGFwdGVkIGZyb206ICJHZXR0aW5nIGFuZCBDbGVhbmluZyBEYXRhIiwgd2VlayAzIC0gbGVzc29uIDEgYnkgSm9obnMgSG9wa2lucyBVbml2ZXJzaXR5IChDb3Vyc2VyYSkKCjxiciAvPgo8YnIgLz4KCiMjIyBDcmVhdGUgYSBEYXRhIEZyYW1lCgo8YnIgLz4KClNldHRpbmcgdXAgZGF0YSBmcmFtZS4uLgoKYGBge3J9CnNldC5zZWVkKDI3NzcpClggPC0gZGF0YS5mcmFtZSgiTm9ydGgiPShzYW1wbGUoMTo5KSkqNCwKICAgICAgICAgICAgICAgICJTb3V0aCI9KHNhbXBsZSgxOjkpKSoxMiwKICAgICAgICAgICAgICAgICJXZXN0Ij0oc2FtcGxlKDE6OSkpKjMsCiAgICAgICAgICAgICAgICAiRWFzdCI9KHNhbXBsZSgxOjkpKSo1KQoKWApgYGAKCjxiciAvPgoKR2VuZXJhdGluZyByYW5kb20gTkFzLi4uCgooY3JlZGl0czogaHR0cHM6Ly93d3cuci1ibG9nZ2Vycy5jb20vZnVuY3Rpb24tdG8tZ2VuZXJhdGUtYS1yYW5kb20tZGF0YS1zZXQvKQpgYGB7cn0KTkFpbnMgPC0gZnVuY3Rpb24oZGYsIHByb3AgPSAuMSl7CiAgICBuIDwtIG5yb3coZGYpCiAgICBtIDwtIG5jb2woZGYpCiAgICBudW0udG8ubmEgPC0gY2VpbGluZyhwcm9wKm4qbSkKICAgIGlkIDwtIHNhbXBsZSgwOihtKm4tMSksIG51bS50by5uYSwgcmVwbGFjZSA9IEZBTFNFKQogICAgcm93cyA8LSBpZCAlLyUgbSArIDEKICAgIGNvbHMgPC0gaWQgJSUgbSArIDEKICAgIHNhcHBseShzZXEobnVtLnRvLm5hKSwgZnVuY3Rpb24oeCl7CiAgICAgICAgICAgIGRmW3Jvd3NbeF0sIGNvbHNbeF1dIDw8LSBOQQogICAgICAgIH0KICAgICkKICAgIHJldHVybihkZikKfQoKWCA8LSBOQWlucyhYLCAuMSkKWApgYGAKCjxiciAvPgoKLS0tCgo8YnIgLz4KCiMjIyBTdWJzZXR0aW5nIChRdWljayBSZXZpZXcpCgo8YnIgLz4KClN1YnNldHRpbmcgKmNvbHVtbiogYnkgKipudW1iZXIqKjoKYGBge3J9ClhbLDFdCmBgYAoKPGJyIFw+CgpTdWJzZXR0aW5nICpjb2x1bW4qIGJ5ICoqbmFtZSoqOgpgYGB7cn0KWFssIlNvdXRoIl0KWCRTb3V0aAppZGVudGljYWwoWFssIlNvdXRoIl0sIFgkU291dGgpCmBgYAoKPGJyIFw+ClN1YnNldHRpbmcgKnJvd3MqIGFuZCAqY29sdW1ucyoKYGBge3J9ClhbMzo1LCBjKCJFYXN0IiwgIldlc3QiKV0KYGBgCgo8YnIgLz4KCi0tLQoKPGJyIC8+CgojIyMgU3Vic2V0IHVzaW5nIGxvZ2ljYWxzIDx1PioqYW5kKio8L3U+cyBhbmQgPHU+KipvcioqPC91PnMKCjxiciAvPgpXaXRoICoqYW5kICgmKSoqCmBgYHtyfQpYWyhYJEVhc3QgPj0gMzAgJiBYJE5vcnRoIDw9IDMwKSxdCmBgYAoKPGJyIC8+CldpdGggKipvciAofCkqKgpgYGB7cn0KWFsoWCRXZXN0ID49IDMwIHwgWCRTb3V0aCA8PSA1MCksXQpgYGAKCjxiciAvPgoKLS0tCgo8YnIgLz4KCiMjIyBEZWFsaW5nIHdpdGggbWlzc2luZyB2YWx1ZXMgKE5BKQoKPGJyIC8+CklmIEknbSBkZWFsaW5nIHdpdGggTkEgdmFsdWVzIHN1YnNldHRpbmcgb24gTkFzIHdpbGwgbm90IHByb2R1Y2UgdGhlIGFjdHVhbCByb3dzLCBzbyB5b3Ugd2FudCB0byB1c2UgdGhlICoqd2hpY2goKSoqIGNvbW1hbmQuIFNvIGlmIHlvdSB1c2UgKip3aGljaCgpKiosIGl0IHdpbGwgcmV0dXJuIHRoZSBpbmRpY2VzIHdoZXJlIHNlbGVjdGVkIHZhcmlhYmxlKHMpIGlzIGdyZWF0ZXIgdGhhbiAwIGluIHRoaXMgY2FzZS4KClNvIHdoZW4gaXQgcmV0dXJucyB0aGUgaW5kaWNlcyB3aGVyZSBzZWxlY3RlZCB2YXJpYWJsZShzKSBpcyBncmVhdGVyIHRoYW4gMCwgaXQgZG9lc24ndCByZXR1cm4gdGhlIE5BcywgYW5kIHNvIHlvdSBjYW4gc3Vic2V0IHRoZSBkYXRhIHNldC4KCmBgYHtyfQpYW3doaWNoKGMoWCROb3J0aCA+IDAgJiBYJFNvdXRoID4gMCAmIFgkV2VzdCA+IDAgJiBYJEVhc3QgPiAwKSksXQpgYGAKCjxiciAvPgoKLS0tCgo8YnIgLz4KCiMjIyBTb3J0aW5nCgo8YnIgLz4KU29ydCBsb3cgdG8gaGlnaCAoaW5jcmVhc2UpCmBgYHtyfQpzb3J0KFgkRWFzdCkKYGBgCgo8YnIgLz4KU29ydCBoaWdoIHRvIGxvdyAoZGVjcmVhc2UpCmBgYHtyfQpzb3J0KFgkU291dGgsIGRlY3JlYXNpbmcgPSBUKQpgYGAKCjxiciAvPgpTb3J0IGRlY3JlYXNpbmcgd2l0aCBOQSBpbiBsYXN0CmBgYHtyfQpzb3J0KFgkV2VzdCwgZGVjcmVhc2luZyA9IFQsIG5hLmxhc3QgPSBUKQpgYGAKCjxiciAvPgoKLS0tCgo8YnIgLz4KCiMjIyBPcmRlcmluZwoKPGJyIC8+Ck9yZGVyaW5nIGRhdGEgYnkgdmFyaWFibGUgKk5vcnRoKgpgYGB7cn0KWFtvcmRlcihYJE5vcnRoKSxdCmBgYAoKPGJyIC8+CkRlY3JlYXNpbmcgb3JkZXIgYnkgKk5vcnRoKiB3aGlsZSBzdWJzZXR0aW5nIGNvbHVtbnMgKlNvdXRoKiAmICpXZXN0KgpgYGB7cn0KWFtvcmRlcihYJE5vcnRoLCBkZWNyZWFzaW5nID0gVCksYygiU291dGgiLCAiV2VzdCIpXQpgYGAKCjxiciAvPgpPcmRlcmluZyBieSBtdWx0aXBsZSB2YWx1ZXMKYGBge3J9Clhbb3JkZXIoWCROb3J0aCwgWCRTb3V0aCksXQpgYGAKCjxiciAvPgoKLS0tCgo8YnIgLz4KCiMjIyBPcmRlcmluZyB3aXRoICoqcGx5cioqCgo8YnIgLz4KCkxvYWQgdGhlIHBhY2thZ2UKYGBge3J9CmxpYnJhcnkocGx5cikKYGBgCgo8YnIgLz4KSW5jcmVhc2luZyBvcmRlciB3aXRoICoqYXJyYW5nZSgpKiogZnJvbSAqcGx5cioKYGBge3J9CmFycmFuZ2UoWCwgV2VzdCkKYGBgCgo8YnIgLz4KRGVjcmVhc2luZyBvcmRlciB3aXRoICoqYXJyYW5nZSgpKiogZnJvbSAqcGx5cioKYGBge3J9CmFycmFuZ2UoWCwgZGVzYyhFYXN0KSkKYGBgCgo8YnIgLz4KCi0tLQoKPGJyIC8+CgojIyMgQWRkaW5nIHJvd3MgYW5kIGNvbHVtbnMKCjxiciAvPgpBZGRpbmcgY29sdW1uCmBgYHtyfQpYJFNvdXRoZWFzdCA8LSAoWCRTb3V0aCArIFgkRWFzdCkgLyAyClgKYGBgCgo8YnIgLz4KQW5vdGhlciB3YXkgdG8gYWRkIGNvbHVtbgpgYGB7cn0KWSA8LSBjYmluZChYLCBOb3J0aHdlc3Q9KFgkTm9ydGggKyBYJFdlc3QpLzIpClkKYGBgCgo8YnIgLz4KQWRkIHJvdwpgYGB7cn0KWSA8LSByYmluZChYLCBjKDQwLCA0OCwgMTIsIDQ1LCAoNDgrNDApLzIpKQpZCmBgYAoKLS0tCgo8Y2VudGVyPkVORDwvY2VudGVyPgoKLS0t