Adapted from: “Getting and Cleaning Data”, week 3 - lesson 1 by Johns Hopkins University (Coursera)



Create a Data Frame


Setting up data frame…

set.seed(2777)
X <- data.frame("North"=(sample(1:9))*4,
                "South"=(sample(1:9))*12,
                "West"=(sample(1:9))*3,
                "East"=(sample(1:9))*5)
X


Generating random NAs…

(credits: https://www.r-bloggers.com/function-to-generate-a-random-data-set/)

NAins <- function(df, prop = .1){
    n <- nrow(df)
    m <- ncol(df)
    num.to.na <- ceiling(prop*n*m)
    id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
    rows <- id %/% m + 1
    cols <- id %% m + 1
    sapply(seq(num.to.na), function(x){
            df[rows[x], cols[x]] <<- NA
        }
    )
    return(df)
}
X <- NAins(X, .1)
X




Subsetting (Quick Review)


Subsetting column by number:

X[,1]
[1] 24 20 16 12  8 28  4 32 36


Subsetting column by name:

X[,"South"]
[1]  72  96  60 108  24  84  NA  36  12
X$South
[1]  72  96  60 108  24  84  NA  36  12
identical(X[,"South"], X$South)
[1] TRUE


Subsetting rows and columns

X[3:5, c("East", "West")]




Subset using logicals ands and ors


With and (&)

X[(X$East >= 30 & X$North <= 30),]


With or (|)

X[(X$West >= 30 | X$South <= 50),]




Dealing with missing values (NA)


If I’m dealing with NA values subsetting on NAs will not produce the actual rows, so you want to use the which() command. So if you use which(), it will return the indices where selected variable(s) is greater than 0 in this case.

So when it returns the indices where selected variable(s) is greater than 0, it doesn’t return the NAs, and so you can subset the data set.

X[which(c(X$North > 0 & X$South > 0 & X$West > 0 & X$East > 0)),]




Sorting


Sort low to high (increase)

sort(X$East)
[1]  5 10 15 20 25 30 35 40


Sort high to low (decrease)

sort(X$South, decreasing = T)
[1] 108  96  84  72  60  36  24  12


Sort decreasing with NA in last

sort(X$West, decreasing = T, na.last = T)
[1] 27 24 18 15  9  6  3 NA NA




Ordering


Ordering data by variable North

X[order(X$North),]


Decreasing order by North while subsetting columns South & West

X[order(X$North, decreasing = T),c("South", "West")]


Ordering by multiple values

X[order(X$North, X$South),]




Ordering with plyr


Load the package

library(plyr)


Increasing order with arrange() from plyr

arrange(X, West)


Decreasing order with arrange() from plyr

arrange(X, desc(East))




Adding rows and columns


Adding column

X$Southeast <- (X$South + X$East) / 2
X


Another way to add column

Y <- cbind(X, Northwest=(X$North + X$West)/2)
Y


Add row

Y <- rbind(X, c(40, 48, 12, 45, (48+40)/2))
Y

END

LS0tCnRpdGxlOiAiU3Vic2V0dGluZyBhbmQgU29ydGluZyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMjIyBBZGFwdGVkIGZyb206ICJHZXR0aW5nIGFuZCBDbGVhbmluZyBEYXRhIiwgd2VlayAzIC0gbGVzc29uIDEgYnkgSm9obnMgSG9wa2lucyBVbml2ZXJzaXR5IChDb3Vyc2VyYSkKCjxiciAvPgo8YnIgLz4KCiMjIyBDcmVhdGUgYSBEYXRhIEZyYW1lCgo8YnIgLz4KClNldHRpbmcgdXAgZGF0YSBmcmFtZS4uLgoKYGBge3J9CnNldC5zZWVkKDI3NzcpClggPC0gZGF0YS5mcmFtZSgiTm9ydGgiPShzYW1wbGUoMTo5KSkqNCwKICAgICAgICAgICAgICAgICJTb3V0aCI9KHNhbXBsZSgxOjkpKSoxMiwKICAgICAgICAgICAgICAgICJXZXN0Ij0oc2FtcGxlKDE6OSkpKjMsCiAgICAgICAgICAgICAgICAiRWFzdCI9KHNhbXBsZSgxOjkpKSo1KQoKWApgYGAKCjxiciAvPgoKR2VuZXJhdGluZyByYW5kb20gTkFzLi4uCgooY3JlZGl0czogaHR0cHM6Ly93d3cuci1ibG9nZ2Vycy5jb20vZnVuY3Rpb24tdG8tZ2VuZXJhdGUtYS1yYW5kb20tZGF0YS1zZXQvKQpgYGB7cn0KTkFpbnMgPC0gZnVuY3Rpb24oZGYsIHByb3AgPSAuMSl7CiAgICBuIDwtIG5yb3coZGYpCiAgICBtIDwtIG5jb2woZGYpCiAgICBudW0udG8ubmEgPC0gY2VpbGluZyhwcm9wKm4qbSkKICAgIGlkIDwtIHNhbXBsZSgwOihtKm4tMSksIG51bS50by5uYSwgcmVwbGFjZSA9IEZBTFNFKQogICAgcm93cyA8LSBpZCAlLyUgbSArIDEKICAgIGNvbHMgPC0gaWQgJSUgbSArIDEKICAgIHNhcHBseShzZXEobnVtLnRvLm5hKSwgZnVuY3Rpb24oeCl7CiAgICAgICAgICAgIGRmW3Jvd3NbeF0sIGNvbHNbeF1dIDw8LSBOQQogICAgICAgIH0KICAgICkKICAgIHJldHVybihkZikKfQoKWCA8LSBOQWlucyhYLCAuMSkKWApgYGAKCjxiciAvPgoKLS0tCgo8YnIgLz4KCiMjIyBTdWJzZXR0aW5nIChRdWljayBSZXZpZXcpCgo8YnIgLz4KClN1YnNldHRpbmcgKmNvbHVtbiogYnkgKipudW1iZXIqKjoKYGBge3J9ClhbLDFdCmBgYAoKPGJyIFw+CgpTdWJzZXR0aW5nICpjb2x1bW4qIGJ5ICoqbmFtZSoqOgpgYGB7cn0KWFssIlNvdXRoIl0KWCRTb3V0aAppZGVudGljYWwoWFssIlNvdXRoIl0sIFgkU291dGgpCmBgYAoKPGJyIFw+ClN1YnNldHRpbmcgKnJvd3MqIGFuZCAqY29sdW1ucyoKYGBge3J9ClhbMzo1LCBjKCJFYXN0IiwgIldlc3QiKV0KYGBgCgo8YnIgLz4KCi0tLQoKPGJyIC8+CgojIyMgU3Vic2V0IHVzaW5nIGxvZ2ljYWxzIDx1PioqYW5kKio8L3U+cyBhbmQgPHU+KipvcioqPC91PnMKCjxiciAvPgpXaXRoICoqYW5kICgmKSoqCmBgYHtyfQpYWyhYJEVhc3QgPj0gMzAgJiBYJE5vcnRoIDw9IDMwKSxdCmBgYAoKPGJyIC8+CldpdGggKipvciAofCkqKgpgYGB7cn0KWFsoWCRXZXN0ID49IDMwIHwgWCRTb3V0aCA8PSA1MCksXQpgYGAKCjxiciAvPgoKLS0tCgo8YnIgLz4KCiMjIyBEZWFsaW5nIHdpdGggbWlzc2luZyB2YWx1ZXMgKE5BKQoKPGJyIC8+CklmIEknbSBkZWFsaW5nIHdpdGggTkEgdmFsdWVzIHN1YnNldHRpbmcgb24gTkFzIHdpbGwgbm90IHByb2R1Y2UgdGhlIGFjdHVhbCByb3dzLCBzbyB5b3Ugd2FudCB0byB1c2UgdGhlICoqd2hpY2goKSoqIGNvbW1hbmQuIFNvIGlmIHlvdSB1c2UgKip3aGljaCgpKiosIGl0IHdpbGwgcmV0dXJuIHRoZSBpbmRpY2VzIHdoZXJlIHNlbGVjdGVkIHZhcmlhYmxlKHMpIGlzIGdyZWF0ZXIgdGhhbiAwIGluIHRoaXMgY2FzZS4KClNvIHdoZW4gaXQgcmV0dXJucyB0aGUgaW5kaWNlcyB3aGVyZSBzZWxlY3RlZCB2YXJpYWJsZShzKSBpcyBncmVhdGVyIHRoYW4gMCwgaXQgZG9lc24ndCByZXR1cm4gdGhlIE5BcywgYW5kIHNvIHlvdSBjYW4gc3Vic2V0IHRoZSBkYXRhIHNldC4KCmBgYHtyfQpYW3doaWNoKGMoWCROb3J0aCA+IDAgJiBYJFNvdXRoID4gMCAmIFgkV2VzdCA+IDAgJiBYJEVhc3QgPiAwKSksXQpgYGAKCjxiciAvPgoKLS0tCgo8YnIgLz4KCiMjIyBTb3J0aW5nCgo8YnIgLz4KU29ydCBsb3cgdG8gaGlnaCAoaW5jcmVhc2UpCmBgYHtyfQpzb3J0KFgkRWFzdCkKYGBgCgo8YnIgLz4KU29ydCBoaWdoIHRvIGxvdyAoZGVjcmVhc2UpCmBgYHtyfQpzb3J0KFgkU291dGgsIGRlY3JlYXNpbmcgPSBUKQpgYGAKCjxiciAvPgpTb3J0IGRlY3JlYXNpbmcgd2l0aCBOQSBpbiBsYXN0CmBgYHtyfQpzb3J0KFgkV2VzdCwgZGVjcmVhc2luZyA9IFQsIG5hLmxhc3QgPSBUKQpgYGAKCjxiciAvPgoKLS0tCgo8YnIgLz4KCiMjIyBPcmRlcmluZwoKPGJyIC8+Ck9yZGVyaW5nIGRhdGEgYnkgdmFyaWFibGUgKk5vcnRoKgpgYGB7cn0KWFtvcmRlcihYJE5vcnRoKSxdCmBgYAoKPGJyIC8+CkRlY3JlYXNpbmcgb3JkZXIgYnkgKk5vcnRoKiB3aGlsZSBzdWJzZXR0aW5nIGNvbHVtbnMgKlNvdXRoKiAmICpXZXN0KgpgYGB7cn0KWFtvcmRlcihYJE5vcnRoLCBkZWNyZWFzaW5nID0gVCksYygiU291dGgiLCAiV2VzdCIpXQpgYGAKCjxiciAvPgpPcmRlcmluZyBieSBtdWx0aXBsZSB2YWx1ZXMKYGBge3J9Clhbb3JkZXIoWCROb3J0aCwgWCRTb3V0aCksXQpgYGAKCjxiciAvPgoKLS0tCgo8YnIgLz4KCiMjIyBPcmRlcmluZyB3aXRoICoqcGx5cioqCgo8YnIgLz4KCkxvYWQgdGhlIHBhY2thZ2UKYGBge3J9CmxpYnJhcnkocGx5cikKYGBgCgo8YnIgLz4KSW5jcmVhc2luZyBvcmRlciB3aXRoICoqYXJyYW5nZSgpKiogZnJvbSAqcGx5cioKYGBge3J9CmFycmFuZ2UoWCwgV2VzdCkKYGBgCgo8YnIgLz4KRGVjcmVhc2luZyBvcmRlciB3aXRoICoqYXJyYW5nZSgpKiogZnJvbSAqcGx5cioKYGBge3J9CmFycmFuZ2UoWCwgZGVzYyhFYXN0KSkKYGBgCgo8YnIgLz4KCi0tLQoKPGJyIC8+CgojIyMgQWRkaW5nIHJvd3MgYW5kIGNvbHVtbnMKCjxiciAvPgpBZGRpbmcgY29sdW1uCmBgYHtyfQpYJFNvdXRoZWFzdCA8LSAoWCRTb3V0aCArIFgkRWFzdCkgLyAyClgKYGBgCgo8YnIgLz4KQW5vdGhlciB3YXkgdG8gYWRkIGNvbHVtbgpgYGB7cn0KWSA8LSBjYmluZChYLCBOb3J0aHdlc3Q9KFgkTm9ydGggKyBYJFdlc3QpLzIpClkKYGBgCgo8YnIgLz4KQWRkIHJvdwpgYGB7cn0KWSA8LSByYmluZChYLCBjKDQwLCA0OCwgMTIsIDQ1LCAoNDgrNDApLzIpKQpZCmBgYAoKLS0tCgo8Y2VudGVyPkVORDwvY2VudGVyPgoKLS0t