# Load library
library(data.table)

Before we start, let’s take a look at the ‘iris’ dataset and the flower

iris<-data.table(iris)
head(iris)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1:          5.1         3.5          1.4         0.2  setosa
## 2:          4.9         3.0          1.4         0.2  setosa
## 3:          4.7         3.2          1.3         0.2  setosa
## 4:          4.6         3.1          1.5         0.2  setosa
## 5:          5.0         3.6          1.4         0.2  setosa
## 6:          5.4         3.9          1.7         0.4  setosa

1 Subset

1.1 Subset columns

# Method 1
temp<-subset(iris,select = c('Sepal.Length','Sepal.Width'))
head(temp)
##    Sepal.Length Sepal.Width
## 1:          5.1         3.5
## 2:          4.9         3.0
## 3:          4.7         3.2
## 4:          4.6         3.1
## 5:          5.0         3.6
## 6:          5.4         3.9
# Method 2
#subset(iris,select=c(1:2))

1.2 Subset rows

# Method 1
subset(iris,Sepal.Width < 2.4 & Species =="virginica")
##    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 1:            6         2.2            5         1.5 virginica
# Method 2
#iris[ which(iris$Sepal.Width < 2.4 & iris$Species =="virginica"), ]

2 Column Manipulation

2.1 Combine columns

irisSub <- iris[1:2,1:2,with = FALSE]
irisSub
##    Sepal.Length Sepal.Width
## 1:          5.1         3.5
## 2:          4.9         3.0
colbind <- cbind(irisSub,irisSub)
colbind
##    Sepal.Length Sepal.Width Sepal.Length Sepal.Width
## 1:          5.1         3.5          5.1         3.5
## 2:          4.9         3.0          4.9         3.0

2.2 Add columns

# Method 1
iris$New1<-"Hello"
iris$New2 <- ifelse(iris$Sepal.Length >=5, "long", "short")
# Method 2
iris[ , New3 :=1:nrow(iris)]
iris[ , New4 :=shift(Petal.Width, 1, type='lag')]
head(iris)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  New1  New2
## 1:          5.1         3.5          1.4         0.2  setosa Hello  long
## 2:          4.9         3.0          1.4         0.2  setosa Hello short
## 3:          4.7         3.2          1.3         0.2  setosa Hello short
## 4:          4.6         3.1          1.5         0.2  setosa Hello short
## 5:          5.0         3.6          1.4         0.2  setosa Hello  long
## 6:          5.4         3.9          1.7         0.4  setosa Hello  long
##    New3 New4
## 1:    1   NA
## 2:    2  0.2
## 3:    3  0.2
## 4:    4  0.2
## 5:    5  0.2
## 6:    6  0.2

2.3 Drop columns

colnames(iris)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## [5] "Species"
# Method 1: directly remove columns
iris$Species <- iris$Petal.Width <- NULL
head(iris) 
##    Sepal.Length Sepal.Width Petal.Length
## 1:          5.1         3.5          1.4
## 2:          4.9         3.0          1.4
## 3:          4.7         3.2          1.3
## 4:          4.6         3.1          1.5
## 5:          5.0         3.6          1.4
## 6:          5.4         3.9          1.7
# Method 2: remove columns if it matches certain strings
temp <- iris[, which(!grepl("^Species$", colnames(iris))), with=FALSE]
head(temp)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1:          5.1         3.5          1.4         0.2
## 2:          4.9         3.0          1.4         0.2
## 3:          4.7         3.2          1.3         0.2
## 4:          4.6         3.1          1.5         0.2
## 5:          5.0         3.6          1.4         0.2
## 6:          5.4         3.9          1.7         0.4

2.4 Change column names

#cannot run with rmarkdown
setnames(iris, "Species", "NewSpecies")
head(iris)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width NewSpecies
## 1:          5.1         3.5          1.4         0.2     setosa
## 2:          4.9         3.0          1.4         0.2     setosa
## 3:          4.7         3.2          1.3         0.2     setosa
## 4:          4.6         3.1          1.5         0.2     setosa
## 5:          5.0         3.6          1.4         0.2     setosa
## 6:          5.4         3.9          1.7         0.4     setosa

2.5 Get Column Number

list<-c("Petal.Width",'Petal.Length')
grep(paste(list,collapse="|"),names(iris), ignore.case = FALSE)
## [1] 3 4

2.6 Access columns with strings

# Method 1: without column names
temp<-iris[["Sepal.Length"]]
head(temp)
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
# Method 2: with column names
temp<-iris[,"Sepal.Length",with = FALSE]
head(temp)
##    Sepal.Length
## 1:          5.1
## 2:          4.9
## 3:          4.7
## 4:          4.6
## 5:          5.0
## 6:          5.4

3 Row Manipulation

3.1 Combine columns

irisSub <- iris[1:2,1:2,with = FALSE]
irisSub
##    Sepal.Length Sepal.Width
## 1:          5.1         3.5
## 2:          4.9         3.0
rowbind <- rbind(irisSub,irisSub)
rowbind
##    Sepal.Length Sepal.Width
## 1:          5.1         3.5
## 2:          4.9         3.0
## 3:          5.1         3.5
## 4:          4.9         3.0

3.2 Remove duplicated rows

# Method 1
Removed<-iris[!duplicated(iris), ]
c(nrow(iris),nrow(Removed))
## [1] 150 149
# Method 2
uni<-unique(iris,by=c("Sepal.Length","Sepal.Width"))
c(nrow(iris),nrow(uni))
## [1] 150 117

3.3 Get row number

iris[ , .I[Sepal.Length == 5]]
##  [1]  5  8 26 27 36 41 44 50 61 94
iris[ , .I[duplicated(iris)]]
## [1] 143

3.4 Remove rows with outliers

par(mar = rep(2, 4))
NoOutlier<-mtcars[ ! mtcars$qsec %in% boxplot(mtcars$qsec)$out, ]

boxplot(NoOutlier$qsec)

3.5 Remove na rows

# iris has no NA value, this is just an example
temp<-iris[!is.na(iris[["Species"]]),]

3.6 Sample rows

Sampled<-iris[sample(nrow(iris), 100), ]
head(Sampled)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1:          6.5         3.2          5.1         2.0  virginica
## 2:          4.5         2.3          1.3         0.3     setosa
## 3:          6.2         3.4          5.4         2.3  virginica
## 4:          5.8         2.7          4.1         1.0 versicolor
## 5:          6.1         2.9          4.7         1.4 versicolor
## 6:          5.1         3.4          1.5         0.2     setosa

3.7 Summation across row

temp<-iris[,Sum:=rowSums(iris[,1:4,with=FALSE])]
head(temp)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  Sum
## 1:          5.1         3.5          1.4         0.2  setosa 10.2
## 2:          4.9         3.0          1.4         0.2  setosa  9.5
## 3:          4.7         3.2          1.3         0.2  setosa  9.4
## 4:          4.6         3.1          1.5         0.2  setosa  9.4
## 5:          5.0         3.6          1.4         0.2  setosa 10.2
## 6:          5.4         3.9          1.7         0.4  setosa 11.4

4 Other functions

4.0.1 Round values in dt

temp<-format(round(iris[,1:4,with=FALSE],2),nsmall = 2)
head(temp)
##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] "5.10"       "3.50"      "1.40"       "0.20"     
## [2,] "4.90"       "3.00"      "1.40"       "0.20"     
## [3,] "4.70"       "3.20"      "1.30"       "0.20"     
## [4,] "4.60"       "3.10"      "1.50"       "0.20"     
## [5,] "5.00"       "3.60"      "1.40"       "0.20"     
## [6,] "5.40"       "3.90"      "1.70"       "0.40"

4.1 Sorting

# Method 1
temp<-iris[order(iris$Sepal.Length,decreasing=FALSE),]
head(temp)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1:          4.3         3.0          1.1         0.1  setosa
## 2:          4.4         2.9          1.4         0.2  setosa
## 3:          4.4         3.0          1.3         0.2  setosa
## 4:          4.4         3.2          1.3         0.2  setosa
## 5:          4.5         2.3          1.3         0.3  setosa
## 6:          4.6         3.1          1.5         0.2  setosa
# Method 2
temp<-iris[with(iris,order(-Sepal.Length,-Sepal.Width)), ]
head(temp)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 1:          7.9         3.8          6.4         2.0 virginica
## 2:          7.7         3.8          6.7         2.2 virginica
## 3:          7.7         3.0          6.1         2.3 virginica
## 4:          7.7         2.8          6.7         2.0 virginica
## 5:          7.7         2.6          6.9         2.3 virginica
## 6:          7.6         3.0          6.6         2.1 virginica

4.2 Merge tables

dt1 <- data.table(A = letters[1:5], X = 1:5)
dt2 <- data.table(A = letters[4:7], Y = 1:4)
merge(dt1, dt2,by="A")
##    A X Y
## 1: d 4 1
## 2: e 5 2

5 Table Summary

5.1 Common summary

#=== Method 1: using R built in function 'aggregate'
aggregate(cbind(mpg.mean=mtcars[["mpg"]], disp.mean=mtcars[["disp"]]) ~ cyl+ gear, data=mtcars, mean)
##   cyl gear mpg.mean disp.mean
## 1   4    3   21.500  120.1000
## 2   6    3   19.750  241.5000
## 3   8    3   15.050  357.6167
## 4   4    4   26.925  102.6250
## 5   6    4   19.750  163.8000
## 6   4    5   28.200  107.7000
## 7   6    5   19.700  145.0000
## 8   8    5   15.400  326.0000
#=== Method 2: using 'data table' 
iris[,list(Sepal.Length.Mean=mean(Sepal.Length),Sepal.Length.Median=median(Sepal.Length),Sepal.Width.Mean=mean(Sepal.Width),Sepal.Width.Median=median(Sepal.Width)),by=list(Species)]
##       Species Sepal.Length.Mean Sepal.Length.Median Sepal.Width.Mean
## 1:     setosa             5.006                 5.0            3.428
## 2: versicolor             5.936                 5.9            2.770
## 3:  virginica             6.588                 6.5            2.974
##    Sepal.Width.Median
## 1:                3.4
## 2:                2.8
## 3:                3.0
#=== Method 3: using 'dplyr'
library(dplyr)
iris %>% group_by(Species) %>%
  summarise("MeanA" = round(mean(Sepal.Length),2),
            "MeanB" = round(mean(Sepal.Width),2))
## # A tibble: 3 <U+00D7> 3
##      Species MeanA MeanB
##       <fctr> <dbl> <dbl>
## 1     setosa  5.01  3.43
## 2 versicolor  5.94  2.77
## 3  virginica  6.59  2.97

5.2 Summary at one shot

Write a self-defined summary function and call the function to run.

#===== gain the name of parameters to be summerized
parameterList<-NULL
parameter<-names(iris)[1:4]

for ( i in 1:length(colnames(iris))){
  if(colnames(iris)[i] %in% parameter){  
    parameterList<-c(parameterList,i)
  }}

#===== do call SummaryFunction returns summrized parameters
SummaryFunction<-function(i){
  # i=1
  OneSummerized<-iris[,list(Averg=mean(get(colnames(iris)[i])),sd=sd(get(colnames(iris)[i]))),by=list(Species)]
  setnames(OneSummerized,"Averg",paste(colnames(iris)[i],'MEAN',sep="."))
  setnames(OneSummerized,"sd",paste(colnames(iris)[i],'SD',sep="."))
  return(OneSummerized)
}    
AllSum<-do.call('cbind',lapply(parameterList,SummaryFunction))

#===== remove duplicated columns
AllSummerized <- AllSum[, !duplicated(colnames(AllSum)),with=FALSE]
head(AllSummerized)
##       Species Sepal.Length.MEAN Sepal.Length.SD Sepal.Width.MEAN
## 1:     setosa             5.006       0.3524897            3.428
## 2: versicolor             5.936       0.5161711            2.770
## 3:  virginica             6.588       0.6358796            2.974
##    Sepal.Width.SD Petal.Length.MEAN Petal.Length.SD Petal.Width.MEAN
## 1:      0.3790644             1.462       0.1736640            0.246
## 2:      0.3137983             4.260       0.4699110            1.326
## 3:      0.3224966             5.552       0.5518947            2.026
##    Petal.Width.SD
## 1:      0.1053856
## 2:      0.1977527
## 3:      0.2746501