# Load library
library(data.table)Before we start, let’s take a look at the ‘iris’ dataset and the flower
iris<-data.table(iris)
head(iris)## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1: 5.1 3.5 1.4 0.2 setosa
## 2: 4.9 3.0 1.4 0.2 setosa
## 3: 4.7 3.2 1.3 0.2 setosa
## 4: 4.6 3.1 1.5 0.2 setosa
## 5: 5.0 3.6 1.4 0.2 setosa
## 6: 5.4 3.9 1.7 0.4 setosa
# Method 1
temp<-subset(iris,select = c('Sepal.Length','Sepal.Width'))
head(temp)## Sepal.Length Sepal.Width
## 1: 5.1 3.5
## 2: 4.9 3.0
## 3: 4.7 3.2
## 4: 4.6 3.1
## 5: 5.0 3.6
## 6: 5.4 3.9
# Method 2
#subset(iris,select=c(1:2))# Method 1
subset(iris,Sepal.Width < 2.4 & Species =="virginica")## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1: 6 2.2 5 1.5 virginica
# Method 2
#iris[ which(iris$Sepal.Width < 2.4 & iris$Species =="virginica"), ]irisSub <- iris[1:2,1:2,with = FALSE]
irisSub## Sepal.Length Sepal.Width
## 1: 5.1 3.5
## 2: 4.9 3.0
colbind <- cbind(irisSub,irisSub)
colbind## Sepal.Length Sepal.Width Sepal.Length Sepal.Width
## 1: 5.1 3.5 5.1 3.5
## 2: 4.9 3.0 4.9 3.0
# Method 1
iris$New1<-"Hello"
iris$New2 <- ifelse(iris$Sepal.Length >=5, "long", "short")
# Method 2
iris[ , New3 :=1:nrow(iris)]
iris[ , New4 :=shift(Petal.Width, 1, type='lag')]
head(iris)## Sepal.Length Sepal.Width Petal.Length Petal.Width Species New1 New2
## 1: 5.1 3.5 1.4 0.2 setosa Hello long
## 2: 4.9 3.0 1.4 0.2 setosa Hello short
## 3: 4.7 3.2 1.3 0.2 setosa Hello short
## 4: 4.6 3.1 1.5 0.2 setosa Hello short
## 5: 5.0 3.6 1.4 0.2 setosa Hello long
## 6: 5.4 3.9 1.7 0.4 setosa Hello long
## New3 New4
## 1: 1 NA
## 2: 2 0.2
## 3: 3 0.2
## 4: 4 0.2
## 5: 5 0.2
## 6: 6 0.2
colnames(iris)## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
# Method 1: directly remove columns
iris$Species <- iris$Petal.Width <- NULL
head(iris) ## Sepal.Length Sepal.Width Petal.Length
## 1: 5.1 3.5 1.4
## 2: 4.9 3.0 1.4
## 3: 4.7 3.2 1.3
## 4: 4.6 3.1 1.5
## 5: 5.0 3.6 1.4
## 6: 5.4 3.9 1.7
# Method 2: remove columns if it matches certain strings
temp <- iris[, which(!grepl("^Species$", colnames(iris))), with=FALSE]
head(temp)## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1: 5.1 3.5 1.4 0.2
## 2: 4.9 3.0 1.4 0.2
## 3: 4.7 3.2 1.3 0.2
## 4: 4.6 3.1 1.5 0.2
## 5: 5.0 3.6 1.4 0.2
## 6: 5.4 3.9 1.7 0.4
#cannot run with rmarkdown
setnames(iris, "Species", "NewSpecies")
head(iris)## Sepal.Length Sepal.Width Petal.Length Petal.Width NewSpecies
## 1: 5.1 3.5 1.4 0.2 setosa
## 2: 4.9 3.0 1.4 0.2 setosa
## 3: 4.7 3.2 1.3 0.2 setosa
## 4: 4.6 3.1 1.5 0.2 setosa
## 5: 5.0 3.6 1.4 0.2 setosa
## 6: 5.4 3.9 1.7 0.4 setosa
list<-c("Petal.Width",'Petal.Length')
grep(paste(list,collapse="|"),names(iris), ignore.case = FALSE)## [1] 3 4
# Method 1: without column names
temp<-iris[["Sepal.Length"]]
head(temp)## [1] 5.1 4.9 4.7 4.6 5.0 5.4
# Method 2: with column names
temp<-iris[,"Sepal.Length",with = FALSE]
head(temp)## Sepal.Length
## 1: 5.1
## 2: 4.9
## 3: 4.7
## 4: 4.6
## 5: 5.0
## 6: 5.4
irisSub <- iris[1:2,1:2,with = FALSE]
irisSub## Sepal.Length Sepal.Width
## 1: 5.1 3.5
## 2: 4.9 3.0
rowbind <- rbind(irisSub,irisSub)
rowbind## Sepal.Length Sepal.Width
## 1: 5.1 3.5
## 2: 4.9 3.0
## 3: 5.1 3.5
## 4: 4.9 3.0
# Method 1
Removed<-iris[!duplicated(iris), ]
c(nrow(iris),nrow(Removed))## [1] 150 149
# Method 2
uni<-unique(iris,by=c("Sepal.Length","Sepal.Width"))
c(nrow(iris),nrow(uni))## [1] 150 117
iris[ , .I[Sepal.Length == 5]]## [1] 5 8 26 27 36 41 44 50 61 94
iris[ , .I[duplicated(iris)]]## [1] 143
par(mar = rep(2, 4))
NoOutlier<-mtcars[ ! mtcars$qsec %in% boxplot(mtcars$qsec)$out, ]boxplot(NoOutlier$qsec)# iris has no NA value, this is just an example
temp<-iris[!is.na(iris[["Species"]]),]Sampled<-iris[sample(nrow(iris), 100), ]
head(Sampled)## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1: 6.5 3.2 5.1 2.0 virginica
## 2: 4.5 2.3 1.3 0.3 setosa
## 3: 6.2 3.4 5.4 2.3 virginica
## 4: 5.8 2.7 4.1 1.0 versicolor
## 5: 6.1 2.9 4.7 1.4 versicolor
## 6: 5.1 3.4 1.5 0.2 setosa
temp<-iris[,Sum:=rowSums(iris[,1:4,with=FALSE])]
head(temp)## Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sum
## 1: 5.1 3.5 1.4 0.2 setosa 10.2
## 2: 4.9 3.0 1.4 0.2 setosa 9.5
## 3: 4.7 3.2 1.3 0.2 setosa 9.4
## 4: 4.6 3.1 1.5 0.2 setosa 9.4
## 5: 5.0 3.6 1.4 0.2 setosa 10.2
## 6: 5.4 3.9 1.7 0.4 setosa 11.4
temp<-format(round(iris[,1:4,with=FALSE],2),nsmall = 2)
head(temp)## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] "5.10" "3.50" "1.40" "0.20"
## [2,] "4.90" "3.00" "1.40" "0.20"
## [3,] "4.70" "3.20" "1.30" "0.20"
## [4,] "4.60" "3.10" "1.50" "0.20"
## [5,] "5.00" "3.60" "1.40" "0.20"
## [6,] "5.40" "3.90" "1.70" "0.40"
# Method 1
temp<-iris[order(iris$Sepal.Length,decreasing=FALSE),]
head(temp)## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1: 4.3 3.0 1.1 0.1 setosa
## 2: 4.4 2.9 1.4 0.2 setosa
## 3: 4.4 3.0 1.3 0.2 setosa
## 4: 4.4 3.2 1.3 0.2 setosa
## 5: 4.5 2.3 1.3 0.3 setosa
## 6: 4.6 3.1 1.5 0.2 setosa
# Method 2
temp<-iris[with(iris,order(-Sepal.Length,-Sepal.Width)), ]
head(temp)## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1: 7.9 3.8 6.4 2.0 virginica
## 2: 7.7 3.8 6.7 2.2 virginica
## 3: 7.7 3.0 6.1 2.3 virginica
## 4: 7.7 2.8 6.7 2.0 virginica
## 5: 7.7 2.6 6.9 2.3 virginica
## 6: 7.6 3.0 6.6 2.1 virginica
dt1 <- data.table(A = letters[1:5], X = 1:5)
dt2 <- data.table(A = letters[4:7], Y = 1:4)
merge(dt1, dt2,by="A")## A X Y
## 1: d 4 1
## 2: e 5 2
#=== Method 1: using R built in function 'aggregate'
aggregate(cbind(mpg.mean=mtcars[["mpg"]], disp.mean=mtcars[["disp"]]) ~ cyl+ gear, data=mtcars, mean)## cyl gear mpg.mean disp.mean
## 1 4 3 21.500 120.1000
## 2 6 3 19.750 241.5000
## 3 8 3 15.050 357.6167
## 4 4 4 26.925 102.6250
## 5 6 4 19.750 163.8000
## 6 4 5 28.200 107.7000
## 7 6 5 19.700 145.0000
## 8 8 5 15.400 326.0000
#=== Method 2: using 'data table'
iris[,list(Sepal.Length.Mean=mean(Sepal.Length),Sepal.Length.Median=median(Sepal.Length),Sepal.Width.Mean=mean(Sepal.Width),Sepal.Width.Median=median(Sepal.Width)),by=list(Species)]## Species Sepal.Length.Mean Sepal.Length.Median Sepal.Width.Mean
## 1: setosa 5.006 5.0 3.428
## 2: versicolor 5.936 5.9 2.770
## 3: virginica 6.588 6.5 2.974
## Sepal.Width.Median
## 1: 3.4
## 2: 2.8
## 3: 3.0
#=== Method 3: using 'dplyr'
library(dplyr)
iris %>% group_by(Species) %>%
summarise("MeanA" = round(mean(Sepal.Length),2),
"MeanB" = round(mean(Sepal.Width),2))## # A tibble: 3 <U+00D7> 3
## Species MeanA MeanB
## <fctr> <dbl> <dbl>
## 1 setosa 5.01 3.43
## 2 versicolor 5.94 2.77
## 3 virginica 6.59 2.97
Write a self-defined summary function and call the function to run.
#===== gain the name of parameters to be summerized
parameterList<-NULL
parameter<-names(iris)[1:4]
for ( i in 1:length(colnames(iris))){
if(colnames(iris)[i] %in% parameter){
parameterList<-c(parameterList,i)
}}
#===== do call SummaryFunction returns summrized parameters
SummaryFunction<-function(i){
# i=1
OneSummerized<-iris[,list(Averg=mean(get(colnames(iris)[i])),sd=sd(get(colnames(iris)[i]))),by=list(Species)]
setnames(OneSummerized,"Averg",paste(colnames(iris)[i],'MEAN',sep="."))
setnames(OneSummerized,"sd",paste(colnames(iris)[i],'SD',sep="."))
return(OneSummerized)
}
AllSum<-do.call('cbind',lapply(parameterList,SummaryFunction))
#===== remove duplicated columns
AllSummerized <- AllSum[, !duplicated(colnames(AllSum)),with=FALSE]
head(AllSummerized)## Species Sepal.Length.MEAN Sepal.Length.SD Sepal.Width.MEAN
## 1: setosa 5.006 0.3524897 3.428
## 2: versicolor 5.936 0.5161711 2.770
## 3: virginica 6.588 0.6358796 2.974
## Sepal.Width.SD Petal.Length.MEAN Petal.Length.SD Petal.Width.MEAN
## 1: 0.3790644 1.462 0.1736640 0.246
## 2: 0.3137983 4.260 0.4699110 1.326
## 3: 0.3224966 5.552 0.5518947 2.026
## Petal.Width.SD
## 1: 0.1053856
## 2: 0.1977527
## 3: 0.2746501