R Veri Duzenleme - data.table ornekler

Dr. Kubra ATALAY KABASAKAL

11/28/2020

data.table Paketi

data.table Paketi

data.table Paketi

data.table Paketi

library(data.table)

data.table Paketi

# data.frame
X <- data.frame(A = letters[1:10], X = 1:10)
# data.table
Y <- data.table(A = letters[1:10], X = 1:10)

data.table Paketi

# data.frame 
X <- data.frame(A = letters[1:10], X = 1:10)
X <- as.data.table(X)

data.table Paketi

data.table Paketi

Y <- data.table(A = letters[1:10], B = 1:10)
# birinci yöntem
Y[1:3]
##    A B
## 1: a 1
## 2: b 2
## 3: c 3
# ikinci yöntem
Y[1:3,]
##    A B
## 1: a 1
## 2: b 2
## 3: c 3

data.table Paketi

Y <- data.table(A = letters[1:10], B = 1:10)
# birinci yöntem
Y[-(4:10)]
##    A B
## 1: a 1
## 2: b 2
## 3: c 3
# ikinci yöntem
Y[!(4:10)]
##    A B
## 1: a 1
## 2: b 2
## 3: c 3

data.table Paketi

Y <- data.table(A = letters[1:10], 
                B = 1:10)
# Nesnenin son satirini yazdirma nrow fonksiyonu ile
Y[nrow(Y)]
##    A  B
## 1: j 10
# .N ile
Y[.N]
##    A  B
## 1: j 10

data.table Paketi

Y <- data.table(A = letters[1:10],
                B = 1:10)
Y[B==1]
##    A B
## 1: a 1
Y[B==1,]
##    A B
## 1: a 1

data.table Paketi

dt <- data.table::data.table(
  X = as.character(sample(letters[1:10], 1e+07, replace = TRUE)),
  Y = sample(1:100, 1e+07, replace = TRUE))
system.time(dt[X=="A"])
##    user  system elapsed 
##    0.12    0.11    0.06
indices(dt)
## [1] "X"
system.time(dt[X=="A"])
##    user  system elapsed 
##       0       0       0

data.table Paketi

iris <- as.data.table(iris)
head(iris[Species %like% "setosa"],2)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1:          5.1         3.5          1.4         0.2  setosa
## 2:          4.9         3.0          1.4         0.2  setosa
#head(iris[Species %like% "^s"],2)
#head(iris[grepl("^s",Species)],2)
# %ilike% ve %flike%

data.table Paketi

setosa <- iris[Species %like% "setosa"]
setosa[Sepal.Length %between% c(4.3,4.4)]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1:          4.4         2.9          1.4         0.2  setosa
## 2:          4.3         3.0          1.1         0.1  setosa
## 3:          4.4         3.0          1.3         0.2  setosa
## 4:          4.4         3.2          1.3         0.2  setosa
#  ayni islem & opertoru ile de yapilabilir.

data.table Paketi

iris <- as.data.table(iris)
iris <- iris %>% dplyr::mutate_if(is.factor, as.character)
iris_mini <- iris[Species %chin% c("setosa","virginica")]
str(iris_mini) #  ayni islem %in% opertoru ile de yapilabilir.
## Classes 'data.table' and 'data.frame':   100 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : chr  "setosa" "setosa" "setosa" "setosa" ...
##  - attr(*, ".internal.selfref")=<externalptr>

data.table Paketi

data.table Paketi

data.table Paketi

iris[,mean(Sepal.Length)]
## [1] 5.843333
iris[Species=="setosa",mean(Sepal.Length)]
## [1] 5.006
iris[Species=="setosa",.N]
## [1] 50

data.table Paketi

iris[,.(mean(Sepal.Length),
        mean(Sepal.Width))]
##          V1       V2
## 1: 5.843333 3.057333
iris[,.(sl_mean = mean(Sepal.Length),
        sw_mean = mean(Sepal.Width))]
##     sl_mean  sw_mean
## 1: 5.843333 3.057333

data.table Paketi

iris[Species=="setosa",.(mean(Sepal.Length),
        mean(Sepal.Width))]
##       V1    V2
## 1: 5.006 3.428
iris[Species=="setosa",.(sl_mean = mean(Sepal.Length),
        sw_mean = mean(Sepal.Width))]
##    sl_mean sw_mean
## 1:   5.006   3.428

data.table Paketi

# iris veri setinde her bir grupta kaç kişi olduğu
iris[,.N,by="Species"]
##       Species  N
## 1:     setosa 50
## 2: versicolor 50
## 3:  virginica 50
 iris[,.N,by=.(Species)]
##       Species  N
## 1:     setosa 50
## 2: versicolor 50
## 3:  virginica 50

data.table Paketi

#her bir grubun bir değişkene gore ortalaması
iris[,mean(Sepal.Length),by="Species"]
##       Species    V1
## 1:     setosa 5.006
## 2: versicolor 5.936
## 3:  virginica 6.588
iris[,mean(Sepal.Length),by=.(Species)] # list .() aynı işleve sahip
##       Species    V1
## 1:     setosa 5.006
## 2: versicolor 5.936
## 3:  virginica 6.588

data.table Paketi

# ortalamadan uzun olan çiçekleri seçip sıralam
dt1 <- iris[Sepal.Length>mean(Sepal.Length)]
dt2 <- dt1 [order(Sepal.Length)]
head(dt2,3)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1:          5.9         3.0          4.2         1.5 versicolor
## 2:          5.9         3.2          4.8         1.8 versicolor
## 3:          5.9         3.0          5.1         1.8  virginica
dt3 <- iris[Sepal.Length>mean(Sepal.Length)][order(Sepal.Length)]
head(dt3,3)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1:          5.9         3.0          4.2         1.5 versicolor
## 2:          5.9         3.2          4.8         1.8 versicolor
## 3:          5.9         3.0          5.1         1.8  virginica

data.table Paketi

iris[,.(mean_sl=mean(Sepal.Length)),by="Species"][order(mean_sl)]
##       Species mean_sl
## 1:     setosa   5.006
## 2: versicolor   5.936
## 3:  virginica   6.588

data.table Paketi

iris[,uniqueN(Sepal.Length),by="Species"]
##       Species V1
## 1:     setosa 15
## 2: versicolor 21
## 3:  virginica 21

data.table Paketi

iris[,.SD[1],by=Species]
##       Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1:     setosa          5.1         3.5          1.4         0.2
## 2: versicolor          7.0         3.2          4.7         1.4
## 3:  virginica          6.3         3.3          6.0         2.5
iris[,.SD[.N],by=Species]
##       Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1:     setosa          5.0         3.3          1.4         0.2
## 2: versicolor          5.7         2.8          4.1         1.3
## 3:  virginica          5.9         3.0          5.1         1.8

data.table Paketi

iris[,.SD[.N],by=Species,.SDcols=c("Sepal.Length")]
##       Species Sepal.Length
## 1:     setosa          5.0
## 2: versicolor          5.7
## 3:  virginica          5.9

data.table Paketi

iris[,Sepal.Length:=NULL]
head(iris,3)
##    Sepal.Width Petal.Length Petal.Width Species
## 1:         3.5          1.4         0.2  setosa
## 2:         3.0          1.4         0.2  setosa
## 3:         3.2          1.3         0.2  setosa
(iris[,':='(new=Sepal.Width >5)])
## Warning in `[.data.table`(iris, , `:=`(new = Sepal.Width > 5)):
## Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the
## data.table so that := can add this new column by reference. At an earlier point,
## this data.table has been copied by R (or was created manually using structure()
## or similar). Avoid names<- and attr<- which in R currently (and oddly) may
## copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?
## setnames and ?setattr. If this message doesn't help, please report your use case
## to the data.table issue tracker so the root cause can be fixed or this message
## improved.
head(iris,3)
##    Sepal.Width Petal.Length Petal.Width Species   new
## 1:         3.5          1.4         0.2  setosa FALSE
## 2:         3.0          1.4         0.2  setosa FALSE
## 3:         3.2          1.3         0.2  setosa FALSE

data.table Paketi

iris[,Sepal.Length:=NULL]
## Warning in `[.data.table`(iris, , `:=`(Sepal.Length, NULL)): Column
## 'Sepal.Length' does not exist to remove
head(iris,3)
##    Sepal.Width Petal.Length Petal.Width Species   new
## 1:         3.5          1.4         0.2  setosa FALSE
## 2:         3.0          1.4         0.2  setosa FALSE
## 3:         3.2          1.3         0.2  setosa FALSE
(iris[,':='(new=Sepal.Width >5)])
head(iris,3)
##    Sepal.Width Petal.Length Petal.Width Species   new
## 1:         3.5          1.4         0.2  setosa FALSE
## 2:         3.0          1.4         0.2  setosa FALSE
## 3:         3.2          1.3         0.2  setosa FALSE