library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
dat <- ggplot2::mpg
summary(dat$hwy)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.00   18.00   24.00   23.44   27.00   44.00

Grubbs’s test

H0 : Nilai tertinggi bukan merupakan oulier H1 : Nilai tertinggi merupakan oulier

library(outliers)
test <- grubbs.test(dat$hwy)
test
## 
##  Grubbs test for one outlier
## 
## data:  dat$hwy
## G = 3.45274, U = 0.94862, p-value = 0.05555
## alternative hypothesis: highest value 44 is an outlier

p-value=0,056 > 5%, Gagal Tolak H0 Dengan tingkat signifikansi 5%, Belum cukup bukti untuk menyatakan bahwa nilai tertinggi merupakan outlier.

H0 : Nilai terendah bukan merupakan oulier H1 : Nilai terendah merupakan oulier

library(outliers)
test <- grubbs.test(dat$hwy, opposite = TRUE)
test
## 
##  Grubbs test for one outlier
## 
## data:  dat$hwy
## G = 1.92122, U = 0.98409, p-value = 1
## alternative hypothesis: lowest value 12 is an outlier

p-value=1 > 5%, Gagal Tolak H0 Dengan tingkat signifikansi 5%, Belum cukup bukti untuk menyatakan bahwa nilai terendah merupakan outlier.

Dixon’s test

H0 : Nilai terendah bukan merupakan oulier H1 : Nilai terendah merupakan oulier

subdat <- dat[1:20, ]
test <- dixon.test(subdat$hwy)
test
## 
##  Dixon test for outliers
## 
## data:  subdat$hwy
## Q = 0.57143, p-value = 0.006508
## alternative hypothesis: lowest value 15 is an outlier

p-value=0,0065 < 5%, Tolak H0 Dengan tingkat signifikansi 5%, sudah cukup bukti untuk menyatakan bahwa 15 nilai terendah merupakan outlier.

H0 : Nilai tertinggi bukan merupakan oulier H1 : Nilai tertinggi merupakan oulier

test <- dixon.test(subdat$hwy,
  opposite = TRUE)
test
## 
##  Dixon test for outliers
## 
## data:  subdat$hwy
## Q = 0.25, p-value = 0.8582
## alternative hypothesis: highest value 31 is an outlier

p-value=0,8582 > 5%, Gagal Tolak H0 Dengan tingkat signifikansi 5%, belum cukup bukti untuk menyatakan bahwa 31 nilai tertinggi merupakan outlier.

# find and exclude lowest value
remove_ind <- which.min(subdat$hwy)
subsubdat <- subdat[-remove_ind, ]

# Dixon test on dataset without the minimum
test <- dixon.test(subsubdat$hwy)
test
## 
##  Dixon test for outliers
## 
## data:  subsubdat$hwy
## Q = 0.44444, p-value = 0.1297
## alternative hypothesis: lowest value 20 is an outlier

p-value=0,1297 > 5%, Gagal Tolak H0 Dengan tingkat signifikansi 5%, belum cukup bukti untuk menyatakan bahwa 20 nilai terendah merupakan outlier.

Rosner’s Test

library(EnvStats)
## Warning: package 'EnvStats' was built under R version 4.2.3
## 
## Attaching package: 'EnvStats'
## The following objects are masked from 'package:stats':
## 
##     predict, predict.lm
test <- rosnerTest(dat$hwy,
  k = 3)
test
## 
## Results of Outlier Test
## -------------------------
## 
## Test Method:                     Rosner's Test for Outliers
## 
## Hypothesized Distribution:       Normal
## 
## Data:                            dat$hwy
## 
## Sample Size:                     234
## 
## Test Statistics:                 R.1 = 3.452739
##                                  R.2 = 3.552586
##                                  R.3 = 3.131909
## 
## Test Statistic Parameter:        k = 3
## 
## Alternative Hypothesis:          Up to 3 observations are not
##                                  from the same Distribution.
## 
## Type I Error:                    5%
## 
## Number of Outliers Detected:     0
## 
##   i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 23.44017 5.954643    44     213 3.452739   3.652091   FALSE
## 2 1 23.35193 5.812124    44     222 3.552586   3.650836   FALSE
## 3 2 23.26293 5.663340    41     223 3.131909   3.649575   FALSE
test$all.stats

LOF (Local Outlier Factor): Proximity (density) Based Outlier Detection Technique

library(DMwR2)
## Warning: package 'DMwR2' was built under R version 4.2.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Mengambil data mpg pada package ggplot2
dat <- ggplot2::mpg
head(dat)
#Mengambil kolom yang numerik             
dat2 <- print(select_if(dat, is.numeric))
## # A tibble: 234 × 5
##    displ  year   cyl   cty   hwy
##    <dbl> <int> <int> <int> <int>
##  1   1.8  1999     4    18    29
##  2   1.8  1999     4    21    29
##  3   2    2008     4    20    31
##  4   2    2008     4    21    30
##  5   2.8  1999     6    16    26
##  6   2.8  1999     6    18    26
##  7   3.1  2008     6    18    27
##  8   1.8  1999     4    18    26
##  9   1.8  1999     4    16    25
## 10   2    2008     4    20    28
## # ℹ 224 more rows
#Menghitung skor outlier 
outlier.scores <- lofactor(dat2, k=5)

#Grafik kepadatan dari skor outlier 
plot(density(outlier.scores))

#Menampilkan outlier 
outliers <- order(outlier.scores, decreasing=T)[1:5]
print(outliers)
## [1] 157 149  92  91 103
#Visualisasi outlier dengan biplot dari dua komponen utama pertama. 
n <- nrow(dat2)
labels <- 1:n
labels[-outliers] <- "."
biplot(prcomp(dat2), cex=.8, xlabs=labels)

#Visualisasi outlier dengan pairs plot 
pch <- rep(".", n)
pch[outliers] <- "+"

col <- rep("black", n)
col[outliers] <- "red"

pairs(dat2, pch=pch, col=col)

#Visualisasi outlier dengan 3D
library(rgl)
## Warning: package 'rgl' was built under R version 4.2.3
plot3d(dat2$displ, dat2$cty, dat2$hwy, type="s", col=col)