outlier

library(factoextra)

## Warning: package 'factoextra' was built under R version 4.3.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.3.2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(outliers)

setwd("D:/R CSV")
df <- read.csv("2019.csv")
df <- subset(df,select = -c(Overall.rank,Country.or.region))

par(mfrow=c(2,2))
for(i in 1:7) {boxplot(df[,i], main=names(df)[i])}

sapply(df, function(df) grubbs.test(df))

##             Score                              GDP.per.capita                
## statistic   numeric,2                          numeric,2                     
## alternative "lowest value 2.853 is an outlier" "lowest value 0 is an outlier"
## p.value     1                                  1                             
## method      "Grubbs test for one outlier"      "Grubbs test for one outlier" 
## data.name   "df"                               "df"                          
##             Social.support                 Healthy.life.expectancy       
## statistic   numeric,2                      numeric,2                     
## alternative "lowest value 0 is an outlier" "lowest value 0 is an outlier"
## p.value     0.00262529                     0.1873518                     
## method      "Grubbs test for one outlier"  "Grubbs test for one outlier" 
## data.name   "df"                           "df"                          
##             Freedom.to.make.life.choices   Generosity                         
## statistic   numeric,2                      numeric,2                          
## alternative "lowest value 0 is an outlier" "highest value 0.566 is an outlier"
## p.value     0.4379682                      0.003153117                        
## method      "Grubbs test for one outlier"  "Grubbs test for one outlier"      
## data.name   "df"                           "df"                               
##             Perceptions.of.corruption          
## statistic   numeric,2                          
## alternative "highest value 0.453 is an outlier"
## p.value     0.01703929                         
## method      "Grubbs test for one outlier"      
## data.name   "df"

col_num = colnames(df)
par(mar = c(3,3,1,1),
    mfrow = c(2,2))
for (i in col_num) {
  hist(df[[i]],
       main = i,
       las = 2,
       col = "green")
}

fviz_nbclust(df, kmeans, method = "gap_stat")+
  labs(subtitle = "Gap statistic method")

km.res <- kmeans(df, 5)
fviz_cluster(km.res, df, ellipse.type = "norm")

dim(df)

## [1] 156   7

boxplot.stats(df$Social.support)

## $stats
## [1] 0.5170 1.0555 1.2715 1.4530 1.6240
## 
## $n
## [1] 156
## 
## $conf
## [1] 1.221216 1.321784
## 
## $out
## [1] 0.437 0.447 0.378 0.000

for (i in (1:4)){
  df <- df[-which(df$Social.support %in% boxplot.stats(df$Social.support)$out),]
}

#Atribut Healthy.life.expectancy
boxplot.stats(df$Healthy.life.expectancy)

## $stats
## [1] 0.1680 0.5775 0.8035 0.8920 1.1410
## 
## $n
## [1] 148
## 
## $conf
## [1] 0.7626542 0.8443458
## 
## $out
## [1] 0

df <- df[-which(df$Healthy.life.expectancy %in% boxplot.stats(df$Healthy.life.expectancy)$out),]

#Atribut Freedom.to.make.life.choices
boxplot.stats(df$Freedom.to.make.life.choices)

## $stats
## [1] 0.0660 0.3175 0.4300 0.5080 0.6310
## 
## $n
## [1] 147
## 
## $conf
## [1] 0.4051748 0.4548252
## 
## $out
## [1] 0.026

df <- df[-which(df$Freedom.to.make.life.choices %in% boxplot.stats(df$Freedom.to.make.life.choices)$out),]

#Atribut Generosity
boxplot.stats(df$Generosity)

## $stats
## [1] 0.0000 0.1070 0.1775 0.2520 0.3750
## 
## $n
## [1] 146
## 
## $conf
## [1] 0.1585395 0.1964605
## 
## $out
## [1] 0.498 0.566

df <- df[-which(df$Generosity %in% boxplot.stats(df$Generosity)$out),]

#Atribut Perceptions.of.corruption
boxplot.stats(df$Perceptions.of.corruption)

## $stats
## [1] 0.0000 0.0470 0.0835 0.1425 0.2780
## 
## $n
## [1] 144
## 
## $conf
## [1] 0.07092583 0.09607417
## 
## $out
##  [1] 0.393 0.410 0.341 0.298 0.343 0.373 0.380 0.308 0.290 0.316 0.310 0.453
## [13] 0.287 0.411

for (i in (1:3)){
  df <- df[-which(df$Perceptions.of.corruption %in% boxplot.stats(df$Perceptions.of.corruption)$out),]
}

#Pembersihan Outlier
dim(df)

## [1] 124   7

col_num = colnames(df)
par(mar = c(3,3,1,1),
    mfrow = c(2,2))
for (i in col_num) {
  hist(df[[i]],
       main = i,
       las = 2,
       col = "green")
}

par(mfrow=c(2,2))
for(i in 1:7) {boxplot(df[,i], main=names(df)[i])}

sapply(df, function(df) grubbs.test(df))

##             Score                              
## statistic   numeric,2                          
## alternative "highest value 7.494 is an outlier"
## p.value     0.856768                           
## method      "Grubbs test for one outlier"      
## data.name   "df"                               
##             GDP.per.capita                    
## statistic   numeric,2                         
## alternative "lowest value 0.073 is an outlier"
## p.value     1                                 
## method      "Grubbs test for one outlier"     
## data.name   "df"                              
##             Social.support                    
## statistic   numeric,2                         
## alternative "lowest value 0.666 is an outlier"
## p.value     0.7477548                         
## method      "Grubbs test for one outlier"     
## data.name   "df"                              
##             Healthy.life.expectancy           
## statistic   numeric,2                         
## alternative "lowest value 0.168 is an outlier"
## p.value     0.4964903                         
## method      "Grubbs test for one outlier"     
## data.name   "df"                              
##             Freedom.to.make.life.choices      
## statistic   numeric,2                         
## alternative "lowest value 0.066 is an outlier"
## p.value     0.6943657                         
## method      "Grubbs test for one outlier"     
## data.name   "df"                              
##             Generosity                         
## statistic   numeric,2                          
## alternative "highest value 0.375 is an outlier"
## p.value     0.582165                           
## method      "Grubbs test for one outlier"      
## data.name   "df"                               
##             Perceptions.of.corruption          
## statistic   numeric,2                          
## alternative "highest value 0.183 is an outlier"
## p.value     1                                  
## method      "Grubbs test for one outlier"      
## data.name   "df"

km.res <- kmeans(df, 5)
fviz_cluster(km.res, df, ellipse.type = "norm")

outlier

CHANDY ANUGRA PRATAMA

2024-03-07