library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
library(ggplot2)
df_outlieer <- read.csv("LATIHAN OUTLIER.csv")
is.na(df_outlieer)
##          ID  Nama  Umur   IPK Pengeluaran..Rp.  Kota
##  [1,] FALSE FALSE FALSE FALSE            FALSE FALSE
##  [2,] FALSE FALSE FALSE FALSE            FALSE FALSE
##  [3,] FALSE FALSE FALSE FALSE            FALSE FALSE
##  [4,] FALSE FALSE FALSE FALSE            FALSE FALSE
##  [5,] FALSE FALSE FALSE FALSE            FALSE FALSE
##  [6,] FALSE FALSE  TRUE FALSE            FALSE FALSE
##  [7,] FALSE FALSE FALSE  TRUE            FALSE FALSE
##  [8,] FALSE FALSE FALSE FALSE            FALSE FALSE
##  [9,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [10,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [11,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [12,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [13,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [14,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [15,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [16,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [17,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [18,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [19,] FALSE FALSE FALSE  TRUE            FALSE FALSE
## [20,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [21,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [22,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [23,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [24,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [25,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [26,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [27,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [28,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [29,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [30,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [31,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [32,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [33,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [34,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [35,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [36,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [37,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [38,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [39,] FALSE FALSE FALSE FALSE            FALSE FALSE
## [40,] FALSE FALSE FALSE FALSE            FALSE FALSE
colSums(is.na(df_outlieer))
##               ID             Nama             Umur              IPK 
##                0                0                1                2 
## Pengeluaran..Rp.             Kota 
##                0                0
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.5.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
aggr(df_outlieer, numbers = TRUE, prop = FALSE)

?aggr
## starting httpd help server ... done
df_outlieer$IPK[is.na(df_outlieer$IPK)] <- median(df_outlieer$IPK, na.rm = TRUE)
df_outlieer$Umur[is.na(df_outlieer$Umur)] <- median(df_outlieer$Umur, na.rm = TRUE)
df_outlieer
##    ID   Nama Umur  IPK Pengeluaran..Rp.     Kota
## 1   1   Andi   20 3.50        1.000.000  Jakarta
## 2   2   Budi   21 2.80        1.200.000 Surabaya
## 3   3  Citra   19 3.90              800  Bandung
## 4   4   Deni   22 3.20        1.500.000    Medan
## 5   5   Deni   22 3.20        1.500.000    Medan
## 6   6    Eka   20 3.00              900    Jogja
## 7   7 Farhan   20 3.25        1.100.000 Makassar
## 8   8   Gita   21 3.70      150.000.000  Jakarta
## 9   9   Hadi   20 3.40        1.000.000      NaN
## 10 10  Indah   19 3.60              850 Semarang
## 11 11   Jaka   45 3.10        2.000.000  Bandung
## 12 12  Kevin   20 0.50              950 Surabaya
## 13 13  Laras   21 3.80              NaN  Jakarta
## 14 14   Maya   20 3.30        1.050.000    Medan
## 15 15   Niko   22 2.90        1.300.000    Jogja
## 16 16 Olivia   19 4.00              700  Bandung
## 17 17  Putri   20 3.50        1.000.000  Jakarta
## 18 18   Qori   21 3.20        1.150.000 Surabaya
## 19 19   Rian   22 3.25        1.400.000 Semarang
## 20 20   Sari   20 3.70              900 Makassar
## 21 21   Tono   20 3.50        1.000.000  Jakarta
## 22 22   Umar   21 3.00        1.200.000 Surabaya
## 23 23   Umar   21 3.00        1.200.000 Surabaya
## 24 24   Vina   19 3.90              800  Bandung
## 25 25  Wawan   22 3.20        1.500.000    Medan
## 26 26   Xena   20 3.00              900    Jogja
## 27 27   Yogi   20 1.20        1.100.000 Makassar
## 28 28   Zara   21 3.70              500  Jakarta
## 29 29   Andi   20 3.50        1.000.000  Jakarta
## 30 30   Budi   21 2.80        1.200.000 Surabaya
## 31 31  Citra   19 3.90              800  Bandung
## 32 32   Deni   22 3.20        1.500.000    Medan
## 33 33    Eka   20 3.00              900    Jogja
## 34 34 Farhan   20 3.10        1.100.000 Makassar
## 35 35   Gita   21 3.70        1.000.000  Jakarta
## 36 36   Hadi   20 3.40        1.000.000 Semarang
## 37 37  Indah   19 3.60              850 Semarang
## 38 38   Jaka   21 3.10        2.000.000  Bandung
## 39 39  Kevin   20 3.20              950 Surabaya
## 40 40  Laras   21 3.80        1.100.000  Jakarta
colSums(is.na(df_outlieer))
##               ID             Nama             Umur              IPK 
##                0                0                0                0 
## Pengeluaran..Rp.             Kota 
##                0                0
Q1 <- quantile(df_outlieer$IPK, 0.25)
Q3 <- quantile(df_outlieer$IPK, 0.75)
IQR <- Q3 - Q1

lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliersa <- df_outlieer$IPK < lower_bound
outliers <- df_outlieer$IPK > upper_bound
sum(outliersa)
## [1] 2
sum(outliersa)
## [1] 2
boxplot(df_outlieer$IPK, main = "Boxplot Ozone", col = "red")

sum(duplicated(df_outlieer))
## [1] 0
summary(df_outlieer)
##        ID            Nama                Umur            IPK       
##  Min.   : 1.00   Length:40          Min.   :19.00   Min.   :0.500  
##  1st Qu.:10.75   Class :character   1st Qu.:20.00   1st Qu.:3.075  
##  Median :20.50   Mode  :character   Median :20.00   Median :3.250  
##  Mean   :20.50                      Mean   :21.05   Mean   :3.240  
##  3rd Qu.:30.25                      3rd Qu.:21.00   3rd Qu.:3.625  
##  Max.   :40.00                      Max.   :45.00   Max.   :4.000  
##  Pengeluaran..Rp.       Kota          
##  Length:40          Length:40         
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##