1 Eksplorasi Data

df <- read.csv("D:/Kuliah/Mat/TSA Kominfo/Praktikum/2.ChurnDetection.csv", header = TRUE, sep = ",")

# Melihat struktur data
cat("STRUKTUR :", "\n\n")
## STRUKTUR :
str(df) 
## 'data.frame':    667 obs. of  20 variables:
##  $ State                 : chr  "LA" "IN" "NY" "SC" ...
##  $ Account.length        : int  117 65 161 111 49 36 65 119 10 68 ...
##  $ Area.code             : int  408 415 415 415 510 408 415 415 408 415 ...
##  $ International.plan    : chr  "No" "No" "No" "No" ...
##  $ Voice.mail.plan       : chr  "No" "No" "No" "No" ...
##  $ Number.vmail.messages : int  0 0 0 0 0 30 0 0 0 0 ...
##  $ Total.day.minutes     : num  184 129 333 110 119 ...
##  $ Total.day.calls       : int  97 137 67 103 117 128 120 114 112 70 ...
##  $ Total.day.charge      : num  31.4 21.9 56.6 18.8 20.3 ...
##  $ Total.eve.minutes     : num  352 228 318 137 215 ...
##  $ Total.eve.calls       : int  80 83 97 102 109 80 122 117 66 164 ...
##  $ Total.eve.charge      : num  29.9 19.4 27 11.7 18.3 ...
##  $ Total.night.minutes   : num  216 209 161 190 179 ...
##  $ Total.night.calls     : int  90 111 128 105 90 109 118 91 57 103 ...
##  $ Total.night.charge    : num  9.71 9.4 7.23 8.53 8.04 ...
##  $ Total.intl.minutes    : num  8.7 12.7 5.4 7.7 11.1 14.5 13.2 8.8 11.4 12.1 ...
##  $ Total.intl.calls      : int  4 6 9 6 1 6 5 3 6 3 ...
##  $ Total.intl.charge     : num  2.35 3.43 1.46 2.08 3 3.92 3.56 2.38 3.08 3.27 ...
##  $ Customer.service.calls: int  1 4 4 2 1 0 3 5 2 3 ...
##  $ Churn                 : chr  "False" "True" "True" "False" ...
# Melihat ringkasan data
cat("\n\n", "SUMMMARY :", "\n\n")
## 
## 
##  SUMMMARY :
summary(df)
##     State           Account.length    Area.code     International.plan
##  Length:667         Min.   :  1.0   Min.   :408.0   Length:667        
##  Class :character   1st Qu.: 76.0   1st Qu.:408.0   Class :character  
##  Mode  :character   Median :102.0   Median :415.0   Mode  :character  
##                     Mean   :102.8   Mean   :436.2                     
##                     3rd Qu.:128.0   3rd Qu.:415.0                     
##                     Max.   :232.0   Max.   :510.0                     
##  Voice.mail.plan    Number.vmail.messages Total.day.minutes Total.day.calls
##  Length:667         Min.   : 0.000        Min.   : 25.9     Min.   : 30.0  
##  Class :character   1st Qu.: 0.000        1st Qu.:146.2     1st Qu.: 87.5  
##  Mode  :character   Median : 0.000        Median :178.3     Median :101.0  
##                     Mean   : 8.408        Mean   :180.9     Mean   :100.9  
##                     3rd Qu.:20.000        3rd Qu.:220.7     3rd Qu.:115.0  
##                     Max.   :51.000        Max.   :334.3     Max.   :165.0  
##  Total.day.charge Total.eve.minutes Total.eve.calls Total.eve.charge
##  Min.   : 4.40    Min.   : 48.1     Min.   : 37.0   Min.   : 4.09   
##  1st Qu.:24.86    1st Qu.:171.1     1st Qu.: 88.0   1st Qu.:14.54   
##  Median :30.31    Median :203.7     Median :101.0   Median :17.31   
##  Mean   :30.76    Mean   :203.4     Mean   :100.5   Mean   :17.29   
##  3rd Qu.:37.52    3rd Qu.:236.4     3rd Qu.:113.0   3rd Qu.:20.09   
##  Max.   :56.83    Max.   :361.8     Max.   :168.0   Max.   :30.75   
##  Total.night.minutes Total.night.calls Total.night.charge Total.intl.minutes
##  Min.   : 23.2       Min.   : 42.0     Min.   : 1.040     Min.   : 0.00     
##  1st Qu.:167.9       1st Qu.: 86.0     1st Qu.: 7.560     1st Qu.: 8.60     
##  Median :201.6       Median :100.0     Median : 9.070     Median :10.50     
##  Mean   :199.7       Mean   :100.1     Mean   : 8.986     Mean   :10.24     
##  3rd Qu.:231.5       3rd Qu.:113.5     3rd Qu.:10.420     3rd Qu.:12.05     
##  Max.   :367.7       Max.   :175.0     Max.   :16.550     Max.   :18.30     
##  Total.intl.calls Total.intl.charge Customer.service.calls    Churn          
##  Min.   : 0.000   Min.   :0.000     Min.   :0.000          Length:667        
##  1st Qu.: 3.000   1st Qu.:2.320     1st Qu.:1.000          Class :character  
##  Median : 4.000   Median :2.840     Median :1.000          Mode  :character  
##  Mean   : 4.528   Mean   :2.765     Mean   :1.564                            
##  3rd Qu.: 6.000   3rd Qu.:3.255     3rd Qu.:2.000                            
##  Max.   :18.000   Max.   :4.940     Max.   :8.000
summary(df$Total.day.calls)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    30.0    87.5   101.0   100.9   115.0   165.0

1.1 Histogram

histogram <- hist(df$Total.day.calls, main="HISTOGRAM Total Calls (Day) ", xlab = "Days")

cat("Breaks :", histogram$breaks, "\n")
## Breaks : 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170
cat("Counts :", histogram$counts, "\n")
## Counts : 3 1 11 30 56 99 125 127 96 75 24 18 0 2
cat("Mids :", histogram$mids, "\n")
## Mids : 35 45 55 65 75 85 95 105 115 125 135 145 155 165
hist(df$Total.day.calls, freq = FALSE, main="HISTOGRAM Total Calls", xlab = "Days")

histo.2 <- hist(df$Total.day.calls, freq = FALSE, breaks = 30, 
                main="HISTOGRAM Total Calls", xlab = "Days")

cat("BREAKS :", histo.2$breaks, "\n\n")
## BREAKS : 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 160 165
n <- length(df$Total.day.calls) # banyaknya amatan
k1 <- sqrt(n) # akar kuadrat banyaknya amatan

histo.3 <- hist(df$Total.day.calls, freq = FALSE, breaks = k1, 
                main="HISTOGRAM Total Calls", xlab = "Days")

cat("BREAKS :", histo.3$breaks, "\n\n")
## BREAKS : 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 160 165
length(histo.3$breaks)
## [1] 28
k2 <- log2(n)+1  # Formula yang diusulkan H.A. Sturges

histo.4 <- hist(df$Total.day.calls, freq = FALSE, breaks = k2, 
                main="HISTOGRAM Total Calls", xlab = "Days")

cat("BREAKS :", histo.4$breaks, "\n\n")
## BREAKS : 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170
k3 <- 2*n^(1/3) # Formula yang diusulkan Rice University

histo.5 <- hist(df$Total.day.calls, freq = FALSE, breaks = k3, 
                main="HISTOGRAM Total Calls", xlab = "Days")

cat("BREAKS :", histo.5$breaks, "\n\n")
## BREAKS : 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170

1.1.1 Histogram dengan kurva

# visualisasi histogram dengan kurva
hist(df$Total.day.calls, freq = FALSE, main = "Distribusi Total Panggilan",
     xlab = "Total Panggilan",  col = "lightgreen",
     xlim = c(20,180), ylim = c(0,0.02))
# Menambahkan kurva Normal
curve(dnorm(x, mean = mean(df$Total.day.calls), sd = sd(df$Total.day.calls)), 
      add = TRUE, col = "darkblue", lwd = 2)

1.1.2 Histogram (perbandingan sebaran)

par(mfrow = c(1,2))

hist(df[df$Total.day.calls < 100,]$Total.day.calls, main = "", 
     col = "green", xlab = "Total Panggilan dibawah 100")

hist(df[df$Total.day.calls >= 100,]$Total.day.calls, main = "",
     col = "blue", xlab = "Total Panggilan diatas 100")

plot(density(df$Total.day.calls), main = "Perbandingan", ylim = c(0, 0.022))
lines(density(df$Total.eve.calls), col = "green")
lines(density(df$Total.night.calls), col = "red")

legend("topleft", c("Day Calls", "Evenening Calls", "Night Calls"),
       col =c("black","green","red"), lty=1)

hist(df$Total.intl.calls, freq = FALSE, breaks=20, main = "Distribusi Total Panggilan Int",
     xlab = "Total Panggilan",  col = "lightgreen",)

hist(df$Total.intl.charge, freq = FALSE, breaks=20, main = "Distribusi Total Panggilan Int",
     xlab = "Total Panggilan",  col = "lightgreen",)

1.2 Boxplot

1.2.1 Visualisasi boxplot (vertical)

boxplot(df$Total.day.calls, main = "Distribusi Total Panggilan",
        ylab = "Total Panggilan", horizontal = TRUE)

1.2.2 Visualisasi boxplot

boxplot(df$Total.day.calls, horizontal = TRUE, 
        main = "Distribusi Total Panggilan", ylab = "Total Panggilan")

1.2.3 Statistik lima Serangkai

cat("Minimum  :", min(df$Total.day.calls)) # atau 
## Minimum  : 30
cat("Q1 (25%) :", quantile(df$Total.day.calls)[1], "\n")
## Q1 (25%) : 30
cat("Q1 (25%) :", quantile(df$Total.day.calls)[2], "\n")
## Q1 (25%) : 87.5
cat("Q2 (Med) :", quantile(df$Total.day.calls)[3], "\n")
## Q2 (Med) : 101
cat("Q3 (75%) :", quantile(df$Total.day.calls)[4], "\n")
## Q3 (75%) : 115
cat("Maximum  :", max(df$Total.day.calls), "\n") 
## Maximum  : 165
quantile(df$Total.day.calls)[5]
## 100% 
##  165
quantile(df$Total.day.calls)
##    0%   25%   50%   75%  100% 
##  30.0  87.5 101.0 115.0 165.0

1.2.4 Alternatif Kuantil (dan persentil)

1.2.4.1 Q1, Median (Q2), Q3

Q1 <- quantile(df$Total.day.calls, probs=0.25, names=F)
Q2 <- quantile(df$Total.day.calls, probs=0.5, names=F)
Q3 <- quantile(df$Total.day.calls, probs=0.75, names=F)

cat("Kuartil 1:", Q1, "\n")
## Kuartil 1: 87.5
cat("Median:", Q2, "\n")
## Median: 101
cat("Kuartil 3:", Q3, "\n")
## Kuartil 3: 115

1.2.4.2 Persentil ke-5 dan ke-95

P5 <- quantile(df$Total.day.calls, probs=0.05, names=F)
P95 <- quantile(df$Total.day.calls, probs=0.95, names=F)

cat("Persentil-5:", P5, "\n")
## Persentil-5: 68
cat("Persentil-95:", P95, "\n")
## Persentil-95: 134

1.2.4.3 Deteksi Amatan Pencilan

Q1 <- quantile(df$Total.day.calls)[2]
Q3 <- quantile(df$Total.day.calls)[4]

BA <- Q3 + (3/2*(Q3-Q1)) #Batas Atas
BB <- Q1 - (3/2*(Q3-Q1)) #Batas Bawah

cat("Batas Bawah :", BB, "\n")
## Batas Bawah : 46.25
cat("Batas Atas :", BA, "\n")
## Batas Atas : 156.25

Amatan dengan nilai kurang dari Batas Bawah atau Lebih dari Batas atas merupakan amatan outlier. Mengecek amatan di bawah Batas Bawah dan di atas Batas Atas

df[df$Total.day.calls < BB,]$Total.day.calls
## [1] 30 35 40
df[df$Total.day.calls > BA,]$Total.day.calls
## [1] 163 165

1.2.5 Boxplot By Group

boxplot(Total.day.calls ~ Churn, data = df,
        col = c("#FFE0B2", "#F57C00"), horizontal = TRUE)

boxplot(Total.intl.calls ~ Churn, data = df,
        col = c("#FFE0B2", "#F57C00"), horizontal = TRUE)

boxplot(Customer.service.calls ~ Churn, data = df,
        col = c("#FFE0B2", "#F57C00"), horizontal = TRUE)

1.2.6 Layout

Membuat Layout untuk Tampilan Histogram dan Boxplot

nf <- layout(mat = matrix(c(1,2), 2,1, byrow = TRUE), heights = c(3,1.5))

par(mar=c(3.1, 3.1, 2.1, 2.1))

1.2.6.1 Histogram

hist(df$Total.day.calls, col = "lightblue", main = "Sebaran Total Panggilan") 

1.2.6.2 Boxplot

boxplot(df$Total.day.calls, horizontal = T, frame = F, col = "lightgreen")

props.1 <- table(df[,c("Churn", "International.plan")])

props.1
##        International.plan
## Churn    No Yes
##   False 538  34
##   True   76  19
prop.table(props.1)
##        International.plan
## Churn           No        Yes
##   False 0.80659670 0.05097451
##   True  0.11394303 0.02848576
barplot(prop.table(props.1, 2),
        col = c("orange", "lightgreen"),  
        legend = c("False", "True"))

barplot(prop.table(props.1, 2), col = c("orange", "lightgreen"))

cat("Mean   :", mean(df$Total.day.calls), "\n")
## Mean   : 100.937
cat("Median :", median(df$Total.day.calls), "\n")
## Median : 101
# Create the function 'Modus'
getmode <- function(v) {
    uniqv <- unique(v)
    uniqv[which.max(tabulate(match(v, uniqv)))]
}

cat("Mode   :", getmode(df$Total.day.calls))
## Mode   : 102
getmode.alt <- function(data) {
  # Hitung frekuensi kemunculan setiap nilai dalam data
  frek <- table(data)
  # Temukan nilai dengan frekuensi tertinggi
  mode <- names(frek)[which.max(frek)]
  return(mode)
}

cat("Mode   :", getmode.alt(df$Total.day.calls))
## Mode   : 102

1.3 Penyebaran Data

Ukuran Penyebaran Data

ragam <- var(df$Total.day.calls)
sp.baku <- sd(df$Total.day.calls) # atau --> sqrt(var(df$Total.day.calls))
jangkauan <- range(df$Total.day.calls)
jak <- IQR(df$Total.day.calls)

cat("Ragam     :", ragam, "\n")
## Ragam     : 416.0291
cat("Sp. Baku  :", sp.baku, "\n")
## Sp. Baku  : 20.39679
cat("Jangkauan :", jangkauan, "\n")
## Jangkauan : 30 165
cat("JAK       :", jak, "\n")
## JAK       : 27.5

1.3.1 Coef. of Variation

cv <- sd(df$Total.day.calls)/mean(df$Total.day.calls) * 100

cat("Coefficient of Variation :", cv, "\n")
## Coefficient of Variation : 20.20744

1.3.2 Koefisien Gini

# install.packages("DescTools")
library("DescTools")
gini <- DescTools::Gini(df$Total.day.calls)
cat("\n'\", Gini Coef :", gini, "\n")
## 
## '", Gini Coef : 0.113967
plot(ineq::Lc(df$Total.day.calls),col="darkred",lwd=2)
## Registered S3 methods overwritten by 'ineq':
##   method   from     
##   plot.Lc  DescTools
##   lines.Lc DescTools

1.3.3 Bentuk Sebaran

moments::skewness(df$Total.day.calls) # Kemencengan
## [1] -0.05265871
moments::kurtosis(df$Total.day.calls) # Keruncingan
## [1] 3.061712

2 Pima Indians Diabetes

Akan dilakukan eksplorasi pada data Pima Indians Diabetes. Memuat Pustaka yang diperlukan.

library(ggplot2)
library(gridExtra)
setwd("D:/Kuliah/Mat/TSA Kominfo/Praktikum")
diabetes <- read.csv('pima-indians-diabeter-database.csv', sep = ',')
head(diabetes)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

2.1 Daftar variabel :

  • Pregnancies : Number of times pregnant

  • Glucose : Plasma glucose concentration a 2 hours in an oral glucose tolerance test

  • BloodPressure : Diastolic blood pressure (mm Hg)

  • SkinThickness : Triceps skin fold thickness (mm) Insulin : 2-Hour serum insulin (mu U/ml)

  • BMI : Body mass index (weight in kg/(height in m)^2)

  • DiabetesPedigreeFunction : Diabetes pedigree function

  • Age : Age (years)

  • Outcome : Class variable (0 or 1) 268 of 768 are 1, the others are 0

diabetes$Outcome <- as.factor(diabetes$Outcome)
summary(diabetes)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##  Outcome
##  0:500  
##  1:268  
##         
##         
##         
## 

2.2 Amatan

2.2.1 Amatan dengan missing value

print(colSums(is.na(diabetes)))
##              Pregnancies                  Glucose            BloodPressure 
##                        0                        0                        0 
##            SkinThickness                  Insulin                      BMI 
##                        0                        0                        0 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0

2.2.2 Amatan dengan nilai 0

why ?

num.zero = colSums(diabetes==0)
print(data.frame(num.zero))
##                          num.zero
## Pregnancies                   111
## Glucose                         5
## BloodPressure                  35
## SkinThickness                 227
## Insulin                       374
## BMI                            11
## DiabetesPedigreeFunction        0
## Age                             0
## Outcome                       500

2.2.3 Persentase amatan dengan nilai 0

pct.zero = round(colSums(diabetes==0)/length(diabetes$Outcome), 4)*100

print(data.frame(pct.zero))
##                          pct.zero
## Pregnancies                 14.45
## Glucose                      0.65
## BloodPressure                4.56
## SkinThickness               29.56
## Insulin                     48.70
## BMI                          1.43
## DiabetesPedigreeFunction     0.00
## Age                          0.00
## Outcome                     65.10
plot1 <- ggplot(diabetes, aes(x = Pregnancies)) + 
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("Pregnancies")
plot2 <- ggplot(diabetes, aes(x = Glucose)) +
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("Glucose")
plot3 <- ggplot(diabetes, aes(x = BloodPressure)) +
         geom_density(lwd=1, color="darkgreen") +
         ggtitle("BloodPressure")
plot4 <- ggplot(diabetes, aes(x = SkinThickness)) +
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("SkinThickness")
plot5 <- ggplot(diabetes, aes(x = Insulin)) +
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("Insulin")
plot6 <- ggplot(diabetes, aes(x = BMI)) +
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("BMI")
plot7 <- ggplot(diabetes, aes(x = DiabetesPedigreeFunction)) +
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("DiabetesPedigreeFunction")
plot8 <- ggplot(diabetes, aes(x = Age)) +
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("Age")
plot9 <- ggplot(diabetes, aes(x = Outcome)) + 
         geom_bar(fill="darkred") +
         ggtitle("Outcome")

# Mengatur grid 3x3
grid.arrange(
  plot1, plot2, plot3,
  plot4, plot5, plot6,
  plot7, plot8, plot9,
  ncol = 3
)

2.3 Kolom

2.3.1 Pregnancies

summary(diabetes$Pregnancies)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   3.000   3.845   6.000  17.000
cat("Jumlah nilai 0:", num.zero[1], "\n")
## Jumlah nilai 0: 111
cat("Persentase nilai 0:", pct.zero[1])
## Persentase nilai 0: 14.45
plot1 <- ggplot(diabetes, aes(x = Pregnancies)) + 
         geom_histogram(bins = 10, fill = "red", alpha = 0.7)
plot2 <- ggplot(diabetes, aes(x = Pregnancies)) +
         geom_boxplot(fill = "darkgreen", alpha=0.7)
plot3 <- ggplot(diabetes, aes(x = Pregnancies, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(y = Pregnancies, x=Outcome)) +
         geom_boxplot(aes(fill=Outcome)) 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

2.3.2 Glucose

summary(diabetes$Glucose)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    99.0   117.0   120.9   140.2   199.0
cat("Jumlah nilai 0:", num.zero[2], "\n")
## Jumlah nilai 0: 5
cat("Persentase nilai 0:", pct.zero[2])
## Persentase nilai 0: 0.65
plot1 <- ggplot(diabetes, aes(x = Glucose)) + 
         geom_histogram(bins = 20, fill = "red", alpha = 0.7 )
plot2 <- ggplot(diabetes, aes(x = Glucose)) +
         geom_boxplot(fill = "darkgreen", alpha=0.7)
plot3 <- ggplot(diabetes, aes(x = Glucose, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(y = Glucose, x=Outcome)) +
         geom_boxplot(aes(fill=Outcome)) 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

2.3.3 Blood Pressure

summary(diabetes$BloodPressure)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   62.00   72.00   69.11   80.00  122.00
cat("Jumlah nilai 0:", num.zero[3], "\n")
## Jumlah nilai 0: 35
cat("Persentase nilai 0:", pct.zero[3])
## Persentase nilai 0: 4.56
plot1 <- ggplot(diabetes, aes(x = BloodPressure)) + 
         geom_histogram(bins = 20, fill = "red", alpha = 0.7)
plot2 <- ggplot(diabetes, aes(x = BloodPressure)) +
         geom_boxplot(fill = "darkgreen", alpha=0.7)
plot3 <- ggplot(diabetes, aes(x = BloodPressure, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(y = BloodPressure, x=Outcome)) +
         geom_boxplot(aes(fill=Outcome)) 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

2.3.4 Skin Thickness

summary(diabetes$SkinThickness)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   23.00   20.54   32.00   99.00
cat("Jumlah nilai 0:", num.zero[4], "\n")
## Jumlah nilai 0: 227
cat("Persentase nilai 0:", pct.zero[4])
## Persentase nilai 0: 29.56
plot1 <- ggplot(diabetes, aes(x = SkinThickness)) + 
         geom_histogram(bins = 20, fill = "red", alpha = 0.7)
plot2 <- ggplot(diabetes, aes(x = SkinThickness)) +
         geom_boxplot(fill = "darkgreen", alpha=0.7)
plot3 <- ggplot(diabetes, aes(x = SkinThickness, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(y = SkinThickness, x=Outcome)) +
         geom_boxplot(aes(fill=Outcome)) 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

2.3.5 Insulin

summary(diabetes$Insulin)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0    30.5    79.8   127.2   846.0
cat("Jumlah nilai 0:", num.zero[5], "\n")
## Jumlah nilai 0: 374
cat("Persentase nilai 0:", pct.zero[5])
## Persentase nilai 0: 48.7
plot1 <- ggplot(diabetes, aes(x = Insulin)) + 
         geom_histogram(bins = 20, fill = "red", alpha = 0.7)
plot2 <- ggplot(diabetes, aes(x = Insulin)) +
         geom_boxplot(fill = "darkgreen", alpha=0.7)
plot3 <- ggplot(diabetes, aes(x = Insulin, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(y = Insulin, x=Outcome)) +
         geom_boxplot(aes(fill=Outcome)) 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

2.3.6 Body Mass Index

summary(diabetes$BMI)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   27.30   32.00   31.99   36.60   67.10
cat("Jumlah nilai 0:", num.zero[6], "\n")
## Jumlah nilai 0: 11
cat("Persentase nilai 0:", pct.zero[6])
## Persentase nilai 0: 1.43
plot1 <- ggplot(diabetes, aes(x = BMI)) + 
         geom_histogram(bins = 20, fill = "red", alpha = 0.7)
plot2 <- ggplot(diabetes, aes(x = BMI)) +
         geom_boxplot(fill = "darkgreen", alpha=0.7)
plot3 <- ggplot(diabetes, aes(x = BMI, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(y = BMI, x=Outcome)) +
         geom_boxplot(aes(fill=Outcome)) 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

2.3.7 Diabetes Pedigree Function

summary(diabetes$DiabetesPedigreeFunction)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0780  0.2437  0.3725  0.4719  0.6262  2.4200
cat("Jumlah nilai 0:", num.zero[7], "\n")
## Jumlah nilai 0: 0
cat("Persentase nilai 0:", pct.zero[7])
## Persentase nilai 0: 0
plot1 <- ggplot(diabetes, aes(x = DiabetesPedigreeFunction)) + 
         geom_histogram(bins = 20, fill = "red", alpha = 0.7)
plot2 <- ggplot(diabetes, aes(x = DiabetesPedigreeFunction)) +
         geom_boxplot(fill = "darkgreen", alpha=0.7)
plot3 <- ggplot(diabetes, aes(x = DiabetesPedigreeFunction, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(y = DiabetesPedigreeFunction, x=Outcome)) +
         geom_boxplot(aes(fill=Outcome)) 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

2.3.8 Age

summary(diabetes$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   24.00   29.00   33.24   41.00   81.00
cat("Jumlah nilai 0:", num.zero[8], "\n")
## Jumlah nilai 0: 0
cat("Persentase nilai 0:", pct.zero[8])
## Persentase nilai 0: 0
plot1 <- ggplot(diabetes, aes(x = Age)) + 
         geom_histogram(bins = 20, fill = "red", alpha = 0.7)
plot2 <- ggplot(diabetes, aes(x = Age)) +
         geom_boxplot(fill = "darkgreen", alpha=0.7)
plot3 <- ggplot(diabetes, aes(x = Age, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(y = Age, x=Outcome)) +
         geom_boxplot(aes(fill=Outcome)) 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

3 Outlier Analysis

Data yang digunakan merupakan data “cars” yang tersedia di R.

head(cars)
##   speed dist
## 1     4    2
## 2     4   10
## 3     7    4
## 4     7   22
## 5     8   16
## 6     9   10
summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00
str(cars)
## 'data.frame':    50 obs. of  2 variables:
##  $ speed: num  4 4 7 7 8 9 10 10 10 11 ...
##  $ dist : num  2 10 4 22 16 10 18 26 34 17 ...
cars <- cars[1:30, ] # data awal diambil 30 baris pertama saja

3.1 Plot data awal tanpa outlier.

plot(cars$speed, cars$dist, xlim=c(0, 28), ylim=c(0, 100), 
     main="Speed - Dist", xlab="speed", ylab="dist", 
     pch="*", col="red", cex=2)
abline(lm(dist ~ speed, data=cars), col="darkgreen", lwd=4, lty=2)

3.2 Penambahan outlier pada data

cars_outliers <- data.frame(speed=c(24,24,25,25,25),
                            dist=c(81, 80, 86, 88, 85)) # introduce outliers.
cars_mod1 <- rbind(cars, cars_outliers)  # data dengan outlier.
cars_mod1
##    speed dist
## 1      4    2
## 2      4   10
## 3      7    4
## 4      7   22
## 5      8   16
## 6      9   10
## 7     10   18
## 8     10   26
## 9     10   34
## 10    11   17
## 11    11   28
## 12    12   14
## 13    12   20
## 14    12   24
## 15    12   28
## 16    13   26
## 17    13   34
## 18    13   34
## 19    13   46
## 20    14   26
## 21    14   36
## 22    14   60
## 23    14   80
## 24    15   20
## 25    15   26
## 26    15   54
## 27    16   32
## 28    16   40
## 29    17   32
## 30    17   40
## 31    24   81
## 32    24   80
## 33    25   86
## 34    25   88
## 35    25   85

3.3 Plot data dengan outlier.

par(mfrow=c(1, 2))
# Plot data awal tanpa outlier. 
model_1 = lm(dist ~ speed, data=cars)
plot(cars$speed, cars$dist, xlim=c(0, 28), ylim=c(0, 100), 
     main="Speed - Dist", xlab="speed", ylab="dist", 
     pch="*", col="red", cex=2)
abline(model_1, col="darkgreen", lwd=4, lty=2)

model_2 = lm(dist ~ speed, data=cars_mod1)        # lm = linear model
plot(cars_mod1$speed, cars_mod1$dist, xlim=c(0, 28), ylim=c(0, 230), 
     main="With Outliers", xlab="speed", ylab="dist", 
     pch="*", col="red", cex=2)
abline(model_2, col="blue", lwd=3, lty=2)

model_1
## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Coefficients:
## (Intercept)        speed  
##      -6.845        2.973
model_2
## 
## Call:
## lm(formula = dist ~ speed, data = cars_mod1)
## 
## Coefficients:
## (Intercept)        speed  
##     -17.129        3.905
cars_outliers <- data.frame(speed=c(24,24,25,25,25), dist=c(81, 80, 86, 88, 85))  # introduce outliers.

# menambahkan data outlier pada data cars yang asli
cars_mod2 <- rbind(cars, cars_outliers)  # data dengan amatan berpengaruh.
View(cars_mod2)

length(cars_mod2$speed)
## [1] 35
model_3 = lm(dist ~ speed, data=cars_mod2)
plot(cars_mod2$speed, cars_mod2$dist, xlim=c(0, 28), ylim=c(0, 100), 
     main="Speed - Dist", xlab="speed", ylab="dist", 
     pch="*", col="red", cex=2)
abline(model_3, col="darkgreen", lwd=4, lty=2)

summary(model_3)
## 
## Call:
## lm(formula = dist ~ speed, data = cars_mod2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.453  -8.425   0.358   4.993  42.453 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.1288     5.8048  -2.951  0.00579 ** 
## speed         3.9054     0.3928   9.943 1.87e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.63 on 33 degrees of freedom
## Multiple R-squared:  0.7498, Adjusted R-squared:  0.7422 
## F-statistic: 98.87 on 1 and 33 DF,  p-value: 1.866e-11
influence.measures(model_3)
## Influence measures of
##   lm(formula = dist ~ speed, data = cars_mod2) :
## 
##      dfb.1_  dfb.sped    dffit cov.r   cook.d    hat inf
## 1   0.10699 -0.094246  0.10793 1.203 5.99e-03 0.1203   *
## 2   0.35568 -0.313326  0.35881 1.141 6.45e-02 0.1203    
## 3  -0.13469  0.109816 -0.14107 1.129 1.02e-02 0.0725    
## 4   0.25846 -0.210730  0.27070 1.082 3.67e-02 0.0725    
## 5   0.03571 -0.027935  0.03847 1.130 7.63e-04 0.0604    
## 6  -0.13203  0.097683 -0.14860 1.091 1.12e-02 0.0503    
## 7  -0.05451  0.037223 -0.06565 1.103 2.22e-03 0.0421    
## 8   0.05658 -0.038642  0.06815 1.103 2.39e-03 0.0421    
## 9   0.16988 -0.116012  0.20460 1.047 2.10e-02 0.0421    
## 10 -0.10178  0.061347 -0.13621 1.069 9.42e-03 0.0358    
## 11  0.02482 -0.014960  0.03321 1.101 5.68e-04 0.0358    
## 12 -0.14615  0.070347 -0.23046 0.994 2.61e-02 0.0315    
## 13 -0.08903  0.042856 -0.14040 1.058 9.97e-03 0.0315    
## 14 -0.05213  0.025094 -0.08221 1.084 3.46e-03 0.0315    
## 15 -0.01573  0.007573 -0.02481 1.097 3.17e-04 0.0315    
## 16 -0.05162  0.014249 -0.10526 1.070 5.65e-03 0.0291    
## 17  0.00241 -0.000664  0.00491 1.095 1.24e-05 0.0291    
## 18  0.00241 -0.000664  0.00491 1.095 1.24e-05 0.0291    
## 19  0.08427 -0.023262  0.17184 1.031 1.48e-02 0.0291    
## 20 -0.05140 -0.007505 -0.15888 1.039 1.27e-02 0.0286    
## 21 -0.00680 -0.000993 -0.02101 1.094 2.28e-04 0.0286    
## 22  0.10389  0.015169  0.32110 0.890 4.79e-02 0.0286    
## 23  0.23173  0.033835  0.71623 0.459 1.71e-01 0.0286   *
## 24 -0.04671 -0.070634 -0.31355 0.908 4.61e-02 0.0301    
## 25 -0.03287 -0.049708 -0.22066 0.996 2.39e-02 0.0301    
## 26  0.02647  0.040030  0.17770 1.030 1.58e-02 0.0301    
## 27  0.00336 -0.076952 -0.20070 1.025 2.00e-02 0.0335    
## 28  0.00133 -0.030406 -0.07930 1.088 3.22e-03 0.0335    
## 29  0.04614 -0.146120 -0.28434 0.980 3.92e-02 0.0388    
## 30  0.02422 -0.076717 -0.14928 1.069 1.13e-02 0.0388    
## 31 -0.09257  0.125965  0.14257 1.212 1.04e-02 0.1303   *
## 32 -0.07146  0.097245  0.11006 1.217 6.23e-03 0.1303   *
## 33 -0.13321  0.177100  0.19668 1.236 1.98e-02 0.1510   *
## 34 -0.18225  0.242289  0.26907 1.221 3.69e-02 0.1510   *
## 35 -0.10884  0.144698  0.16069 1.241 1.33e-02 0.1510   *
summary(model_1)
## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.751  -8.818  -1.993   2.885  45.222 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.8446     8.7420  -0.783 0.440223    
## speed         2.9730     0.7046   4.219 0.000233 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.1 on 28 degrees of freedom
## Multiple R-squared:  0.3887, Adjusted R-squared:  0.3669 
## F-statistic:  17.8 on 1 and 28 DF,  p-value: 0.0002327

4 Outlier: Data BPR

setwd("D:/Kuliah/Mat/TSA Kominfo/Praktikum")  # set working directory
bpr <- read.csv("2. eksplorasi01.csv")
head(bpr)
##   Kode.Bank Non.Performing.Loan Growth.Laba.Rugi.Berjalan Spread.Margin
## 1         1            10.49833                 11.538745     11.611141
## 2         2            15.27000                -11.188068     11.691391
## 3         3             1.63000                  9.325274      3.620513
## 4         4            27.65167                  8.167480     12.871722
## 5         5            51.32000                 26.825480     14.429154
## 6         6             9.03000                 13.969876      3.836760
##   Cash.Ratio Modal.inti.thdp.Aset      BOPO Ratio.Kredit.thdp.DPK
## 1   21.76331           0.09385150 0.9067301             0.9671143
## 2   14.10585           0.09061287 1.4005322             1.4584551
## 3   18.23885           0.08618314 0.7979953             0.9339539
## 4   66.29771           0.05432665 1.6785487             1.2375042
## 5   29.92377           0.28228712 1.3184079             0.8857992
## 6   50.76537           0.14190734 0.8538458             1.5697280
##   Rasio.biaya.tenaga.thd.pendapatan.operasional
## 1                                     0.3536502
## 2                                     0.7167977
## 3                                     0.3055795
## 4                                     0.2392941
## 5                                     0.5862838
## 6                                     0.2692363
##   Rasio.Pendapatan.Operasional.thd.Aset.Produktif
## 1                                       0.2591038
## 2                                       0.1616119
## 3                                       0.1904516
## 4                                       0.2774885
## 5                                       0.3958143
## 6                                       0.2129047
str(bpr)
## 'data.frame':    147 obs. of  10 variables:
##  $ Kode.Bank                                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Non.Performing.Loan                            : num  10.5 15.27 1.63 27.65 51.32 ...
##  $ Growth.Laba.Rugi.Berjalan                      : num  11.54 -11.19 9.33 8.17 26.83 ...
##  $ Spread.Margin                                  : num  11.61 11.69 3.62 12.87 14.43 ...
##  $ Cash.Ratio                                     : num  21.8 14.1 18.2 66.3 29.9 ...
##  $ Modal.inti.thdp.Aset                           : num  0.0939 0.0906 0.0862 0.0543 0.2823 ...
##  $ BOPO                                           : num  0.907 1.401 0.798 1.679 1.318 ...
##  $ Ratio.Kredit.thdp.DPK                          : num  0.967 1.458 0.934 1.238 0.886 ...
##  $ Rasio.biaya.tenaga.thd.pendapatan.operasional  : num  0.354 0.717 0.306 0.239 0.586 ...
##  $ Rasio.Pendapatan.Operasional.thd.Aset.Produktif: num  0.259 0.162 0.19 0.277 0.396 ...
summary(bpr)
##    Kode.Bank     Non.Performing.Loan Growth.Laba.Rugi.Berjalan
##  Min.   :  1.0   Min.   : 0.3433     Min.   :-679.401         
##  1st Qu.: 37.5   1st Qu.: 4.9733     1st Qu.:   9.668         
##  Median : 74.0   Median : 9.5683     Median :  13.085         
##  Mean   : 74.0   Mean   :14.1751     Mean   :  90.354         
##  3rd Qu.:110.5   3rd Qu.:19.4375     3rd Qu.:  20.382         
##  Max.   :147.0   Max.   :61.6100     Max.   :8406.827         
##                                                               
##  Spread.Margin       Cash.Ratio      Modal.inti.thdp.Aset      BOPO       
##  Min.   :-47.267   Min.   :  4.904   Min.   :-0.40167     Min.   :0.6342  
##  1st Qu.:  8.627   1st Qu.: 18.125   1st Qu.: 0.09245     1st Qu.:0.8926  
##  Median : 11.219   Median : 25.903   Median : 0.12070     Median :0.9734  
##  Mean   : 10.841   Mean   : 37.481   Mean   : 0.15401     Mean   :1.0727  
##  3rd Qu.: 13.804   3rd Qu.: 41.177   3rd Qu.: 0.19269     3rd Qu.:1.1209  
##  Max.   : 50.109   Max.   :458.130   Max.   : 0.71460     Max.   :2.4607  
##  NA's   :15                                                               
##  Ratio.Kredit.thdp.DPK Rasio.biaya.tenaga.thd.pendapatan.operasional
##  Min.   :0.5961        Min.   :0.1562                               
##  1st Qu.:0.9678        1st Qu.:0.2773                               
##  Median :1.1626        Median :0.3502                               
##  Mean   :1.3924        Mean   :0.3882                               
##  3rd Qu.:1.5659        3rd Qu.:0.4707                               
##  Max.   :4.1955        Max.   :1.1470                               
##                                                                     
##  Rasio.Pendapatan.Operasional.thd.Aset.Produktif
##  Min.   :0.1346                                 
##  1st Qu.:0.1954                                 
##  Median :0.2200                                 
##  Mean   :0.2292                                 
##  3rd Qu.:0.2517                                 
##  Max.   :0.4427                                 
## 
par0 <- par(mfrow=c(3,3))
for (i in 2:10) boxplot(bpr[,i], main=colnames(bpr)[i], horizontal = TRUE)

par(par0)
par0 <- par(mfrow=c(3,3))
for (i in 2:10) hist(bpr[,i], main=colnames(bpr)[i], freq = FALSE)

par(par0)
# Non Performing Loan
boxplot(bpr$Non.Performing.Loan, main="Non Performing Loan", horizontal=T)

hist(bpr$Non.Performing.Loan, main="Non Performing Loan", breaks=20)

# Membuat QQ plot
qqnorm(bpr$Non.Performing.Loan)
qqline(bpr$Non.Performing.Loan)

4.1 Fungsi WInsorized

Fungsi WInsorized digunakan untuk mengganti nilai-nilai outlier dengan nilai “batas bawah” untuk sebelah kiri dan nilai “batas atas” untuk sebelah kanan

winval <- function (x, tr = 0.2) 
{
    y <- sort(x)
    n <- length(x)
    
    # menentukan indeks batas bawah 
    ibot <- floor(tr * n) + 1   
    
    # menentukan indeks batas atas
    itop <- length(x) - ibot + 1
    
    xbot <- y[ibot] # mengambil nilai pada indeks batas bawah
    xtop <- y[itop] # mengambil nilai pada indeks batas atas
    
    #(data dengan nilai kurang dari xbot akan diganti dengan data xbot)
    winval <- ifelse(x <= xbot, xbot, x)
        
    #(data dengan nilai lebih dari xtop akan diganti dengan xtop)
    winval <- ifelse(winval >= xtop, xtop, winval)
    winval
}


winmean <- function (x, tr = 0.2) 
{
    winmean <- mean(winval(x, tr))
    winmean
}

winvar <- function (x, tr = 0.2) 
{
    winvar <- var(winval(x, tr))
    winvar
}

winse <- function (x, tr = 0.2) 
{
  n = length(x)
  h = n - 2 * floor(tr * n)
  top = (n - 1) * sqrt(winvar(x, tr = tr))
  bot = (h - 1) * sqrt(n)
  se = top/bot
  se
}
x <- bpr$Non.Performing.Loan

x.bar <- mean(x)                   # rataan sample
x.med <- median(x)                 # median sample
x.bar.tr.50 <- mean(x,trim=.5)      # rataan terpangkas 50% (median)
x.bar.tr.20 <- mean(x,trim=.2)      # rataan terpangkas 20%
x.bar.tr.10 <- mean(x,trim=.1)      # rataan terpangkas 10%
x.winmean <- winmean(x)             # winsorized mean 20%
x.winmean.10 <- winmean(x, tr=0.1)  # winsorized mean 10%

cat("Mean   :", x.bar, "\n")
## Mean   : 14.17514
cat("Median :", x.med, "\n\n")
## Median : 9.568333
cat("Tr. mean (50%)  :", x.bar.tr.50, "\n")
## Tr. mean (50%)  : 9.568333
cat("Tr. mean (20%)  :", x.bar.tr.20, "\n")
## Tr. mean (20%)  : 10.71661
cat("Tr. mean (10%)  :", x.bar.tr.10, "\n")
## Tr. mean (10%)  : 11.88604
cat("win. mean (20%) :", x.winmean, "\n")
## win. mean (20%) : 11.86249
cat("win. mean (10%) :", x.winmean.10, "\n\n")
## win. mean (10%) : 13.00679
cat("Quatile :", quantile(x))
## Quatile : 0.3433333 4.973333 9.568333 19.4375 61.61
IQR(x)
## [1] 14.46417
mad(x)
## [1] 7.894845
winvar(x)            # winsorized variance .20
## [1] 52.02483
winvar(x, tr=0.1)    # winsorized variance .10
## [1] 91.66518

4.2 Selang Kepercayaan

trimse <- function (x, tr = 0.2) 
{
   trimse <- sqrt(winvar(x, tr))/((1 - 2 * tr) * sqrt(length(x)))
   trimse
}

se.med <- sqrt((pi/2)*(mad(x)^2/length(x)))  # s.e  median
# SK 95% Mean
ci.mean <- t.test(x)$conf.int  
ci.level <- attr(t.test(x)$conf.int, "conf.level") 

# SK 95% Trimmed Mean
ci.tr.mean <- c(mean(x,trim=.2)-qnorm(0.975)*trimse(x),mean(x,trim=.2)+qnorm(0.975)*trimse(x))

# SK 95% Median
ci.med <- c(median(x)-qnorm(0.975)*se.med,median(x)+qnorm(0.975)*se.med) 

# SK 95% Winsorized Mean
ci.win.mean <- c(winmean(x)-qnorm(0.975)*winse(x),winmean(x)+qnorm(0.975)*winse(x))

cat("Selang Kepercayaan     : ", ci.level, "\n\n")
## Selang Kepercayaan     :  0.95
cat("Mean      : ", ci.mean, "\n")
## Mean      :  12.0626 16.28767
cat("Tr. Mean  : ", ci.tr.mean, "\n")
## Tr. Mean  :  8.773294 12.65993
cat("Median    : ", ci.med, "\n")
## Median    :  7.968801 11.16787
cat("Win. Mean : ", ci.win.mean, "\n utk 20% (default di fungsi kita)")
## Win. Mean :  9.928011 13.79698 
##  utk 20% (default di fungsi kita)
library(MASS)
# M-ESTIMATOR
x <- bpr$Non.Performing.Loan

hub.1 <- hubers(x)
hub.1$mu
## [1] 12.27906
hub.1$s
## [1] 9.795196
hub.2 <- hubers(x, k=1.345)
hub.2$mu
## [1] 11.98978
hub.2$s
## [1] 9.686614
hub.3 <- hubers(x,mu=median(x))
hub.3$mu
## [1] 9.568333
hub.3$s
## [1] 8.719008
hub.4 <- hubers(x,s=1)
hub.4$mu
## [1] 9.54636
hub.4$s
## [1] 1

5 Outlier: Data Handphone

data(phones)
str(phones)
## List of 2
##  $ year : num [1:24] 50 51 52 53 54 55 56 57 58 59 ...
##  $ calls: num [1:24] 4.4 4.7 4.7 5.9 6.6 7.3 8.1 8.8 10.6 12 ...
data.frame(phones)
##    year calls
## 1    50   4.4
## 2    51   4.7
## 3    52   4.7
## 4    53   5.9
## 5    54   6.6
## 6    55   7.3
## 7    56   8.1
## 8    57   8.8
## 9    58  10.6
## 10   59  12.0
## 11   60  13.5
## 12   61  14.9
## 13   62  16.1
## 14   63  21.2
## 15   64 119.0
## 16   65 124.0
## 17   66 142.0
## 18   67 159.0
## 19   68 182.0
## 20   69 212.0
## 21   70  43.0
## 22   71  24.0
## 23   72  27.0
## 24   73  29.0
attach(phones)
plot(year,calls)

plot(phones$year,phones$calls, pch=19, cex=2, col="darkred")

fit.ols <- lm(calls~year, data=phones) # ols = ordinary linear square
              # calls sebagai peubah y dan year sebgai peubah x
summary(fit.ols)
## 
## Call:
## lm(formula = calls ~ year, data = phones)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -78.97 -33.52 -12.04  23.38 124.20 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -260.059    102.607  -2.535   0.0189 * 
## year           5.041      1.658   3.041   0.0060 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56.22 on 22 degrees of freedom
## Multiple R-squared:  0.2959, Adjusted R-squared:  0.2639 
## F-statistic: 9.247 on 1 and 22 DF,  p-value: 0.005998
par(mfrow=c(1,4))
plot(fit.ols,1:2)
plot(fit.ols,4)
abline(fit.ols$coef)

hmat.p <- hat(model.matrix(fit.ols))
h.phone <- hat(hmat.p)
cook.d <- cooks.distance(fit.ols)
plot(h.phone/(1-h.phone),cook.d,xlab="h/(1-h)",ylab="Cook distance")

fit.hub <- rlm(calls~year,maxit=50, data=phones)
summary(fit.hub,cor=F)
## 
## Call: rlm(formula = calls ~ year, data = phones, maxit = 50)
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.314  -5.953  -1.681  26.460 173.769 
## 
## Coefficients:
##             Value     Std. Error t value  
## (Intercept) -102.6222   26.6082    -3.8568
## year           2.0414    0.4299     4.7480
## 
## Residual standard error: 9.032 on 22 degrees of freedom
par(mfrow=c(1,3))
plot(fit.hub,1)
plot(fit.hub,2)
abline(fit.hub$coef)

fit.hub2 <- rlm(calls~year,scale.est="proposal 2", data=phones)
summary(fit.hub2,cor=F)
## 
## Call: rlm(formula = calls ~ year, data = phones, scale.est = "proposal 2")
## Residuals:
##    Min     1Q Median     3Q    Max 
## -68.15 -29.46 -11.52  22.74 132.67 
## 
## Coefficients:
##             Value     Std. Error t value  
## (Intercept) -227.9250  101.8740    -2.2373
## year           4.4530    1.6461     2.7052
## 
## Residual standard error: 57.25 on 22 degrees of freedom
fit.lms <- lqs(calls~year,method="lms", data=phones)

fit.lms
## Call:
## lqs.formula(formula = calls ~ year, data = phones, method = "lms")
## 
## Coefficients:
## (Intercept)         year  
##     -55.947        1.155  
## 
## Scale estimates 0.9377 0.9095
fit.lts <- lqs(calls~year,method="lts", data=phones)
fit.lts
## Call:
## lqs.formula(formula = calls ~ year, data = phones, method = "lts")
## 
## Coefficients:
## (Intercept)         year  
##     -56.162        1.159  
## 
## Scale estimates 1.249 1.131
par(mfrow=c(1,2))
plot(fit.lms$fit,fit.lms$res,main="lms")
plot(fit.lts$fit,fit.lts$res,main="lts")

par(mfrow=c(1,1))
data.frame(diabetes)
##     Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1             6     148            72            35       0 33.6
## 2             1      85            66            29       0 26.6
## 3             8     183            64             0       0 23.3
## 4             1      89            66            23      94 28.1
## 5             0     137            40            35     168 43.1
## 6             5     116            74             0       0 25.6
## 7             3      78            50            32      88 31.0
## 8            10     115             0             0       0 35.3
## 9             2     197            70            45     543 30.5
## 10            8     125            96             0       0  0.0
## 11            4     110            92             0       0 37.6
## 12           10     168            74             0       0 38.0
## 13           10     139            80             0       0 27.1
## 14            1     189            60            23     846 30.1
## 15            5     166            72            19     175 25.8
## 16            7     100             0             0       0 30.0
## 17            0     118            84            47     230 45.8
## 18            7     107            74             0       0 29.6
## 19            1     103            30            38      83 43.3
## 20            1     115            70            30      96 34.6
## 21            3     126            88            41     235 39.3
## 22            8      99            84             0       0 35.4
## 23            7     196            90             0       0 39.8
## 24            9     119            80            35       0 29.0
## 25           11     143            94            33     146 36.6
## 26           10     125            70            26     115 31.1
## 27            7     147            76             0       0 39.4
## 28            1      97            66            15     140 23.2
## 29           13     145            82            19     110 22.2
## 30            5     117            92             0       0 34.1
## 31            5     109            75            26       0 36.0
## 32            3     158            76            36     245 31.6
## 33            3      88            58            11      54 24.8
## 34            6      92            92             0       0 19.9
## 35           10     122            78            31       0 27.6
## 36            4     103            60            33     192 24.0
## 37           11     138            76             0       0 33.2
## 38            9     102            76            37       0 32.9
## 39            2      90            68            42       0 38.2
## 40            4     111            72            47     207 37.1
## 41            3     180            64            25      70 34.0
## 42            7     133            84             0       0 40.2
## 43            7     106            92            18       0 22.7
## 44            9     171           110            24     240 45.4
## 45            7     159            64             0       0 27.4
## 46            0     180            66            39       0 42.0
## 47            1     146            56             0       0 29.7
## 48            2      71            70            27       0 28.0
## 49            7     103            66            32       0 39.1
## 50            7     105             0             0       0  0.0
## 51            1     103            80            11      82 19.4
## 52            1     101            50            15      36 24.2
## 53            5      88            66            21      23 24.4
## 54            8     176            90            34     300 33.7
## 55            7     150            66            42     342 34.7
## 56            1      73            50            10       0 23.0
## 57            7     187            68            39     304 37.7
## 58            0     100            88            60     110 46.8
## 59            0     146            82             0       0 40.5
## 60            0     105            64            41     142 41.5
## 61            2      84             0             0       0  0.0
## 62            8     133            72             0       0 32.9
## 63            5      44            62             0       0 25.0
## 64            2     141            58            34     128 25.4
## 65            7     114            66             0       0 32.8
## 66            5      99            74            27       0 29.0
## 67            0     109            88            30       0 32.5
## 68            2     109            92             0       0 42.7
## 69            1      95            66            13      38 19.6
## 70            4     146            85            27     100 28.9
## 71            2     100            66            20      90 32.9
## 72            5     139            64            35     140 28.6
## 73           13     126            90             0       0 43.4
## 74            4     129            86            20     270 35.1
## 75            1      79            75            30       0 32.0
## 76            1       0            48            20       0 24.7
## 77            7      62            78             0       0 32.6
## 78            5      95            72            33       0 37.7
## 79            0     131             0             0       0 43.2
## 80            2     112            66            22       0 25.0
## 81            3     113            44            13       0 22.4
## 82            2      74             0             0       0  0.0
## 83            7      83            78            26      71 29.3
## 84            0     101            65            28       0 24.6
## 85            5     137           108             0       0 48.8
## 86            2     110            74            29     125 32.4
## 87           13     106            72            54       0 36.6
## 88            2     100            68            25      71 38.5
## 89           15     136            70            32     110 37.1
## 90            1     107            68            19       0 26.5
## 91            1      80            55             0       0 19.1
## 92            4     123            80            15     176 32.0
## 93            7      81            78            40      48 46.7
## 94            4     134            72             0       0 23.8
## 95            2     142            82            18      64 24.7
## 96            6     144            72            27     228 33.9
## 97            2      92            62            28       0 31.6
## 98            1      71            48            18      76 20.4
## 99            6      93            50            30      64 28.7
## 100           1     122            90            51     220 49.7
## 101           1     163            72             0       0 39.0
## 102           1     151            60             0       0 26.1
## 103           0     125            96             0       0 22.5
## 104           1      81            72            18      40 26.6
## 105           2      85            65             0       0 39.6
## 106           1     126            56            29     152 28.7
## 107           1      96           122             0       0 22.4
## 108           4     144            58            28     140 29.5
## 109           3      83            58            31      18 34.3
## 110           0      95            85            25      36 37.4
## 111           3     171            72            33     135 33.3
## 112           8     155            62            26     495 34.0
## 113           1      89            76            34      37 31.2
## 114           4      76            62             0       0 34.0
## 115           7     160            54            32     175 30.5
## 116           4     146            92             0       0 31.2
## 117           5     124            74             0       0 34.0
## 118           5      78            48             0       0 33.7
## 119           4      97            60            23       0 28.2
## 120           4      99            76            15      51 23.2
## 121           0     162            76            56     100 53.2
## 122           6     111            64            39       0 34.2
## 123           2     107            74            30     100 33.6
## 124           5     132            80             0       0 26.8
## 125           0     113            76             0       0 33.3
## 126           1      88            30            42      99 55.0
## 127           3     120            70            30     135 42.9
## 128           1     118            58            36      94 33.3
## 129           1     117            88            24     145 34.5
## 130           0     105            84             0       0 27.9
## 131           4     173            70            14     168 29.7
## 132           9     122            56             0       0 33.3
## 133           3     170            64            37     225 34.5
## 134           8      84            74            31       0 38.3
## 135           2      96            68            13      49 21.1
## 136           2     125            60            20     140 33.8
## 137           0     100            70            26      50 30.8
## 138           0      93            60            25      92 28.7
## 139           0     129            80             0       0 31.2
## 140           5     105            72            29     325 36.9
## 141           3     128            78             0       0 21.1
## 142           5     106            82            30       0 39.5
## 143           2     108            52            26      63 32.5
## 144          10     108            66             0       0 32.4
## 145           4     154            62            31     284 32.8
## 146           0     102            75            23       0  0.0
## 147           9      57            80            37       0 32.8
## 148           2     106            64            35     119 30.5
## 149           5     147            78             0       0 33.7
## 150           2      90            70            17       0 27.3
## 151           1     136            74            50     204 37.4
## 152           4     114            65             0       0 21.9
## 153           9     156            86            28     155 34.3
## 154           1     153            82            42     485 40.6
## 155           8     188            78             0       0 47.9
## 156           7     152            88            44       0 50.0
## 157           2      99            52            15      94 24.6
## 158           1     109            56            21     135 25.2
## 159           2      88            74            19      53 29.0
## 160          17     163            72            41     114 40.9
## 161           4     151            90            38       0 29.7
## 162           7     102            74            40     105 37.2
## 163           0     114            80            34     285 44.2
## 164           2     100            64            23       0 29.7
## 165           0     131            88             0       0 31.6
## 166           6     104            74            18     156 29.9
## 167           3     148            66            25       0 32.5
## 168           4     120            68             0       0 29.6
## 169           4     110            66             0       0 31.9
## 170           3     111            90            12      78 28.4
## 171           6     102            82             0       0 30.8
## 172           6     134            70            23     130 35.4
## 173           2      87             0            23       0 28.9
## 174           1      79            60            42      48 43.5
## 175           2      75            64            24      55 29.7
## 176           8     179            72            42     130 32.7
## 177           6      85            78             0       0 31.2
## 178           0     129           110            46     130 67.1
## 179           5     143            78             0       0 45.0
## 180           5     130            82             0       0 39.1
## 181           6      87            80             0       0 23.2
## 182           0     119            64            18      92 34.9
## 183           1       0            74            20      23 27.7
## 184           5      73            60             0       0 26.8
## 185           4     141            74             0       0 27.6
## 186           7     194            68            28       0 35.9
## 187           8     181            68            36     495 30.1
## 188           1     128            98            41      58 32.0
## 189           8     109            76            39     114 27.9
## 190           5     139            80            35     160 31.6
## 191           3     111            62             0       0 22.6
## 192           9     123            70            44      94 33.1
## 193           7     159            66             0       0 30.4
## 194          11     135             0             0       0 52.3
## 195           8      85            55            20       0 24.4
## 196           5     158            84            41     210 39.4
## 197           1     105            58             0       0 24.3
## 198           3     107            62            13      48 22.9
## 199           4     109            64            44      99 34.8
## 200           4     148            60            27     318 30.9
## 201           0     113            80            16       0 31.0
## 202           1     138            82             0       0 40.1
## 203           0     108            68            20       0 27.3
## 204           2      99            70            16      44 20.4
## 205           6     103            72            32     190 37.7
## 206           5     111            72            28       0 23.9
## 207           8     196            76            29     280 37.5
## 208           5     162           104             0       0 37.7
## 209           1      96            64            27      87 33.2
## 210           7     184            84            33       0 35.5
## 211           2      81            60            22       0 27.7
## 212           0     147            85            54       0 42.8
## 213           7     179            95            31       0 34.2
## 214           0     140            65            26     130 42.6
## 215           9     112            82            32     175 34.2
## 216          12     151            70            40     271 41.8
## 217           5     109            62            41     129 35.8
## 218           6     125            68            30     120 30.0
## 219           5      85            74            22       0 29.0
## 220           5     112            66             0       0 37.8
## 221           0     177            60            29     478 34.6
## 222           2     158            90             0       0 31.6
## 223           7     119             0             0       0 25.2
## 224           7     142            60            33     190 28.8
## 225           1     100            66            15      56 23.6
## 226           1      87            78            27      32 34.6
## 227           0     101            76             0       0 35.7
## 228           3     162            52            38       0 37.2
## 229           4     197            70            39     744 36.7
## 230           0     117            80            31      53 45.2
## 231           4     142            86             0       0 44.0
## 232           6     134            80            37     370 46.2
## 233           1      79            80            25      37 25.4
## 234           4     122            68             0       0 35.0
## 235           3      74            68            28      45 29.7
## 236           4     171            72             0       0 43.6
## 237           7     181            84            21     192 35.9
## 238           0     179            90            27       0 44.1
## 239           9     164            84            21       0 30.8
## 240           0     104            76             0       0 18.4
## 241           1      91            64            24       0 29.2
## 242           4      91            70            32      88 33.1
## 243           3     139            54             0       0 25.6
## 244           6     119            50            22     176 27.1
## 245           2     146            76            35     194 38.2
## 246           9     184            85            15       0 30.0
## 247          10     122            68             0       0 31.2
## 248           0     165            90            33     680 52.3
## 249           9     124            70            33     402 35.4
## 250           1     111            86            19       0 30.1
## 251           9     106            52             0       0 31.2
## 252           2     129            84             0       0 28.0
## 253           2      90            80            14      55 24.4
## 254           0      86            68            32       0 35.8
## 255          12      92            62             7     258 27.6
## 256           1     113            64            35       0 33.6
## 257           3     111            56            39       0 30.1
## 258           2     114            68            22       0 28.7
## 259           1     193            50            16     375 25.9
## 260          11     155            76            28     150 33.3
## 261           3     191            68            15     130 30.9
## 262           3     141             0             0       0 30.0
## 263           4      95            70            32       0 32.1
## 264           3     142            80            15       0 32.4
## 265           4     123            62             0       0 32.0
## 266           5      96            74            18      67 33.6
## 267           0     138             0             0       0 36.3
## 268           2     128            64            42       0 40.0
## 269           0     102            52             0       0 25.1
## 270           2     146             0             0       0 27.5
## 271          10     101            86            37       0 45.6
## 272           2     108            62            32      56 25.2
## 273           3     122            78             0       0 23.0
## 274           1      71            78            50      45 33.2
## 275          13     106            70             0       0 34.2
## 276           2     100            70            52      57 40.5
## 277           7     106            60            24       0 26.5
## 278           0     104            64            23     116 27.8
## 279           5     114            74             0       0 24.9
## 280           2     108            62            10     278 25.3
## 281           0     146            70             0       0 37.9
## 282          10     129            76            28     122 35.9
## 283           7     133            88            15     155 32.4
## 284           7     161            86             0       0 30.4
## 285           2     108            80             0       0 27.0
## 286           7     136            74            26     135 26.0
## 287           5     155            84            44     545 38.7
## 288           1     119            86            39     220 45.6
## 289           4      96            56            17      49 20.8
## 290           5     108            72            43      75 36.1
## 291           0      78            88            29      40 36.9
## 292           0     107            62            30      74 36.6
## 293           2     128            78            37     182 43.3
## 294           1     128            48            45     194 40.5
## 295           0     161            50             0       0 21.9
## 296           6     151            62            31     120 35.5
## 297           2     146            70            38     360 28.0
## 298           0     126            84            29     215 30.7
## 299          14     100            78            25     184 36.6
## 300           8     112            72             0       0 23.6
## 301           0     167             0             0       0 32.3
## 302           2     144            58            33     135 31.6
## 303           5      77            82            41      42 35.8
## 304           5     115            98             0       0 52.9
## 305           3     150            76             0       0 21.0
## 306           2     120            76            37     105 39.7
## 307          10     161            68            23     132 25.5
## 308           0     137            68            14     148 24.8
## 309           0     128            68            19     180 30.5
## 310           2     124            68            28     205 32.9
## 311           6      80            66            30       0 26.2
## 312           0     106            70            37     148 39.4
## 313           2     155            74            17      96 26.6
## 314           3     113            50            10      85 29.5
## 315           7     109            80            31       0 35.9
## 316           2     112            68            22      94 34.1
## 317           3      99            80            11      64 19.3
## 318           3     182            74             0       0 30.5
## 319           3     115            66            39     140 38.1
## 320           6     194            78             0       0 23.5
## 321           4     129            60            12     231 27.5
## 322           3     112            74            30       0 31.6
## 323           0     124            70            20       0 27.4
## 324          13     152            90            33      29 26.8
## 325           2     112            75            32       0 35.7
## 326           1     157            72            21     168 25.6
## 327           1     122            64            32     156 35.1
## 328          10     179            70             0       0 35.1
## 329           2     102            86            36     120 45.5
## 330           6     105            70            32      68 30.8
## 331           8     118            72            19       0 23.1
## 332           2      87            58            16      52 32.7
## 333           1     180             0             0       0 43.3
## 334          12     106            80             0       0 23.6
## 335           1      95            60            18      58 23.9
## 336           0     165            76            43     255 47.9
## 337           0     117             0             0       0 33.8
## 338           5     115            76             0       0 31.2
## 339           9     152            78            34     171 34.2
## 340           7     178            84             0       0 39.9
## 341           1     130            70            13     105 25.9
## 342           1      95            74            21      73 25.9
## 343           1       0            68            35       0 32.0
## 344           5     122            86             0       0 34.7
## 345           8      95            72             0       0 36.8
## 346           8     126            88            36     108 38.5
## 347           1     139            46            19      83 28.7
## 348           3     116             0             0       0 23.5
## 349           3      99            62            19      74 21.8
## 350           5       0            80            32       0 41.0
## 351           4      92            80             0       0 42.2
## 352           4     137            84             0       0 31.2
## 353           3      61            82            28       0 34.4
## 354           1      90            62            12      43 27.2
## 355           3      90            78             0       0 42.7
## 356           9     165            88             0       0 30.4
## 357           1     125            50            40     167 33.3
## 358          13     129             0            30       0 39.9
## 359          12      88            74            40      54 35.3
## 360           1     196            76            36     249 36.5
## 361           5     189            64            33     325 31.2
## 362           5     158            70             0       0 29.8
## 363           5     103           108            37       0 39.2
## 364           4     146            78             0       0 38.5
## 365           4     147            74            25     293 34.9
## 366           5      99            54            28      83 34.0
## 367           6     124            72             0       0 27.6
## 368           0     101            64            17       0 21.0
## 369           3      81            86            16      66 27.5
## 370           1     133           102            28     140 32.8
## 371           3     173            82            48     465 38.4
## 372           0     118            64            23      89  0.0
## 373           0      84            64            22      66 35.8
## 374           2     105            58            40      94 34.9
## 375           2     122            52            43     158 36.2
## 376          12     140            82            43     325 39.2
## 377           0      98            82            15      84 25.2
## 378           1      87            60            37      75 37.2
## 379           4     156            75             0       0 48.3
## 380           0      93           100            39      72 43.4
## 381           1     107            72            30      82 30.8
## 382           0     105            68            22       0 20.0
## 383           1     109            60             8     182 25.4
## 384           1      90            62            18      59 25.1
## 385           1     125            70            24     110 24.3
## 386           1     119            54            13      50 22.3
## 387           5     116            74            29       0 32.3
## 388           8     105           100            36       0 43.3
## 389           5     144            82            26     285 32.0
## 390           3     100            68            23      81 31.6
## 391           1     100            66            29     196 32.0
## 392           5     166            76             0       0 45.7
## 393           1     131            64            14     415 23.7
## 394           4     116            72            12      87 22.1
## 395           4     158            78             0       0 32.9
## 396           2     127            58            24     275 27.7
## 397           3      96            56            34     115 24.7
## 398           0     131            66            40       0 34.3
## 399           3      82            70             0       0 21.1
## 400           3     193            70            31       0 34.9
## 401           4      95            64             0       0 32.0
## 402           6     137            61             0       0 24.2
## 403           5     136            84            41      88 35.0
## 404           9      72            78            25       0 31.6
## 405           5     168            64             0       0 32.9
## 406           2     123            48            32     165 42.1
## 407           4     115            72             0       0 28.9
## 408           0     101            62             0       0 21.9
## 409           8     197            74             0       0 25.9
## 410           1     172            68            49     579 42.4
## 411           6     102            90            39       0 35.7
## 412           1     112            72            30     176 34.4
## 413           1     143            84            23     310 42.4
## 414           1     143            74            22      61 26.2
## 415           0     138            60            35     167 34.6
## 416           3     173            84            33     474 35.7
## 417           1      97            68            21       0 27.2
## 418           4     144            82            32       0 38.5
## 419           1      83            68             0       0 18.2
## 420           3     129            64            29     115 26.4
## 421           1     119            88            41     170 45.3
## 422           2      94            68            18      76 26.0
## 423           0     102            64            46      78 40.6
## 424           2     115            64            22       0 30.8
## 425           8     151            78            32     210 42.9
## 426           4     184            78            39     277 37.0
## 427           0      94             0             0       0  0.0
## 428           1     181            64            30     180 34.1
## 429           0     135            94            46     145 40.6
## 430           1      95            82            25     180 35.0
## 431           2      99             0             0       0 22.2
## 432           3      89            74            16      85 30.4
## 433           1      80            74            11      60 30.0
## 434           2     139            75             0       0 25.6
## 435           1      90            68             8       0 24.5
## 436           0     141             0             0       0 42.4
## 437          12     140            85            33       0 37.4
## 438           5     147            75             0       0 29.9
## 439           1      97            70            15       0 18.2
## 440           6     107            88             0       0 36.8
## 441           0     189           104            25       0 34.3
## 442           2      83            66            23      50 32.2
## 443           4     117            64            27     120 33.2
## 444           8     108            70             0       0 30.5
## 445           4     117            62            12       0 29.7
## 446           0     180            78            63      14 59.4
## 447           1     100            72            12      70 25.3
## 448           0      95            80            45      92 36.5
## 449           0     104            64            37      64 33.6
## 450           0     120            74            18      63 30.5
## 451           1      82            64            13      95 21.2
## 452           2     134            70             0       0 28.9
## 453           0      91            68            32     210 39.9
## 454           2     119             0             0       0 19.6
## 455           2     100            54            28     105 37.8
## 456          14     175            62            30       0 33.6
## 457           1     135            54             0       0 26.7
## 458           5      86            68            28      71 30.2
## 459          10     148            84            48     237 37.6
## 460           9     134            74            33      60 25.9
## 461           9     120            72            22      56 20.8
## 462           1      71            62             0       0 21.8
## 463           8      74            70            40      49 35.3
## 464           5      88            78            30       0 27.6
## 465          10     115            98             0       0 24.0
## 466           0     124            56            13     105 21.8
## 467           0      74            52            10      36 27.8
## 468           0      97            64            36     100 36.8
## 469           8     120             0             0       0 30.0
## 470           6     154            78            41     140 46.1
## 471           1     144            82            40       0 41.3
## 472           0     137            70            38       0 33.2
## 473           0     119            66            27       0 38.8
## 474           7     136            90             0       0 29.9
## 475           4     114            64             0       0 28.9
## 476           0     137            84            27       0 27.3
## 477           2     105            80            45     191 33.7
## 478           7     114            76            17     110 23.8
## 479           8     126            74            38      75 25.9
## 480           4     132            86            31       0 28.0
## 481           3     158            70            30     328 35.5
## 482           0     123            88            37       0 35.2
## 483           4      85            58            22      49 27.8
## 484           0      84            82            31     125 38.2
## 485           0     145             0             0       0 44.2
## 486           0     135            68            42     250 42.3
## 487           1     139            62            41     480 40.7
## 488           0     173            78            32     265 46.5
## 489           4      99            72            17       0 25.6
## 490           8     194            80             0       0 26.1
## 491           2      83            65            28      66 36.8
## 492           2      89            90            30       0 33.5
## 493           4      99            68            38       0 32.8
## 494           4     125            70            18     122 28.9
## 495           3      80             0             0       0  0.0
## 496           6     166            74             0       0 26.6
## 497           5     110            68             0       0 26.0
## 498           2      81            72            15      76 30.1
## 499           7     195            70            33     145 25.1
## 500           6     154            74            32     193 29.3
## 501           2     117            90            19      71 25.2
## 502           3      84            72            32       0 37.2
## 503           6       0            68            41       0 39.0
## 504           7      94            64            25      79 33.3
## 505           3      96            78            39       0 37.3
## 506          10      75            82             0       0 33.3
## 507           0     180            90            26      90 36.5
## 508           1     130            60            23     170 28.6
## 509           2      84            50            23      76 30.4
## 510           8     120            78             0       0 25.0
## 511          12      84            72            31       0 29.7
## 512           0     139            62            17     210 22.1
## 513           9      91            68             0       0 24.2
## 514           2      91            62             0       0 27.3
## 515           3      99            54            19      86 25.6
## 516           3     163            70            18     105 31.6
## 517           9     145            88            34     165 30.3
## 518           7     125            86             0       0 37.6
## 519          13      76            60             0       0 32.8
## 520           6     129            90             7     326 19.6
## 521           2      68            70            32      66 25.0
## 522           3     124            80            33     130 33.2
## 523           6     114             0             0       0  0.0
## 524           9     130            70             0       0 34.2
## 525           3     125            58             0       0 31.6
## 526           3      87            60            18       0 21.8
## 527           1      97            64            19      82 18.2
## 528           3     116            74            15     105 26.3
## 529           0     117            66            31     188 30.8
## 530           0     111            65             0       0 24.6
## 531           2     122            60            18     106 29.8
## 532           0     107            76             0       0 45.3
## 533           1      86            66            52      65 41.3
## 534           6      91             0             0       0 29.8
## 535           1      77            56            30      56 33.3
## 536           4     132             0             0       0 32.9
## 537           0     105            90             0       0 29.6
## 538           0      57            60             0       0 21.7
## 539           0     127            80            37     210 36.3
## 540           3     129            92            49     155 36.4
## 541           8     100            74            40     215 39.4
## 542           3     128            72            25     190 32.4
## 543          10      90            85            32       0 34.9
## 544           4      84            90            23      56 39.5
## 545           1      88            78            29      76 32.0
## 546           8     186            90            35     225 34.5
## 547           5     187            76            27     207 43.6
## 548           4     131            68            21     166 33.1
## 549           1     164            82            43      67 32.8
## 550           4     189           110            31       0 28.5
## 551           1     116            70            28       0 27.4
## 552           3      84            68            30     106 31.9
## 553           6     114            88             0       0 27.8
## 554           1      88            62            24      44 29.9
## 555           1      84            64            23     115 36.9
## 556           7     124            70            33     215 25.5
## 557           1      97            70            40       0 38.1
## 558           8     110            76             0       0 27.8
## 559          11     103            68            40       0 46.2
## 560          11      85            74             0       0 30.1
## 561           6     125            76             0       0 33.8
## 562           0     198            66            32     274 41.3
## 563           1      87            68            34      77 37.6
## 564           6      99            60            19      54 26.9
## 565           0      91            80             0       0 32.4
## 566           2      95            54            14      88 26.1
## 567           1      99            72            30      18 38.6
## 568           6      92            62            32     126 32.0
## 569           4     154            72            29     126 31.3
## 570           0     121            66            30     165 34.3
## 571           3      78            70             0       0 32.5
## 572           2     130            96             0       0 22.6
## 573           3     111            58            31      44 29.5
## 574           2      98            60            17     120 34.7
## 575           1     143            86            30     330 30.1
## 576           1     119            44            47      63 35.5
## 577           6     108            44            20     130 24.0
## 578           2     118            80             0       0 42.9
## 579          10     133            68             0       0 27.0
## 580           2     197            70            99       0 34.7
## 581           0     151            90            46       0 42.1
## 582           6     109            60            27       0 25.0
## 583          12     121            78            17       0 26.5
## 584           8     100            76             0       0 38.7
## 585           8     124            76            24     600 28.7
## 586           1      93            56            11       0 22.5
## 587           8     143            66             0       0 34.9
## 588           6     103            66             0       0 24.3
## 589           3     176            86            27     156 33.3
## 590           0      73             0             0       0 21.1
## 591          11     111            84            40       0 46.8
## 592           2     112            78            50     140 39.4
## 593           3     132            80             0       0 34.4
## 594           2      82            52            22     115 28.5
## 595           6     123            72            45     230 33.6
## 596           0     188            82            14     185 32.0
## 597           0      67            76             0       0 45.3
## 598           1      89            24            19      25 27.8
## 599           1     173            74             0       0 36.8
## 600           1     109            38            18     120 23.1
## 601           1     108            88            19       0 27.1
## 602           6      96             0             0       0 23.7
## 603           1     124            74            36       0 27.8
## 604           7     150            78            29     126 35.2
## 605           4     183             0             0       0 28.4
## 606           1     124            60            32       0 35.8
## 607           1     181            78            42     293 40.0
## 608           1      92            62            25      41 19.5
## 609           0     152            82            39     272 41.5
## 610           1     111            62            13     182 24.0
## 611           3     106            54            21     158 30.9
## 612           3     174            58            22     194 32.9
## 613           7     168            88            42     321 38.2
## 614           6     105            80            28       0 32.5
## 615          11     138            74            26     144 36.1
## 616           3     106            72             0       0 25.8
## 617           6     117            96             0       0 28.7
## 618           2      68            62            13      15 20.1
## 619           9     112            82            24       0 28.2
## 620           0     119             0             0       0 32.4
## 621           2     112            86            42     160 38.4
## 622           2      92            76            20       0 24.2
## 623           6     183            94             0       0 40.8
## 624           0      94            70            27     115 43.5
## 625           2     108            64             0       0 30.8
## 626           4      90            88            47      54 37.7
## 627           0     125            68             0       0 24.7
## 628           0     132            78             0       0 32.4
## 629           5     128            80             0       0 34.6
## 630           4      94            65            22       0 24.7
## 631           7     114            64             0       0 27.4
## 632           0     102            78            40      90 34.5
## 633           2     111            60             0       0 26.2
## 634           1     128            82            17     183 27.5
## 635          10      92            62             0       0 25.9
## 636          13     104            72             0       0 31.2
## 637           5     104            74             0       0 28.8
## 638           2      94            76            18      66 31.6
## 639           7      97            76            32      91 40.9
## 640           1     100            74            12      46 19.5
## 641           0     102            86            17     105 29.3
## 642           4     128            70             0       0 34.3
## 643           6     147            80             0       0 29.5
## 644           4      90             0             0       0 28.0
## 645           3     103            72            30     152 27.6
## 646           2     157            74            35     440 39.4
## 647           1     167            74            17     144 23.4
## 648           0     179            50            36     159 37.8
## 649          11     136            84            35     130 28.3
## 650           0     107            60            25       0 26.4
## 651           1      91            54            25     100 25.2
## 652           1     117            60            23     106 33.8
## 653           5     123            74            40      77 34.1
## 654           2     120            54             0       0 26.8
## 655           1     106            70            28     135 34.2
## 656           2     155            52            27     540 38.7
## 657           2     101            58            35      90 21.8
## 658           1     120            80            48     200 38.9
## 659          11     127           106             0       0 39.0
## 660           3      80            82            31      70 34.2
## 661          10     162            84             0       0 27.7
## 662           1     199            76            43       0 42.9
## 663           8     167           106            46     231 37.6
## 664           9     145            80            46     130 37.9
## 665           6     115            60            39       0 33.7
## 666           1     112            80            45     132 34.8
## 667           4     145            82            18       0 32.5
## 668          10     111            70            27       0 27.5
## 669           6      98            58            33     190 34.0
## 670           9     154            78            30     100 30.9
## 671           6     165            68            26     168 33.6
## 672           1      99            58            10       0 25.4
## 673          10      68           106            23      49 35.5
## 674           3     123           100            35     240 57.3
## 675           8      91            82             0       0 35.6
## 676           6     195            70             0       0 30.9
## 677           9     156            86             0       0 24.8
## 678           0      93            60             0       0 35.3
## 679           3     121            52             0       0 36.0
## 680           2     101            58            17     265 24.2
## 681           2      56            56            28      45 24.2
## 682           0     162            76            36       0 49.6
## 683           0      95            64            39     105 44.6
## 684           4     125            80             0       0 32.3
## 685           5     136            82             0       0  0.0
## 686           2     129            74            26     205 33.2
## 687           3     130            64             0       0 23.1
## 688           1     107            50            19       0 28.3
## 689           1     140            74            26     180 24.1
## 690           1     144            82            46     180 46.1
## 691           8     107            80             0       0 24.6
## 692          13     158           114             0       0 42.3
## 693           2     121            70            32      95 39.1
## 694           7     129            68            49     125 38.5
## 695           2      90            60             0       0 23.5
## 696           7     142            90            24     480 30.4
## 697           3     169            74            19     125 29.9
## 698           0      99             0             0       0 25.0
## 699           4     127            88            11     155 34.5
## 700           4     118            70             0       0 44.5
## 701           2     122            76            27     200 35.9
## 702           6     125            78            31       0 27.6
## 703           1     168            88            29       0 35.0
## 704           2     129             0             0       0 38.5
## 705           4     110            76            20     100 28.4
## 706           6      80            80            36       0 39.8
## 707          10     115             0             0       0  0.0
## 708           2     127            46            21     335 34.4
## 709           9     164            78             0       0 32.8
## 710           2      93            64            32     160 38.0
## 711           3     158            64            13     387 31.2
## 712           5     126            78            27      22 29.6
## 713          10     129            62            36       0 41.2
## 714           0     134            58            20     291 26.4
## 715           3     102            74             0       0 29.5
## 716           7     187            50            33     392 33.9
## 717           3     173            78            39     185 33.8
## 718          10      94            72            18       0 23.1
## 719           1     108            60            46     178 35.5
## 720           5      97            76            27       0 35.6
## 721           4      83            86            19       0 29.3
## 722           1     114            66            36     200 38.1
## 723           1     149            68            29     127 29.3
## 724           5     117            86            30     105 39.1
## 725           1     111            94             0       0 32.8
## 726           4     112            78            40       0 39.4
## 727           1     116            78            29     180 36.1
## 728           0     141            84            26       0 32.4
## 729           2     175            88             0       0 22.9
## 730           2      92            52             0       0 30.1
## 731           3     130            78            23      79 28.4
## 732           8     120            86             0       0 28.4
## 733           2     174            88            37     120 44.5
## 734           2     106            56            27     165 29.0
## 735           2     105            75             0       0 23.3
## 736           4      95            60            32       0 35.4
## 737           0     126            86            27     120 27.4
## 738           8      65            72            23       0 32.0
## 739           2      99            60            17     160 36.6
## 740           1     102            74             0       0 39.5
## 741          11     120            80            37     150 42.3
## 742           3     102            44            20      94 30.8
## 743           1     109            58            18     116 28.5
## 744           9     140            94             0       0 32.7
## 745          13     153            88            37     140 40.6
## 746          12     100            84            33     105 30.0
## 747           1     147            94            41       0 49.3
## 748           1      81            74            41      57 46.3
## 749           3     187            70            22     200 36.4
## 750           6     162            62             0       0 24.3
## 751           4     136            70             0       0 31.2
## 752           1     121            78            39      74 39.0
## 753           3     108            62            24       0 26.0
## 754           0     181            88            44     510 43.3
## 755           8     154            78            32       0 32.4
## 756           1     128            88            39     110 36.5
## 757           7     137            90            41       0 32.0
## 758           0     123            72             0       0 36.3
## 759           1     106            76             0       0 37.5
## 760           6     190            92             0       0 35.5
## 761           2      88            58            26      16 28.4
## 762           9     170            74            31       0 44.0
## 763           9      89            62             0       0 22.5
## 764          10     101            76            48     180 32.9
## 765           2     122            70            27       0 36.8
## 766           5     121            72            23     112 26.2
## 767           1     126            60             0       0 30.1
## 768           1      93            70            31       0 30.4
##     DiabetesPedigreeFunction Age Outcome
## 1                      0.627  50       1
## 2                      0.351  31       0
## 3                      0.672  32       1
## 4                      0.167  21       0
## 5                      2.288  33       1
## 6                      0.201  30       0
## 7                      0.248  26       1
## 8                      0.134  29       0
## 9                      0.158  53       1
## 10                     0.232  54       1
## 11                     0.191  30       0
## 12                     0.537  34       1
## 13                     1.441  57       0
## 14                     0.398  59       1
## 15                     0.587  51       1
## 16                     0.484  32       1
## 17                     0.551  31       1
## 18                     0.254  31       1
## 19                     0.183  33       0
## 20                     0.529  32       1
## 21                     0.704  27       0
## 22                     0.388  50       0
## 23                     0.451  41       1
## 24                     0.263  29       1
## 25                     0.254  51       1
## 26                     0.205  41       1
## 27                     0.257  43       1
## 28                     0.487  22       0
## 29                     0.245  57       0
## 30                     0.337  38       0
## 31                     0.546  60       0
## 32                     0.851  28       1
## 33                     0.267  22       0
## 34                     0.188  28       0
## 35                     0.512  45       0
## 36                     0.966  33       0
## 37                     0.420  35       0
## 38                     0.665  46       1
## 39                     0.503  27       1
## 40                     1.390  56       1
## 41                     0.271  26       0
## 42                     0.696  37       0
## 43                     0.235  48       0
## 44                     0.721  54       1
## 45                     0.294  40       0
## 46                     1.893  25       1
## 47                     0.564  29       0
## 48                     0.586  22       0
## 49                     0.344  31       1
## 50                     0.305  24       0
## 51                     0.491  22       0
## 52                     0.526  26       0
## 53                     0.342  30       0
## 54                     0.467  58       1
## 55                     0.718  42       0
## 56                     0.248  21       0
## 57                     0.254  41       1
## 58                     0.962  31       0
## 59                     1.781  44       0
## 60                     0.173  22       0
## 61                     0.304  21       0
## 62                     0.270  39       1
## 63                     0.587  36       0
## 64                     0.699  24       0
## 65                     0.258  42       1
## 66                     0.203  32       0
## 67                     0.855  38       1
## 68                     0.845  54       0
## 69                     0.334  25       0
## 70                     0.189  27       0
## 71                     0.867  28       1
## 72                     0.411  26       0
## 73                     0.583  42       1
## 74                     0.231  23       0
## 75                     0.396  22       0
## 76                     0.140  22       0
## 77                     0.391  41       0
## 78                     0.370  27       0
## 79                     0.270  26       1
## 80                     0.307  24       0
## 81                     0.140  22       0
## 82                     0.102  22       0
## 83                     0.767  36       0
## 84                     0.237  22       0
## 85                     0.227  37       1
## 86                     0.698  27       0
## 87                     0.178  45       0
## 88                     0.324  26       0
## 89                     0.153  43       1
## 90                     0.165  24       0
## 91                     0.258  21       0
## 92                     0.443  34       0
## 93                     0.261  42       0
## 94                     0.277  60       1
## 95                     0.761  21       0
## 96                     0.255  40       0
## 97                     0.130  24       0
## 98                     0.323  22       0
## 99                     0.356  23       0
## 100                    0.325  31       1
## 101                    1.222  33       1
## 102                    0.179  22       0
## 103                    0.262  21       0
## 104                    0.283  24       0
## 105                    0.930  27       0
## 106                    0.801  21       0
## 107                    0.207  27       0
## 108                    0.287  37       0
## 109                    0.336  25       0
## 110                    0.247  24       1
## 111                    0.199  24       1
## 112                    0.543  46       1
## 113                    0.192  23       0
## 114                    0.391  25       0
## 115                    0.588  39       1
## 116                    0.539  61       1
## 117                    0.220  38       1
## 118                    0.654  25       0
## 119                    0.443  22       0
## 120                    0.223  21       0
## 121                    0.759  25       1
## 122                    0.260  24       0
## 123                    0.404  23       0
## 124                    0.186  69       0
## 125                    0.278  23       1
## 126                    0.496  26       1
## 127                    0.452  30       0
## 128                    0.261  23       0
## 129                    0.403  40       1
## 130                    0.741  62       1
## 131                    0.361  33       1
## 132                    1.114  33       1
## 133                    0.356  30       1
## 134                    0.457  39       0
## 135                    0.647  26       0
## 136                    0.088  31       0
## 137                    0.597  21       0
## 138                    0.532  22       0
## 139                    0.703  29       0
## 140                    0.159  28       0
## 141                    0.268  55       0
## 142                    0.286  38       0
## 143                    0.318  22       0
## 144                    0.272  42       1
## 145                    0.237  23       0
## 146                    0.572  21       0
## 147                    0.096  41       0
## 148                    1.400  34       0
## 149                    0.218  65       0
## 150                    0.085  22       0
## 151                    0.399  24       0
## 152                    0.432  37       0
## 153                    1.189  42       1
## 154                    0.687  23       0
## 155                    0.137  43       1
## 156                    0.337  36       1
## 157                    0.637  21       0
## 158                    0.833  23       0
## 159                    0.229  22       0
## 160                    0.817  47       1
## 161                    0.294  36       0
## 162                    0.204  45       0
## 163                    0.167  27       0
## 164                    0.368  21       0
## 165                    0.743  32       1
## 166                    0.722  41       1
## 167                    0.256  22       0
## 168                    0.709  34       0
## 169                    0.471  29       0
## 170                    0.495  29       0
## 171                    0.180  36       1
## 172                    0.542  29       1
## 173                    0.773  25       0
## 174                    0.678  23       0
## 175                    0.370  33       0
## 176                    0.719  36       1
## 177                    0.382  42       0
## 178                    0.319  26       1
## 179                    0.190  47       0
## 180                    0.956  37       1
## 181                    0.084  32       0
## 182                    0.725  23       0
## 183                    0.299  21       0
## 184                    0.268  27       0
## 185                    0.244  40       0
## 186                    0.745  41       1
## 187                    0.615  60       1
## 188                    1.321  33       1
## 189                    0.640  31       1
## 190                    0.361  25       1
## 191                    0.142  21       0
## 192                    0.374  40       0
## 193                    0.383  36       1
## 194                    0.578  40       1
## 195                    0.136  42       0
## 196                    0.395  29       1
## 197                    0.187  21       0
## 198                    0.678  23       1
## 199                    0.905  26       1
## 200                    0.150  29       1
## 201                    0.874  21       0
## 202                    0.236  28       0
## 203                    0.787  32       0
## 204                    0.235  27       0
## 205                    0.324  55       0
## 206                    0.407  27       0
## 207                    0.605  57       1
## 208                    0.151  52       1
## 209                    0.289  21       0
## 210                    0.355  41       1
## 211                    0.290  25       0
## 212                    0.375  24       0
## 213                    0.164  60       0
## 214                    0.431  24       1
## 215                    0.260  36       1
## 216                    0.742  38       1
## 217                    0.514  25       1
## 218                    0.464  32       0
## 219                    1.224  32       1
## 220                    0.261  41       1
## 221                    1.072  21       1
## 222                    0.805  66       1
## 223                    0.209  37       0
## 224                    0.687  61       0
## 225                    0.666  26       0
## 226                    0.101  22       0
## 227                    0.198  26       0
## 228                    0.652  24       1
## 229                    2.329  31       0
## 230                    0.089  24       0
## 231                    0.645  22       1
## 232                    0.238  46       1
## 233                    0.583  22       0
## 234                    0.394  29       0
## 235                    0.293  23       0
## 236                    0.479  26       1
## 237                    0.586  51       1
## 238                    0.686  23       1
## 239                    0.831  32       1
## 240                    0.582  27       0
## 241                    0.192  21       0
## 242                    0.446  22       0
## 243                    0.402  22       1
## 244                    1.318  33       1
## 245                    0.329  29       0
## 246                    1.213  49       1
## 247                    0.258  41       0
## 248                    0.427  23       0
## 249                    0.282  34       0
## 250                    0.143  23       0
## 251                    0.380  42       0
## 252                    0.284  27       0
## 253                    0.249  24       0
## 254                    0.238  25       0
## 255                    0.926  44       1
## 256                    0.543  21       1
## 257                    0.557  30       0
## 258                    0.092  25       0
## 259                    0.655  24       0
## 260                    1.353  51       1
## 261                    0.299  34       0
## 262                    0.761  27       1
## 263                    0.612  24       0
## 264                    0.200  63       0
## 265                    0.226  35       1
## 266                    0.997  43       0
## 267                    0.933  25       1
## 268                    1.101  24       0
## 269                    0.078  21       0
## 270                    0.240  28       1
## 271                    1.136  38       1
## 272                    0.128  21       0
## 273                    0.254  40       0
## 274                    0.422  21       0
## 275                    0.251  52       0
## 276                    0.677  25       0
## 277                    0.296  29       1
## 278                    0.454  23       0
## 279                    0.744  57       0
## 280                    0.881  22       0
## 281                    0.334  28       1
## 282                    0.280  39       0
## 283                    0.262  37       0
## 284                    0.165  47       1
## 285                    0.259  52       1
## 286                    0.647  51       0
## 287                    0.619  34       0
## 288                    0.808  29       1
## 289                    0.340  26       0
## 290                    0.263  33       0
## 291                    0.434  21       0
## 292                    0.757  25       1
## 293                    1.224  31       1
## 294                    0.613  24       1
## 295                    0.254  65       0
## 296                    0.692  28       0
## 297                    0.337  29       1
## 298                    0.520  24       0
## 299                    0.412  46       1
## 300                    0.840  58       0
## 301                    0.839  30       1
## 302                    0.422  25       1
## 303                    0.156  35       0
## 304                    0.209  28       1
## 305                    0.207  37       0
## 306                    0.215  29       0
## 307                    0.326  47       1
## 308                    0.143  21       0
## 309                    1.391  25       1
## 310                    0.875  30       1
## 311                    0.313  41       0
## 312                    0.605  22       0
## 313                    0.433  27       1
## 314                    0.626  25       0
## 315                    1.127  43       1
## 316                    0.315  26       0
## 317                    0.284  30       0
## 318                    0.345  29       1
## 319                    0.150  28       0
## 320                    0.129  59       1
## 321                    0.527  31       0
## 322                    0.197  25       1
## 323                    0.254  36       1
## 324                    0.731  43       1
## 325                    0.148  21       0
## 326                    0.123  24       0
## 327                    0.692  30       1
## 328                    0.200  37       0
## 329                    0.127  23       1
## 330                    0.122  37       0
## 331                    1.476  46       0
## 332                    0.166  25       0
## 333                    0.282  41       1
## 334                    0.137  44       0
## 335                    0.260  22       0
## 336                    0.259  26       0
## 337                    0.932  44       0
## 338                    0.343  44       1
## 339                    0.893  33       1
## 340                    0.331  41       1
## 341                    0.472  22       0
## 342                    0.673  36       0
## 343                    0.389  22       0
## 344                    0.290  33       0
## 345                    0.485  57       0
## 346                    0.349  49       0
## 347                    0.654  22       0
## 348                    0.187  23       0
## 349                    0.279  26       0
## 350                    0.346  37       1
## 351                    0.237  29       0
## 352                    0.252  30       0
## 353                    0.243  46       0
## 354                    0.580  24       0
## 355                    0.559  21       0
## 356                    0.302  49       1
## 357                    0.962  28       1
## 358                    0.569  44       1
## 359                    0.378  48       0
## 360                    0.875  29       1
## 361                    0.583  29       1
## 362                    0.207  63       0
## 363                    0.305  65       0
## 364                    0.520  67       1
## 365                    0.385  30       0
## 366                    0.499  30       0
## 367                    0.368  29       1
## 368                    0.252  21       0
## 369                    0.306  22       0
## 370                    0.234  45       1
## 371                    2.137  25       1
## 372                    1.731  21       0
## 373                    0.545  21       0
## 374                    0.225  25       0
## 375                    0.816  28       0
## 376                    0.528  58       1
## 377                    0.299  22       0
## 378                    0.509  22       0
## 379                    0.238  32       1
## 380                    1.021  35       0
## 381                    0.821  24       0
## 382                    0.236  22       0
## 383                    0.947  21       0
## 384                    1.268  25       0
## 385                    0.221  25       0
## 386                    0.205  24       0
## 387                    0.660  35       1
## 388                    0.239  45       1
## 389                    0.452  58       1
## 390                    0.949  28       0
## 391                    0.444  42       0
## 392                    0.340  27       1
## 393                    0.389  21       0
## 394                    0.463  37       0
## 395                    0.803  31       1
## 396                    1.600  25       0
## 397                    0.944  39       0
## 398                    0.196  22       1
## 399                    0.389  25       0
## 400                    0.241  25       1
## 401                    0.161  31       1
## 402                    0.151  55       0
## 403                    0.286  35       1
## 404                    0.280  38       0
## 405                    0.135  41       1
## 406                    0.520  26       0
## 407                    0.376  46       1
## 408                    0.336  25       0
## 409                    1.191  39       1
## 410                    0.702  28       1
## 411                    0.674  28       0
## 412                    0.528  25       0
## 413                    1.076  22       0
## 414                    0.256  21       0
## 415                    0.534  21       1
## 416                    0.258  22       1
## 417                    1.095  22       0
## 418                    0.554  37       1
## 419                    0.624  27       0
## 420                    0.219  28       1
## 421                    0.507  26       0
## 422                    0.561  21       0
## 423                    0.496  21       0
## 424                    0.421  21       0
## 425                    0.516  36       1
## 426                    0.264  31       1
## 427                    0.256  25       0
## 428                    0.328  38       1
## 429                    0.284  26       0
## 430                    0.233  43       1
## 431                    0.108  23       0
## 432                    0.551  38       0
## 433                    0.527  22       0
## 434                    0.167  29       0
## 435                    1.138  36       0
## 436                    0.205  29       1
## 437                    0.244  41       0
## 438                    0.434  28       0
## 439                    0.147  21       0
## 440                    0.727  31       0
## 441                    0.435  41       1
## 442                    0.497  22       0
## 443                    0.230  24       0
## 444                    0.955  33       1
## 445                    0.380  30       1
## 446                    2.420  25       1
## 447                    0.658  28       0
## 448                    0.330  26       0
## 449                    0.510  22       1
## 450                    0.285  26       0
## 451                    0.415  23       0
## 452                    0.542  23       1
## 453                    0.381  25       0
## 454                    0.832  72       0
## 455                    0.498  24       0
## 456                    0.212  38       1
## 457                    0.687  62       0
## 458                    0.364  24       0
## 459                    1.001  51       1
## 460                    0.460  81       0
## 461                    0.733  48       0
## 462                    0.416  26       0
## 463                    0.705  39       0
## 464                    0.258  37       0
## 465                    1.022  34       0
## 466                    0.452  21       0
## 467                    0.269  22       0
## 468                    0.600  25       0
## 469                    0.183  38       1
## 470                    0.571  27       0
## 471                    0.607  28       0
## 472                    0.170  22       0
## 473                    0.259  22       0
## 474                    0.210  50       0
## 475                    0.126  24       0
## 476                    0.231  59       0
## 477                    0.711  29       1
## 478                    0.466  31       0
## 479                    0.162  39       0
## 480                    0.419  63       0
## 481                    0.344  35       1
## 482                    0.197  29       0
## 483                    0.306  28       0
## 484                    0.233  23       0
## 485                    0.630  31       1
## 486                    0.365  24       1
## 487                    0.536  21       0
## 488                    1.159  58       0
## 489                    0.294  28       0
## 490                    0.551  67       0
## 491                    0.629  24       0
## 492                    0.292  42       0
## 493                    0.145  33       0
## 494                    1.144  45       1
## 495                    0.174  22       0
## 496                    0.304  66       0
## 497                    0.292  30       0
## 498                    0.547  25       0
## 499                    0.163  55       1
## 500                    0.839  39       0
## 501                    0.313  21       0
## 502                    0.267  28       0
## 503                    0.727  41       1
## 504                    0.738  41       0
## 505                    0.238  40       0
## 506                    0.263  38       0
## 507                    0.314  35       1
## 508                    0.692  21       0
## 509                    0.968  21       0
## 510                    0.409  64       0
## 511                    0.297  46       1
## 512                    0.207  21       0
## 513                    0.200  58       0
## 514                    0.525  22       0
## 515                    0.154  24       0
## 516                    0.268  28       1
## 517                    0.771  53       1
## 518                    0.304  51       0
## 519                    0.180  41       0
## 520                    0.582  60       0
## 521                    0.187  25       0
## 522                    0.305  26       0
## 523                    0.189  26       0
## 524                    0.652  45       1
## 525                    0.151  24       0
## 526                    0.444  21       0
## 527                    0.299  21       0
## 528                    0.107  24       0
## 529                    0.493  22       0
## 530                    0.660  31       0
## 531                    0.717  22       0
## 532                    0.686  24       0
## 533                    0.917  29       0
## 534                    0.501  31       0
## 535                    1.251  24       0
## 536                    0.302  23       1
## 537                    0.197  46       0
## 538                    0.735  67       0
## 539                    0.804  23       0
## 540                    0.968  32       1
## 541                    0.661  43       1
## 542                    0.549  27       1
## 543                    0.825  56       1
## 544                    0.159  25       0
## 545                    0.365  29       0
## 546                    0.423  37       1
## 547                    1.034  53       1
## 548                    0.160  28       0
## 549                    0.341  50       0
## 550                    0.680  37       0
## 551                    0.204  21       0
## 552                    0.591  25       0
## 553                    0.247  66       0
## 554                    0.422  23       0
## 555                    0.471  28       0
## 556                    0.161  37       0
## 557                    0.218  30       0
## 558                    0.237  58       0
## 559                    0.126  42       0
## 560                    0.300  35       0
## 561                    0.121  54       1
## 562                    0.502  28       1
## 563                    0.401  24       0
## 564                    0.497  32       0
## 565                    0.601  27       0
## 566                    0.748  22       0
## 567                    0.412  21       0
## 568                    0.085  46       0
## 569                    0.338  37       0
## 570                    0.203  33       1
## 571                    0.270  39       0
## 572                    0.268  21       0
## 573                    0.430  22       0
## 574                    0.198  22       0
## 575                    0.892  23       0
## 576                    0.280  25       0
## 577                    0.813  35       0
## 578                    0.693  21       1
## 579                    0.245  36       0
## 580                    0.575  62       1
## 581                    0.371  21       1
## 582                    0.206  27       0
## 583                    0.259  62       0
## 584                    0.190  42       0
## 585                    0.687  52       1
## 586                    0.417  22       0
## 587                    0.129  41       1
## 588                    0.249  29       0
## 589                    1.154  52       1
## 590                    0.342  25       0
## 591                    0.925  45       1
## 592                    0.175  24       0
## 593                    0.402  44       1
## 594                    1.699  25       0
## 595                    0.733  34       0
## 596                    0.682  22       1
## 597                    0.194  46       0
## 598                    0.559  21       0
## 599                    0.088  38       1
## 600                    0.407  26       0
## 601                    0.400  24       0
## 602                    0.190  28       0
## 603                    0.100  30       0
## 604                    0.692  54       1
## 605                    0.212  36       1
## 606                    0.514  21       0
## 607                    1.258  22       1
## 608                    0.482  25       0
## 609                    0.270  27       0
## 610                    0.138  23       0
## 611                    0.292  24       0
## 612                    0.593  36       1
## 613                    0.787  40       1
## 614                    0.878  26       0
## 615                    0.557  50       1
## 616                    0.207  27       0
## 617                    0.157  30       0
## 618                    0.257  23       0
## 619                    1.282  50       1
## 620                    0.141  24       1
## 621                    0.246  28       0
## 622                    1.698  28       0
## 623                    1.461  45       0
## 624                    0.347  21       0
## 625                    0.158  21       0
## 626                    0.362  29       0
## 627                    0.206  21       0
## 628                    0.393  21       0
## 629                    0.144  45       0
## 630                    0.148  21       0
## 631                    0.732  34       1
## 632                    0.238  24       0
## 633                    0.343  23       0
## 634                    0.115  22       0
## 635                    0.167  31       0
## 636                    0.465  38       1
## 637                    0.153  48       0
## 638                    0.649  23       0
## 639                    0.871  32       1
## 640                    0.149  28       0
## 641                    0.695  27       0
## 642                    0.303  24       0
## 643                    0.178  50       1
## 644                    0.610  31       0
## 645                    0.730  27       0
## 646                    0.134  30       0
## 647                    0.447  33       1
## 648                    0.455  22       1
## 649                    0.260  42       1
## 650                    0.133  23       0
## 651                    0.234  23       0
## 652                    0.466  27       0
## 653                    0.269  28       0
## 654                    0.455  27       0
## 655                    0.142  22       0
## 656                    0.240  25       1
## 657                    0.155  22       0
## 658                    1.162  41       0
## 659                    0.190  51       0
## 660                    1.292  27       1
## 661                    0.182  54       0
## 662                    1.394  22       1
## 663                    0.165  43       1
## 664                    0.637  40       1
## 665                    0.245  40       1
## 666                    0.217  24       0
## 667                    0.235  70       1
## 668                    0.141  40       1
## 669                    0.430  43       0
## 670                    0.164  45       0
## 671                    0.631  49       0
## 672                    0.551  21       0
## 673                    0.285  47       0
## 674                    0.880  22       0
## 675                    0.587  68       0
## 676                    0.328  31       1
## 677                    0.230  53       1
## 678                    0.263  25       0
## 679                    0.127  25       1
## 680                    0.614  23       0
## 681                    0.332  22       0
## 682                    0.364  26       1
## 683                    0.366  22       0
## 684                    0.536  27       1
## 685                    0.640  69       0
## 686                    0.591  25       0
## 687                    0.314  22       0
## 688                    0.181  29       0
## 689                    0.828  23       0
## 690                    0.335  46       1
## 691                    0.856  34       0
## 692                    0.257  44       1
## 693                    0.886  23       0
## 694                    0.439  43       1
## 695                    0.191  25       0
## 696                    0.128  43       1
## 697                    0.268  31       1
## 698                    0.253  22       0
## 699                    0.598  28       0
## 700                    0.904  26       0
## 701                    0.483  26       0
## 702                    0.565  49       1
## 703                    0.905  52       1
## 704                    0.304  41       0
## 705                    0.118  27       0
## 706                    0.177  28       0
## 707                    0.261  30       1
## 708                    0.176  22       0
## 709                    0.148  45       1
## 710                    0.674  23       1
## 711                    0.295  24       0
## 712                    0.439  40       0
## 713                    0.441  38       1
## 714                    0.352  21       0
## 715                    0.121  32       0
## 716                    0.826  34       1
## 717                    0.970  31       1
## 718                    0.595  56       0
## 719                    0.415  24       0
## 720                    0.378  52       1
## 721                    0.317  34       0
## 722                    0.289  21       0
## 723                    0.349  42       1
## 724                    0.251  42       0
## 725                    0.265  45       0
## 726                    0.236  38       0
## 727                    0.496  25       0
## 728                    0.433  22       0
## 729                    0.326  22       0
## 730                    0.141  22       0
## 731                    0.323  34       1
## 732                    0.259  22       1
## 733                    0.646  24       1
## 734                    0.426  22       0
## 735                    0.560  53       0
## 736                    0.284  28       0
## 737                    0.515  21       0
## 738                    0.600  42       0
## 739                    0.453  21       0
## 740                    0.293  42       1
## 741                    0.785  48       1
## 742                    0.400  26       0
## 743                    0.219  22       0
## 744                    0.734  45       1
## 745                    1.174  39       0
## 746                    0.488  46       0
## 747                    0.358  27       1
## 748                    1.096  32       0
## 749                    0.408  36       1
## 750                    0.178  50       1
## 751                    1.182  22       1
## 752                    0.261  28       0
## 753                    0.223  25       0
## 754                    0.222  26       1
## 755                    0.443  45       1
## 756                    1.057  37       1
## 757                    0.391  39       0
## 758                    0.258  52       1
## 759                    0.197  26       0
## 760                    0.278  66       1
## 761                    0.766  22       0
## 762                    0.403  43       1
## 763                    0.142  33       0
## 764                    0.171  63       0
## 765                    0.340  27       0
## 766                    0.245  30       0
## 767                    0.349  47       1
## 768                    0.315  23       0

6 Outlier: Data Pima Indian Diabetes

6.1 Pemeriksaan Data

setwd("D:/Kuliah/Mat/TSA Kominfo/Praktikum")
diabetes <- read.csv("pima-indians-diabeter-database.csv", sep = ',')
library(ggplot2)
library(mlbench) # data pima Indian Dataset
library(cowplot) # menampilkan plot dalam bentuk Grid
library(caret) # features scaling
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
str(diabetes)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...
head(diabetes)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

6.2 Ringkasan dan Sebaran Data

dim(diabetes)
## [1] 768   9
summary(diabetes)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

6.2.1 Jumlah Nilai NA

  dari summary di bawah, terlihat tidak terdapat data NA
data.frame(colSums(is.na(diabetes)))
##                          colSums.is.na.diabetes..
## Pregnancies                                     0
## Glucose                                         0
## BloodPressure                                   0
## SkinThickness                                   0
## Insulin                                         0
## BMI                                             0
## DiabetesPedigreeFunction                        0
## Age                                             0
## Outcome                                         0

6.2.2 Jumlah Nilai 0

terlihat banyak kolom memiliki nilai 0 yang seharusnya tidak mungkin 0. Seperti glucose yang menunjukkan kadar gula atau pressure yang menunjukkan tekanan darah, serta kolom lainnya. Kondisi ini perlu ditangani karena merupakan informasi yang keliru.

data.frame(colSums(diabetes==0))
##                          colSums.diabetes....0.
## Pregnancies                                 111
## Glucose                                       5
## BloodPressure                                35
## SkinThickness                               227
## Insulin                                     374
## BMI                                          11
## DiabetesPedigreeFunction                      0
## Age                                           0
## Outcome                                     500

6.2.3 Persentase Jumlah Nilai 0 di setiap kolom

data.frame(round(colSums(diabetes==0)/nrow(diabetes), 4)*100)
##                          round.colSums.diabetes....0..nrow.diabetes...4....100
## Pregnancies                                                              14.45
## Glucose                                                                   0.65
## BloodPressure                                                             4.56
## SkinThickness                                                            29.56
## Insulin                                                                  48.70
## BMI                                                                       1.43
## DiabetesPedigreeFunction                                                  0.00
## Age                                                                       0.00
## Outcome                                                                  65.10
library(gridExtra)
plot1 <- ggplot(diabetes, aes(x = Pregnancies)) + 
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("Pregnancies")
plot2 <- ggplot(diabetes, aes(x = Glucose)) +
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("Glucose")
plot3 <- ggplot(diabetes, aes(x = BloodPressure)) +
         geom_density(lwd=1, color="darkgreen") +
         ggtitle("BloodPressure")
plot4 <- ggplot(diabetes, aes(x = SkinThickness)) +
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("SkinThickness")
plot5 <- ggplot(diabetes, aes(x = Insulin)) +
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("Insulin")
plot6 <- ggplot(diabetes, aes(x = BMI)) +
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("BMI")
plot7 <- ggplot(diabetes, aes(x = DiabetesPedigreeFunction)) +
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("DiabetesPedigreeFunction")
plot8 <- ggplot(diabetes, aes(x = Age)) +
         geom_density(lwd=1, color="darkgreen") + 
         ggtitle("Age")
plot9 <- ggplot(diabetes, aes(x = Outcome)) + 
         geom_bar(fill="darkred") +
         ggtitle("Outcome")

# Mengatur grid 3x3
grid.arrange(
  plot1, plot2, plot3,
  plot4, plot5, plot6,
  plot7, plot8, plot9,
  ncol = 3
)

plots <- lapply(names(diabetes), function(var_x){
  p <- 
    ggplot(diabetes) +
    aes_string(var_x)

  if(var_x %in% names(diabetes)[1:8]) {
    p <- p + geom_density(lwd=1, color="darkgreen")

  } else if(var_x =="Outcome") {
    p <- p + geom_bar(fill="darkred")
  } 

})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot_grid(plotlist = plots)

6.3 Kolom Pregnancies

library(ggplot2)
library(gridExtra)
plot1 <- ggplot(diabetes, aes(x = Pregnancies)) + 
         geom_histogram(bins = 10, fill = "red", alpha = 0.7)
plot2 <- ggplot(diabetes, aes(x = Pregnancies)) +
         geom_boxplot(fill = "red", alpha=0.6) +theme_classic()
plot3 <- ggplot(diabetes, aes(x = Pregnancies, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(x = Pregnancies, group=Outcome)) +
         geom_boxplot(aes(fill = Outcome)) +theme_classic() 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

summary(diabetes$Pregnancies)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   3.000   3.845   6.000  17.000

Sebaran nilai Pregnancies cenderung menjulur jauh ke kanan.

Menurut visualisasi boxplot, terdapat beberapa data outlier namun secara umum masih dalam batas yang wajar. Apalagi jika melihat boxplot antara Pregnancies dan Outcome sebaran data relatif cukup baik.

Namun jika ingin dilakukan penanganan maka terdapat beberapa alternatif:

  • Menghapus data Outlier

  • Mengganti data dengan mean/ median data yang tidak outlier

  • Flooring & Capping: Mengganti data dengan nilai max teoritis (Misal Q1 - 1.5xIQR dan Q3 + 1.5xIQR)

6.3.1 Menghapus Data Outlier

Melihat dari boxplot tunggal, terdapat 3 titik data outlier, sedangkan jika melihat berdasarkan kelompok diabetes atau tidak maka terdapat dua outlier pada kelompok tidak diabetes. Misalkan kita menggunakan indikator dari boxplot tunggal, maka untuk menghapus data tersebut:

q1 <-quantile(diabetes$Pregnancies, prob = 0.25, names = F)
q3 <-quantile(diabetes$Pregnancies, prob = 0.75, names = F)
IQR <- q3 - q1
max <- q3 + 1.5*IQR
# Hapus data yang lebih besar dari max yaitu 

# R base
(diabetes[diabetes$Pregnancies > max,])
##     Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 89           15     136            70            32     110 37.1
## 160          17     163            72            41     114 40.9
## 299          14     100            78            25     184 36.6
## 456          14     175            62            30       0 33.6
##     DiabetesPedigreeFunction Age Outcome
## 89                     0.153  43       1
## 160                    0.817  47       1
## 299                    0.412  46       1
## 456                    0.212  38       1
# dplyr
#library(dplyr)
#(diabetes %>% filter(Pregnancies > max))

6.3.2 Mengganti dengan Mean/Median dari data yang tidak Outlier

non.outlier <- diabetes[diabetes$Pregnancies <= max, "Pregnancies"]
mean.non <- mean(non.outlier)
med.non <- median(non.outlier)

# Update outlier data menggunakan mean atau median

# R base
diabetes$Pregnancies <- ifelse(diabetes$Pregnancies > max, med.non, diabetes$Pregnancies)

# dplyr
# diabetes <- diabetes %>% mutate(pregnant = ifelse(pregnant > max, med.non, pregnant))

6.3.3 Mengganti dengan Nilai Max dari data yang tidak Outlier

Sama seperti sebelumnya, cukup mengganti nilai outlier dengan nilai minimum/maksimum

# Update outlier data menggunakan nilai max

# R base
# diabetes$Pregnancies <- ifelse(diabetes$Pregnancies > max, max,
#                              diabetes$Pregnancies)

# dplyr
# diabetes <- diabetes %>% mutate(pregnant = ifelse(pregnant > max, max, pregnant))

6.3.4 Plot Setelah Modifikasi

pl2 <- ggplot(diabetes, aes(y=Pregnancies))
pl2 + 
  geom_boxplot(fill="red", alpha=0.6) + coord_flip() + theme_classic()

6.4 Kolom Glucose

summary(diabetes$Glucose)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    99.0   117.0   120.9   140.2   199.0
plot1 <- ggplot(diabetes, aes(x = Glucose)) + 
         geom_histogram(bins = 10, fill = "red", alpha = 0.7)
plot2 <- ggplot(diabetes, aes(x = Glucose)) +
         geom_boxplot(fill = "red", alpha=0.6) +theme_classic()
plot3 <- ggplot(diabetes, aes(x = Glucose, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(x = Glucose, group=Outcome)) +
         geom_boxplot(aes(fill = Outcome)) +theme_classic() 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

Pada kolom ini sebaran data relatif simetris, dan secara umum tidak terdapat data outlier kecuali pada data bernilai 0.

Nilai Glucose menunjukkan kadar gula dalam darah, sehingga tidak mungkin bernilai 0. Perlu dicek juga apakah terdapat nilai-nilai lainnya yang tidak masuk akal (misalkan menurut Dokter).Namun pada contoh ini kita hanya akan fokus pada nilai 0 saja, kita anggap nilai lainnya masih masuk dalam rentang nilai yang memungkinkan.

Cek jumlah data bernilai 0:

length(which(diabetes$Glucose == 0))
## [1] 5

Terdapat 5 data yang memiliki nilai Glucose = 0 yang perlu ditangani. Jumlah ini relatif sedikit jadi dapat saja kita 5 baris ini atau alternatif lain kita amputasi misal dengan mean atau median.

6.4.1 Menghapus Data

# R base
# diabetes <- diabetes[diabetes$Glucose > 0,] 

# dplyr
#diabetes <- diabetes %>% filter(Glucose > 0)

6.4.2 Mengganti dengan Nilai Mean atau Median

Perlu diperhatikan, karena data bernilai 0 (bukan NA) maka perlu berhati-hati khususnya dalam menghitung mean.

Jika data bernilai NA, penghitungan mean dapat langsung dilakukan pada seluruh data, karena nilai NA akan diabaikan. Namun jika data 0 maka nilai tersebut akan dimasukkan dalam penghitungan rata-rata. Oleh karena itu, perlu difilter terlebih dahulu data yang tidak bernilai 0.

Pemilihan mean biasanya untuk data yang memiliki sebaran simetris, sementara untuk data dengan sebaran yang menjulur dapat menggunakan nilai median.

Melihat dari boxplot, tampat terdapat perbedaan sebaran nilai Glucose antara yang terkena diabetes dan yang tidak. Penderita Diabetes cenderung memiliki nilai Glucose yang lebih tinggi dibandingkan yang bukan penderita diabetes.

Opsi yang cocok adalah menggunakan mean atau median per kelompok. Data Glucose bernilai 0 mengacu pada kelompok diabetes diganti dengan mean atau median pada kelompok tersebut begitu pula sebaliknya.

# Mengambil data glucose yang tidak bernilai 0
glucose.pos <- diabetes[(diabetes$Glucose != 0) & (diabetes$Outcome == "1"), "Glucose"]
glucose.neg <- diabetes[(diabetes$Glucose != 0) & (diabetes$Outcome == "0"), "Glucose"]

# Menghitung mean glucose masing-masing kelompok
med.gluc.pos <- median(glucose.pos)
med.gluc.neg <- median(glucose.neg)

# Mengganti nilai 0 dengan mean

# R base
diabetes$Glucose <- ifelse(diabetes$Glucose == 0, 
                           ifelse(diabetes$Outcome == "1", med.gluc.pos, med.gluc.neg), 
                           diabetes$Glucose)

# dplyr
# diabetes <- diabetes %>% mutate(glucose = ifelse(glucose == 0, 
#                                                 ifelse(diabetes=="pos",
# med.gluc.pos, med.gluc.neg), 
#                                                 glucose))
plot1 <- ggplot(diabetes, aes(x = Glucose)) + 
         geom_histogram(bins = 10, fill = "red", alpha = 0.7) 
plot2 <- ggplot(diabetes, aes(x = Glucose)) +
         geom_boxplot(fill = "red", alpha=0.6) +theme_classic()
plot3 <- ggplot(diabetes, aes(x = Glucose, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(x = Glucose, group=Outcome)) +
         geom_boxplot(aes(fill = Outcome)) +theme_classic() 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

summary(diabetes$Glucose)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   44.00   99.75  117.00  121.68  140.25  199.00

6.5 Kolom Blood Pressure

Kolom BloodPressure berisi data tekanan darah yang memuat nilai 0 (tidak mungkin terjadi)

Selain itu terdapat nilai-nilai outlier di kedua sisinya.

summary(diabetes$BloodPressure)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   62.00   72.00   69.11   80.00  122.00
plot1 <- ggplot(diabetes, aes(x = BloodPressure)) + 
         geom_histogram(bins = 10, fill = "red", alpha = 0.7) 
plot2 <- ggplot(diabetes, aes(x = BloodPressure)) +
         geom_boxplot(fill = "red", alpha=0.6) +theme_classic()
plot3 <- ggplot(diabetes, aes(x = BloodPressure, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(x = BloodPressure, group=Outcome)) +
         geom_boxplot(aes(fill = Outcome)) +theme_classic() 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

6.5.1 Data bernilai 0

length(which(diabetes$BloodPressure == 0))
## [1] 35
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
zero.BloodPressure <- diabetes %>%
  filter(BloodPressure == 0) %>% 
  group_by(Outcome) %>%
  summarise(count = length(BloodPressure))
zero.BloodPressure
## # A tibble: 2 × 2
##   Outcome count
##     <int> <int>
## 1       0    19
## 2       1    16

Terdapat 35 data dengan nilai BloodPressure == 0. Untuk kelompok penderita diabetes terdapat 16 data bernilai 0 sedangkan untuk kelompok bukan penderita diabetes terdapat 19 data bernilai 0. Melihat dari boxplot kelompok, tampaknya tidak terdapat perbedaan yang berarti antara sebaran nilai BloodPressure pada kelompok diabetes maupun tidak. Oleh karena itu, kita mungkin cukup menggunakan mean data keseluruhan (yang tidak bernilai 0) untuk imputasi data 0.

Pada data BloodPressure terindikasi terdapat beberapa amatan outlier, dapat pula dilakukan penanganan seperti pada bagian sebelumnya (pregnant). Namun di sini tidak akan dilakukan penanganan.

6.5.2 Mengganti nilai 0 dengan Nilai Mean

# Mengambil data BloodPressure yang tidak bernilai 0
BloodPressure <- diabetes[diabetes$BloodPressure != 0, "BloodPressure"]

# Menghitung mean pressure
mean.BloodPressure <- mean(diabetes$BloodPressure)

# Mengganti nilai 0 dengan mean

# R base
diabetes$BloodPressure <- ifelse(diabetes$BloodPressure == 0, mean.BloodPressure, diabetes$BloodPressure)

# dplyr
# diabetes <- diabetes %>% mutate(pressure = ifelse(pressure == 0, mean.pressure, pressure))

6.5.3 Plot Setelah Imputasi

plot1 <- ggplot(diabetes, aes(x = BloodPressure)) + 
         geom_histogram(bins = 10, fill = "red", alpha = 0.7) 
plot2 <- ggplot(diabetes, aes(x = BloodPressure)) +
         geom_boxplot(fill = "red", alpha=0.6) +theme_classic()
plot3 <- ggplot(diabetes, aes(x = BloodPressure, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(x = BloodPressure, group=Outcome)) +
         geom_boxplot(aes(fill = Outcome)) +theme_classic() 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

6.6 Kolom Skin Thickness

Kolom SkinThickness menunjukkan ketebalan lipatan kulit(mm).

Data ini juga seharusnya tidak dapat bernilai 0.

summary(diabetes$SkinThickness)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   23.00   20.54   32.00   99.00
plot1 <- ggplot(diabetes, aes(x = SkinThickness)) + 
         geom_histogram(bins = 10, fill = "red", alpha = 0.7) 
plot2 <- ggplot(diabetes, aes(x = SkinThickness)) +
         geom_boxplot(fill = "red", alpha=0.6) +theme_classic()
plot3 <- ggplot(diabetes, aes(x = SkinThickness, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(x = SkinThickness, group=Outcome)) +
         geom_boxplot(aes(fill = Outcome)) +theme_classic() 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

length(which(diabetes$SkinThickness == 0))
## [1] 227
length(which(diabetes$SkinThickness == 0))/length(diabetes$SkinThickness)
## [1] 0.2955729

Jumlah data bernilai 0 untuk kolom SkinThickness cukup besar yaitu sekitar 30%.

Dapat dilakukan imputasi seperti kolom sebelumnya. Namun, jika dirasa terlalu banyak yang diimputasi mungkin sebaiknya kolom ini dapat dibuang dari model.

6.6.1 Menghapus Kolom SkinThickness

diabetes$SkinThickness <- NULL

6.7 Kolom Insulin

Sama seperti kolom SkinThickness, kolom Insulin juga tidak mungkin bernilai 0. Namun pada data terdapat nilai 0 yang sangat banyak bahkan hampir 50%.

Oleh karena itu, penanganan yang paling tepat adalah membuang kolom tersebut dari model.

summary(diabetes$Insulin)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0    30.5    79.8   127.2   846.0
length(which(diabetes$Insulin == 0))
## [1] 374
length(which(diabetes$Insulin == 0))/length(diabetes$Insulin)
## [1] 0.4869792

6.7.1 Menghapus kolom Insulin

diabetes$Insulin <- NULL

6.8 Kolom BMI

summary(diabetes$BMI)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   27.30   32.00   31.99   36.60   67.10
plot1 <- ggplot(diabetes, aes(x = BMI)) + 
         geom_histogram(bins = 10, fill = "red", alpha = 0.7) 
plot2 <- ggplot(diabetes, aes(x = BMI)) +
         geom_boxplot(fill = "red", alpha=0.6) +theme_classic()
plot3 <- ggplot(diabetes, aes(x = BMI, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(x = BMI, group=Outcome)) +
         geom_boxplot(aes(fill = Outcome)) +theme_classic() 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

length(which(diabetes$BMI == 0))
## [1] 11
length(which(diabetes$BMI == 0))/length(diabetes$BMI)
## [1] 0.01432292

6.8.1 Mengganti Nilai 0 dengan Nilai Mean

# Mengambil data mass yang tidak bernilai 0
BMI <- diabetes[diabetes$BMI != 0, "BMI"]

# Menghitung mean mass
mean.BMI <- mean(diabetes$BMI)

# Mengganti nilai 0 dengan mean

# R base
diabetes$BMI <- ifelse(diabetes$BMI == 0, mean.BMI, diabetes$BMI)

# dplyr
# diabetes <- diabetes %>% mutate(mass = ifelse(mass == 0, mean.mass, mass))

6.8.2 Mengganti Nilai Data Outlier

Dalam kolom BMI ini, dianggap nilai BMI di atas (Q3 +1.5 IQR) merupakan nilai yang tidak masuk akal.

Maka akan dihapus amatan dengan BMI > 50 atau mengganti dengan nilai (Q3 + 1.5 IQR)

q1 <- quantile(diabetes$BMI, probs=0.25, names=F)
q3 <- quantile(diabetes$BMI, probs=0.75, names=F)
IQR <- q3 - q1
max <- q3 + 1.5*IQR

# R base
diabetes$BMI <- ifelse(diabetes$BMI > max, max, diabetes$BMI)

# dplyr
# diabetes <- diabetes %>% mutate(mass = ifelse(mass > max, max, mass))

6.9 Setelah melakukan Imputasi Pada Kolom BMI

plot1 <- ggplot(diabetes, aes(x = BMI)) + 
         geom_histogram(bins = 10, fill = "red", alpha = 0.7) 
plot2 <- ggplot(diabetes, aes(x = BMI)) +
         geom_boxplot(fill = "red", alpha=0.6) +theme_classic()
plot3 <- ggplot(diabetes, aes(x = BMI, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(x = BMI, group=Outcome)) +
         geom_boxplot(aes(fill = Outcome)) +theme_classic() 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

6.10 Kolom Diabetes Pedigree Function

summary(diabetes$DiabetesPedigreeFunction)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0780  0.2437  0.3725  0.4719  0.6262  2.4200
plot1 <- ggplot(diabetes, aes(x = DiabetesPedigreeFunction)) + 
         geom_histogram(bins = 10, fill = "red", alpha = 0.7) 
plot2 <- ggplot(diabetes, aes(x = DiabetesPedigreeFunction)) +
         geom_boxplot(fill = "red", alpha=0.6) +theme_classic()
plot3 <- ggplot(diabetes, aes(x = DiabetesPedigreeFunction, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(x = DiabetesPedigreeFunction, group=Outcome)) +
         geom_boxplot(aes(fill = Outcome)) +theme_classic() 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

length(which(diabetes$DiabetesPedigreeFunction == 0))
## [1] 0

Untuk melakukan penanganan outlier dapat menggunakan cara seperti sebelumnya. Namun pada kolom Diabetes Pedigree Function ini akan dibiarkan, karena nilai-nilai masih masuk dalam rentang nilai yang memungkinkan.

6.11 Kolom Age

summary(diabetes$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   24.00   29.00   33.24   41.00   81.00
plot1 <- ggplot(diabetes, aes(x = Age)) + 
         geom_histogram(bins = 10, fill = "red", alpha = 0.7) 
plot2 <- ggplot(diabetes, aes(x = Age)) +
         geom_boxplot(fill = "red", alpha=0.6) +theme_classic()
plot3 <- ggplot(diabetes, aes(x = Age, group=Outcome)) + 
         geom_density(lwd=0.1, aes(fill=Outcome), alpha= 0.5) 
plot4 <- ggplot(diabetes, aes(x = Age, group=Outcome)) +
         geom_boxplot(aes(fill = Outcome)) +theme_classic() 

# Mengatur grid
grid.arrange(
  plot1, plot2, plot3, plot4, ncol = 2
)

Tidak terdapat data yang aneh pada kolom Age sehingga akan dibiarkan apa adanya.

6.12 Features Scalling

Features Scalling adalah proses pengubahan skala pada pubah-peubah sehingga memiliki skala yang sama. Banyak algoritma machine learning, terutama yang berbasis jarak ataupun Neural Network akan bekerja lebih optimal jika skala setiap fitur sama.

Selain itu, dalam kasus model regresi, ada skala data yang sama, semakin besar nilai koefisiennya (mutlak) maka semakin besar pengaruhnya terhadap model.

Features Scalling tidak merubah pola hubungan antara peubah tersebut dengan peubah respon.

Contoh features scalling: 1. Standarize: mengubah setiap fitur menjadi distribusi dengan mean 0 dan variansi 1 dengan formula x_i scaled = (x_i - x_bar)/ s 2. Normalize: mengubah setiap fitur sehingga memiliki nilai pada rentang [0,1] dengan formula x_i scaled = (x_i - min(x))/(max(x) - min(x))

6.12.1 Standardized

summary(diabetes[,1:6])
##   Pregnancies        Glucose       BloodPressure         BMI       
##  Min.   : 0.000   Min.   : 44.00   Min.   : 24.00   Min.   :18.20  
##  1st Qu.: 1.000   1st Qu.: 99.75   1st Qu.: 64.00   1st Qu.:27.50  
##  Median : 3.000   Median :117.00   Median : 72.00   Median :32.00  
##  Mean   : 3.783   Mean   :121.68   Mean   : 72.25   Mean   :32.39  
##  3rd Qu.: 6.000   3rd Qu.:140.25   3rd Qu.: 80.00   3rd Qu.:36.60  
##  Max.   :13.000   Max.   :199.00   Max.   :122.00   Max.   :50.25  
##  DiabetesPedigreeFunction      Age       
##  Min.   :0.0780           Min.   :21.00  
##  1st Qu.:0.2437           1st Qu.:24.00  
##  Median :0.3725           Median :29.00  
##  Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :2.4200           Max.   :81.00
sapply(diabetes[,1:6], sd)
##              Pregnancies                  Glucose            BloodPressure 
##                3.2706442               30.4641606               12.1159316 
##                      BMI DiabetesPedigreeFunction                      Age 
##                6.6676330                0.3313286               11.7602315

6.12.2 Standarisasi Fitur

preprocessParams <- preProcess(diabetes[,1:6], method=c("center", "scale"))
standardized <- predict(preprocessParams, diabetes[,1:6])

summary(standardized)
##   Pregnancies         Glucose        BloodPressure           BMI          
##  Min.   :-1.1565   Min.   :-2.5498   Min.   :-3.98276   Min.   :-2.12804  
##  1st Qu.:-0.8508   1st Qu.:-0.7198   1st Qu.:-0.68132   1st Qu.:-0.73324  
##  Median :-0.2393   Median :-0.1535   Median :-0.02103   Median :-0.05833  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.6780   3rd Qu.: 0.6097   3rd Qu.: 0.63926   3rd Qu.: 0.63157  
##  Max.   : 2.8182   Max.   : 2.5382   Max.   : 4.10577   Max.   : 2.67877  
##  DiabetesPedigreeFunction      Age         
##  Min.   :-1.1888          Min.   :-1.0409  
##  1st Qu.:-0.6885          1st Qu.:-0.7858  
##  Median :-0.2999          Median :-0.3606  
##  Mean   : 0.0000          Mean   : 0.0000  
##  3rd Qu.: 0.4659          3rd Qu.: 0.6598  
##  Max.   : 5.8797          Max.   : 4.0611
sapply(standardized, sd)
##              Pregnancies                  Glucose            BloodPressure 
##                        1                        1                        1 
##                      BMI DiabetesPedigreeFunction                      Age 
##                        1                        1                        1

6.12.3 Normalize

# Normalisasi Fitur
preprocessParams <- preProcess(diabetes[,1:6], method=c("range"))
normalized <- predict(preprocessParams, diabetes[,1:6])

summary(normalized)
##   Pregnancies         Glucose       BloodPressure         BMI        
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.07692   1st Qu.:0.3597   1st Qu.:0.4082   1st Qu.:0.2902  
##  Median :0.23077   Median :0.4710   Median :0.4898   Median :0.4306  
##  Mean   :0.29097   Mean   :0.5011   Mean   :0.4924   Mean   :0.4427  
##  3rd Qu.:0.46154   3rd Qu.:0.6210   3rd Qu.:0.5714   3rd Qu.:0.5741  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  DiabetesPedigreeFunction      Age        
##  Min.   :0.00000          Min.   :0.0000  
##  1st Qu.:0.07077          1st Qu.:0.0500  
##  Median :0.12575          Median :0.1333  
##  Mean   :0.16818          Mean   :0.2040  
##  3rd Qu.:0.23409          3rd Qu.:0.3333  
##  Max.   :1.00000          Max.   :1.0000

6.12.4 Plot Sebaran Asal

plots <- lapply(names(diabetes[,1:6]), function(var_x){
  p <- 
    ggplot(diabetes) +
    aes_string(var_x)

  if(is.numeric(diabetes[[var_x]])) {
    p <- p + geom_density(lwd=1, color="darkgreen")

  } else {
    p <- p + geom_bar(fill="darkred")
  } 

})

plot_grid(plotlist = plots)

6.12.5 Hasil Standarisasi

Plot Sebaran Hasil Standarisasi

plots <- lapply(names(standardized), function(var_x){
  p <- 
    ggplot(standardized) +
    aes_string(var_x) + 
    geom_density(lwd=1, color="darkgreen")
})

plot_grid(plotlist = plots)

6.12.6 Hasil Normalisasi

Plot Sebaran Hasil Normalisasi

plots <- lapply(names(normalized), function(var_x){
  p <- 
    ggplot(normalized) +
    aes_string(var_x) + 
    geom_density(lwd=1, color="darkgreen")
})

plot_grid(plotlist = plots)

6.13 Transformasi Box-Cox

Transformasi umumnya digunakan untuk merubah sebaran suatu feature ketika feature tersebut tidak normal sehingga menjadi mendekati sebaran normal.

Box-Cox Transformation biasanya dilakukan pada variabel dependen (Y) dalam analisis regresi atau analisis varian (ANOVA). Ini dilakukan untuk memastikan bahwa variabel dependen memiliki distribusi yang lebih dekat dengan normal sehingga asumsi distribusi normal dalam model regresi atau ANOVA terpenuhi.

Transformasi ini juga dapat digunakan pada variabel independen (X) jika diperlukan, tergantung pada distribusi data dan tujuan analisis. Namun, transformasi pada variabel independen lebih jarang dilakukan karena memiliki implikasi terhadap interpretasi model akhir.

preprocessParams <- preProcess(diabetes[,1:6], method=c("BoxCox"))
print(preprocessParams)
## Created from 768 samples and 5 variables
## 
## Pre-processing:
##   - Box-Cox transformation (5)
##   - ignored (0)
## 
## Lambda estimates for Box-Cox transformation:
## 0.1, 0.9, 0.3, -0.1, -1.1
transformed <- predict(preprocessParams, diabetes[,1:6])
summary(transformed)
##   Pregnancies        Glucose      BloodPressure         BMI       
##  Min.   : 0.000   Min.   :3.784   Min.   : 24.00   Min.   :4.626  
##  1st Qu.: 1.000   1st Qu.:4.603   1st Qu.: 64.00   1st Qu.:5.676  
##  Median : 3.000   Median :4.762   Median : 72.00   Median :6.095  
##  Mean   : 3.783   Mean   :4.770   Mean   : 72.25   Mean   :6.087  
##  3rd Qu.: 6.000   3rd Qu.:4.943   3rd Qu.: 80.00   3rd Qu.:6.482  
##  Max.   :13.000   Max.   :5.293   Max.   :122.00   Max.   :7.462  
##  DiabetesPedigreeFunction      Age        
##  Min.   :-2.5510          Min.   :0.8772  
##  1st Qu.:-1.4116          1st Qu.:0.8815  
##  Median :-0.9875          Median :0.8867  
##  Mean   :-0.9599          Mean   :0.8874  
##  3rd Qu.:-0.4680          3rd Qu.:0.8938  
##  Max.   : 0.8838          Max.   :0.9019
plots <- lapply(names(transformed), function(var_x){
  p <- 
    ggplot(transformed) +
    aes_string(var_x)  + 
    geom_density(lwd=1, color="darkgreen")
})

plot_grid(plotlist = plots)