HAFTA 3 - OGRENME GUNLUGU: UC DEGERLER (OUTLIERS)

R PAKETLERI

library(tidyverse)
library(stevemisc)
library(knitr)
library(haven)
library(summarytools)
library(outliers)
library(ggplot2)
library(plotly)
library(ggpmisc)
library(psych)
library(sur)
library(moments)
library(corrplot)
library(olsrr)
library(dplyr)
library(knitr)

VERI SETI

  • Bu calisma gunlugu icin PISA 2022 USA ogrenci anketi veri setinde yer alan bazi degiskenler secilerek 7, 8,9, 95, 97, 99 gibi degerler kayip veri gibi davrandigi kabul edilerek → derste islenen SCREEN datasina benzer bir veri seti elde edilmistir.
  • Veri seti yukleme ve veri setindeki degiskenlerin “etiketlerini” kaldirmak icin → “expss” paketinde yer alan → “drop_var_labs()” fonksiyonu kullanma:
orj_USA <- read_sav("C:/Users/User/Desktop/USA.SAV")
orj_USA <- expss::drop_var_labs(orj_USA)
head(orj_USA)
## # A tibble: 6 × 34
##   CNT   CNTSTUID   AGE GRADE IMMIG LANGN REPEAT MISSSC SKIPPING TARDYSD EXERPRAC
##   <chr>    <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>    <dbl>   <dbl>    <dbl>
## 1 USA   84000002  15.4     0     1   313      0      0        0       0       10
## 2 USA   84000003  15.5     0     1   313      0      0        1       0       10
## 3 USA   84000004  16.1     0     1   313      0      0        0       0        5
## 4 USA   84000005  15.6     0     1   313      0      0        1       0        0
## 5 USA   84000006  16.3     1     1   313      0      0        0       2        5
## 6 USA   84000008  15.6     0     2   313      0      1        0       0        4
## # ℹ 23 more variables: STUDYHMW <dbl>, WORKPAY <dbl>, WORKHOME <dbl>,
## #   EXPECEDU <dbl>, MATHPREF <dbl>, MATHEASE <dbl>, MATHMOT <dbl>,
## #   DURECEC <dbl>, SISCO <dbl>, MISCED <dbl>, FISCED <dbl>, HISCED <dbl>,
## #   PAREDINT <dbl>, BMMJ1 <dbl>, BFMJ2 <dbl>, HISEI <dbl>, ICTRES <dbl>,
## #   HOMEPOS <dbl>, ESCS <dbl>, FCFMLRTY <dbl>, ICTAVSCH <dbl>, ICTHOME <dbl>,
## #   ICTAVHOM <dbl>
  • Excel ile benim tarafimdan temizlenmeye calisilmis USA datasi “.csv” formatinda yuklenerek →calisma icin en uygun olacak sekilde umarim kullanilacak hale getirilmesi:
USA <- read.csv("C:/Users/User/Desktop/USA_temiz.csv", header=T, sep=",")
head(USA)
##   CNT CNTSTUID   AGE GRADE IMMIG LANGN REPEAT MISSSC SKIPPING TARDYSD EXERPRAC
## 1 USA 84000002 15.42     0     1   313      0      0        0       0       10
## 2 USA 84000003 15.50     0     1   313      0      0        1       0       10
## 3 USA 84000004 16.08     0     1   313      0      0        0       0        5
## 4 USA 84000005 15.58     0     1   313      0      0        1       0        0
## 5 USA 84000006 16.33     1     1   313      0      0        0       2        5
## 6 USA 84000008 15.58     0     2   313      0      1        0       0        4
##   STUDYHMW WORKPAY WORKHOME EXPECEDU MATHPREF MATHEASE MATHMOT DURECEC SISCO
## 1        2       0        2        9        0        0       0       2     1
## 2        6       0        0        4        0        0       0      NA     0
## 3        9       0        5        8        1        0       0       2     1
## 4        3      10       10        4        0        0       0       3     1
## 5        1       5        0        9        0        0       0       2     1
## 6       10       0        0        9       NA        0      NA       4     0
##   MISCED FISCED HISCED PAREDINT BMMJ1 BFMJ2 HISEI  ICTRES HOMEPOS    ESCS
## 1      7      9      9       16    NA 79.05 79.05  0.5500  1.1179  1.2582
## 2      6      6      6       12 73.91 59.89 73.91  0.4946  0.7300  0.3488
## 3      9      9      9       16 67.94 82.41 82.41  1.0020  1.1761  1.3463
## 4      5      3      5       12 24.98    NA 24.98 -0.7480 -0.9389 -1.3108
## 5      5      5      5       12    NA 16.50 16.50  1.7606  0.2333 -0.4500
## 6     10      3     10       16 85.85 51.92 85.85  1.4757  0.4713  1.1127
##   FCFMLRTY ICTAVSCH ICTHOME ICTAVHOM
## 1       11        7  0.3346        6
## 2        9        7  0.3346        6
## 3       15        7  0.3346        6
## 4       10        7  0.3346        6
## 5       10        7  0.3346        6
## 6        0        3 -1.5118        5
  • Eksik veri duzenlemesi icin:
USA <- USA %>% 
  mutate(IMMIG = ifelse(is.na(IMMIG), mean(IMMIG, na.rm=T), IMMIG)) %>% na.omit()
summary(USA)  
##      CNT               CNTSTUID             AGE            GRADE        
##  Length:1571        Min.   :84000004   Min.   :15.33   Min.   :-2.0000  
##  Class :character   1st Qu.:84002126   1st Qu.:15.58   1st Qu.: 0.0000  
##  Mode  :character   Median :84004153   Median :15.83   Median : 0.0000  
##                     Mean   :84004109   Mean   :15.83   Mean   : 0.1337  
##                     3rd Qu.:84006116   3rd Qu.:16.08   3rd Qu.: 0.0000  
##                     Max.   :84008157   Max.   :16.33   Max.   : 2.0000  
##      IMMIG           LANGN           REPEAT           MISSSC       
##  Min.   :1.000   Min.   :156.0   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:1.000   1st Qu.:313.0   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :1.000   Median :313.0   Median :0.0000   Median :0.00000  
##  Mean   :1.239   Mean   :316.2   Mean   :0.0471   Mean   :0.04583  
##  3rd Qu.:1.000   3rd Qu.:313.0   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :3.000   Max.   :859.0   Max.   :1.0000   Max.   :1.00000  
##     SKIPPING         TARDYSD          EXERPRAC         STUDYHMW     
##  Min.   :0.0000   Min.   :0.0000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 2.000   1st Qu.: 3.000  
##  Median :0.0000   Median :0.0000   Median : 5.000   Median : 5.000  
##  Mean   :0.3514   Mean   :0.5086   Mean   : 4.745   Mean   : 4.959  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.: 8.000   3rd Qu.: 7.000  
##  Max.   :1.0000   Max.   :2.0000   Max.   :10.000   Max.   :10.000  
##     WORKPAY          WORKHOME         EXPECEDU        MATHPREF     
##  Min.   : 0.000   Min.   : 0.000   Min.   :2.000   Min.   :0.0000  
##  1st Qu.: 0.000   1st Qu.: 1.000   1st Qu.:6.000   1st Qu.:0.0000  
##  Median : 0.000   Median : 4.000   Median :7.000   Median :0.0000  
##  Mean   : 1.282   Mean   : 4.122   Mean   :7.073   Mean   :0.1693  
##  3rd Qu.: 2.000   3rd Qu.: 7.000   3rd Qu.:8.000   3rd Qu.:0.0000  
##  Max.   :10.000   Max.   :10.000   Max.   :9.000   Max.   :1.0000  
##     MATHEASE         MATHMOT          DURECEC          SISCO       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.0000  
##  Median :0.0000   Median :0.0000   Median :2.000   Median :1.0000  
##  Mean   :0.1254   Mean   :0.0261   Mean   :2.266   Mean   :0.8351  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :6.000   Max.   :1.0000  
##      MISCED          FISCED           HISCED          PAREDINT    
##  Min.   : 1.00   Min.   : 1.000   Min.   : 1.000   Min.   : 3.00  
##  1st Qu.: 5.00   1st Qu.: 5.000   1st Qu.: 6.000   1st Qu.:12.00  
##  Median : 8.00   Median : 7.000   Median : 8.000   Median :16.00  
##  Mean   : 7.17   Mean   : 6.751   Mean   : 7.622   Mean   :14.62  
##  3rd Qu.: 9.00   3rd Qu.: 9.000   3rd Qu.: 9.000   3rd Qu.:16.00  
##  Max.   :10.00   Max.   :10.000   Max.   :10.000   Max.   :16.00  
##      BMMJ1           BFMJ2           HISEI           ICTRES       
##  Min.   :11.56   Min.   :11.01   Min.   :11.56   Min.   :-2.5500  
##  1st Qu.:30.90   1st Qu.:25.95   1st Qu.:47.83   1st Qu.:-0.2720  
##  Median :59.18   Median :51.50   Median :68.70   Median : 0.3344  
##  Mean   :53.03   Mean   :48.52   Mean   :60.87   Mean   : 0.3359  
##  3rd Qu.:70.50   3rd Qu.:70.57   3rd Qu.:76.49   3rd Qu.: 0.8511  
##  Max.   :88.70   Max.   :88.96   Max.   :88.96   Max.   : 4.1742  
##     HOMEPOS             ESCS            FCFMLRTY         ICTAVSCH    
##  Min.   :-2.5965   Min.   :-3.3280   Min.   : 0.000   Min.   :0.000  
##  1st Qu.:-0.2490   1st Qu.:-0.1888   1st Qu.: 4.000   1st Qu.:7.000  
##  Median : 0.3100   Median : 0.5272   Median : 7.000   Median :7.000  
##  Mean   : 0.3315   Mean   : 0.3416   Mean   : 7.181   Mean   :6.683  
##  3rd Qu.: 0.8680   3rd Qu.: 0.9649   3rd Qu.:10.000   3rd Qu.:7.000  
##  Max.   : 4.9111   Max.   : 2.6262   Max.   :16.000   Max.   :7.000  
##     ICTHOME           ICTAVHOM    
##  Min.   :-6.3292   Min.   :0.000  
##  1st Qu.: 0.3346   1st Qu.:6.000  
##  Median : 0.3346   Median :6.000  
##  Mean   : 0.0949   Mean   :5.796  
##  3rd Qu.: 0.3346   3rd Qu.:6.000  
##  Max.   : 0.3456   Max.   :6.000

UC DEGERLER (OUTLIERS)

  • Veri setinde outliers bulunma nedenleri:
    1. Veri dosyasina isleme hatasi
    2. Yanlis kodlama
    3. Evren-orneklem uyusmazligi
    4. Orneklemin evrenden cekilmis olmasina ragmen → NORMAL DAGILIMa uygun olmayan → uc/aykiri/asiri degerlere sahip olmasi
  • Tek Degiskenli vs. Cok Degiskenli Uc Deger → farkli

UC DEGERLERIN BELIRLENMESI

Kategorik Degiskenlerde Uc Deger

  • MISSSC DEGISKENI:
freq(USA$MISSSC, round.digits = 3, report.nas = F, style = "rmarkdown")
## ### Frequencies  
## #### USA$MISSSC  
## **Type:** Integer  
## 
## |    &nbsp; | Freq |       % |  % Cum. |
## |----------:|-----:|--------:|--------:|
## |     **0** | 1499 |  95.417 |  95.417 |
## |     **1** |   72 |   4.583 | 100.000 |
## | **Total** | 1571 | 100.000 | 100.000 |
  • Tabloda USA veri setindeki “MISSSC” degiskenine ait frekans dagilimi hesaplanarak → Markdown formatinda raporlandi.
    • MISSSC degiskeni 2 kategoriye sahip → 0 ve 1
    • 1499 gozlem (%95.417) → 0 kategorisinde → BASKIN kategori
    • 72 gozlem (%4.583) → 1 kategorisinde
    • Buna gore → MISSSC degiskeninde bir taraf yaklasik %95 iken diger taraf yaklasik %5 oldugu icin → iyi temsil EDILMIYOR!
  • SKIPPING DEGISKENI:
library(kableExtra)
freq(USA$SKIPPING, report.nas = F) %>%
  kable(format='markdown', 
      caption="Frekans Tablosu", digits = 3) %>%
    kable_styling(full_width = T, font_size = 14, bootstrap_options = "striped") %>%
  row_spec(0, background = "white", color = "black")
Frekans Tablosu
Freq % Valid % Valid Cum. % Total % Total Cum.
0 1019 64.863 64.863 64.863 64.863
1 552 35.137 100.000 35.137 100.000
<NA> 0 NA NA 0.000 100.000
Total 1571 100.000 100.000 100.000 100.000
  • SKIPPING degiskenine ait dagilim incelendiginde:
    • 0 degeri %65 iken 1 degeri %35 oldugu icin (65/35 orani) → KABUL edilebilir temsil durumu vardir.

Surekli Degiskenlerde Uc Deger

Tek Degiskenli Uc Deger Belirleme

  • Degiskene ait butun degerlerin ORTALAMA 0 ve STANDART SAPMA 1 olacak sekilde → standartlastirmak icin z puanlarina donusturme
    • TEK degiskenli uc degerler → COK BUYUK z puanlarina sahiptir.
  • outliers paketinde yer alan → scores() fonksiyonu ile → z degerleri hesaplama:
z.scores_USA <- USA %>%  
 select(25:27) %>% scores(type = "z") %>% round(2)
head(z.scores_USA)
##   BMMJ1 BFMJ2 HISEI
## 1  0.67  1.46  1.08
## 2  0.78 -0.90  0.48
## 3  0.94  1.09  0.65
## 4  0.77  0.96  0.50
## 5 -1.26 -0.63 -1.36
## 6 -0.98 -0.57 -1.28
  • BMMJ1, BMMJ2 ve HISEI degiskenleri icin (25:27) secilerek bu degiskenler incelenmistir.

  • summarytools paketinde yer alan → descr() fonksiyonu ile z degerlerinin → en dusuk ve en yuksek degerlerini hesaplama:

descr(z.scores_USA,
 stats     = c("min", "max"),
 transpose = T,
 headings  = F)
## 
##                 Min    Max
## ----------- ------- ------
##       BFMJ2   -1.61   1.74
##       BMMJ1   -1.86   1.60
##       HISEI   -2.47   1.41
  • Elde edilen degerlere gore uc deger bulunmamaktadir → cünkü → en DUSUK ve en YUKSEK degerler -3 ile +3 araligindadir.

NOTE: Veri setinde bir sorun oldugunu fark ederek USA veri setinin icerisine farkli surekli degiskenler eklenmesi gerektigini anliyorum :(

Veri Setine Surekli Degisken Ekleme
set.seed(123)
n <- nrow(USA)
USA$NEW_VAR <- runif(n, min = 0, max = 200)
outliers <- c(-150, 450)  
USA$NEW_VAR[sample(1:n, 2)] <- outliers
summary(USA$NEW_VAR)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -150.00   49.97   97.82   99.21  147.98  450.00
set.seed(123)
n <- nrow(USA)
USA$NEW_VAR_2 <- runif(n, min = 0, max = 300)
outliers <- c(-200, 550)  
USA$NEW_VAR_2[sample(1:n, 2)] <- outliers
summary(USA$NEW_VAR_2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -200.00   74.95  146.73  148.76  221.98  550.00
head(USA)
##    CNT CNTSTUID   AGE GRADE IMMIG LANGN REPEAT MISSSC SKIPPING TARDYSD EXERPRAC
## 3  USA 84000004 16.08     0     1   313      0      0        0       0        5
## 8  USA 84000011 16.08     0     3   313      0      0        0       0        8
## 9  USA 84000014 16.25     1     1   313      0      0        0       1       10
## 10 USA 84000015 15.83     0     1   313      0      0        1       1        8
## 12 USA 84000019 16.25     0     1   313      1      0        0       0        5
## 13 USA 84000021 15.83     0     1   313      0      1        1       1        0
##    STUDYHMW WORKPAY WORKHOME EXPECEDU MATHPREF MATHEASE MATHMOT DURECEC SISCO
## 3         9       0        5        8        1        0       0       2     1
## 8        10       0        5        9        0        0       0       1     1
## 9        10      10       10        8        0        0       0       3     1
## 10        1       2        0        7        0        0       0       1     0
## 12        6       4        6        4        0        0       0       1     1
## 13        0       3        6        6        0        0       0       3     1
##    MISCED FISCED HISCED PAREDINT BMMJ1 BFMJ2 HISEI  ICTRES HOMEPOS    ESCS
## 3       9      9      9     16.0 67.94 82.41 82.41  1.0020  1.1761  1.3463
## 8       9      8      9     16.0 70.50 27.52 70.50  0.2867  0.2929  0.7473
## 9      10     10     10     16.0 73.91 73.91 73.91  2.2545  1.1559  1.1772
## 10      8      9      9     16.0 70.10 70.89 70.89 -0.3588  0.6069  0.8877
## 12      5      5      5     12.0 25.04 33.76 33.76  0.7541  0.4548 -0.5548
## 13      6      7      7     14.5 31.08 35.34 35.34  0.8701  0.0469 -0.2746
##    FCFMLRTY ICTAVSCH ICTHOME ICTAVHOM   NEW_VAR NEW_VAR_2
## 3        15        7  0.3346        6  57.51550  86.27326
## 8        12        7  0.3346        6 157.66103 236.49154
## 9        11        7  0.3346        6  81.79538 122.69308
## 10        5        7 -0.3881        5 176.60348 264.90522
## 12        8        7  0.3346        6 188.09346 282.14019
## 13        7        7  0.3346        6   9.11130  13.66695
z.scores_USA_2 <- USA %>%  
 select(35:36) %>% scores(type = "z") %>% round(2)
head(z.scores_USA_2)
##   NEW_VAR NEW_VAR_2
## 1   -0.72     -0.72
## 2    1.01      1.01
## 3   -0.30     -0.30
## 4    1.33      1.34
## 5    1.53      1.54
## 6   -1.55     -1.56
descr(z.scores_USA_2,
 stats     = c("min", "max"),
 transpose = T,
 headings  = F)
## 
##                     Min    Max
## --------------- ------- ------
##         NEW_VAR   -4.29   6.04
##       NEW_VAR_2   -4.03   4.63
  • NEW_VAR ve NEW_VAR_2 sayesinde artik uc degerlere sahip surekli degisken elde ettim.

Frekans Tablosu

  • DT paketinde yer alan → datatable() fonksiyonu:
DT::datatable(z.scores_USA_2, 
              options = list(pageLength = 5, 
                             scrollX = T,
                             searching = T,
                             autoWidth = F))
  • Yukaridaki tablo ile → 2 surekli degiskene ait → z puanlarini gorebiliyorum.

Histogram Grafigi

  • ggplot2 paketi ile HISTOGRAM GRAFIGI elde etme:
ggplot2::ggplot(USA, aes(x = NEW_VAR)) +
  geom_histogram(bins = 30L, fill = "red", color = "black") +
  theme_minimal() +
  xlim(min(USA$NEW_VAR), max(USA$NEW_VAR)) +
  coord_cartesian(expand = T) +
  labs(title = "NEW_VAR Histogramı", x = "NEW_VAR Degerleri", y = "Frekans") +
  theme(plot.title = element_text(hjust = 0.5, size = 14))

  • NEW_VAR degiskeninin → ortalamasini elde ederek → grafige eklenmesi icin:
mean(USA$NEW_VAR, na.rm = T)    #  99.21278
## [1] 99.21278
  • Grafik ile ORTALAMANIN gosterilmesi:
ggplot2::ggplot(USA, aes(x = NEW_VAR)) + 
  geom_histogram(bins = 30, fill = "lightblue", color = "black", alpha = 0.7) + 
  geom_vline(xintercept = 99.21278, color = "red", linetype = "dashed", size = 1) +
  annotate("text", label = "Ort = 99.21278", x = 12, y = max(table(cut(USA$NEW_VAR, breaks = 30))),  
           color ="black", size = 5, fontface = "bold") + 
  theme_minimal() + 
  labs(title = "NEW_VAR Degiskeninin Histogram Grafigi",
       subtitle = "Ortalama 99.21278 olarak isaretlendi",
       x = "NEW_VAR Degerleri",
       y = "Frekans") + 
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5, size = 12),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 13))

Yogunluk Grafigi

ggplot2::ggplot(USA, aes(x = NEW_VAR)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black", alpha = 0.6) + 
  geom_density(alpha = 0.5, fill = "red", color = "darkred", size = 1.2) +  
  geom_vline(xintercept = mean(USA$NEW_VAR, na.rm = TRUE), color = "blue", 
             linetype = "dashed", size = 1) + 
  annotate("text", label = paste("Ortalama =", round(mean(USA$NEW_VAR, na.rm = T), 2)), 
           x = mean(USA$NEW_VAR, na.rm = T) + 5, 
           y = max(density(USA$NEW_VAR, na.rm = T)$y) * 0.9, 
           color = "blue", size = 5, fontface = "bold") +
  labs(title = "NEW_VAR Degiskeninin Histogrami ve Yogunluk Grafigi",
       subtitle = "Histogram (lightblue) ve Yogunluk Egrisi (red)",
       x = "NEW_VAR Degerleri",
       y = "Yogunluk") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5, size = 12),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 13))

Interaktif Grafik

plot_ly(
  x = USA$NEW_VAR, 
  type = "histogram", 
  histnorm = "probability",  
  nbinsx = 30,  
  marker = list(color = "red", line = list(color = "black", width = 1.2)) 
) %>%
  layout(
    title = "NEW_VAR Degiskeninin Histogrami",
    xaxis = list(title = "NEW_VAR Degerleri", showgrid = F),  
    yaxis = list(title = "Olasilik", showgrid = F), 
    plot_bgcolor = "white"
  )

Kutu Grafigi

summary(USA$NEW_VAR)  
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -150.00   49.97   97.82   99.21  147.98  450.00
ggplot(USA, aes(x = "", y = NEW_VAR)) + 
  geom_boxplot(fill = "lightblue", color = "black", outlier.color = "red", 
               outlier.shape = 16, outlier.size = 3, na.rm = T) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 4, 
               color = "blue", fill = "blue", na.rm = T) +
  theme_minimal() +
  labs(title = "NEW_VAR Degiskeninin Boxplot Grafigi",
       subtitle = "Kutu grafigi ile dagilimin gorsellestirilmesi",
       y = "NEW_VAR Degerleri") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5, size = 12),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 13))

  • boxplot.stats() fonksiyonuout bileseni → UC DEGERLERI verir:
out <- boxplot.stats(USA$NEW_VAR)$out
out 
## [1] -150  450
outliers_indices <- which(USA$NEW_VAR %in% out)
outliers_indices  
## [1] 403 978
USA[outliers_indices, ]
##      CNT CNTSTUID   AGE GRADE IMMIG LANGN REPEAT MISSSC SKIPPING TARDYSD
## 1182 USA 84002165 15.67     0     1   313      0      0        0       0
## 2858 USA 84005114 15.50     0     2   859      0      0        0       0
##      EXERPRAC STUDYHMW WORKPAY WORKHOME EXPECEDU MATHPREF MATHEASE MATHMOT
## 1182        5        2       0        2        9        0        0       0
## 2858        3        5       0        5        7        0        0       0
##      DURECEC SISCO MISCED FISCED HISCED PAREDINT BMMJ1 BFMJ2 HISEI  ICTRES
## 1182       1     1      8      8      8       16 70.34 62.39 70.34 -0.6000
## 2858       2     1      8      7      8       16 47.83 57.37 57.37  0.7374
##      HOMEPOS   ESCS FCFMLRTY ICTAVSCH ICTHOME ICTAVHOM NEW_VAR NEW_VAR_2
## 1182  0.2592 0.7300        6        7  0.3346        6    -150      -200
## 2858  0.7674 0.7004        8        7  0.3346        6     450       550
  • USA veri setinde NEW_VAR degiskeni icin 2 adet uc deger var:
    • Bu uc degerler sirasi ile → SATIR 1182 ve 2858’de yer almaktadir.
    • CNTSTUID (Ogrenci_Kimligi) → 84002165 ve 84005114
    • AGE (Yas) → 15.67 ve 15.50
    • IMMIG (Gocmenlik Durumu) → 1 ve 2
    • LANGN (Dil Kodu) → 313 ve 859
  • Bu 2 uc degerlerin NEW_VAR degiskeninde hangi gozlemlerde/bireylerde oldugunu belirleme:
out_ind <- which(USA$NEW_VAR %in% c(out))
out_ind    # 403 ve 978 bireyler uc degerdir.
## [1] 403 978

Interaktif Kutu Grafigi

plot_ly(y = ~USA$NEW_VAR, type = 'box', 
        boxpoints = "all",
        jitter = 0.3,  
        pointpos = -1.8,
        marker = list(color = 'red', size = 5, opacity = 0.6),
        line = list(color = 'black')) %>%  
  layout(title = "NEW_VAR Degiskeninin Boxplot Grafigi", 
         yaxis = list(title = "NEW_VAR Degerleri"),
         plot_bgcolor = "white")
  • Interaktif kutu grafiginde gorulen uc degerlerin IDlerinin isaretlenmesi:
out_ind <- which(USA$NEW_VAR %in% outliers)
plot_ly(y = ~USA$NEW_VAR, type = 'box', 
        boxpoints = "all", jitter = 0.3, pointpos = -1.5,
        marker = list(color = "red", size = 6, opacity = 0.6),
        line = list(color = "black")) %>%
  layout(title = "NEW_VAR Degiskeninin Boxplot Grafigi",
         yaxis = list(title = "NEW_VAR Degerleri"),
         plot_bgcolor = "white",
         annotations = lapply(1:length(outliers), function(i) {
           list(
             x = -0.2,
             y = outliers[i], 
             text = paste("İndeks:", out_ind[i]), 
             showarrow = F,
             xanchor = "right",
             font = list(size = 10, color = "blue") 
           )
         })
  )

NEW_VAR Degiskeninin NEW_VAR_2 Degiskenine gore Incelenmesi

ggplot(USA, aes(x = factor(NEW_VAR_2), 
                y = NEW_VAR, 
                fill = factor(NEW_VAR_2))) +
  geom_boxplot(outlier.color = "red", outlier.shape = 16, outlier.size = 3) +
  scale_fill_brewer(palette = "Set2") + 
  theme_minimal() +
  labs(title = "NEW_VAR Degiskeninin Kategorik Boxplot Grafigi",
       subtitle = "NEW_VAR_2 degiskenine gore gruplanmis",
       x = "NEW_VAR_2 Kategorileri",
       y = "NEW_VAR Degerleri",
       fill = "NEW_VAR_2") +  
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5, size = 12),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 13),
        legend.position = "top")

NEW_VAR Degiskeninin MISCED Degiskenine gore Incelenmesi

mean(USA$MISCED, na.rm = T)   # 7.169955
## [1] 7.169955
ggplot(USA, aes(x = MISCED)) + 
  geom_histogram(bins = 10, fill = "#0c4c8a", color = "white", alpha = 0.8) + 
  geom_vline(aes(xintercept = mean(MISCED, na.rm = T)),  
             color = "red", linetype = "dashed", size = 1) + 
  annotate("text", x = mean(USA$MISCED, na.rm = T) + 1, y = 10, 
           label = paste("Ortalama:", round(mean(USA$MISCED, na.rm = T), 2)), 
           color = "red", size = 5, fontface = "bold") +  
  theme_minimal() +
  labs(title = "MISCED Degiskeninin Histogrami",
       subtitle = "Bireylerin anne egitim seviyesinin dagilimi",
       x = "MISCED (Anne Egitim Seviyesi)",
       y = "Frekans") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5, size = 12),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 13))

MAHALANOBIS UZAKLIGI

Mahalanobis Uzakligi Hesaplama:

library(psych)
veri <- USA[,3:13]
md <- mahalanobis(veri, center = colMeans(veri), cov = cov(veri))
head(md, 20)
##         3         8         9        10        12        13        14        23 
##  5.207553 18.846258 20.993176  5.879155 24.881852 25.627941  1.669207  8.206659 
##        30        31        32        33        36        39        40        43 
##  6.919917  3.857980  3.342804  4.997115 28.018506 10.388325  9.631875  4.049845 
##        48        51        52        54 
## 13.647818  6.397026  1.283293  7.905329

Mahalanobis Uzakligi Kritik Deger Belirleme

alpha <- .001
cutoff <- (qchisq(p = 1 - alpha, df = ncol(veri)))
cutoff    # 31.26413
## [1] 31.26413
  • Mahalonobis uzakligi degerleri ki-kare ile değerlendirilir.
    • Serbestlik derecesi BAGIMSIZ degisken sayisina ESIT
    • 31.26413 kritik degerinden BUYUK olan degerler 0.001 alfa duzeyinde istatistiksel olarak MANIDAR seklinde degerlendirilmektedir.
ucdegerler <- which(md > cutoff)
veri[ucdegerler, ]
##        AGE GRADE    IMMIG LANGN REPEAT MISSSC SKIPPING TARDYSD EXERPRAC
## 129  16.17     0 1.000000   313      1      0        0       2       10
## 206  15.58     0 3.000000   859      0      0        1       0        7
## 218  15.83    -1 1.000000   313      1      1        1       2        0
## 226  15.50     0 2.000000   156      1      1        1       2        4
## 249  16.25     1 3.000000   859      0      0        0       1        5
## 336  15.42    -1 1.000000   859      0      1        1       1        0
## 400  15.50    -1 1.000000   313      0      1        1       0       10
## 426  15.75     0 2.000000   156      0      1        0       1        1
## 537  15.50     0 2.000000   156      0      1        0       1        0
## 659  15.42    -2 1.000000   313      1      0        1       2       10
## 669  15.42     0 3.000000   859      0      0        1       0        3
## 830  15.50     0 3.000000   156      0      0        1       2        4
## 908  15.92     0 2.000000   156      1      0        0       1        4
## 967  15.58    -1 1.000000   313      1      0        0       2       10
## 1010 15.42     0 3.000000   859      0      0        0       0        0
## 1013 15.58     0 3.000000   859      0      1        1       0        4
## 1065 16.17     0 1.000000   156      0      1        0       2        2
## 1366 16.17     0 1.000000   313      1      0        0       0       10
## 1470 15.50    -1 3.000000   156      1      0        1       1        8
## 1576 15.42    -1 1.000000   313      1      1        0       1        3
## 1641 16.33     1 2.000000   156      0      1        0       0        5
## 1724 16.17     0 1.000000   156      0      1        1       2       10
## 1749 16.17    -1 1.000000   313      1      0        0       2        5
## 1756 16.08     0 1.284064   859      0      0        0       0       10
## 1764 16.08     1 3.000000   156      0      0        1       0        4
## 1844 15.67     0 1.000000   156      1      1        0       0       10
## 1970 16.00     0 2.000000   859      0      1        1       2       10
## 1987 16.08     0 3.000000   859      0      0        1       2        5
## 2004 16.00     0 2.000000   156      1      1        1       2       10
## 2084 15.42    -1 1.000000   313      1      0        0       0       10
## 2156 15.83     1 3.000000   859      0      0        0       0        5
## 2196 15.42    -1 1.000000   313      1      0        0       2        2
## 2241 16.08     0 2.000000   156      0      1        0       0        0
## 2287 15.67     1 2.000000   313      1      0        0       0        2
## 2371 16.25     1 1.284064   859      0      0        1       0        0
## 2423 15.67    -1 1.000000   313      1      1        1       1        0
## 2439 16.33     0 2.000000   156      1      0        0       0        4
## 2448 16.08     0 2.000000   859      0      0        1       2        4
## 2490 16.25     0 2.000000   156      1      0        0       1        0
## 2581 16.08    -1 1.000000   313      0      0        0       1       10
## 2583 16.33     0 1.000000   313      0      1        0       2       10
## 2599 15.92    -1 1.000000   313      1      0        1       0       10
## 2613 16.17     0 1.000000   313      1      0        0       0        0
## 2631 15.92    -1 1.000000   313      1      1        0       1       10
## 2665 15.42     0 1.000000   313      0      1        0       2        0
## 2689 15.33     0 3.000000   859      0      0        0       2        0
## 2704 15.42    -1 1.000000   313      0      1        1       1        0
## 2730 16.08     0 1.000000   313      0      1        0       1       10
## 2843 16.25     0 1.000000   313      1      0        0       0        0
## 2879 15.50    -1 1.000000   313      1      0        1       2        0
## 2885 16.17     0 1.000000   313      0      1        1       1        0
## 2914 16.08     0 2.000000   859      0      0        0       2       10
## 2998 15.50    -1 3.000000   859      1      0        0       0        5
## 3117 15.50    -1 1.000000   859      0      0        0       2        2
## 3124 16.08     0 3.000000   859      0      0        1       1        0
## 3131 15.75     0 1.000000   859      1      0        1       1       10
## 3272 15.92     0 1.284064   313      1      1        0       0       10
## 3299 16.17     1 2.000000   156      0      1        1       1       10
## 3415 15.67     0 3.000000   859      0      0        0       0        5
## 3440 16.33     0 3.000000   156      1      0        1       0        6
## 3453 16.25    -1 1.000000   313      0      0        0       2       10
## 3488 15.58     0 1.000000   313      0      1        0       1        0
## 3507 15.50    -1 1.000000   313      1      0        0       2        8
## 3517 15.58     0 1.000000   313      1      0        1       0       10
## 3776 15.92    -1 1.000000   313      1      1        1       2        1
## 3799 16.00     1 3.000000   859      0      0        0       1       10
## 3887 15.83     1 3.000000   156      0      1        0       1        8
## 3986 16.17     0 1.000000   313      1      1        0       2        0
## 4001 16.17     0 1.000000   313      0      1        1       2       10
## 4054 16.00     0 1.000000   859      0      0        0       1        0
## 4067 15.92    -1 1.000000   313      1      0        0       1       10
## 4083 16.08     0 2.000000   313      1      1        1       1        8
## 4088 16.33     1 1.000000   313      1      0        1       0        1
## 4147 16.33     0 1.000000   313      0      1        1       2        2
## 4197 15.50     0 1.000000   313      1      1        0       2        8
## 4460 15.92     0 3.000000   859      0      0        0       0        2
## 4492 16.08     0 3.000000   859      0      0        0       1        5
## 4504 16.00     0 1.000000   313      1      1        1       0        1
## 4522 15.58    -1 3.000000   156      0      1        0       2        4
##      STUDYHMW WORKPAY
## 129         3       0
## 206         8       0
## 218         0       0
## 226         2       3
## 249         5       0
## 336         2       0
## 400        10       0
## 426        10       0
## 537         5       7
## 659        10       7
## 669         5       0
## 830         9       9
## 908        10       0
## 967         8       0
## 1010        4       0
## 1013       10       0
## 1065       10       5
## 1366        6      10
## 1470        2       0
## 1576        4       0
## 1641        8       1
## 1724        8       0
## 1749        4       2
## 1756        8      10
## 1764       10      10
## 1844       10      10
## 1970       10      10
## 1987        5       0
## 2004        3      10
## 2084       10      10
## 2156       10       0
## 2196        5       0
## 2241        3       0
## 2287        6       0
## 2371        6       6
## 2423        7       0
## 2439       10       0
## 2448        2       8
## 2490        4       4
## 2581       10      10
## 2583        6       2
## 2599        0       0
## 2613       10       0
## 2631       10       0
## 2665        4       0
## 2689        3       0
## 2704        0       0
## 2730        0       0
## 2843        0       8
## 2879        6       0
## 2885        7       9
## 2914        3       0
## 2998        5       0
## 3117        4       3
## 3124        8       0
## 3131        8       4
## 3272        4       8
## 3299        6       0
## 3415        6       0
## 3440       10       0
## 3453       10       0
## 3488        0      10
## 3507        8       6
## 3517        0      10
## 3776        0      10
## 3799       10       6
## 3887        4       4
## 3986        6       3
## 4001        6       6
## 4054        0      10
## 4067       10      10
## 4083        4       0
## 4088        3       2
## 4147        2       0
## 4197        3       2
## 4460        4       0
## 4492        0       0
## 4504        1       1
## 4522        4       0
data_temiz <- veri[-ucdegerler, ]
veri[ucdegerler, ]
##        AGE GRADE    IMMIG LANGN REPEAT MISSSC SKIPPING TARDYSD EXERPRAC
## 129  16.17     0 1.000000   313      1      0        0       2       10
## 206  15.58     0 3.000000   859      0      0        1       0        7
## 218  15.83    -1 1.000000   313      1      1        1       2        0
## 226  15.50     0 2.000000   156      1      1        1       2        4
## 249  16.25     1 3.000000   859      0      0        0       1        5
## 336  15.42    -1 1.000000   859      0      1        1       1        0
## 400  15.50    -1 1.000000   313      0      1        1       0       10
## 426  15.75     0 2.000000   156      0      1        0       1        1
## 537  15.50     0 2.000000   156      0      1        0       1        0
## 659  15.42    -2 1.000000   313      1      0        1       2       10
## 669  15.42     0 3.000000   859      0      0        1       0        3
## 830  15.50     0 3.000000   156      0      0        1       2        4
## 908  15.92     0 2.000000   156      1      0        0       1        4
## 967  15.58    -1 1.000000   313      1      0        0       2       10
## 1010 15.42     0 3.000000   859      0      0        0       0        0
## 1013 15.58     0 3.000000   859      0      1        1       0        4
## 1065 16.17     0 1.000000   156      0      1        0       2        2
## 1366 16.17     0 1.000000   313      1      0        0       0       10
## 1470 15.50    -1 3.000000   156      1      0        1       1        8
## 1576 15.42    -1 1.000000   313      1      1        0       1        3
## 1641 16.33     1 2.000000   156      0      1        0       0        5
## 1724 16.17     0 1.000000   156      0      1        1       2       10
## 1749 16.17    -1 1.000000   313      1      0        0       2        5
## 1756 16.08     0 1.284064   859      0      0        0       0       10
## 1764 16.08     1 3.000000   156      0      0        1       0        4
## 1844 15.67     0 1.000000   156      1      1        0       0       10
## 1970 16.00     0 2.000000   859      0      1        1       2       10
## 1987 16.08     0 3.000000   859      0      0        1       2        5
## 2004 16.00     0 2.000000   156      1      1        1       2       10
## 2084 15.42    -1 1.000000   313      1      0        0       0       10
## 2156 15.83     1 3.000000   859      0      0        0       0        5
## 2196 15.42    -1 1.000000   313      1      0        0       2        2
## 2241 16.08     0 2.000000   156      0      1        0       0        0
## 2287 15.67     1 2.000000   313      1      0        0       0        2
## 2371 16.25     1 1.284064   859      0      0        1       0        0
## 2423 15.67    -1 1.000000   313      1      1        1       1        0
## 2439 16.33     0 2.000000   156      1      0        0       0        4
## 2448 16.08     0 2.000000   859      0      0        1       2        4
## 2490 16.25     0 2.000000   156      1      0        0       1        0
## 2581 16.08    -1 1.000000   313      0      0        0       1       10
## 2583 16.33     0 1.000000   313      0      1        0       2       10
## 2599 15.92    -1 1.000000   313      1      0        1       0       10
## 2613 16.17     0 1.000000   313      1      0        0       0        0
## 2631 15.92    -1 1.000000   313      1      1        0       1       10
## 2665 15.42     0 1.000000   313      0      1        0       2        0
## 2689 15.33     0 3.000000   859      0      0        0       2        0
## 2704 15.42    -1 1.000000   313      0      1        1       1        0
## 2730 16.08     0 1.000000   313      0      1        0       1       10
## 2843 16.25     0 1.000000   313      1      0        0       0        0
## 2879 15.50    -1 1.000000   313      1      0        1       2        0
## 2885 16.17     0 1.000000   313      0      1        1       1        0
## 2914 16.08     0 2.000000   859      0      0        0       2       10
## 2998 15.50    -1 3.000000   859      1      0        0       0        5
## 3117 15.50    -1 1.000000   859      0      0        0       2        2
## 3124 16.08     0 3.000000   859      0      0        1       1        0
## 3131 15.75     0 1.000000   859      1      0        1       1       10
## 3272 15.92     0 1.284064   313      1      1        0       0       10
## 3299 16.17     1 2.000000   156      0      1        1       1       10
## 3415 15.67     0 3.000000   859      0      0        0       0        5
## 3440 16.33     0 3.000000   156      1      0        1       0        6
## 3453 16.25    -1 1.000000   313      0      0        0       2       10
## 3488 15.58     0 1.000000   313      0      1        0       1        0
## 3507 15.50    -1 1.000000   313      1      0        0       2        8
## 3517 15.58     0 1.000000   313      1      0        1       0       10
## 3776 15.92    -1 1.000000   313      1      1        1       2        1
## 3799 16.00     1 3.000000   859      0      0        0       1       10
## 3887 15.83     1 3.000000   156      0      1        0       1        8
## 3986 16.17     0 1.000000   313      1      1        0       2        0
## 4001 16.17     0 1.000000   313      0      1        1       2       10
## 4054 16.00     0 1.000000   859      0      0        0       1        0
## 4067 15.92    -1 1.000000   313      1      0        0       1       10
## 4083 16.08     0 2.000000   313      1      1        1       1        8
## 4088 16.33     1 1.000000   313      1      0        1       0        1
## 4147 16.33     0 1.000000   313      0      1        1       2        2
## 4197 15.50     0 1.000000   313      1      1        0       2        8
## 4460 15.92     0 3.000000   859      0      0        0       0        2
## 4492 16.08     0 3.000000   859      0      0        0       1        5
## 4504 16.00     0 1.000000   313      1      1        1       0        1
## 4522 15.58    -1 3.000000   156      0      1        0       2        4
##      STUDYHMW WORKPAY
## 129         3       0
## 206         8       0
## 218         0       0
## 226         2       3
## 249         5       0
## 336         2       0
## 400        10       0
## 426        10       0
## 537         5       7
## 659        10       7
## 669         5       0
## 830         9       9
## 908        10       0
## 967         8       0
## 1010        4       0
## 1013       10       0
## 1065       10       5
## 1366        6      10
## 1470        2       0
## 1576        4       0
## 1641        8       1
## 1724        8       0
## 1749        4       2
## 1756        8      10
## 1764       10      10
## 1844       10      10
## 1970       10      10
## 1987        5       0
## 2004        3      10
## 2084       10      10
## 2156       10       0
## 2196        5       0
## 2241        3       0
## 2287        6       0
## 2371        6       6
## 2423        7       0
## 2439       10       0
## 2448        2       8
## 2490        4       4
## 2581       10      10
## 2583        6       2
## 2599        0       0
## 2613       10       0
## 2631       10       0
## 2665        4       0
## 2689        3       0
## 2704        0       0
## 2730        0       0
## 2843        0       8
## 2879        6       0
## 2885        7       9
## 2914        3       0
## 2998        5       0
## 3117        4       3
## 3124        8       0
## 3131        8       4
## 3272        4       8
## 3299        6       0
## 3415        6       0
## 3440       10       0
## 3453       10       0
## 3488        0      10
## 3507        8       6
## 3517        0      10
## 3776        0      10
## 3799       10       6
## 3887        4       4
## 3986        6       3
## 4001        6       6
## 4054        0      10
## 4067       10      10
## 4083        4       0
## 4088        3       2
## 4147        2       0
## 4197        3       2
## 4460        4       0
## 4492        0       0
## 4504        1       1
## 4522        4       0