Veri Temizleme-2

TIMSS 2023 öğrenci anketinden elde edilen değişkenlerden bazılarından oluşan veri seti kullanılmıştır.

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(haven)
## Warning: package 'haven' was built under R version 4.4.3
ogrenci <- read_sav("bsgturm8.sav")
ogrenci <- expss::drop_var_labs(ogrenci)
head(ogrenci)
## # A tibble: 6 × 19
##   BSBG01 BSBM19A BSBM19B BSBM19C BSBM19D BSBM19E BSBM19F BSBM19G BSBM19H BSBM19I
##    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1      2       1       2       2       1       1       1       1       1       1
## 2      1       2       1       1       2       4       4       4       4       4
## 3      2       2       3       1       2       3       4       2       4       3
## 4      1       2       1       1       2       4       4       4       4       4
## 5      2       1       2       4       1       1       1       1       2       1
## 6      1       2       1       1       3       4       4       4       4       4
## # ℹ 9 more variables: BSBM22A <dbl>, BSBM22B <dbl>, BSBM22C <dbl>,
## #   BSBM22D <dbl>, BSBM22E <dbl>, BSBM22F <dbl>, BSBM22G <dbl>, BSBM22H <dbl>,
## #   BSMMAT01 <dbl>

Normallik

Değişkenlerin normalliği istatistiksel veya grafiksel yöntemlerle değerlendirilir. Normalliğin iki unsuru çarpıklık ve basıklıktır.

Eksik Verilerin Düzenlenmesi

ogrenci <- ogrenci %>% 
mutate(BSBM22F = ifelse(is.na(BSBM22F), mean(BSBM22F, na.rm =TRUE),BSBM22F)) %>% na.omit()
summary(ogrenci)
##      BSBG01         BSBM19A         BSBM19B         BSBM19C     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :1.000   Median :2.000   Median :2.000   Median :2.000  
##  Mean   :1.498   Mean   :2.187   Mean   :2.434   Mean   :2.462  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :2.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
##     BSBM19D         BSBM19E         BSBM19F         BSBM19G     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :2.000   Median :2.000   Median :2.000   Median :2.000  
##  Mean   :2.074   Mean   :2.304   Mean   :2.425   Mean   :2.425  
##  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
##     BSBM19H         BSBM19I         BSBM22A         BSBM22B     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :3.000   Median :3.000   Median :2.000   Median :2.000  
##  Mean   :2.924   Mean   :2.675   Mean   :2.404   Mean   :2.513  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:4.000  
##  Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
##     BSBM22C         BSBM22D         BSBM22E         BSBM22F     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :3.000   Median :3.000   Median :3.000  
##  Mean   :2.408   Mean   :2.687   Mean   :2.848   Mean   :2.649  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
##     BSBM22G         BSBM22H         BSMMAT01    
##  Min.   :1.000   Min.   :1.000   Min.   :221.6  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:425.4  
##  Median :2.000   Median :2.000   Median :507.9  
##  Mean   :2.284   Mean   :2.197   Mean   :508.8  
##  3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:590.2  
##  Max.   :4.000   Max.   :4.000   Max.   :851.1
x <- c(1,2,3,NA,5)
ifelse(is.na(x),mean(x,na.rm=TRUE),x)
## [1] 1.00 2.00 3.00 2.75 5.00

Kategorik Değişken İçin Uç Değerler

library(summarytools)
## Warning: package 'summarytools' was built under R version 4.4.3
freq(ogrenci$BSBM19A, 
     round.digits=2,report.nas = FALSE,
 style = "rmarkdown") 
## setting plain.ascii to FALSE
## ### Frequencies  
## #### ogrenci$BSBM19A  
## **Type:** Numeric  
## 
## |    &nbsp; | Freq |      % | % Cum. |
## |----------:|-----:|-------:|-------:|
## |     **1** | 1414 |  32.39 |  32.39 |
## |     **2** | 1481 |  33.92 |  66.31 |
## |     **3** |  711 |  16.28 |  82.59 |
## |     **4** |  760 |  17.41 | 100.00 |
## | **Total** | 4366 | 100.00 | 100.00 |
library(knitr)
## Warning: package 'knitr' was built under R version 4.4.3
freq(ogrenci$BSBM19B,report.nas = FALSE) %>%
  kable(format='markdown', 
      caption="Frekans Tablosu",digits = 2)
Frekans Tablosu
Freq % Valid % Valid Cum. % Total % Total Cum.
1 1395 31.95 31.95 31.95 31.95
2 935 21.42 53.37 21.42 53.37
3 780 17.87 71.23 17.87 71.23
4 1256 28.77 100.00 28.77 100.00
0 NA NA 0.00 100.00
Total 4366 100.00 100.00 100.00 100.00

Sürekli Değişkenlerde Uç Değerler

library(outliers)
z.scores <- ogrenci %>%  
 select(1:6) %>% 
 scores(type = "z") %>%
 round(2)
head(z.scores)
##   BSBG01 BSBM19A BSBM19B BSBM19C BSBM19D BSBM19E
## 1      1   -1.11   -0.36   -0.40   -0.99   -1.14
## 2     -1   -0.17   -1.19   -1.28   -0.07    1.49
## 3      1   -0.17    0.47   -1.28   -0.07    0.61
## 4      1   -1.11   -0.36    1.34   -0.99   -1.14
## 5     -1   -0.17   -1.19   -1.28    0.86    1.49
## 6      1   -1.11    1.30    1.34   -0.99   -1.14
summarytools::descr(z.scores,
 stats     = c("min", "max"),
 transpose = TRUE,
 headings  = FALSE) 
## 
##                   Min    Max
## ------------- ------- ------
##        BSBG01   -1.00   1.00
##       BSBM19A   -1.11   1.69
##       BSBM19B   -1.19   1.30
##       BSBM19C   -1.28   1.34
##       BSBM19D   -0.99   1.78
##       BSBM19E   -1.14   1.49
library(DT)
## Warning: package 'DT' was built under R version 4.4.3
DT::datatable(z.scores)

BSBM22F Maddesinin İncelenmesi

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
ggplot(ogrenci, aes(x = BSBM22F)) +
  geom_histogram(bins = 30L, fill = "#0c4c8a")

# library(ggpmisc)
ggplot(ogrenci, aes(x = BSBM22F)) + geom_histogram() + 
geom_vline(xintercept =7.914, color = "red", 
linetype = "dashed") + 
annotate("text", label = "Ort = 7.913", x = 10, y = 100,  color ="black")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Yoğunluk Grafiği

ggplot(ogrenci, aes(x = BSBM22F)) +
 geom_histogram(aes(y=..density..))+
 geom_density(alpha=.5, fill="#0c4c8a") +
  theme_minimal()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

İnteraktif Grafik

library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(x = ogrenci$BSBM22F,  type = "histogram", 
histnorm = "probability")

Kutu Grafiği

ggplot(ogrenci, aes(y = BSBM22F)) + 
  geom_boxplot()  

Uç Değerler

out <- boxplot.stats(ogrenci$BSMMAT01)$out
out
## [1] 851.0999 837.9938
out_ind <- which(ogrenci$BSMMAT01 %in% c(out))
out_ind
## [1]  463 1076

İnteraktif Kutu Grafiği

plot_ly(y = ogrenci$BSMMAT01, type = 'box') 
plot_ly(y = ogrenci$BSMMAT01, type = 'box')  %>% 
  layout(title = 'Box Plot',
annotations = list( x = -0.01,  y = boxplot.stats(ogrenci$BSMMAT01)$out, 
text = paste(out_ind), showarrow = FALSE,
xanchor = "right"))

Mahalanobis Uzaklığı

Çoklu regresyon modelinde kullanılan bağımsız değişkenler uzayındaki merkezden veya örneklem ortalamasından, tek bir veri noktasının uzaklığını ölçen istatistiktir. Tek değişkenli uzaklık belirleme teknikleriyle yakın ilişkili olmasının yanı sıra, çoklu normal verilerde ve geniş örneklemlerde, değişken sayısına bağlı olarak elde edilen serbestlik derecesi ile ki-kare dağılımına uyar ve örneklem sayısı arttıkça da bu kestirim daha güçlü hale gelir (Johnson ve Wichern, 2002).

library(psych)
## Warning: package 'psych' was built under R version 4.4.3
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:outliers':
## 
##     outlier
veri <- ogrenci[,1:5]
md <- mahalanobis(veri, center = colMeans(veri), cov = cov(veri))
head(md,20)
##  [1] 4.112136 4.100392 6.174371 7.312989 5.048895 3.137728 3.388253 3.137728
##  [9] 3.750581 3.281783 4.251374 4.237090 4.510729 4.112136 3.664009 4.031108
## [17] 3.452970 4.433120 1.757408 3.144806
library(psych)
alpha <- .001
cutoff <- (qchisq(p = 1 - alpha, df = ncol(veri)))
cutoff
## [1] 20.51501
ucdegerler <- which(md > cutoff)
veri[ucdegerler, ]
## # A tibble: 10 × 5
##    BSBG01 BSBM19A BSBM19B BSBM19C BSBM19D
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1      1       4       1       4       1
##  2      2       4       4       1       1
##  3      1       4       4       1       1
##  4      2       4       4       1       1
##  5      2       4       4       1       1
##  6      2       4       4       1       1
##  7      2       1       1       4       4
##  8      2       4       4       1       1
##  9      2       4       4       1       1
## 10      2       4       4       1       1
data_temiz <- veri[-ucdegerler, ]
veri[ucdegerler, ]
## # A tibble: 10 × 5
##    BSBG01 BSBM19A BSBM19B BSBM19C BSBM19D
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1      1       4       1       4       1
##  2      2       4       4       1       1
##  3      1       4       4       1       1
##  4      2       4       4       1       1
##  5      2       4       4       1       1
##  6      2       4       4       1       1
##  7      2       1       1       4       4
##  8      2       4       4       1       1
##  9      2       4       4       1       1
## 10      2       4       4       1       1

Çok Değişkenli Normallik

library(sur)
## Warning: package 'sur' was built under R version 4.4.3
## 
## Attaching package: 'sur'
## The following object is masked from 'package:psych':
## 
##     skew
attach(ogrenci)

skew(BSBM22F)
## [1] -0.141198
se.skew(BSBM22F)
## [1] 0.03705823
skew.ratio(BSBM22F)
## [1] -3.810165
skew(BSBM22F)/se.skew(BSBM22F)
## [1] -3.810165

Jarque Test

library(moments)
library(labelled)
## Warning: package 'labelled' was built under R version 4.4.3
jarque.test(remove_labels(BSBM22F))
## 
##  Jarque-Bera Normality Test
## 
## data:  remove_labels(BSBM22F)
## JB = 347.02, p-value < 2.2e-16
## alternative hypothesis: greater
set.seed(0)
normal <- rnorm(200)
non_normal<- rexp(200, rate=3)
par(mfrow=c(1,2)) 
hist(normal, col='steelblue', main='Normal')
hist(non_normal, col='steelblue', main='Non-normal')

par(mfrow=c(1,2)) 
qqnorm(normal, main='Normal')
qqline(normal)
qqnorm(non_normal, main='Non-normal')
qqline(non_normal)

ggplot(data = ogrenci, aes(sample = BSBM22F )) + 
  geom_qq()+
  geom_qq_line( )