Veri Temizleme-2
TIMSS 2023 öğrenci anketinden elde edilen değişkenlerden bazılarından oluşan veri seti kullanılmıştır.
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(haven)
## Warning: package 'haven' was built under R version 4.4.3
ogrenci <- read_sav("bsgturm8.sav")
ogrenci <- expss::drop_var_labs(ogrenci)
head(ogrenci)
## # A tibble: 6 × 19
## BSBG01 BSBM19A BSBM19B BSBM19C BSBM19D BSBM19E BSBM19F BSBM19G BSBM19H BSBM19I
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 1 2 2 1 1 1 1 1 1
## 2 1 2 1 1 2 4 4 4 4 4
## 3 2 2 3 1 2 3 4 2 4 3
## 4 1 2 1 1 2 4 4 4 4 4
## 5 2 1 2 4 1 1 1 1 2 1
## 6 1 2 1 1 3 4 4 4 4 4
## # ℹ 9 more variables: BSBM22A <dbl>, BSBM22B <dbl>, BSBM22C <dbl>,
## # BSBM22D <dbl>, BSBM22E <dbl>, BSBM22F <dbl>, BSBM22G <dbl>, BSBM22H <dbl>,
## # BSMMAT01 <dbl>
Normallik
Değişkenlerin normalliği istatistiksel veya grafiksel yöntemlerle değerlendirilir. Normalliğin iki unsuru çarpıklık ve basıklıktır.
Eksik Verilerin Düzenlenmesi
ogrenci <- ogrenci %>%
mutate(BSBM22F = ifelse(is.na(BSBM22F), mean(BSBM22F, na.rm =TRUE),BSBM22F)) %>% na.omit()
summary(ogrenci)
## BSBG01 BSBM19A BSBM19B BSBM19C
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :2.000 Median :2.000 Median :2.000
## Mean :1.498 Mean :2.187 Mean :2.434 Mean :2.462
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :2.000 Max. :4.000 Max. :4.000 Max. :4.000
## BSBM19D BSBM19E BSBM19F BSBM19G
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.000 Median :2.000 Median :2.000 Median :2.000
## Mean :2.074 Mean :2.304 Mean :2.425 Mean :2.425
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000
## BSBM19H BSBM19I BSBM22A BSBM22B
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000
## Median :3.000 Median :3.000 Median :2.000 Median :2.000
## Mean :2.924 Mean :2.675 Mean :2.404 Mean :2.513
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000
## BSBM22C BSBM22D BSBM22E BSBM22F
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :3.000 Median :3.000 Median :3.000
## Mean :2.408 Mean :2.687 Mean :2.848 Mean :2.649
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000
## BSBM22G BSBM22H BSMMAT01
## Min. :1.000 Min. :1.000 Min. :221.6
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:425.4
## Median :2.000 Median :2.000 Median :507.9
## Mean :2.284 Mean :2.197 Mean :508.8
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:590.2
## Max. :4.000 Max. :4.000 Max. :851.1
x <- c(1,2,3,NA,5)
ifelse(is.na(x),mean(x,na.rm=TRUE),x)
## [1] 1.00 2.00 3.00 2.75 5.00
Kategorik Değişken İçin Uç Değerler
library(summarytools)
## Warning: package 'summarytools' was built under R version 4.4.3
freq(ogrenci$BSBM19A,
round.digits=2,report.nas = FALSE,
style = "rmarkdown")
## setting plain.ascii to FALSE
## ### Frequencies
## #### ogrenci$BSBM19A
## **Type:** Numeric
##
## | | Freq | % | % Cum. |
## |----------:|-----:|-------:|-------:|
## | **1** | 1414 | 32.39 | 32.39 |
## | **2** | 1481 | 33.92 | 66.31 |
## | **3** | 711 | 16.28 | 82.59 |
## | **4** | 760 | 17.41 | 100.00 |
## | **Total** | 4366 | 100.00 | 100.00 |
library(knitr)
## Warning: package 'knitr' was built under R version 4.4.3
freq(ogrenci$BSBM19B,report.nas = FALSE) %>%
kable(format='markdown',
caption="Frekans Tablosu",digits = 2)
| Freq | % Valid | % Valid Cum. | % Total | % Total Cum. | |
|---|---|---|---|---|---|
| 1 | 1395 | 31.95 | 31.95 | 31.95 | 31.95 |
| 2 | 935 | 21.42 | 53.37 | 21.42 | 53.37 |
| 3 | 780 | 17.87 | 71.23 | 17.87 | 71.23 |
| 4 | 1256 | 28.77 | 100.00 | 28.77 | 100.00 |
| 0 | NA | NA | 0.00 | 100.00 | |
| Total | 4366 | 100.00 | 100.00 | 100.00 | 100.00 |
Sürekli Değişkenlerde Uç Değerler
library(outliers)
z.scores <- ogrenci %>%
select(1:6) %>%
scores(type = "z") %>%
round(2)
head(z.scores)
## BSBG01 BSBM19A BSBM19B BSBM19C BSBM19D BSBM19E
## 1 1 -1.11 -0.36 -0.40 -0.99 -1.14
## 2 -1 -0.17 -1.19 -1.28 -0.07 1.49
## 3 1 -0.17 0.47 -1.28 -0.07 0.61
## 4 1 -1.11 -0.36 1.34 -0.99 -1.14
## 5 -1 -0.17 -1.19 -1.28 0.86 1.49
## 6 1 -1.11 1.30 1.34 -0.99 -1.14
summarytools::descr(z.scores,
stats = c("min", "max"),
transpose = TRUE,
headings = FALSE)
##
## Min Max
## ------------- ------- ------
## BSBG01 -1.00 1.00
## BSBM19A -1.11 1.69
## BSBM19B -1.19 1.30
## BSBM19C -1.28 1.34
## BSBM19D -0.99 1.78
## BSBM19E -1.14 1.49
library(DT)
## Warning: package 'DT' was built under R version 4.4.3
DT::datatable(z.scores)
BSBM22F Maddesinin İncelenmesi
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
ggplot(ogrenci, aes(x = BSBM22F)) +
geom_histogram(bins = 30L, fill = "#0c4c8a")
# library(ggpmisc)
ggplot(ogrenci, aes(x = BSBM22F)) + geom_histogram() +
geom_vline(xintercept =7.914, color = "red",
linetype = "dashed") +
annotate("text", label = "Ort = 7.913", x = 10, y = 100, color ="black")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
Yoğunluk Grafiği
ggplot(ogrenci, aes(x = BSBM22F)) +
geom_histogram(aes(y=..density..))+
geom_density(alpha=.5, fill="#0c4c8a") +
theme_minimal()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
İnteraktif Grafik
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(x = ogrenci$BSBM22F, type = "histogram",
histnorm = "probability")
Kutu Grafiği
ggplot(ogrenci, aes(y = BSBM22F)) +
geom_boxplot()
Uç Değerler
out <- boxplot.stats(ogrenci$BSMMAT01)$out
out
## [1] 851.0999 837.9938
out_ind <- which(ogrenci$BSMMAT01 %in% c(out))
out_ind
## [1] 463 1076
İnteraktif Kutu Grafiği
plot_ly(y = ogrenci$BSMMAT01, type = 'box')
plot_ly(y = ogrenci$BSMMAT01, type = 'box') %>%
layout(title = 'Box Plot',
annotations = list( x = -0.01, y = boxplot.stats(ogrenci$BSMMAT01)$out,
text = paste(out_ind), showarrow = FALSE,
xanchor = "right"))
Mahalanobis Uzaklığı
Çoklu regresyon modelinde kullanılan bağımsız değişkenler uzayındaki merkezden veya örneklem ortalamasından, tek bir veri noktasının uzaklığını ölçen istatistiktir. Tek değişkenli uzaklık belirleme teknikleriyle yakın ilişkili olmasının yanı sıra, çoklu normal verilerde ve geniş örneklemlerde, değişken sayısına bağlı olarak elde edilen serbestlik derecesi ile ki-kare dağılımına uyar ve örneklem sayısı arttıkça da bu kestirim daha güçlü hale gelir (Johnson ve Wichern, 2002).
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:outliers':
##
## outlier
veri <- ogrenci[,1:5]
md <- mahalanobis(veri, center = colMeans(veri), cov = cov(veri))
head(md,20)
## [1] 4.112136 4.100392 6.174371 7.312989 5.048895 3.137728 3.388253 3.137728
## [9] 3.750581 3.281783 4.251374 4.237090 4.510729 4.112136 3.664009 4.031108
## [17] 3.452970 4.433120 1.757408 3.144806
library(psych)
alpha <- .001
cutoff <- (qchisq(p = 1 - alpha, df = ncol(veri)))
cutoff
## [1] 20.51501
ucdegerler <- which(md > cutoff)
veri[ucdegerler, ]
## # A tibble: 10 × 5
## BSBG01 BSBM19A BSBM19B BSBM19C BSBM19D
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 4 1 4 1
## 2 2 4 4 1 1
## 3 1 4 4 1 1
## 4 2 4 4 1 1
## 5 2 4 4 1 1
## 6 2 4 4 1 1
## 7 2 1 1 4 4
## 8 2 4 4 1 1
## 9 2 4 4 1 1
## 10 2 4 4 1 1
data_temiz <- veri[-ucdegerler, ]
veri[ucdegerler, ]
## # A tibble: 10 × 5
## BSBG01 BSBM19A BSBM19B BSBM19C BSBM19D
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 4 1 4 1
## 2 2 4 4 1 1
## 3 1 4 4 1 1
## 4 2 4 4 1 1
## 5 2 4 4 1 1
## 6 2 4 4 1 1
## 7 2 1 1 4 4
## 8 2 4 4 1 1
## 9 2 4 4 1 1
## 10 2 4 4 1 1
Çok Değişkenli Normallik
library(sur)
## Warning: package 'sur' was built under R version 4.4.3
##
## Attaching package: 'sur'
## The following object is masked from 'package:psych':
##
## skew
attach(ogrenci)
skew(BSBM22F)
## [1] -0.141198
se.skew(BSBM22F)
## [1] 0.03705823
skew.ratio(BSBM22F)
## [1] -3.810165
skew(BSBM22F)/se.skew(BSBM22F)
## [1] -3.810165
Jarque Test
library(moments)
library(labelled)
## Warning: package 'labelled' was built under R version 4.4.3
jarque.test(remove_labels(BSBM22F))
##
## Jarque-Bera Normality Test
##
## data: remove_labels(BSBM22F)
## JB = 347.02, p-value < 2.2e-16
## alternative hypothesis: greater
set.seed(0)
normal <- rnorm(200)
non_normal<- rexp(200, rate=3)
par(mfrow=c(1,2))
hist(normal, col='steelblue', main='Normal')
hist(non_normal, col='steelblue', main='Non-normal')
par(mfrow=c(1,2))
qqnorm(normal, main='Normal')
qqline(normal)
qqnorm(non_normal, main='Non-normal')
qqline(non_normal)
ggplot(data = ogrenci, aes(sample = BSBM22F )) +
geom_qq()+
geom_qq_line( )