Bu ödevde, TIMSS 2015 uygulamasına ait bir kitapçığın Türkiye ve Amerika verilerini kullanacaksınız.
TRUSA <- readRDS("C:/R/Rders2/TRUSA.RDS")
any(is.na(TRUSA))
## [1] FALSE
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
TRUSA_1 <- TRUSA %>% mutate(
sum=rowSums(across(starts_with("M0")))
)
betimsel_istatistik <- TRUSA_1 %>%
group_by(CNT) %>%
summarise(
Gozlem_Sayisi = n(),
Ortalama = mean(sum, na.rm = TRUE),
Standart_Sapma = sd(sum, na.rm = TRUE),
Min = min(sum, na.rm = TRUE),
Medyan = median(sum, na.rm = TRUE),
Maks = max(sum, na.rm = TRUE)
)
print(betimsel_istatistik)
## # A tibble: 2 × 7
## CNT Gozlem_Sayisi Ortalama Standart_Sapma Min Medyan Maks
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TUR 435 13.5 7.57 2 11 32
## 2 USA 716 17.0 7.53 1 17 34
turkiye <- TRUSA_1 %>% filter(CNT == "TUR") %>% pull(sum)
abd <- TRUSA_1 %>% filter(CNT == "USA") %>% pull(sum)
t.test(turkiye, abd)
##
## Welch Two Sample t-test
##
## data: turkiye and abd
## t = -7.8242, df = 912.31, p-value = 1.41e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.494510 -2.691921
## sample estimates:
## mean of x mean of y
## 13.45287 17.04609
make_missing <- function(data, percent) {
# Veri çerçevesinin toplam hücre sayısını hesapla
n <- nrow(data) * ncol(data)
# Eksik olacak hücrelerin sayısını belirle
num_missing <- round(n * percent / 100)
# Satır ve sütun indekslerini rastgele seç
row_indices <- sample(nrow(data), size = num_missing, replace = TRUE)
col_indices <- sample(ncol(data), size = num_missing, replace = TRUE)
# Seçilen satır ve sütunlara NA ata
for (i in seq_along(row_indices)) {
data[row_indices[i], col_indices[i]] <- NA
}
return(data)
}
M1 <- make_missing(TRUSA_1, 5)
M2 <- make_missing(TRUSA_1, 10)
M3 <- make_missing(TRUSA_1, 15)
library(knitr)
toplam_hucre_sayisi <- function(data) sum(!is.na(data))
eksik_veri_tablosu <- data.frame(
Veri_Durumu = c("%5 Eksik Veri", "%10 Eksik Veri", "%15 Eksik Veri"),
Toplam_Veriler = c(toplam_hucre_sayisi(M1),
toplam_hucre_sayisi(M2),
toplam_hucre_sayisi(M3)),
Eksik_Veri_Sayisi = c(sum(is.na(M1)),
sum(is.na(M2)),
sum(is.na(M3)))
)
kable(eksik_veri_tablosu, digits = 0, caption = "Eksik Veri Tablosu")
| Veri_Durumu | Toplam_Veriler | Eksik_Veri_Sayisi |
|---|---|---|
| %5 Eksik Veri | 42720 | 2169 |
| %10 Eksik Veri | 40621 | 4268 |
| %15 Eksik Veri | 38618 | 6271 |
library(naniar)
## Warning: package 'naniar' was built under R version 4.4.2
M1_1 <- M1[, -c(1, 2, 3)]
M2_1 <- M2[, -c(1, 2, 3)]
M3_1 <- M3[, -c(1, 2, 3)]
mcar5 <- mcar_test(M1_1)
## Warning in norm::prelim.norm(data): NAs introduced by coercion to integer range
mcar10 <- mcar_test(M2_1)
## Warning in norm::prelim.norm(data): NAs introduced by coercion to integer range
mcar15 <- mcar_test(M3_1)
## Warning in norm::prelim.norm(data): NAs introduced by coercion to integer range
mcar_sonuclari <- data.frame(
Eksiklik = c("%5 Eksik", "%10 Eksik", "%15 Eksik"),
p_degeri = c(mcar5$p.value, mcar10$p.value, mcar15$p.value),
Missing_patterns = c(mcar5$missing.patterns, mcar10$missing.patterns, mcar15$missing.patterns)
)
kable(mcar_sonuclari, digits = 2, caption = "MCAR Test Sonuçları")
| Eksiklik | p_degeri | Missing_patterns |
|---|---|---|
| %5 Eksik | 1 | 558 |
| %10 Eksik | 1 | 1002 |
| %15 Eksik | 1 | 1132 |
M1_temiz <- na.omit(M1)
M2_temiz <- na.omit(M2)
M3_temiz <- na.omit(M3)
turkiye_1 <- M1_temiz %>% filter(CNT == "TUR") %>% pull(sum)
abd_1 <- M1_temiz %>% filter(CNT == "USA") %>% pull(sum)
t.test(turkiye_1, abd_1)
##
## Welch Two Sample t-test
##
## data: turkiye_1 and abd_1
## t = -1.5922, df = 134.51, p-value = 0.1137
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.4360540 0.4790648
## sample estimates:
## mean of x mean of y
## 14.00000 15.97849
turkiye_2 <- M2_temiz %>% filter(CNT == "TUR") %>% pull(sum)
abd_2 <- M2_temiz %>% filter(CNT == "USA") %>% pull(sum)
t.test(turkiye_2, abd_2)
##
## Welch Two Sample t-test
##
## data: turkiye_2 and abd_2
## t = -1.0356, df = 8.11, p-value = 0.3303
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -10.992489 4.167928
## sample estimates:
## mean of x mean of y
## 13.16667 16.57895
turkiye_3 <- M3_temiz %>% filter(CNT == "TUR") %>% pull(sum)
abd_3 <- M3_temiz %>% filter(CNT == "USA") %>% pull(sum)
#t.test(turkiye_3, abd_3). Sadece USA verisi kaldığı için t-test uygulanamıyor.
M1_imputed<- M1 %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
M2_imputed<- M1 %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
M3_imputed<- M1 %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
turkiye <- TRUSA_1 %>% filter(CNT == "TUR") %>% pull(sum)
abd <- TRUSA_1 %>% filter(CNT == "USA") %>% pull(sum)
t_test_veri <- t.test(turkiye, abd)
turkiye_1_imputed <- M1_imputed %>% filter(CNT == "TUR") %>% pull(sum)
abd_1_imputed <- M1_imputed %>% filter(CNT == "USA") %>% pull(sum)
t_test_M1 <- t.test(turkiye_1_imputed, abd_1_imputed)
turkiye_2_imputed <- M2_imputed %>% filter(CNT == "TUR") %>% pull(sum)
abd_2_imputed <- M2_imputed %>% filter(CNT == "USA") %>% pull(sum)
t_test_M2 <- t.test(turkiye_2_imputed, abd_2_imputed)
turkiye_3_imputed <- M3_imputed %>% filter(CNT == "TUR") %>% pull(sum)
abd_3_imputed <- M3_imputed %>% filter(CNT == "USA") %>% pull(sum)
t_test_M3 <- t.test(turkiye_3_imputed, abd_3_imputed)
Süre: 2 saat 10 dakika