library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
#install.packages("readr")
library(readr)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(tidyr)
library(haven)
library(CTT)
##
## Attaching package: 'CTT'
## The following objects are masked from 'package:psych':
##
## polyserial, reliability
library(readxl)
veri <- readRDS("~/hacettepe/Rders/data/TRUSA.rds")
any(is.na(veri))
## [1] FALSE
sorular <- veri %>%select(starts_with("M0"))
veri <- veri %>% mutate(toplam = rowSums(sorular, na.rm = FALSE))
##d) Toplam puanın her iki ülkeye göre betimsel istatistiklerini hesaplayınız.
grup <- veri %>%
group_by(CNT) %>%
summarise(ortalama = round(mean(toplam),2),
standart_sapma = round(sd(toplam),2),
minimum = min(toplam),
maksimum = max(toplam),
frekans = frequency(toplam),
varyans = var(toplam)
)
kableExtra::kable(grup)
CNT | ortalama | standart_sapma | minimum | maksimum | frekans | varyans |
---|---|---|---|---|---|---|
TUR | 13.45 | 7.57 | 2 | 32 | 1 | 57.30826 |
USA | 17.05 | 7.53 | 1 | 34 | 1 | 56.67899 |
##e) Toplam puanın, Türkiye ve ABD örneklemlerinde farklılaşıp farklılaşmadığını t testi ile test ediniz.
t_test_tam <- t.test(toplam ~ CNT, data = veri, var.equal = TRUE)
##f) Veri setinde %5, %10 ve %15 oranında eksik veriler oluşturunuz.
# %5 Eksik Veri
# %5 Eksik Veri
set.seed(123) # Sonuçların tekrarlanabilir olması için
veri_5 <- veri
n_cells <- nrow(veri_5) * ncol(veri_5)
# Eksik veri oranını belirle
missing_5 <- sample(1:n_cells, size = 0.05 * n_cells, replace = FALSE)
# İndeksleri satır ve sütun şeklinde ayır
rows <- rep(1:nrow(veri_5), times = ncol(veri_5))
cols <- rep(1:ncol(veri_5), each = nrow(veri_5))
# Eksik verileri yerleştirme
veri_5[cbind(rows[missing_5], cols[missing_5])] <- NA
# %5 Eksik Veri
set.seed(123) # Sonuçların tekrarlanabilir olması için
veri_10 <- veri
n_cells <- nrow(veri_10) * ncol(veri_10)
# Eksik veri oranını belirle
missing_10 <- sample(1:n_cells, size = 0.05 * n_cells, replace = FALSE)
# İndeksleri satır ve sütun şeklinde ayır
rows <- rep(1:nrow(veri_10), times = ncol(veri_10))
cols <- rep(1:ncol(veri_10), each = nrow(veri_10))
# Eksik verileri yerleştirme
veri_10[cbind(rows[missing_10], cols[missing_10])] <- NA
# %5 Eksik Veri
set.seed(123) # Sonuçların tekrarlanabilir olması için
veri_15 <- veri
n_cells <- nrow(veri_15) * ncol(veri_15)
# Eksik veri oranını belirle
missing_15 <- sample(1:n_cells, size = 0.05 * n_cells, replace = FALSE)
# İndeksleri satır ve sütun şeklinde ayır
rows <- rep(1:nrow(veri_15), times = ncol(veri_15))
cols <- rep(1:ncol(veri_15), each = nrow(veri_15))
# Eksik verileri yerleştirme
veri_5[cbind(rows[missing_5], cols[missing_5])] <- NA
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
mcar_test_5 <- md.pattern(veri_5)
mcar_test_10 <- md.pattern(veri_10)
mcar_test_15 <- md.pattern(veri_15)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
veri_sil_5 <- na.omit(veri_5)
veri_sil_10 <- na.omit(veri_10)
veri_sil_15 <- na.omit(veri_15)
t_test_5 <- t.test(toplam ~ CNT, data = veri_sil_5, var.equal = TRUE)
t_test_10 <- t.test(toplam ~ CNT, data = veri_sil_10, var.equal = TRUE)
t_test_15 <- t.test(toplam ~ CNT, data = veri_sil_15, var.equal = TRUE)
t_test_5
##
## Two Sample t-test
##
## data: toplam by CNT
## t = -2.4339, df = 159, p-value = 0.01604
## alternative hypothesis: true difference in means between group TUR and group USA is not equal to 0
## 95 percent confidence interval:
## -5.8501494 -0.6089578
## sample estimates:
## mean in group TUR mean in group USA
## 14.62903 17.85859
t_test_10
##
## Two Sample t-test
##
## data: toplam by CNT
## t = -2.4339, df = 159, p-value = 0.01604
## alternative hypothesis: true difference in means between group TUR and group USA is not equal to 0
## 95 percent confidence interval:
## -5.8501494 -0.6089578
## sample estimates:
## mean in group TUR mean in group USA
## 14.62903 17.85859
t_test_15
##
## Two Sample t-test
##
## data: toplam by CNT
## t = -7.8348, df = 1149, p-value = 1.064e-14
## alternative hypothesis: true difference in means between group TUR and group USA is not equal to 0
## 95 percent confidence interval:
## -4.493049 -2.693382
## sample estimates:
## mean in group TUR mean in group USA
## 13.45287 17.04609
p_tam <- round(t_test_tam$p.value,2)
p_5 <- round(t_test_5$p.value,2)
p_10 <- round(t_test_10$p.value,2)
p_15 <- round(t_test_15$p.value,2)
p_degerler <- data.frame(p_tam, p_5, p_10, p_15)
kableExtra::kable(p_degerler)
p_tam | p_5 | p_10 | p_15 |
---|---|---|---|
0 | 0.02 | 0.02 | 0 |
sadece %5lik veri için yaptım
veri_5_ekle_regression <- mice(veri_5, method = 'norm.predict', m = 1)
##
## iter imp variable
## 1 1 IDSTUD M042182 M042081 M042049 M042052 M042076 M042302A M042302B M042302C M042100 M042202 M042240 M042093 M042271 M042268 M042159 M042164 M042167 M062208 M062208A M062208B M062208C M062208D M062153 M062111A M062111B M062237 M062314 M062074 M062183 M062202 M062246 M062286 M062325 M062106 M062124 toplam
## 2 1 IDSTUD M042182 M042081 M042049 M042052 M042076 M042302A M042302B M042302C M042100 M042202 M042240 M042093 M042271 M042268 M042159 M042164 M042167 M062208 M062208A M062208B M062208C M062208D M062153 M062111A M062111B M062237 M062314 M062074 M062183 M062202 M062246 M062286 M062325 M062106 M062124 toplam
## 3 1 IDSTUD M042182 M042081 M042049 M042052 M042076 M042302A M042302B M042302C M042100 M042202 M042240 M042093 M042271 M042268 M042159 M042164 M042167 M062208 M062208A M062208B M062208C M062208D M062153 M062111A M062111B M062237 M062314 M062074 M062183 M062202 M062246 M062286 M062325 M062106 M062124 toplam
## 4 1 IDSTUD M042182 M042081 M042049 M042052 M042076 M042302A M042302B M042302C M042100 M042202 M042240 M042093 M042271 M042268 M042159 M042164 M042167 M062208 M062208A M062208B M062208C M062208D M062153 M062111A M062111B M062237 M062314 M062074 M062183 M062202 M062246 M062286 M062325 M062106 M062124 toplam
## 5 1 IDSTUD M042182 M042081 M042049 M042052 M042076 M042302A M042302B M042302C M042100 M042202 M042240 M042093 M042271 M042268 M042159 M042164 M042167 M062208 M062208A M062208B M062208C M062208D M062153 M062111A M062111B M062237 M062314 M062074 M062183 M062202 M062246 M062286 M062325 M062106 M062124 toplam
# Doldurulmuş veri setini almak
veri_5_ekle_regression <- complete(veri_5_ekle_regression, 1)
t_test_reg <- t.test(toplam ~ CNT, data = veri_5_ekle_regression, var.equal = TRUE)
p_tam <- round(t_test_tam$p.value,2)
p_ekle_reg <- round(t_test_reg$p.value,2)
t_ekle_reg <- round(t_test_reg$statistic,2)
t_tam <- round(t_test_tam$statistic,2)
degerler <- data.frame(p_tam,
p_ekle_reg,
t_ekle_reg,
t_tam
)
kableExtra::kable(degerler)
p_tam | p_ekle_reg | t_ekle_reg | t_tam | |
---|---|---|---|---|
t | 0 | 0 | -7.73 | -7.83 |