library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(stevemisc)
##
## Attaching package: 'stevemisc'
##
## The following object is masked from 'package:lubridate':
##
## dst
##
## The following object is masked from 'package:dplyr':
##
## tbl_df
library(knitr)
library(kableExtra)
library(naniar)
library(haven)
library(summarytools)
##
## Attaching package: 'summarytools'
##
## The following object is masked from 'package:tibble':
##
## view
library(outliers)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggpmisc)
## Loading required package: ggpp
## Registered S3 methods overwritten by 'ggpp':
## method from
## heightDetails.titleGrob ggplot2
## widthDetails.titleGrob ggplot2
##
## Attaching package: 'ggpp'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(psych)
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:outliers':
##
## outlier
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(sur)
##
## Attaching package: 'sur'
##
## The following object is masked from 'package:psych':
##
## skew
library(moments)
library(corrplot)
## corrplot 0.95 loaded
library(olsrr)
##
## Attaching package: 'olsrr'
##
## The following object is masked from 'package:datasets':
##
## rivers
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(readr)
scr_canada <- read_csv("D:/OLC_733/scr_canada.csv")
## Rows: 13879 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): idbid
## dbl (28): IDCNTRY, IDPOP, IDGRADER, IDGRADE, IDSCHOOL, IDCLASS, IDSTUD, ITSE...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(scr_canada) # ilk satırların gözlemlenmesi
kable(miss_var_summary(scr_canada), caption = "Kayıp Veri Özeti") %>%
kable_styling(full_width = TRUE, position = "center")
| variable | n_miss | pct_miss |
|---|---|---|
| ASBR08A | 194 | 1.40 |
| ASBR08B | 194 | 1.40 |
| ASBR08C | 194 | 1.40 |
| ASBR08D | 194 | 1.40 |
| ASBR08E | 194 | 1.40 |
| ASBR08F | 194 | 1.40 |
| IDCNTRY | 0 | 0 |
| IDPOP | 0 | 0 |
| IDGRADER | 0 | 0 |
| IDGRADE | 0 | 0 |
| IDSCHOOL | 0 | 0 |
| IDCLASS | 0 | 0 |
| IDSTUD | 0 | 0 |
| ITSEX | 0 | 0 |
| ITADMINI | 0 | 0 |
| ITLANG_SA | 0 | 0 |
| ITLANG_SQ | 0 | 0 |
| IDBOOK | 0 | 0 |
| HOUWGT | 0 | 0 |
| TOTWGT | 0 | 0 |
| SENWGT | 0 | 0 |
| JKREP | 0 | 0 |
| JKZONE | 0 | 0 |
| ASRREA01 | 0 | 0 |
| ASRREA02 | 0 | 0 |
| ASRREA03 | 0 | 0 |
| ASRREA04 | 0 | 0 |
| ASRREA05 | 0 | 0 |
| idbid | 0 | 0 |
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
vis_miss(scr_canada) +
labs(title = "Eksik Veri Haritası (vis_miss)", x = "Değişkenler", y = "Gözlemler") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg_miss_var(scr_canada) +
labs(title = "Değişken Bazında Eksik Veri Sayısı (gg_miss_var)", x = "Değişkenler", y = "Eksik Değer Sayısı") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
cat("\n\nEksik Veri Örüntüsü Tablosu (Seçili Değişkenler ile):\n\n")
##
##
## Eksik Veri Örüntüsü Tablosu (Seçili Değişkenler ile):
scr_canada %>%
select(IDCNTRY, ITSEX, ASBR08A, ASBR08B, ASBR08C, ASBR08D, ASBR08E, ASBR08F) %>%
md.pattern()
## IDCNTRY ITSEX ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 13685 1 1 1 1 1 1 1 1 0
## 194 1 1 0 0 0 0 0 0 6
## 0 0 194 194 194 194 194 194 1164
aggr(scr_canada, numbers = TRUE, sortVars = TRUE, cex.axis = 0.7, gap = 3, ylab = c("Eksik Veri Oranı", "Örüntü"))
##
## Variables sorted by number of missings:
## Variable Count
## ASBR08A 0.01397795
## ASBR08B 0.01397795
## ASBR08C 0.01397795
## ASBR08D 0.01397795
## ASBR08E 0.01397795
## ASBR08F 0.01397795
## IDCNTRY 0.00000000
## IDPOP 0.00000000
## IDGRADER 0.00000000
## IDGRADE 0.00000000
## IDSCHOOL 0.00000000
## IDCLASS 0.00000000
## IDSTUD 0.00000000
## ITSEX 0.00000000
## ITADMINI 0.00000000
## ITLANG_SA 0.00000000
## ITLANG_SQ 0.00000000
## IDBOOK 0.00000000
## HOUWGT 0.00000000
## TOTWGT 0.00000000
## SENWGT 0.00000000
## JKREP 0.00000000
## JKZONE 0.00000000
## ASRREA01 0.00000000
## ASRREA02 0.00000000
## ASRREA03 0.00000000
## ASRREA04 0.00000000
## ASRREA05 0.00000000
## idbid 0.00000000
#MCAR Testi İle Kayıp Verinin Rastgeleliğinin İncelenmesi
veri_mcar <- scr_canada %>%
select(ASBR08A, ASBR08B, ASBR08C, ASBR08D, ASBR08E, ASBR08F)
mcar_test(veri_mcar)
#serbestlik derecesi 0 çıktığı için daha fazla değişken ekleyerek, mcar analizini yeniden yapmaya karar verdim.
veri_mcar2 <- scr_canada %>%
select(IDCNTRY, ITSEX, ASBR08A, ASBR08B, ASBR08C, ASBR08D, ASBR08E, ASBR08F,ASBR08A, ASBR08B, ASBR08C, ASBR08D, ASBR08E, ASBR08F)
mcar_test(veri_mcar2)
gg_miss_case(veri_mcar2) +
labs(title = "Birey Bazında Eksik Veri Sayısı") +
theme_minimal()
vis_miss(veri_mcar2) +
labs(title = "Eksik Veri Haritası") +
theme_minimal()
#Cinsiyete göre kayıp verinin incelenmesi daha mantıklı olabilir gibi geldi
veri_mcar2 %>%
group_by(ITSEX) %>%
gg_miss_var(facet = TRUE) +
labs(title = "Cinsiyete Göre Eksik Veri Dağılımı") +
theme_minimal()
* Çoklu atama yönteminin bu veri yapısı için uygun olabileceğini düşündüm.
# Çoklu atama modeli
mice_model <- mice(veri_mcar2, m = 5, method = "pmm", seed = 123)
##
## iter imp variable
## 1 1 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 1 2 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 1 3 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 1 4 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 1 5 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 2 1 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 2 2 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 2 3 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 2 4 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 2 5 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 3 1 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 3 2 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 3 3 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 3 4 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 3 5 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 4 1 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 4 2 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 4 3 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 4 4 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 4 5 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 5 1 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 5 2 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 5 3 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 5 4 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 5 5 ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
summary(mice_model)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## IDCNTRY ITSEX ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## "" "" "pmm" "pmm" "pmm" "pmm" "pmm" "pmm"
## PredictorMatrix:
## IDCNTRY ITSEX ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## IDCNTRY 0 1 1 1 1 1 1 1
## ITSEX 1 0 1 1 1 1 1 1
## ASBR08A 1 1 0 1 1 1 1 1
## ASBR08B 1 1 1 0 1 1 1 1
## ASBR08C 1 1 1 1 0 1 1 1
## ASBR08D 1 1 1 1 1 0 1 1
tamamlanmis_veri <- complete(mice_model, 1)
is.na(tamamlanmis_veri) %>% colSums()
## IDCNTRY ITSEX ASBR08A ASBR08B ASBR08C ASBR08D ASBR08E ASBR08F
## 0 0 0 0 0 0 0 0
veri_numeric <- tamamlanmis_veri %>%
select(where(is.numeric)) %>%
select(ASBR08A, ASBR08B, ASBR08C, ASBR08D, ASBR08E, ASBR08F)
# Uzun formata çevrilmesi ve boxplot çizilmesi
veri_numeric %>%
pivot_longer(cols = everything(), names_to = "Degisken", values_to = "Deger") %>%
ggplot(aes(x = Degisken, y = Deger)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 8) +
labs(title = "Tamamlanmış Veri Setinde ASBR08A-F Değişkenlerine Göre Kutu Grafikleri", x = "Değişkenler", y = "Değerler") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
coord_cartesian(ylim = c(0, 10))