VERİ TEMİZLEME

getwd()
## [1] "/Users/meltematasoy/Desktop/Doktora Dersleri/Doktora Dersleri 2.Yarıyıl/OLC731_R ile İleri İstatistik Uygulamaları_Doç. Dr. Kübra Atalay Kabasakal /1.Hafta 16.02.2026"
load("midiPISA.rda")
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)

library(haven)

describe(midiPISA [,-1])
##             vars    n   mean    sd median trimmed   mad    min    max  range
## SINIF          1 6890   9.84  0.46  10.00    9.89  0.00   7.00  12.00   5.00
## CINSIYET       2 6890   1.51  0.50   2.00    1.51  0.00   1.00   2.00   1.00
## Anne_Egitim    3 6835   2.66  1.95   2.00    2.57  1.48   0.00   6.00   6.00
## Baba_Egitim    4 6833   3.12  1.94   2.00    3.08  1.48   0.00   6.00   6.00
## OKUMA_ZEVK     5 6821   0.68  0.98   0.64    0.65  0.88  -2.73   2.66   5.39
## ST097Q01TA     6 6826   3.01  0.74   3.00    3.07  0.00   1.00   4.00   3.00
## ST097Q02TA     7 6807   2.92  0.82   3.00    2.99  0.00   1.00   4.00   3.00
## ST097Q03TA     8 6779   2.94  0.89   3.00    3.05  1.48   1.00   4.00   3.00
## ST097Q04TA     9 6809   2.70  0.86   3.00    2.75  1.48   1.00   4.00   3.00
## ST097Q05TA    10 6821   2.69  0.91   3.00    2.74  1.48   1.00   4.00   3.00
## ODOKUMA1      11 6890 464.23 87.78 463.40  463.90 91.11 175.61 771.51 595.90
## ODOKUMA2      12 6890 464.42 87.70 465.92  464.57 90.33 166.62 729.88 563.26
## ODOKUMA3      13 6890 464.71 87.08 464.62  464.81 91.06 171.84 748.15 576.32
## ODOKUMA4      14 6890 464.61 87.40 464.89  464.48 90.43 184.83 739.18 554.36
## ODOKUMA5      15 6890 464.20 87.21 464.83  464.36 91.02 168.89 747.02 578.13
##              skew kurtosis   se
## SINIF       -0.81     2.14 0.01
## CINSIYET    -0.03    -2.00 0.01
## Anne_Egitim  0.45    -1.13 0.02
## Baba_Egitim  0.26    -1.36 0.02
## OKUMA_ZEVK   0.12     0.22 0.01
## ST097Q01TA  -0.73     0.77 0.01
## ST097Q02TA  -0.65     0.11 0.01
## ST097Q03TA  -0.66    -0.20 0.01
## ST097Q04TA  -0.41    -0.41 0.01
## ST097Q05TA  -0.35    -0.63 0.01
## ODOKUMA1     0.04    -0.30 1.06
## ODOKUMA2    -0.02    -0.33 1.06
## ODOKUMA3     0.00    -0.31 1.05
## ODOKUMA4     0.01    -0.34 1.05
## ODOKUMA5    -0.01    -0.34 1.05
screen <- read_sav("SCREEN.SAV")

head(screen)
## # A tibble: 6 × 8
##   SUBNO TIMEDRS ATTDRUG ATTHOUSE INCOME EMPLMNT      MSTATUS  RACE
##   <dbl>   <dbl>   <dbl>    <dbl>  <dbl> <dbl+lbl>      <dbl> <dbl>
## 1     1       1       8       27      5 1 [HOUSEWFE]       2     1
## 2     2       3       7       20      6 0 [PAIDWORK]       2     1
## 3     3       0       8       23      3 0 [PAIDWORK]       2     1
## 4     4      13       9       28      8 1 [HOUSEWFE]       2     1
## 5     5      15       7       24      1 1 [HOUSEWFE]       2     1
## 6     6       3       8       25      4 0 [PAIDWORK]       2     1
summary(screen)
##      SUBNO          TIMEDRS          ATTDRUG          ATTHOUSE    
##  Min.   :  1.0   Min.   : 0.000   Min.   : 5.000   Min.   : 2.00  
##  1st Qu.:137.0   1st Qu.: 2.000   1st Qu.: 7.000   1st Qu.:21.00  
##  Median :314.0   Median : 4.000   Median : 8.000   Median :24.00  
##  Mean   :317.4   Mean   : 7.901   Mean   : 7.686   Mean   :23.54  
##  3rd Qu.:483.0   3rd Qu.:10.000   3rd Qu.: 9.000   3rd Qu.:27.00  
##  Max.   :758.0   Max.   :81.000   Max.   :10.000   Max.   :35.00  
##                                                    NA's   :1      
##      INCOME         EMPLMNT         MSTATUS           RACE      
##  Min.   : 1.00   Min.   :0.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 2.50   1st Qu.:0.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median : 4.00   Median :0.000   Median :2.000   Median :1.000  
##  Mean   : 4.21   Mean   :0.471   Mean   :1.778   Mean   :1.088  
##  3rd Qu.: 6.00   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:1.000  
##  Max.   :10.00   Max.   :1.000   Max.   :2.000   Max.   :2.000  
##  NA's   :26
str(screen)
## tibble [465 × 8] (S3: tbl_df/tbl/data.frame)
##  $ SUBNO   : num [1:465] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "label")= chr "Subject number"
##   ..- attr(*, "format.spss")= chr "F3.0"
##  $ TIMEDRS : num [1:465] 1 3 0 13 15 3 2 0 7 4 ...
##   ..- attr(*, "label")= chr "Visits to health professionals"
##   ..- attr(*, "format.spss")= chr "F2.0"
##  $ ATTDRUG : num [1:465] 8 7 8 9 7 8 7 7 7 8 ...
##   ..- attr(*, "label")= chr "Attitudes toward medication"
##   ..- attr(*, "format.spss")= chr "F2.0"
##  $ ATTHOUSE: num [1:465] 27 20 23 28 24 25 30 24 20 30 ...
##   ..- attr(*, "label")= chr "Attitudes toward housework"
##   ..- attr(*, "format.spss")= chr "F2.0"
##  $ INCOME  : num [1:465] 5 6 3 8 1 4 6 6 2 8 ...
##   ..- attr(*, "format.spss")= chr "F2.0"
##  $ EMPLMNT : dbl+lbl [1:465] 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, ...
##    ..@ label      : chr "Whether currently employed"
##    ..@ format.spss: chr "F1.0"
##    ..@ labels     : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "PAIDWORK" "HOUSEWFE"
##  $ MSTATUS : num [1:465] 2 2 2 2 2 2 2 2 2 1 ...
##   ..- attr(*, "label")= chr "Whether currently married"
##   ..- attr(*, "format.spss")= chr "F1.0"
##  $ RACE    : num [1:465] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "format.spss")= chr "F1.0"
describe(screen)
##          vars   n   mean     sd median trimmed    mad min max range  skew
## SUBNO       1 465 317.38 194.16    314  313.26 256.49   1 758   757  0.14
## TIMEDRS     2 465   7.90  10.95      4    5.61   4.45   0  81    81  3.23
## ATTDRUG     3 465   7.69   1.16      8    7.71   1.48   5  10     5 -0.12
## ATTHOUSE    4 464  23.54   4.48     24   23.62   4.45   2  35    33 -0.45
## INCOME      5 439   4.21   2.42      4    4.01   2.97   1  10     9  0.58
## EMPLMNT     6 465   0.47   0.50      0    0.46   0.00   0   1     1  0.12
## MSTATUS     7 465   1.78   0.42      2    1.85   0.00   1   2     1 -1.34
## RACE        8 465   1.09   0.28      1    1.00   0.00   1   2     1  2.90
##          kurtosis   se
## SUBNO       -0.99 9.00
## TIMEDRS     12.88 0.51
## ATTDRUG     -0.47 0.05
## ATTHOUSE     1.51 0.21
## INCOME      -0.38 0.12
## EMPLMNT     -1.99 0.02
## MSTATUS     -0.21 0.02
## RACE         6.40 0.01
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.5.2
screen %>%
  select(2:6) %>%
  tbl_summary(
      statistic = all_continuous() ~ c("{min},{max}"),
      missing = "always"
  )
## ! Column(s) "EMPLMNT" are class "haven_labelled".
## ℹ This is an intermediate data structure not meant for analysis.
## ℹ Convert columns with `haven::as_factor()`, `labelled::to_factor()`,
##   `labelled::unlabelled()`, and `unclass()`. Failure to convert may have
##   unintended consequences or result in error.
## <https://haven.tidyverse.org/articles/semantics.html>
## <https://larmarange.github.io/labelled/articles/intro_labelled.html#unlabelled>
Characteristic N = 4651
Visits to health professionals 0,81
    Unknown 0
Attitudes toward medication
    5 13 (2.8%)
    6 60 (13%)
    7 126 (27%)
    8 149 (32%)
    9 95 (20%)
    10 22 (4.7%)
    Unknown 0
Attitudes toward housework 2.0,35.0
    Unknown 1
INCOME 1.00,10.00
    Unknown 26
Whether currently employed
    0 246 (53%)
    1 219 (47%)
    Unknown 0
1 Min,Max; n (%)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(vtable)

sumtable(screen, summ = c('notNA(x)', 'min(x)', 'max(x)'))
Summary Statistics
Variable NotNA Min Max
SUBNO 465 1 758
TIMEDRS 465 0 81
ATTDRUG 465 5 10
ATTHOUSE 464 2 35
INCOME 439 1 10
MSTATUS 465 1 2
RACE 465 1 2
st(screen, summ= c('notNA(x)', 'min(x)', 'max(x)'),
summ.names= c('Frekans', 'Minimum', 'Maximum'))
Summary Statistics
Variable Frekans Minimum Maximum
SUBNO 465 1 758
TIMEDRS 465 0 81
ATTDRUG 465 5 10
ATTHOUSE 464 2 35
INCOME 439 1 10
MSTATUS 465 1 2
RACE 465 1 2
kable(describe(screen[,-1]), format = 'markdown', caption= 'Betimsel istatisikler', digits = 3)
Betimsel istatisikler
vars n mean sd median trimmed mad min max range skew kurtosis se
TIMEDRS 1 465 7.901 10.948 4 5.606 4.448 0 81 81 3.227 12.879 0.508
ATTDRUG 2 465 7.686 1.156 8 7.708 1.483 5 10 5 -0.122 -0.466 0.054
ATTHOUSE 3 464 23.541 4.484 24 23.624 4.448 2 35 33 -0.454 1.507 0.208
INCOME 4 439 4.210 2.419 4 4.014 2.965 1 10 9 0.578 -0.381 0.115
EMPLMNT 5 465 0.471 0.500 0 0.464 0.000 0 1 1 0.116 -1.991 0.023
MSTATUS 6 465 1.778 0.416 2 1.847 0.000 1 2 1 -1.337 -0.213 0.019
RACE 7 465 1.088 0.284 1 1.000 0.000 1 2 1 2.895 6.398 0.013
library(skimr)
## Warning: package 'skimr' was built under R version 4.5.2
skim(screen)
Data summary
Name screen
Number of rows 465
Number of columns 8
_______________________
Column type frequency:
numeric 8
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SUBNO 0 1.00 317.38 194.16 1 137.0 314 483 758 ▇▆▆▇▁
TIMEDRS 0 1.00 7.90 10.95 0 2.0 4 10 81 ▇▁▁▁▁
ATTDRUG 0 1.00 7.69 1.16 5 7.0 8 9 10 ▃▇▇▅▁
ATTHOUSE 1 1.00 23.54 4.48 2 21.0 24 27 35 ▁▁▅▇▂
INCOME 26 0.94 4.21 2.42 1 2.5 4 6 10 ▆▇▅▃▂
EMPLMNT 0 1.00 0.47 0.50 0 0.0 0 1 1 ▇▁▁▁▇
MSTATUS 0 1.00 1.78 0.42 1 2.0 2 2 2 ▂▁▁▁▇
RACE 0 1.00 1.09 0.28 1 1.0 1 1 2 ▇▁▁▁▁
skim(midiPISA)
Data summary
Name midiPISA
Number of rows 6890
Number of columns 16
_______________________
Column type frequency:
numeric 16
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
OGRENCIID 0 1.00 79203623.28 2086.54 79200001.00 79201814.25 79203625.50 79205426.75 79207242.00 ▇▇▇▇▇
SINIF 0 1.00 9.84 0.46 7.00 10.00 10.00 10.00 12.00 ▁▂▇▁▁
CINSIYET 0 1.00 1.51 0.50 1.00 1.00 2.00 2.00 2.00 ▇▁▁▁▇
Anne_Egitim 55 0.99 2.66 1.95 0.00 1.00 2.00 4.00 6.00 ▇▅▂▂▅
Baba_Egitim 57 0.99 3.12 1.94 0.00 1.00 2.00 5.00 6.00 ▆▆▂▂▇
OKUMA_ZEVK 69 0.99 0.68 0.98 -2.73 0.01 0.64 1.23 2.66 ▁▁▇▇▃
ST097Q01TA 64 0.99 3.01 0.74 1.00 3.00 3.00 3.00 4.00 ▁▂▁▇▃
ST097Q02TA 83 0.99 2.92 0.82 1.00 3.00 3.00 3.00 4.00 ▁▂▁▇▃
ST097Q03TA 111 0.98 2.94 0.89 1.00 3.00 3.00 4.00 4.00 ▂▂▁▇▅
ST097Q04TA 81 0.99 2.70 0.86 1.00 2.00 3.00 3.00 4.00 ▂▃▁▇▂
ST097Q05TA 69 0.99 2.69 0.91 1.00 2.00 3.00 3.00 4.00 ▂▅▁▇▃
ODOKUMA1 0 1.00 464.23 87.78 175.61 402.56 463.40 525.72 771.51 ▁▅▇▃▁
ODOKUMA2 0 1.00 464.42 87.70 166.62 403.45 465.92 525.32 729.88 ▁▃▇▅▁
ODOKUMA3 0 1.00 464.71 87.08 171.84 403.36 464.62 526.30 748.15 ▁▃▇▅▁
ODOKUMA4 0 1.00 464.61 87.40 184.83 402.52 464.89 524.91 739.18 ▁▅▇▅▁
ODOKUMA5 0 1.00 464.20 87.21 168.89 403.08 464.83 525.74 747.02 ▁▃▇▅▁
#rapor oluşturmak için

library(DataExplorer)

create_report(midiPISA)
## 
## 
## processing file: report.rmd
##   |                                             |                                     |   0%  |                                             |.                                    |   2%                                   |                                             |..                                   |   5% [global_options]                  |                                             |...                                  |   7%                                   |                                             |....                                 |  10% [introduce]                       |                                             |....                                 |  12%                                   |                                             |.....                                |  14% [plot_intro]
##   |                                             |......                               |  17%                                   |                                             |.......                              |  19% [data_structure]                  |                                             |........                             |  21%                                   |                                             |.........                            |  24% [missing_profile]
##   |                                             |..........                           |  26%                                   |                                             |...........                          |  29% [univariate_distribution_header]  |                                             |...........                          |  31%                                   |                                             |............                         |  33% [plot_histogram]
##   |                                             |.............                        |  36%                                   |                                             |..............                       |  38% [plot_density]                    |                                             |...............                      |  40%                                   |                                             |................                     |  43% [plot_frequency_bar]              |                                             |.................                    |  45%                                   |                                             |..................                   |  48% [plot_response_bar]               |                                             |..................                   |  50%                                   |                                             |...................                  |  52% [plot_with_bar]                   |                                             |....................                 |  55%                                   |                                             |.....................                |  57% [plot_normal_qq]
##   |                                             |......................               |  60%                                   |                                             |.......................              |  62% [plot_response_qq]                |                                             |........................             |  64%                                   |                                             |.........................            |  67% [plot_by_qq]                      |                                             |..........................           |  69%                                   |                                             |..........................           |  71% [correlation_analysis]
##   |                                             |...........................          |  74%                                   |                                             |............................         |  76% [principal_component_analysis]
##   |                                             |.............................        |  79%                                   |                                             |..............................       |  81% [bivariate_distribution_header]   |                                             |...............................      |  83%                                   |                                             |................................     |  86% [plot_response_boxplot]           |                                             |.................................    |  88%                                   |                                             |.................................    |  90% [plot_by_boxplot]                 |                                             |..................................   |  93%                                   |                                             |...................................  |  95% [plot_response_scatterplot]       |                                             |.................................... |  98%                                   |                                             |.....................................| 100% [plot_by_scatterplot]           
## output file: /Users/meltematasoy/Desktop/Doktora Dersleri/Doktora Dersleri 2.Yarıyıl/OLC731_R ile İleri İstatistik Uygulamaları_Doç. Dr. Kübra Atalay Kabasakal /1.Hafta 16.02.2026/report.knit.md
## '/Users/meltematasoy/Desktop/Doktora Dersleri/Doktora Dersleri 1.Yarıyıl/OLC733_R Yazılımı ile Veri Analizi_Doç. Dr. Kübra Atalay Kabasakal/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/pandoc' +RTS -K512m -RTS '/Users/meltematasoy/Desktop/Doktora Dersleri/Doktora Dersleri 2.Yarıyıl/OLC731_R ile İleri İstatistik Uygulamaları_Doç. Dr. Kübra Atalay Kabasakal /1.Hafta 16.02.2026/report.knit.md' --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc158044e5a80f.html --lua-filter /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmarkdown/lua/pagebreak.lua --lua-filter /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmarkdown/lua/latex-div.lua --lua-filter /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmarkdown/lua/table-classes.lua --embed-resources --standalone --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 6 --template /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable 'mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' --include-in-header /var/folders/7w/tm3yfklj31q6cm334dm97bj80000gn/T//RtmpyUcXLK/rmarkdown-str15805520712a.html
## 
## Output created: report.html
create_report(screen)
## 
## 
## processing file: report.rmd
##   |                                             |                                     |   0%  |                                             |.                                    |   2%                                   |                                             |..                                   |   5% [global_options]                  |                                             |...                                  |   7%                                   |                                             |....                                 |  10% [introduce]                       |                                             |....                                 |  12%                                   |                                             |.....                                |  14% [plot_intro]
##   |                                             |......                               |  17%                                   |                                             |.......                              |  19% [data_structure]                  |                                             |........                             |  21%                                   |                                             |.........                            |  24% [missing_profile]
##   |                                             |..........                           |  26%                                   |                                             |...........                          |  29% [univariate_distribution_header]  |                                             |...........                          |  31%                                   |                                             |............                         |  33% [plot_histogram]
##   |                                             |.............                        |  36%                                   |                                             |..............                       |  38% [plot_density]                    |                                             |...............                      |  40%                                   |                                             |................                     |  43% [plot_frequency_bar]              |                                             |.................                    |  45%                                   |                                             |..................                   |  48% [plot_response_bar]               |                                             |..................                   |  50%                                   |                                             |...................                  |  52% [plot_with_bar]                   |                                             |....................                 |  55%                                   |                                             |.....................                |  57% [plot_normal_qq]
##   |                                             |......................               |  60%                                   |                                             |.......................              |  62% [plot_response_qq]                |                                             |........................             |  64%                                   |                                             |.........................            |  67% [plot_by_qq]                      |                                             |..........................           |  69%                                   |                                             |..........................           |  71% [correlation_analysis]
##   |                                             |...........................          |  74%                                   |                                             |............................         |  76% [principal_component_analysis]
##   |                                             |.............................        |  79%                                   |                                             |..............................       |  81% [bivariate_distribution_header]   |                                             |...............................      |  83%                                   |                                             |................................     |  86% [plot_response_boxplot]           |                                             |.................................    |  88%                                   |                                             |.................................    |  90% [plot_by_boxplot]                 |                                             |..................................   |  93%                                   |                                             |...................................  |  95% [plot_response_scatterplot]       |                                             |.................................... |  98%                                   |                                             |.....................................| 100% [plot_by_scatterplot]           
## output file: /Users/meltematasoy/Desktop/Doktora Dersleri/Doktora Dersleri 2.Yarıyıl/OLC731_R ile İleri İstatistik Uygulamaları_Doç. Dr. Kübra Atalay Kabasakal /1.Hafta 16.02.2026/report.knit.md
## '/Users/meltematasoy/Desktop/Doktora Dersleri/Doktora Dersleri 1.Yarıyıl/OLC733_R Yazılımı ile Veri Analizi_Doç. Dr. Kübra Atalay Kabasakal/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/pandoc' +RTS -K512m -RTS '/Users/meltematasoy/Desktop/Doktora Dersleri/Doktora Dersleri 2.Yarıyıl/OLC731_R ile İleri İstatistik Uygulamaları_Doç. Dr. Kübra Atalay Kabasakal /1.Hafta 16.02.2026/report.knit.md' --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc15803017ae15.html --lua-filter /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmarkdown/lua/pagebreak.lua --lua-filter /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmarkdown/lua/latex-div.lua --lua-filter /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmarkdown/lua/table-classes.lua --embed-resources --standalone --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 6 --template /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable 'mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' --include-in-header /var/folders/7w/tm3yfklj31q6cm334dm97bj80000gn/T//RtmpyUcXLK/rmarkdown-str158015a7a076.html
## 
## Output created: report.html
library(haven)

screen <- expss::drop_var_labs(screen)

head(screen, 10)
## # A tibble: 10 × 8
##    SUBNO TIMEDRS ATTDRUG ATTHOUSE INCOME EMPLMNT MSTATUS  RACE
##    <dbl>   <dbl>   <dbl>    <dbl>  <dbl>   <dbl>   <dbl> <dbl>
##  1     1       1       8       27      5       1       2     1
##  2     2       3       7       20      6       0       2     1
##  3     3       0       8       23      3       0       2     1
##  4     4      13       9       28      8       1       2     1
##  5     5      15       7       24      1       1       2     1
##  6     6       3       8       25      4       0       2     1
##  7     7       2       7       30      6       1       2     1
##  8     8       0       7       24      6       1       2     1
##  9     9       7       7       20      2       1       2     1
## 10    10       4       8       30      8       0       1     1
library(naniar)
## 
## Attaching package: 'naniar'
## The following object is masked from 'package:skimr':
## 
##     n_complete
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
any_na(screen)
## [1] TRUE
n_miss(screen)
## [1] 27
prop_miss(screen)
## [1] 0.007258065
screen %>%
  is.na(
  ) %>%
  colSums()
##    SUBNO  TIMEDRS  ATTDRUG ATTHOUSE   INCOME  EMPLMNT  MSTATUS     RACE 
##        0        0        0        1       26        0        0        0
miss_var_summary(screen)
## # A tibble: 8 × 3
##   variable n_miss pct_miss
##   <chr>     <int>    <num>
## 1 INCOME       26    5.59 
## 2 ATTHOUSE      1    0.215
## 3 SUBNO         0    0    
## 4 TIMEDRS       0    0    
## 5 ATTDRUG       0    0    
## 6 EMPLMNT       0    0    
## 7 MSTATUS       0    0    
## 8 RACE          0    0
miss_var_table(screen)
## # A tibble: 3 × 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      6     75  
## 2             1      1     12.5
## 3            26      1     12.5
miss_case_table(screen)
## # A tibble: 2 × 3
##   n_miss_in_case n_cases pct_cases
##            <int>   <int>     <dbl>
## 1              0     438     94.2 
## 2              1      27      5.81
miss_case_summary(screen)
## # A tibble: 465 × 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1    52      1     12.5
##  2    64      1     12.5
##  3    69      1     12.5
##  4    77      1     12.5
##  5   118      1     12.5
##  6   135      1     12.5
##  7   161      1     12.5
##  8   172      1     12.5
##  9   173      1     12.5
## 10   174      1     12.5
## # ℹ 455 more rows
library(rlang)
## Warning: package 'rlang' was built under R version 4.5.2
## 
## Attaching package: 'rlang'
## The following object is masked from 'package:data.table':
## 
##     :=
library(UpSetR) #nainar paketi hata verirse önce bu iki pkaeti de aktifleştirelim

gg_miss_upset(screen) #bu sayede iki farklı değişkende ortak eksik veri var mı bunu görebiliriz
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the UpSetR package.
##   Please report the issue to the authors.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the UpSetR package.
##   Please report the issue to the authors.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

vis_miss(screen)

#ggplottaki biçimsel düzenlemelerin hepsini nainar pkaeti ile de yapabiliriz

gg_miss_upset(screen)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

MCAR TESTİ

#nainar ile devam

mcar_test(data=screen[,c(2,3,4,5,6, 7,8)])
## # A tibble: 1 × 4
##   statistic    df p.value missing.patterns
##       <dbl> <dbl>   <dbl>            <int>
## 1      19.6    12  0.0752                3
screen_MAR <- screen

screen_MAR$INCOME_m <- screen_MAR$INCOME

library(finalfit)

explanatory=c("TIMEDRS", "ATTDRUG", "ATTHOUSE")

dependent = "INCOME_m"

screen_MAR %>% missing_compare(dependent, explanatory) %>%
  knitr::kable(row.names = FALSE, align = c("l","l","r","r","r"))
Missing data analysis: INCOME_m Not missing Missing p
TIMEDRS Mean (SD) 7.9 (11.1) 7.6 (7.4) 0.891
ATTDRUG Mean (SD) 7.7 (1.2) 7.9 (1.0) 0.368
ATTHOUSE Mean (SD) 23.5 (4.5) 23.7 (4.2) 0.860
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.2.0     ✔ tidyr     1.3.1
## ✔ readr     2.1.5     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ rlang:::=()              masks data.table:::=()
## ✖ ggplot2::%+%()           masks psych::%+%()
## ✖ ggplot2::alpha()         masks psych::alpha()
## ✖ data.table::between()    masks dplyr::between()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ data.table::first()      masks dplyr::first()
## ✖ purrr::flatten()         masks rlang::flatten()
## ✖ purrr::flatten_chr()     masks rlang::flatten_chr()
## ✖ purrr::flatten_dbl()     masks rlang::flatten_dbl()
## ✖ purrr::flatten_int()     masks rlang::flatten_int()
## ✖ purrr::flatten_lgl()     masks rlang::flatten_lgl()
## ✖ purrr::flatten_raw()     masks rlang::flatten_raw()
## ✖ kableExtra::group_rows() masks dplyr::group_rows()
## ✖ lubridate::hour()        masks data.table::hour()
## ✖ purrr::invoke()          masks rlang::invoke()
## ✖ lubridate::isoweek()     masks data.table::isoweek()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ data.table::last()       masks dplyr::last()
## ✖ lubridate::mday()        masks data.table::mday()
## ✖ lubridate::minute()      masks data.table::minute()
## ✖ lubridate::month()       masks data.table::month()
## ✖ lubridate::quarter()     masks data.table::quarter()
## ✖ lubridate::second()      masks data.table::second()
## ✖ purrr::splice()          masks rlang::splice()
## ✖ purrr::transpose()       masks data.table::transpose()
## ✖ lubridate::wday()        masks data.table::wday()
## ✖ lubridate::week()        masks data.table::week()
## ✖ lubridate::yday()        masks data.table::yday()
## ✖ lubridate::year()        masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
miss_test <- screen_MAR %>%
  mutate(miss_income = is.na(INCOME))

VERİ SİLME

Liste olarak

na.omit(screen)
## # A tibble: 438 × 8
##    SUBNO TIMEDRS ATTDRUG ATTHOUSE INCOME EMPLMNT MSTATUS  RACE
##    <dbl>   <dbl>   <dbl>    <dbl>  <dbl>   <dbl>   <dbl> <dbl>
##  1     1       1       8       27      5       1       2     1
##  2     2       3       7       20      6       0       2     1
##  3     3       0       8       23      3       0       2     1
##  4     4      13       9       28      8       1       2     1
##  5     5      15       7       24      1       1       2     1
##  6     6       3       8       25      4       0       2     1
##  7     7       2       7       30      6       1       2     1
##  8     8       0       7       24      6       1       2     1
##  9     9       7       7       20      2       1       2     1
## 10    10       4       8       30      8       0       1     1
## # ℹ 428 more rows

VERİ ATAMA

Ortalama atama

screen3 <- screen

screen3$INCOME[is.na(screen3$INCOME)] <- 
  mean(screen3$INCOME, na.rm = TRUE)

sd(screen3$INCOME)
## [1] 2.350128
sd(screen$INCOME, na.rm = TRUE)
## [1] 2.418875
#ortalama atadığımız için standart sapma küçüldü

Döngü ile ortalama atama

screen4 <- screen [,2:5]

for(i in 1:ncol(screen4)) {
  screen4[,i][is.na(screen4[,i])] <-
    mean(screen4[,i], na.rm=TRUE)
}

any_na(screen4)
## [1] FALSE

Apply ailesi ile ortalama atama

screen4 <- data.frame(lapply(screen4, function(x) {
  x[is.na(x)] <- mean(x, na.rm = TRUE)
  x
}))

any_na(screen4)
## [1] FALSE

NOT: MEDYAN ATAMAK ORTALAMA ATAMAKTAN DAHA İYİDİR ÇÜNKÜ DEĞİŞKENLİĞE ETKİSİ DAHA AZDIR.