Veri incelemede ise yarayacak kütüphaneler bastan aktiflestirilerek veri incelemeden genel duzeyde veri inceleme asamalarına yer verilerek ilerlenmistir.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(haven)  #SPSS verilerini R ortamına aktarmak istedigimizde ise yarar
library(psych) #betimsel istatistiklerin detaylı görünümü icin
library(psych) #tabloların sunuma hazır versiyonu icin
library(vtable)  #ozet tablolar icin
## Loading required package: kableExtra
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(skimr) #veri seti detaylı ozet icin
library(DataExplorer) #veri seti raporu olusturma
library(funModeling) #veri setindeki eksik ve benzersiz degerler icin
## Loading required package: Hmisc
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## funModeling v.1.9.6 :)
## Examples and tutorials at livebook.datascienceheroes.com
##  / Now in Spanish: librovivodecienciadedatos.ai

screen adlı spss veri setini R ortamına aktarma

SCREEN <- haven::read_sav("SCREEN.SAV")

veri seti 8 degiskenli 465 gzlemden olusmakta. Veri setindeki ilk birkac satırını gormek icin

head(SCREEN)
## # A tibble: 6 × 8
##   SUBNO TIMEDRS ATTDRUG ATTHOUSE INCOME EMPLMNT      MSTATUS  RACE
##   <dbl>   <dbl>   <dbl>    <dbl>  <dbl> <dbl+lbl>      <dbl> <dbl>
## 1     1       1       8       27      5 1 [HOUSEWFE]       2     1
## 2     2       3       7       20      6 0 [PAIDWORK]       2     1
## 3     3       0       8       23      3 0 [PAIDWORK]       2     1
## 4     4      13       9       28      8 1 [HOUSEWFE]       2     1
## 5     5      15       7       24      1 1 [HOUSEWFE]       2     1
## 6     6       3       8       25      4 0 [PAIDWORK]       2     1

Veri seti ozet istatistikleri

summary(SCREEN)
##      SUBNO          TIMEDRS          ATTDRUG          ATTHOUSE    
##  Min.   :  1.0   Min.   : 0.000   Min.   : 5.000   Min.   : 2.00  
##  1st Qu.:137.0   1st Qu.: 2.000   1st Qu.: 7.000   1st Qu.:21.00  
##  Median :314.0   Median : 4.000   Median : 8.000   Median :24.00  
##  Mean   :317.4   Mean   : 7.901   Mean   : 7.686   Mean   :23.54  
##  3rd Qu.:483.0   3rd Qu.:10.000   3rd Qu.: 9.000   3rd Qu.:27.00  
##  Max.   :758.0   Max.   :81.000   Max.   :10.000   Max.   :35.00  
##                                                    NA's   :1      
##      INCOME         EMPLMNT         MSTATUS           RACE      
##  Min.   : 1.00   Min.   :0.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 2.50   1st Qu.:0.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median : 4.00   Median :0.000   Median :2.000   Median :1.000  
##  Mean   : 4.21   Mean   :0.471   Mean   :1.778   Mean   :1.088  
##  3rd Qu.: 6.00   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:1.000  
##  Max.   :10.00   Max.   :1.000   Max.   :2.000   Max.   :2.000  
##  NA's   :26

veri seti detaylı istatistikleri icin psych kutuphanesinden describe fonksiyonu

class(SCREEN)
## [1] "tbl_df"     "tbl"        "data.frame"
dim(SCREEN)
## [1] 465   8
str(SCREEN)
## tibble [465 × 8] (S3: tbl_df/tbl/data.frame)
##  $ SUBNO   : num [1:465] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "label")= chr "Subject number"
##   ..- attr(*, "format.spss")= chr "F3.0"
##  $ TIMEDRS : num [1:465] 1 3 0 13 15 3 2 0 7 4 ...
##   ..- attr(*, "label")= chr "Visits to health professionals"
##   ..- attr(*, "format.spss")= chr "F2.0"
##  $ ATTDRUG : num [1:465] 8 7 8 9 7 8 7 7 7 8 ...
##   ..- attr(*, "label")= chr "Attitudes toward medication"
##   ..- attr(*, "format.spss")= chr "F2.0"
##  $ ATTHOUSE: num [1:465] 27 20 23 28 24 25 30 24 20 30 ...
##   ..- attr(*, "label")= chr "Attitudes toward housework"
##   ..- attr(*, "format.spss")= chr "F2.0"
##  $ INCOME  : num [1:465] 5 6 3 8 1 4 6 6 2 8 ...
##   ..- attr(*, "format.spss")= chr "F2.0"
##  $ EMPLMNT : dbl+lbl [1:465] 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, ...
##    ..@ label      : chr "Whether currently employed"
##    ..@ format.spss: chr "F1.0"
##    ..@ labels     : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "PAIDWORK" "HOUSEWFE"
##  $ MSTATUS : num [1:465] 2 2 2 2 2 2 2 2 2 1 ...
##   ..- attr(*, "label")= chr "Whether currently married"
##   ..- attr(*, "format.spss")= chr "F1.0"
##  $ RACE    : num [1:465] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "format.spss")= chr "F1.0"

describe fonksyonu hata verdigi icn aktarılan veri setindeki EMPLMNT değişkeni labelled oldugu icin değişkenleri numeric’e çevirdim

SCREEN_clean <- data.frame(lapply(SCREEN, as.numeric))
describe(SCREEN_clean)
## SCREEN_clean 
## 
##  8  Variables      465  Observations
## --------------------------------------------------------------------------------
## SUBNO 
##        n  missing distinct     Info     Mean  pMedian      Gmd      .05 
##      465        0      465        1    317.4      313    223.3     28.2 
##      .10      .25      .50      .75      .90      .95 
##     55.4    137.0    314.0    483.0    567.6    590.8 
## 
## lowest :   1   2   3   4   5, highest: 754 755 756 757 758
## --------------------------------------------------------------------------------
## TIMEDRS 
##        n  missing distinct     Info     Mean  pMedian      Gmd      .05 
##      465        0       43    0.993    7.901      5.5    9.232      0.0 
##      .10      .25      .50      .75      .90      .95 
##      1.0      2.0      4.0     10.0     17.6     27.0 
## 
## lowest :  0  1  2  3  4, highest: 57 58 60 75 81
## --------------------------------------------------------------------------------
## ATTDRUG 
##        n  missing distinct     Info     Mean  pMedian      Gmd 
##      465        0        6    0.936    7.686      7.5    1.278 
##                                               
## Value          5     6     7     8     9    10
## Frequency     13    60   126   149    95    22
## Proportion 0.028 0.129 0.271 0.320 0.204 0.047
## --------------------------------------------------------------------------------
## ATTHOUSE 
##        n  missing distinct     Info     Mean  pMedian      Gmd      .05 
##      464        1       26    0.995    23.54     23.5    4.976    16.15 
##      .10      .25      .50      .75      .90      .95 
##    18.00    21.00    24.00    27.00    29.00    30.00 
## 
## lowest :  2 11 12 13 14, highest: 31 32 33 34 35
## --------------------------------------------------------------------------------
## INCOME 
##        n  missing distinct     Info     Mean  pMedian      Gmd      .05 
##      439       26       10     0.98     4.21        4    2.704      1.0 
##      .10      .25      .50      .75      .90      .95 
##      1.0      2.5      4.0      6.0      8.0      9.0 
##                                                                       
## Value          1     2     3     4     5     6     7     8     9    10
## Frequency     71    39    79    84    46    36    36    19    14    15
## Proportion 0.162 0.089 0.180 0.191 0.105 0.082 0.082 0.043 0.032 0.034
## --------------------------------------------------------------------------------
## EMPLMNT 
##        n  missing distinct     Info      Sum     Mean 
##      465        0        2    0.747      219    0.471 
## 
## --------------------------------------------------------------------------------
## MSTATUS 
##        n  missing distinct     Info     Mean 
##      465        0        2    0.517    1.778 
##                       
## Value          1     2
## Frequency    103   362
## Proportion 0.222 0.778
## --------------------------------------------------------------------------------
## RACE 
##        n  missing distinct     Info     Mean 
##      465        0        2    0.241    1.088 
##                       
## Value          1     2
## Frequency    424    41
## Proportion 0.912 0.088
## --------------------------------------------------------------------------------

sunuma hazır tablolar oluşturmak icin:

library(gtsummary)
SCREEN_clean %>% 
  select(2:6) %>%
  tbl_summary(
    statistic = all_continuous() ~ c("{min}, {max}"),
    missing = "always"
  )
Characteristic N = 4651
TIMEDRS 0, 81
    Unknown 0
ATTDRUG
    5 13 (2.8%)
    6 60 (13%)
    7 126 (27%)
    8 149 (32%)
    9 95 (20%)
    10 22 (4.7%)
    Unknown 0
ATTHOUSE 2.0, 35.0
    Unknown 1
INCOME 1.00, 10.00
    Unknown 26
EMPLMNT 219 (47%)
    Unknown 0
1 Min, Max; n (%)

vtable paketindeki sumtable fonksiyonu kullanılarak ozet tablolar olusturma:

sumtable(SCREEN_clean, summ=c('notNA(x)','min(x)','max(x)'))
Summary Statistics
Variable NotNA Min Max
SUBNO 465 1 758
TIMEDRS 465 0 81
ATTDRUG 465 5 10
ATTHOUSE 464 2 35
INCOME 439 1 10
EMPLMNT 465 0 1
MSTATUS 465 1 2
RACE 465 1 2
st(SCREEN_clean, summ = c('notNA(x)','min(x)','max(x)'),
   summ.names = c('Frekans','Minimum','Maximum'))
Summary Statistics
Variable Frekans Minimum Maximum
SUBNO 465 1 758
TIMEDRS 465 0 81
ATTDRUG 465 5 10
ATTHOUSE 464 2 35
INCOME 439 1 10
EMPLMNT 465 0 1
MSTATUS 465 1 2
RACE 465 1 2

Markdown formatında tablolar icin

kable(
  as.data.frame(unclass(psych::describe(SCREEN_clean)))[,c("n","mean","sd","min","max")],
  digits = 2,
  caption = "Tablo 1\nBetimsel İstatistikler"
)
Tablo 1 Betimsel İstatistikler
n mean sd min max
465 317.38 194.16 1 758
465 7.90 10.95 0 81
465 7.69 1.16 5 10
464 23.54 4.48 2 35
439 4.21 2.42 1 10
465 0.47 0.50 0 1
465 1.78 0.42 1 2
465 1.09 0.28 1 2
skim(SCREEN_clean)
Data summary
Name SCREEN_clean
Number of rows 465
Number of columns 8
_______________________
Column type frequency:
numeric 8
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SUBNO 0 1.00 317.38 194.16 1 137.0 314 483 758 ▇▆▆▇▁
TIMEDRS 0 1.00 7.90 10.95 0 2.0 4 10 81 ▇▁▁▁▁
ATTDRUG 0 1.00 7.69 1.16 5 7.0 8 9 10 ▃▇▇▅▁
ATTHOUSE 1 1.00 23.54 4.48 2 21.0 24 27 35 ▁▁▅▇▂
INCOME 26 0.94 4.21 2.42 1 2.5 4 6 10 ▆▇▅▃▂
EMPLMNT 0 1.00 0.47 0.50 0 0.0 0 1 1 ▇▁▁▁▇
MSTATUS 0 1.00 1.78 0.42 1 2.0 2 2 2 ▂▁▁▁▇
RACE 0 1.00 1.09 0.28 1 1.0 1 1 2 ▇▁▁▁▁
create_report(SCREEN_clean)
## 
## 
## processing file: report.rmd
##   |                                             |                                     |   0%  |                                             |.                                    |   2%                                   |                                             |..                                   |   5% [global_options]                  |                                             |...                                  |   7%                                   |                                             |....                                 |  10% [introduce]                       |                                             |....                                 |  12%                                   |                                             |.....                                |  14% [plot_intro]
##   |                                             |......                               |  17%                                   |                                             |.......                              |  19% [data_structure]                  |                                             |........                             |  21%                                   |                                             |.........                            |  24% [missing_profile]
##   |                                             |..........                           |  26%                                   |                                             |...........                          |  29% [univariate_distribution_header]  |                                             |...........                          |  31%                                   |                                             |............                         |  33% [plot_histogram]
##   |                                             |.............                        |  36%                                   |                                             |..............                       |  38% [plot_density]                    |                                             |...............                      |  40%                                   |                                             |................                     |  43% [plot_frequency_bar]              |                                             |.................                    |  45%                                   |                                             |..................                   |  48% [plot_response_bar]               |                                             |..................                   |  50%                                   |                                             |...................                  |  52% [plot_with_bar]                   |                                             |....................                 |  55%                                   |                                             |.....................                |  57% [plot_normal_qq]
##   |                                             |......................               |  60%                                   |                                             |.......................              |  62% [plot_response_qq]                |                                             |........................             |  64%                                   |                                             |.........................            |  67% [plot_by_qq]                      |                                             |..........................           |  69%                                   |                                             |..........................           |  71% [correlation_analysis]
##   |                                             |...........................          |  74%                                   |                                             |............................         |  76% [principal_component_analysis]
##   |                                             |.............................        |  79%                                   |                                             |..............................       |  81% [bivariate_distribution_header]   |                                             |...............................      |  83%                                   |                                             |................................     |  86% [plot_response_boxplot]           |                                             |.................................    |  88%                                   |                                             |.................................    |  90% [plot_by_boxplot]                 |                                             |..................................   |  93%                                   |                                             |...................................  |  95% [plot_response_scatterplot]       |                                             |.................................... |  98%                                   |                                             |.....................................| 100% [plot_by_scatterplot]           
## output file: /Users/kamilgurleyen/Desktop/R/report.knit.md
## /private/var/folders/r0/z3w3wh491q38g6cr3k4vvh_r0000gn/T/AppTranslocation/C0AAA363-406F-4DA7-B98B-0FAFAB7CB38F/d/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/pandoc +RTS -K512m -RTS /Users/kamilgurleyen/Desktop/R/report.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output /Users/kamilgurleyen/Desktop/R/report.html --lua-filter /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmarkdown/lua/pagebreak.lua --lua-filter /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmarkdown/lua/latex-div.lua --lua-filter /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmarkdown/lua/table-classes.lua --embed-resources --standalone --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 6 --template /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable 'mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' --include-in-header /var/folders/r0/z3w3wh491q38g6cr3k4vvh_r0000gn/T//RtmpUWerk2/rmarkdown-str40b25e13c0c7.html
## 
## Output created: report.html
df_status(SCREEN_clean)
##   variable q_zeros p_zeros q_na p_na q_inf p_inf    type unique
## 1    SUBNO       0    0.00    0 0.00     0     0 numeric    465
## 2  TIMEDRS      42    9.03    0 0.00     0     0 numeric     43
## 3  ATTDRUG       0    0.00    0 0.00     0     0 numeric      6
## 4 ATTHOUSE       0    0.00    1 0.22     0     0 numeric     26
## 5   INCOME       0    0.00   26 5.59     0     0 numeric     10
## 6  EMPLMNT     246   52.90    0 0.00     0     0 numeric      2
## 7  MSTATUS       0    0.00    0 0.00     0     0 numeric      2
## 8     RACE       0    0.00    0 0.00     0     0 numeric      2