Veri incelemede ise yarayacak kütüphaneler bastan aktiflestirilerek veri incelemeden genel duzeyde veri inceleme asamalarına yer verilerek ilerlenmistir.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(haven) #SPSS verilerini R ortamına aktarmak istedigimizde ise yarar
library(psych) #betimsel istatistiklerin detaylı görünümü icin
library(psych) #tabloların sunuma hazır versiyonu icin
library(vtable) #ozet tablolar icin
## Loading required package: kableExtra
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(skimr) #veri seti detaylı ozet icin
library(DataExplorer) #veri seti raporu olusturma
library(funModeling) #veri setindeki eksik ve benzersiz degerler icin
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.9.6 :)
## Examples and tutorials at livebook.datascienceheroes.com
## / Now in Spanish: librovivodecienciadedatos.ai
screen adlı spss veri setini R ortamına aktarma
SCREEN <- haven::read_sav("SCREEN.SAV")
veri seti 8 degiskenli 465 gzlemden olusmakta. Veri setindeki ilk birkac satırını gormek icin
head(SCREEN)
## # A tibble: 6 × 8
## SUBNO TIMEDRS ATTDRUG ATTHOUSE INCOME EMPLMNT MSTATUS RACE
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl+lbl> <dbl> <dbl>
## 1 1 1 8 27 5 1 [HOUSEWFE] 2 1
## 2 2 3 7 20 6 0 [PAIDWORK] 2 1
## 3 3 0 8 23 3 0 [PAIDWORK] 2 1
## 4 4 13 9 28 8 1 [HOUSEWFE] 2 1
## 5 5 15 7 24 1 1 [HOUSEWFE] 2 1
## 6 6 3 8 25 4 0 [PAIDWORK] 2 1
Veri seti ozet istatistikleri
summary(SCREEN)
## SUBNO TIMEDRS ATTDRUG ATTHOUSE
## Min. : 1.0 Min. : 0.000 Min. : 5.000 Min. : 2.00
## 1st Qu.:137.0 1st Qu.: 2.000 1st Qu.: 7.000 1st Qu.:21.00
## Median :314.0 Median : 4.000 Median : 8.000 Median :24.00
## Mean :317.4 Mean : 7.901 Mean : 7.686 Mean :23.54
## 3rd Qu.:483.0 3rd Qu.:10.000 3rd Qu.: 9.000 3rd Qu.:27.00
## Max. :758.0 Max. :81.000 Max. :10.000 Max. :35.00
## NA's :1
## INCOME EMPLMNT MSTATUS RACE
## Min. : 1.00 Min. :0.000 Min. :1.000 Min. :1.000
## 1st Qu.: 2.50 1st Qu.:0.000 1st Qu.:2.000 1st Qu.:1.000
## Median : 4.00 Median :0.000 Median :2.000 Median :1.000
## Mean : 4.21 Mean :0.471 Mean :1.778 Mean :1.088
## 3rd Qu.: 6.00 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :10.00 Max. :1.000 Max. :2.000 Max. :2.000
## NA's :26
veri seti detaylı istatistikleri icin psych kutuphanesinden describe fonksiyonu
class(SCREEN)
## [1] "tbl_df" "tbl" "data.frame"
dim(SCREEN)
## [1] 465 8
str(SCREEN)
## tibble [465 × 8] (S3: tbl_df/tbl/data.frame)
## $ SUBNO : num [1:465] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "label")= chr "Subject number"
## ..- attr(*, "format.spss")= chr "F3.0"
## $ TIMEDRS : num [1:465] 1 3 0 13 15 3 2 0 7 4 ...
## ..- attr(*, "label")= chr "Visits to health professionals"
## ..- attr(*, "format.spss")= chr "F2.0"
## $ ATTDRUG : num [1:465] 8 7 8 9 7 8 7 7 7 8 ...
## ..- attr(*, "label")= chr "Attitudes toward medication"
## ..- attr(*, "format.spss")= chr "F2.0"
## $ ATTHOUSE: num [1:465] 27 20 23 28 24 25 30 24 20 30 ...
## ..- attr(*, "label")= chr "Attitudes toward housework"
## ..- attr(*, "format.spss")= chr "F2.0"
## $ INCOME : num [1:465] 5 6 3 8 1 4 6 6 2 8 ...
## ..- attr(*, "format.spss")= chr "F2.0"
## $ EMPLMNT : dbl+lbl [1:465] 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, ...
## ..@ label : chr "Whether currently employed"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "PAIDWORK" "HOUSEWFE"
## $ MSTATUS : num [1:465] 2 2 2 2 2 2 2 2 2 1 ...
## ..- attr(*, "label")= chr "Whether currently married"
## ..- attr(*, "format.spss")= chr "F1.0"
## $ RACE : num [1:465] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "format.spss")= chr "F1.0"
describe fonksyonu hata verdigi icn aktarılan veri setindeki EMPLMNT değişkeni labelled oldugu icin değişkenleri numeric’e çevirdim
SCREEN_clean <- data.frame(lapply(SCREEN, as.numeric))
describe(SCREEN_clean)
## SCREEN_clean
##
## 8 Variables 465 Observations
## --------------------------------------------------------------------------------
## SUBNO
## n missing distinct Info Mean pMedian Gmd .05
## 465 0 465 1 317.4 313 223.3 28.2
## .10 .25 .50 .75 .90 .95
## 55.4 137.0 314.0 483.0 567.6 590.8
##
## lowest : 1 2 3 4 5, highest: 754 755 756 757 758
## --------------------------------------------------------------------------------
## TIMEDRS
## n missing distinct Info Mean pMedian Gmd .05
## 465 0 43 0.993 7.901 5.5 9.232 0.0
## .10 .25 .50 .75 .90 .95
## 1.0 2.0 4.0 10.0 17.6 27.0
##
## lowest : 0 1 2 3 4, highest: 57 58 60 75 81
## --------------------------------------------------------------------------------
## ATTDRUG
## n missing distinct Info Mean pMedian Gmd
## 465 0 6 0.936 7.686 7.5 1.278
##
## Value 5 6 7 8 9 10
## Frequency 13 60 126 149 95 22
## Proportion 0.028 0.129 0.271 0.320 0.204 0.047
## --------------------------------------------------------------------------------
## ATTHOUSE
## n missing distinct Info Mean pMedian Gmd .05
## 464 1 26 0.995 23.54 23.5 4.976 16.15
## .10 .25 .50 .75 .90 .95
## 18.00 21.00 24.00 27.00 29.00 30.00
##
## lowest : 2 11 12 13 14, highest: 31 32 33 34 35
## --------------------------------------------------------------------------------
## INCOME
## n missing distinct Info Mean pMedian Gmd .05
## 439 26 10 0.98 4.21 4 2.704 1.0
## .10 .25 .50 .75 .90 .95
## 1.0 2.5 4.0 6.0 8.0 9.0
##
## Value 1 2 3 4 5 6 7 8 9 10
## Frequency 71 39 79 84 46 36 36 19 14 15
## Proportion 0.162 0.089 0.180 0.191 0.105 0.082 0.082 0.043 0.032 0.034
## --------------------------------------------------------------------------------
## EMPLMNT
## n missing distinct Info Sum Mean
## 465 0 2 0.747 219 0.471
##
## --------------------------------------------------------------------------------
## MSTATUS
## n missing distinct Info Mean
## 465 0 2 0.517 1.778
##
## Value 1 2
## Frequency 103 362
## Proportion 0.222 0.778
## --------------------------------------------------------------------------------
## RACE
## n missing distinct Info Mean
## 465 0 2 0.241 1.088
##
## Value 1 2
## Frequency 424 41
## Proportion 0.912 0.088
## --------------------------------------------------------------------------------
sunuma hazır tablolar oluşturmak icin:
library(gtsummary)
SCREEN_clean %>%
select(2:6) %>%
tbl_summary(
statistic = all_continuous() ~ c("{min}, {max}"),
missing = "always"
)
| Characteristic | N = 4651 |
|---|---|
| TIMEDRS | 0, 81 |
| Unknown | 0 |
| ATTDRUG | |
| 5 | 13 (2.8%) |
| 6 | 60 (13%) |
| 7 | 126 (27%) |
| 8 | 149 (32%) |
| 9 | 95 (20%) |
| 10 | 22 (4.7%) |
| Unknown | 0 |
| ATTHOUSE | 2.0, 35.0 |
| Unknown | 1 |
| INCOME | 1.00, 10.00 |
| Unknown | 26 |
| EMPLMNT | 219 (47%) |
| Unknown | 0 |
| 1 Min, Max; n (%) | |
vtable paketindeki sumtable fonksiyonu kullanılarak ozet tablolar olusturma:
sumtable(SCREEN_clean, summ=c('notNA(x)','min(x)','max(x)'))
| Variable | NotNA | Min | Max |
|---|---|---|---|
| SUBNO | 465 | 1 | 758 |
| TIMEDRS | 465 | 0 | 81 |
| ATTDRUG | 465 | 5 | 10 |
| ATTHOUSE | 464 | 2 | 35 |
| INCOME | 439 | 1 | 10 |
| EMPLMNT | 465 | 0 | 1 |
| MSTATUS | 465 | 1 | 2 |
| RACE | 465 | 1 | 2 |
st(SCREEN_clean, summ = c('notNA(x)','min(x)','max(x)'),
summ.names = c('Frekans','Minimum','Maximum'))
| Variable | Frekans | Minimum | Maximum |
|---|---|---|---|
| SUBNO | 465 | 1 | 758 |
| TIMEDRS | 465 | 0 | 81 |
| ATTDRUG | 465 | 5 | 10 |
| ATTHOUSE | 464 | 2 | 35 |
| INCOME | 439 | 1 | 10 |
| EMPLMNT | 465 | 0 | 1 |
| MSTATUS | 465 | 1 | 2 |
| RACE | 465 | 1 | 2 |
Markdown formatında tablolar icin
kable(
as.data.frame(unclass(psych::describe(SCREEN_clean)))[,c("n","mean","sd","min","max")],
digits = 2,
caption = "Tablo 1\nBetimsel İstatistikler"
)
| n | mean | sd | min | max |
|---|---|---|---|---|
| 465 | 317.38 | 194.16 | 1 | 758 |
| 465 | 7.90 | 10.95 | 0 | 81 |
| 465 | 7.69 | 1.16 | 5 | 10 |
| 464 | 23.54 | 4.48 | 2 | 35 |
| 439 | 4.21 | 2.42 | 1 | 10 |
| 465 | 0.47 | 0.50 | 0 | 1 |
| 465 | 1.78 | 0.42 | 1 | 2 |
| 465 | 1.09 | 0.28 | 1 | 2 |
skim(SCREEN_clean)
| Name | SCREEN_clean |
| Number of rows | 465 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| SUBNO | 0 | 1.00 | 317.38 | 194.16 | 1 | 137.0 | 314 | 483 | 758 | ▇▆▆▇▁ |
| TIMEDRS | 0 | 1.00 | 7.90 | 10.95 | 0 | 2.0 | 4 | 10 | 81 | ▇▁▁▁▁ |
| ATTDRUG | 0 | 1.00 | 7.69 | 1.16 | 5 | 7.0 | 8 | 9 | 10 | ▃▇▇▅▁ |
| ATTHOUSE | 1 | 1.00 | 23.54 | 4.48 | 2 | 21.0 | 24 | 27 | 35 | ▁▁▅▇▂ |
| INCOME | 26 | 0.94 | 4.21 | 2.42 | 1 | 2.5 | 4 | 6 | 10 | ▆▇▅▃▂ |
| EMPLMNT | 0 | 1.00 | 0.47 | 0.50 | 0 | 0.0 | 0 | 1 | 1 | ▇▁▁▁▇ |
| MSTATUS | 0 | 1.00 | 1.78 | 0.42 | 1 | 2.0 | 2 | 2 | 2 | ▂▁▁▁▇ |
| RACE | 0 | 1.00 | 1.09 | 0.28 | 1 | 1.0 | 1 | 1 | 2 | ▇▁▁▁▁ |
create_report(SCREEN_clean)
##
##
## processing file: report.rmd
## | | | 0% | |. | 2% | |.. | 5% [global_options] | |... | 7% | |.... | 10% [introduce] | |.... | 12% | |..... | 14% [plot_intro]
## | |...... | 17% | |....... | 19% [data_structure] | |........ | 21% | |......... | 24% [missing_profile]
## | |.......... | 26% | |........... | 29% [univariate_distribution_header] | |........... | 31% | |............ | 33% [plot_histogram]
## | |............. | 36% | |.............. | 38% [plot_density] | |............... | 40% | |................ | 43% [plot_frequency_bar] | |................. | 45% | |.................. | 48% [plot_response_bar] | |.................. | 50% | |................... | 52% [plot_with_bar] | |.................... | 55% | |..................... | 57% [plot_normal_qq]
## | |...................... | 60% | |....................... | 62% [plot_response_qq] | |........................ | 64% | |......................... | 67% [plot_by_qq] | |.......................... | 69% | |.......................... | 71% [correlation_analysis]
## | |........................... | 74% | |............................ | 76% [principal_component_analysis]
## | |............................. | 79% | |.............................. | 81% [bivariate_distribution_header] | |............................... | 83% | |................................ | 86% [plot_response_boxplot] | |................................. | 88% | |................................. | 90% [plot_by_boxplot] | |.................................. | 93% | |................................... | 95% [plot_response_scatterplot] | |.................................... | 98% | |.....................................| 100% [plot_by_scatterplot]
## output file: /Users/kamilgurleyen/Desktop/R/report.knit.md
## /private/var/folders/r0/z3w3wh491q38g6cr3k4vvh_r0000gn/T/AppTranslocation/C0AAA363-406F-4DA7-B98B-0FAFAB7CB38F/d/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/pandoc +RTS -K512m -RTS /Users/kamilgurleyen/Desktop/R/report.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output /Users/kamilgurleyen/Desktop/R/report.html --lua-filter /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmarkdown/lua/pagebreak.lua --lua-filter /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmarkdown/lua/latex-div.lua --lua-filter /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmarkdown/lua/table-classes.lua --embed-resources --standalone --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 6 --template /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable 'mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' --include-in-header /var/folders/r0/z3w3wh491q38g6cr3k4vvh_r0000gn/T//RtmpUWerk2/rmarkdown-str40b25e13c0c7.html
##
## Output created: report.html
df_status(SCREEN_clean)
## variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
## 1 SUBNO 0 0.00 0 0.00 0 0 numeric 465
## 2 TIMEDRS 42 9.03 0 0.00 0 0 numeric 43
## 3 ATTDRUG 0 0.00 0 0.00 0 0 numeric 6
## 4 ATTHOUSE 0 0.00 1 0.22 0 0 numeric 26
## 5 INCOME 0 0.00 26 5.59 0 0 numeric 10
## 6 EMPLMNT 246 52.90 0 0.00 0 0 numeric 2
## 7 MSTATUS 0 0.00 0 0.00 0 0 numeric 2
## 8 RACE 0 0.00 0 0.00 0 0 numeric 2