DATA: File1_Demo_ASQ4Test1
On Set 12, just missing mom’s age variable. I asked Kimberly if I can be of any assistance. Last update: 12 September, 2021
File2_Demo_ASQ3andSupplement
Kimberly, please click here to check the changes I made to the second spreadsheet
Definition Data preprocessing is the process of converting raw data into a well-readable format to be used by statistical and psychometric analyses.
pacman::p_load(tidyverse, janitor, arsenal, DT, DataExplorer,summarytools, psych)
ds_1<- readxl::read_excel("C:/Users/luisf/Dropbox/Puc-Rio/Projeto - ASQ 4 2021/Datasets/ASQ3_Supplmnt and ASQ4 test1 8.19.21.xlsx",
col_types = c("text", "numeric", "text",
"numeric", "date", "date", "numeric",
"numeric", "numeric", "numeric",
"text", "text", "numeric", "text",
"numeric", "text", "text", "text",
"numeric", "text", "text", "text",
"text", "text", "text", "numeric",
"numeric", "numeric", "text", "numeric",
"date", "date", "numeric", "text",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "text", "numeric", "text",
"numeric", "text", "numeric", "text",
"numeric", "text", "numeric", "text",
"numeric", "text", "numeric", "text",
"numeric", "text", "numeric", "text",
"numeric", "numeric", "numeric"))
backup_ds_1 <- ds_1
Data was imported into R. Some warning were reported.
Some columns and rows were completely empty. Example: weeks premie (Excel, Column 6) I’ve excluded these columns.
ds_1 <- clean_names(ds_1)
ds_1 <- remove_empty(ds_1)
value for "which" not specified, defaulting to c("rows", "cols")
According to our last e-mail, values equal to “.” (dots) were transformed into missing.
ds_1 <- ds_1 %>%
mutate_at(vars(-dob,-datcom,-datcom_2,-dob_2),~na_if(., "."))
ALERT: Kimberly, please check the values of ASQ-4 variables. The table below showcast some inconsistencies. Please check it in Excel. Go to line 3524, column AJ and click there. You’ll see that excel changed “10” to “10.484962406015”. Same thing at line 10476.
on Sep 10, Fixed after talking with Kimberly
x<-ds_1 %>% select(com_a4_1, com_a4_2, com_a4_3, com_a4_4, com_a4_5, com_a4_6,
gm_a4_1, gm_a4_2, gm_a4_3, gm_a4_4, gm_a4_5, gm_a4_6,
fm_a4_1, fm_a4_2, fm_a4_3, fm_a4_4, fm_a4_5, fm_a4_6,
cg_a4_1, cg_a4_2, cg_a4_3, cg_a4_4, cg_a4_5, cg_a4_6,
ps_a4_1, ps_a4_2, ps_a4_3, ps_a4_4, ps_a4_5, ps_a4_6) %>%
mutate_all(., factor)
arsenal::tableby(~., data = x) %>% summary()
| Overall (N=17525) | |
|---|---|
| com_a4_1 | |
| N-Miss | 1 |
| 0 | 1098 (6.3%) |
| 5 | 2268 (12.9%) |
| 6 | 1 (0.0%) |
| 7 | 1 (0.0%) |
| 8.75 | 1 (0.0%) |
| 10 | 14153 (80.8%) |
| 10.4210526315789 | 1 (0.0%) |
| 10.484962406015 | 1 (0.0%) |
| com_a4_2 | |
| N-Miss | 1 |
| 0 | 1836 (10.5%) |
| 5 | 2579 (14.7%) |
| 6 | 1 (0.0%) |
| 7 | 1 (0.0%) |
| 9 | 2 (0.0%) |
| 10 | 13105 (74.8%) |
| com_a4_3 | |
| N-Miss | 1 |
| 0 | 2573 (14.7%) |
| 5 | 3344 (19.1%) |
| 7 | 1 (0.0%) |
| 10 | 11605 (66.2%) |
| 10.484962406015 | 1 (0.0%) |
| com_a4_4 | |
| N-Miss | 2 |
| 0 | 2912 (16.6%) |
| 5 | 2823 (16.1%) |
| 7 | 1 (0.0%) |
| 8 | 1 (0.0%) |
| 10 | 11786 (67.3%) |
| com_a4_5 | |
| N-Miss | 2 |
| 0 | 4124 (23.5%) |
| 5 | 3346 (19.1%) |
| 7 | 1 (0.0%) |
| 8.75 | 1 (0.0%) |
| 10 | 10050 (57.4%) |
| 55 | 1 (0.0%) |
| com_a4_6 | |
| N-Miss | 2 |
| 0 | 5170 (29.5%) |
| 5 | 3127 (17.8%) |
| 10 | 9226 (52.7%) |
| gm_a4_1 | |
| N-Miss | 1 |
| 0 | 551 (3.1%) |
| 5 | 1075 (6.1%) |
| 7.5 | 1 (0.0%) |
| 8 | 4 (0.0%) |
| 9 | 1 (0.0%) |
| 10 | 15891 (90.7%) |
| 20 | 1 (0.0%) |
| gm_a4_2 | |
| N-Miss | 2 |
| 0 | 1054 (6.0%) |
| 5 | 2025 (11.6%) |
| 7.5 | 1 (0.0%) |
| 8 | 1 (0.0%) |
| 10 | 14442 (82.4%) |
| gm_a4_3 | |
| N-Miss | 1 |
| 0 | 1613 (9.2%) |
| 5 | 1829 (10.4%) |
| 8 | 1 (0.0%) |
| 10 | 14081 (80.4%) |
| gm_a4_4 | |
| N-Miss | 1 |
| 0 | 1634 (9.3%) |
| 5 | 2207 (12.6%) |
| 8 | 1 (0.0%) |
| 9 | 1 (0.0%) |
| 10 | 13681 (78.1%) |
| gm_a4_5 | |
| N-Miss | 2 |
| 0 | 1930 (11.0%) |
| 5 | 2765 (15.8%) |
| 10 | 12828 (73.2%) |
| gm_a4_6 | |
| N-Miss | 2 |
| 0 | 3309 (18.9%) |
| 5 | 2350 (13.4%) |
| 7 | 1 (0.0%) |
| 8 | 1 (0.0%) |
| 9 | 1 (0.0%) |
| 10 | 11860 (67.7%) |
| 15 | 1 (0.0%) |
| fm_a4_1 | |
| N-Miss | 2 |
| 0 | 1350 (7.7%) |
| 5 | 2472 (14.1%) |
| 9 | 1 (0.0%) |
| 10 | 13700 (78.2%) |
| fm_a4_2 | |
| N-Miss | 1 |
| 0 | 1859 (10.6%) |
| 1 | 1 (0.0%) |
| 4 | 2 (0.0%) |
| 5 | 2617 (14.9%) |
| 10 | 13045 (74.4%) |
| fm_a4_3 | |
| N-Miss | 1 |
| 0 | 2603 (14.9%) |
| 5 | 3018 (17.2%) |
| 8 | 1 (0.0%) |
| 9 | 1 (0.0%) |
| 10 | 11901 (67.9%) |
| fm_a4_4 | |
| N-Miss | 1 |
| 0 | 2942 (16.8%) |
| 5 | 3220 (18.4%) |
| 7 | 1 (0.0%) |
| 9 | 1 (0.0%) |
| 10 | 11360 (64.8%) |
| fm_a4_5 | |
| N-Miss | 1 |
| 0 | 3221 (18.4%) |
| 5 | 3237 (18.5%) |
| 9 | 3 (0.0%) |
| 10 | 11063 (63.1%) |
| fm_a4_6 | |
| N-Miss | 1 |
| 0 | 4547 (25.9%) |
| 3 | 1 (0.0%) |
| 4 | 1 (0.0%) |
| 5 | 3339 (19.1%) |
| 6 | 1 (0.0%) |
| 7 | 2 (0.0%) |
| 8 | 2 (0.0%) |
| 9 | 2 (0.0%) |
| 10 | 9629 (54.9%) |
| cg_a4_1 | |
| N-Miss | 1 |
| 0 | 1429 (8.2%) |
| 5 | 2750 (15.7%) |
| 7 | 1 (0.0%) |
| 8 | 1 (0.0%) |
| 9 | 2 (0.0%) |
| 10 | 13341 (76.1%) |
| cg_a4_2 | |
| N-Miss | 1 |
| 0 | 1567 (8.9%) |
| 5 | 2772 (15.8%) |
| 6 | 1 (0.0%) |
| 8 | 2 (0.0%) |
| 10 | 13182 (75.2%) |
| cg_a4_3 | |
| N-Miss | 1 |
| 0 | 1871 (10.7%) |
| 5 | 2868 (16.4%) |
| 6 | 1 (0.0%) |
| 7 | 2 (0.0%) |
| 8 | 1 (0.0%) |
| 10 | 12781 (72.9%) |
| cg_a4_4 | |
| N-Miss | 1 |
| 0 | 2428 (13.9%) |
| 5 | 4162 (23.8%) |
| 6 | 1 (0.0%) |
| 8 | 1 (0.0%) |
| 10 | 10932 (62.4%) |
| cg_a4_5 | |
| N-Miss | 2 |
| 0 | 4226 (24.1%) |
| 4 | 1 (0.0%) |
| 5 | 3587 (20.5%) |
| 7 | 1 (0.0%) |
| 8 | 1 (0.0%) |
| 9 | 1 (0.0%) |
| 10 | 9706 (55.4%) |
| cg_a4_6 | |
| N-Miss | 1 |
| 0 | 3140 (17.9%) |
| 5 | 3576 (20.4%) |
| 10 | 10808 (61.7%) |
| ps_a4_1 | |
| N-Miss | 1 |
| 0 | 1096 (6.3%) |
| 4 | 1 (0.0%) |
| 5 | 2188 (12.5%) |
| 7 | 1 (0.0%) |
| 9 | 1 (0.0%) |
| 9.72556390977444 | 1 (0.0%) |
| 10 | 14236 (81.2%) |
| ps_a4_2 | |
| N-Miss | 1 |
| 0 | 1963 (11.2%) |
| 5 | 2571 (14.7%) |
| 6 | 1 (0.0%) |
| 8 | 1 (0.0%) |
| 10 | 12988 (74.1%) |
| ps_a4_3 | |
| N-Miss | 1 |
| 0 | 1518 (8.7%) |
| 5 | 2724 (15.5%) |
| 9 | 1 (0.0%) |
| 10 | 13280 (75.8%) |
| 110 | 1 (0.0%) |
| ps_a4_4 | |
| N-Miss | 1 |
| 0 | 1663 (9.5%) |
| 5 | 2654 (15.1%) |
| 10 | 13206 (75.4%) |
| 100 | 1 (0.0%) |
| ps_a4_5 | |
| N-Miss | 2 |
| 0 | 3275 (18.7%) |
| 5 | 3416 (19.5%) |
| 7 | 2 (0.0%) |
| 8 | 2 (0.0%) |
| 9 | 3 (0.0%) |
| 10 | 10825 (61.8%) |
| ps_a4_6 | |
| N-Miss | 1 |
| 0 | 3574 (20.4%) |
| 5 | 3358 (19.2%) |
| 9 | 1 (0.0%) |
| 10 | 10591 (60.4%) |
NA
Except for 0,5, and 10, I’ll change all other values to missing cases.
ds_1 <- ds_1 %>%
mutate_at(vars(com_a4_1, com_a4_2, com_a4_3, com_a4_4, com_a4_5, com_a4_6,
gm_a4_1, gm_a4_2, gm_a4_3, gm_a4_4, gm_a4_5, gm_a4_6,
fm_a4_1, fm_a4_2, fm_a4_3, fm_a4_4, fm_a4_5, fm_a4_6,
cg_a4_1, cg_a4_2, cg_a4_3, cg_a4_4, cg_a4_5, cg_a4_6,
ps_a4_1, ps_a4_2, ps_a4_3, ps_a4_4, ps_a4_5, ps_a4_6), ~ifelse(. %in% c(0,5,10), ., NA))
rm(x)
Please tell me if you agree with my solution.
ds_1 %>%
select(com_a4_1, com_a4_2, com_a4_3, com_a4_4, com_a4_5, com_a4_6,
gm_a4_1, gm_a4_2, gm_a4_3, gm_a4_4, gm_a4_5, gm_a4_6,
fm_a4_1, fm_a4_2, fm_a4_3, fm_a4_4, fm_a4_5, fm_a4_6,
cg_a4_1, cg_a4_2, cg_a4_3, cg_a4_4, cg_a4_5, cg_a4_6,
ps_a4_1, ps_a4_2, ps_a4_3, ps_a4_4, ps_a4_5, ps_a4_6) %>%
pivot_longer(everything()) %>%
count(value) %>%
mutate(percent = (100 * n / sum(n)) %>% round(digits = 1))
ALERT: Kimberly, please click “3” in the table below. You will see that about 7% (n=1291) of the totals of communication, gross motor, fine motor, problem solving, and persnal and social are missing.
on Sep 10, Fixed after talking with Kimberly
profile_missing(ds_1) %>%
arrange(desc(num_missing)) %>%
mutate(pct_missing = formatC(pct_missing)) %>%
DT::datatable(., options = list(dom='t'))
In the table below, you’ll see the number of missing cases for each source. For eample, the source “ASQ BDI” has 519 missing cases.
ds_1 %>%
group_by(source) %>%
summarise_at(vars(contains("sum")), ~sum(is.na(.))) %>%
#summarise(sum(is.na(csum))) %>%
adorn_totals() %>%
datatable(., options = list(dom='t'))
NA
I’ll compute the scores for each child and then check the missing cases again
ds_1 <- ds_1 %>% rowwise() %>%
mutate(csum = sum(c_across(c(com_a4_1, com_a4_2, com_a4_3, com_a4_4, com_a4_5, com_a4_6)), na.rm=T)) %>%
mutate(gmsum = sum(c_across(c(gm_a4_1, gm_a4_2, gm_a4_3, gm_a4_4, gm_a4_5, gm_a4_6)), na.rm=T)) %>%
mutate(fmsum = sum(c_across(c(fm_a4_1, fm_a4_2, fm_a4_3, fm_a4_4, fm_a4_5, fm_a4_6)), na.rm=T)) %>%
mutate(cgsum_a4 = sum(c_across(c(cg_a4_1, cg_a4_2, cg_a4_3, cg_a4_4, cg_a4_5, cg_a4_6)), na.rm=T)) %>%
mutate(psum_a4 = sum(c_across(c(ps_a4_1, ps_a4_2, ps_a4_3, ps_a4_4, ps_a4_5, ps_a4_6)), na.rm=T))
ds_1 %>%
select(contains("sum")) %>%
plot_missing()
Please tell me if you agree with my solution.
id and id_2 are pretty much the same, but for one participant. This was fixed and Kimberly will check the accuracy of this procedure.
ds_1 <- ds_1 %>% mutate(id_2_fix = id)
14 participants had gender number 3.
ds_1 %>% count(gender)
I’ve created a new variable (gender_fix) in which these numbers were replaced to missing. We can change this later.
ds_1 <- ds_1 %>%
mutate(gender_fix = case_when(
gender == 1 ~ "Male",
gender == 2 ~ "Female",
TRUE ~ NA_character_
)) %>%
mutate(gender_fix = as.factor(gender_fix))
In the current ds, mom education had 6 options. See table below.
ds_1 %>% count(momed)
I transformed this 0 value into missing
ds_1 <- ds_1 %>%
mutate(momed_fix = if_else(momed == 0, NA_real_, momed))
ds_1 <- ds_1 %>%
mutate(momed_fix = factor(momed_fix)) %>%
mutate(momed_fix = fct_relevel(momed_fix, sort))
ds_1 %>%
count(momed_fix)
ALERT: Some unkown values were presented in this variable. See table below
ds_1 %>% count(momage) %>%
datatable(.)
Kimberly, this variable was not changed (or fixed)
This variable was previously defined as numeric. I’ve changed to factor
ds_1<- ds_1 %>%
mutate(income_fix = as.factor(income1)) %>%
mutate(income_fix = fct_relevel(income_fix, sort))
ds_1 %>% count(income_fix)
In the current ds, disability had 4 options. See table below.
ds_1 %>% count(disab)
I’ve created a disab_fix with 0 (no disability), 1 (disability), and missing cases (all other values)
ds_1 <- ds_1 %>%
mutate(disab_fix = case_when(
disab == 0 ~ "0",
disab == 1 ~ "1",
TRUE ~ NA_character_
)) %>%
mutate(disab_fix = as.factor(disab_fix))
ds_1 %>% count(disab_fix)
At risk was composed for 16270 dots (missing cases). The number 1 appeared 63 times, 2 appeared 85, and 3 was 52.
ds_1 %>% count(atrisk)
Updated one Sep 12. Missing cases were not transformed to 0.
ds_1 <- ds_1 %>%
mutate(atrisk_fix = case_when(
atrisk == 0 ~ "0",
atrisk == 1 ~ "1",
atrisk == 2 ~ "2",
atrisk == 3 ~ "3",
TRUE ~ NA_character_
)) %>%
mutate(atrisk_fix = as.factor(atrisk_fix))
ds_1 %>% count(atrisk_fix)
This variable was previously defined as numeric. I’ve changed to factor
ds_1 <- ds_1 %>%
mutate(language_fix = as.factor(language))
ds_1 %>% count(language_fix)
Row number 17527 (excel) was used to compute the number of questionnaires. However, this info was added in the “ID” column. I’ve deleted this row. Therefore, R was (incorrectly) thinking was participant had missing cases in almost all variables.
ds_1 %>% count(quest)
ds_1 <- ds_1 %>% filter(id != "17524")
ds_1 <- ds_1 %>%
mutate(quest_fix = as.factor(quest))
This variable was previously defined as numeric. I’ve changed to factor
ds_1 <- ds_1 %>%
mutate(race_fix = as.factor(race)) %>%
mutate(race_fix = fct_relevel(race_fix, sort))
ds_1 %>% count(race_fix)
ds_1 <- ds_1 %>% mutate(website = if_else(source == "ASQ4website","online","paper"))
writexl::write_xlsx(ds_1, "ds_asq4_luis.xlsx")
view(dfSummary(ds_1))
library(readxl)
ds_2 <- read_excel("C:/Users/luisf/Dropbox/Puc-Rio/Projeto - ASQ 4 2021/Datasets/ASQ3_Supplmnt and ASQ4 test1 8.19.21.xlsx",
col_types = c("text", "numeric", "text",
"numeric", "date", "date", "text",
"text", "numeric", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"numeric", "text", "text", "numeric",
"date", "date", "text", "text", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "text", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text"), sheet = 2)
backup_ds_2 <- ds_2
Data was imported into R. Some warning were reported.
Internal issue. Just to make the variables names correcly coded for statistical analysis.
ds_2 <- clean_names(ds_2)
The number of rows was added in the excel row number 3305. R thinks this row represents a case. I removed this row.
ds_2 <- ds_2 %>% filter(language != "3302")
According to our last e-mail, values equal to “.” (dots) were transformed into missing.
ds_2 <- ds_2 %>% #mutate_all(., ~na_if(., "."))
mutate_at(vars(-dob_5, -datcom_6,-dob_31,-datcom_32),~na_if(., "."))
Some rows/columns and rows were empty. Check “eval”, valid“,”reliab’, “com_a4_1”
ALERT: Kimberly, com_a4_1 is 100% . Is this alright ?
ds_2 %>% map(., ~sum(is.na(.))) %>% purrr::simplify()
source language id_3 quest_4 dob_5 datcom_6 weeks_premature premie_8 age_9
0 0 0 0 0 0 95 9 0
gender race weight momed momage author_15 income1 disab what_disab
18 38 389 349 404 36 324 59 410
who_dx service what_service atrisk eval valid reliab intrtr state
3275 62 354 62 3302 3302 3302 3279 0
zip id_29 quest_30 dob_31 datcom_32 author_33 premie_34 age_35 com_a3_1
3283 0 0 0 0 36 9 1 0
com_a3_2 com_a3_3 com_a3_4 com_a3_5 com_a3_6 c_sum gm_a3_1 gm_a3_2 gm_a3_3
0 0 2 2 1 410 0 1 0
gm_a3_4 gm_a3_5 gm_a3_6 gm_sum fm_a3_1 fm_a3_2 fm_a3_3 fm_a3_4 fm_a3_5
0 0 0 410 1 1 5 1 4
fm_a3_6 fm_sum cg_a3_1 cg_a3_2 cg_a3_3 cg_a3_4 cg_a3_5 cg_a3_6 cg_sum
1 410 2 2 1 0 3 3 410
ps_a3_1 ps_a3_2 ps_a3_3 ps_a3_4 ps_a3_5 ps_a3_6 ps_sum overall_a3_1 overalltxt_a3_1
4 2 0 0 0 0 410 19 3284
overall_a3_2 overalltxt_a3_2 overall_a3_3 overalltxt_a3_3 overall_a3_4 overalltxt_a3_4 overall_a3_5 overalltxt_a3_5 overall_a3_6
22 3249 17 3278 17 3279 16 3287 24
overalltxt_a3_6 overall_a3_7 overalltxt_a3_7 overall_a3_8 overalltxt_a3_8 overall_a3_9 overalltxt_a3_9 overall_a3_10 overalltxt_a3_10
3278 116 3276 121 3272 671 3256 1759 3274
com_a4_1 com_a4_2 com_a4_3 com_a4_4 com_a4_5 com_a4_6 gm_a4_1 gm_a4_2 gm_a4_3
3302 3123 2995 2861 3025 2988 2927 2240 2707
gm_a4_4 gm_a4_5 gm_a4_6 fm_a4_1 fm_a4_2 fm_a4_3 fm_a4_4 fm_a4_5 fm_a4_6
2437 2877 2495 2910 2854 2442 2526 3023 2804
cg_a4_1 cg_a4_2 cg_a4_3 cg_a4_4 cg_a4_5 cg_a4_6 ps_a4_1 ps_a4_2 ps_a4_3
2558 2752 2504 2905 2364 2527 1410 2672 2205
ps_a4_4 ps_a4_5 ps_a4_6
2400 1614 2765
I’ve excluded these columns.
ds_2 <- remove_empty(ds_2)
value for "which" not specified, defaulting to c("rows", "cols")
ALERT: Kimberly, please check the values of ASQ-3 variables (spreedsheet 2). The table below showcast some inconsistencies. Please check it in your Excel file row 2216, column AS (gm_a3_3_) and/or row 2834, column AZ (fm_a3_3).
ds_2 %>% select(com_a3_1, com_a3_2, com_a3_3, com_a3_4, com_a3_5, com_a3_6,
gm_a3_1, gm_a3_2, gm_a3_3, gm_a3_4, gm_a3_5, gm_a3_6,
fm_a3_1, fm_a3_2, fm_a3_3, fm_a3_4, fm_a3_5, fm_a3_6,
cg_a3_1, cg_a3_2, cg_a3_3, cg_a3_4, cg_a3_5, cg_a3_6,
ps_a3_1, ps_a3_2, ps_a3_3, ps_a3_4, ps_a3_5, ps_a3_6,
com_a4_2, com_a4_3, com_a4_4, com_a4_5, com_a4_6,
gm_a4_1, gm_a4_2, gm_a4_3, gm_a4_4, gm_a4_5, gm_a4_6,
fm_a4_1, fm_a4_2, fm_a4_3, fm_a4_4, fm_a4_5, fm_a4_6,
cg_a4_1, cg_a4_2, cg_a4_3, cg_a4_4, cg_a4_5, cg_a4_6,
ps_a4_1, ps_a4_2, ps_a4_3, ps_a4_4, ps_a4_5, ps_a4_6) %>%
mutate_all(., factor) %>% arsenal::tableby(~., data = .) %>% summary()
| Overall (N=3302) | |
|---|---|
| com_a3_1 | |
| 0 | 134 (4.1%) |
| 5 | 266 (8.1%) |
| 8.75 | 1 (0.0%) |
| 10 | 2901 (87.9%) |
| com_a3_2 | |
| 0 | 201 (6.1%) |
| 5 | 370 (11.2%) |
| 10 | 2731 (82.7%) |
| com_a3_3 | |
| 0 | 251 (7.6%) |
| 5 | 496 (15.0%) |
| 10 | 2555 (77.4%) |
| com_a3_4 | |
| N-Miss | 2 |
| 0 | 462 (14.0%) |
| 5 | 497 (15.1%) |
| 10 | 2341 (70.9%) |
| com_a3_5 | |
| N-Miss | 2 |
| 0 | 464 (14.1%) |
| 5 | 529 (16.0%) |
| 8.75 | 1 (0.0%) |
| 10 | 2306 (69.9%) |
| com_a3_6 | |
| N-Miss | 1 |
| 0 | 773 (23.4%) |
| 5 | 561 (17.0%) |
| 6 | 1 (0.0%) |
| 10 | 1966 (59.6%) |
| gm_a3_1 | |
| 0 | 84 (2.5%) |
| 5 | 194 (5.9%) |
| 9 | 1 (0.0%) |
| 10 | 3023 (91.6%) |
| gm_a3_2 | |
| N-Miss | 1 |
| 0 | 76 (2.3%) |
| 5 | 218 (6.6%) |
| 10 | 3007 (91.1%) |
| gm_a3_3 | |
| 0 | 144 (4.4%) |
| 5 | 269 (8.1%) |
| 8 | 1 (0.0%) |
| 9 | 1 (0.0%) |
| 10 | 2887 (87.4%) |
| gm_a3_4 | |
| 0 | 189 (5.7%) |
| 4 | 1 (0.0%) |
| 5 | 346 (10.5%) |
| 10 | 2766 (83.8%) |
| gm_a3_5 | |
| 0 | 276 (8.4%) |
| 5 | 439 (13.3%) |
| 9 | 1 (0.0%) |
| 10 | 2586 (78.3%) |
| gm_a3_6 | |
| 0 | 448 (13.6%) |
| 5 | 403 (12.2%) |
| 10 | 2451 (74.2%) |
| fm_a3_1 | |
| N-Miss | 1 |
| 0 | 216 (6.5%) |
| 5 | 375 (11.4%) |
| 10 | 2710 (82.1%) |
| fm_a3_2 | |
| N-Miss | 1 |
| 0 | 325 (9.8%) |
| 2 | 1 (0.0%) |
| 5 | 386 (11.7%) |
| 9 | 1 (0.0%) |
| 10 | 2588 (78.4%) |
| fm_a3_3 | |
| N-Miss | 5 |
| 0 | 398 (12.1%) |
| 10 | 2356 (71.5%) |
| 110 | 1 (0.0%) |
| 5 | 542 (16.4%) |
| fm_a3_4 | |
| N-Miss | 1 |
| 0 | 399 (12.1%) |
| 3 | 1 (0.0%) |
| 5 | 612 (18.5%) |
| 8 | 1 (0.0%) |
| 10 | 2287 (69.3%) |
| 19 | 1 (0.0%) |
| fm_a3_5 | |
| N-Miss | 4 |
| 0 | 539 (16.3%) |
| 5 | 500 (15.2%) |
| 7 | 1 (0.0%) |
| 9 | 1 (0.0%) |
| 10 | 2257 (68.4%) |
| fm_a3_6 | |
| N-Miss | 1 |
| 0 | 593 (18.0%) |
| 3 | 1 (0.0%) |
| 5 | 669 (20.3%) |
| 10 | 2038 (61.7%) |
| cg_a3_1 | |
| N-Miss | 2 |
| 0 | 183 (5.5%) |
| 5 | 233 (7.1%) |
| 10 | 2884 (87.4%) |
| cg_a3_2 | |
| N-Miss | 2 |
| 0 | 241 (7.3%) |
| 2 | 1 (0.0%) |
| 5 | 333 (10.1%) |
| 10 | 2725 (82.6%) |
| cg_a3_3 | |
| N-Miss | 1 |
| 0 | 185 (5.6%) |
| 5 | 344 (10.4%) |
| 10 | 2772 (84.0%) |
| cg_a3_4 | |
| 0 | 306 (9.3%) |
| 5 | 363 (11.0%) |
| 8 | 1 (0.0%) |
| 10 | 2632 (79.7%) |
| cg_a3_5 | |
| N-Miss | 3 |
| 0 | 539 (16.3%) |
| 5 | 514 (15.6%) |
| 10 | 2246 (68.1%) |
| cg_a3_6 | |
| N-Miss | 3 |
| 0 | 497 (15.1%) |
| 5 | 509 (15.4%) |
| 8 | 1 (0.0%) |
| 10 | 2292 (69.5%) |
| ps_a3_1 | |
| N-Miss | 4 |
| 0 | 209 (6.3%) |
| 5 | 404 (12.2%) |
| 8.75 | 1 (0.0%) |
| 10 | 2684 (81.4%) |
| ps_a3_2 | |
| N-Miss | 2 |
| 0 | 180 (5.5%) |
| 3 | 1 (0.0%) |
| 5 | 426 (12.9%) |
| 9 | 1 (0.0%) |
| 10 | 2692 (81.6%) |
| ps_a3_3 | |
| 0 | 194 (5.9%) |
| 5 | 344 (10.4%) |
| 10 | 2763 (83.7%) |
| 110 | 1 (0.0%) |
| ps_a3_4 | |
| 0 | 412 (12.5%) |
| 5 | 674 (20.4%) |
| 10 | 2216 (67.1%) |
| ps_a3_5 | |
| 0 | 480 (14.5%) |
| 5 | 553 (16.7%) |
| 7.5 | 1 (0.0%) |
| 8.75 | 1 (0.0%) |
| 9 | 1 (0.0%) |
| 10 | 2266 (68.6%) |
| ps_a3_6 | |
| 0 | 399 (12.1%) |
| 5 | 606 (18.4%) |
| 7.5 | 1 (0.0%) |
| 10 | 2296 (69.5%) |
| com_a4_2 | |
| N-Miss | 3123 |
| 0 | 18 (10.1%) |
| 10 | 119 (66.5%) |
| 5 | 42 (23.5%) |
| com_a4_3 | |
| N-Miss | 2995 |
| 0 | 4 (1.3%) |
| 10 | 278 (90.6%) |
| 3 | 1 (0.3%) |
| 5 | 24 (7.8%) |
| com_a4_4 | |
| N-Miss | 2861 |
| 0 | 23 (5.2%) |
| 10 | 383 (86.8%) |
| 5 | 35 (7.9%) |
| com_a4_5 | |
| N-Miss | 3025 |
| 0 | 15 (5.4%) |
| 10 | 242 (87.4%) |
| 5 | 20 (7.2%) |
| com_a4_6 | |
| N-Miss | 2988 |
| 0 | 26 (8.3%) |
| 10 | 265 (84.4%) |
| 5 | 23 (7.3%) |
| gm_a4_1 | |
| N-Miss | 2927 |
| 0 | 3 (0.8%) |
| 10 | 354 (94.4%) |
| 20 | 1 (0.3%) |
| 5 | 17 (4.5%) |
| gm_a4_2 | |
| N-Miss | 2240 |
| <U+F739> | 1 (0.1%) |
| 0 | 44 (4.1%) |
| 10 | 890 (83.8%) |
| 5 | 127 (12.0%) |
| gm_a4_3 | |
| N-Miss | 2707 |
| 0 | 52 (8.7%) |
| 10 | 474 (79.7%) |
| 5 | 69 (11.6%) |
| gm_a4_4 | |
| N-Miss | 2437 |
| 0 | 56 (6.5%) |
| 10 | 707 (81.7%) |
| 5 | 102 (11.8%) |
| gm_a4_5 | |
| N-Miss | 2877 |
| 0 | 24 (5.6%) |
| 10 | 365 (85.9%) |
| 5 | 36 (8.5%) |
| gm_a4_6 | |
| N-Miss | 2495 |
| 0 | 31 (3.8%) |
| 10 | 740 (91.7%) |
| 5 | 36 (4.5%) |
| fm_a4_1 | |
| N-Miss | 2910 |
| 0 | 22 (5.6%) |
| 10 | 307 (78.3%) |
| 20 | 1 (0.3%) |
| 5 | 62 (15.8%) |
| fm_a4_2 | |
| N-Miss | 2854 |
| 0 | 17 (3.8%) |
| 10 | 376 (83.9%) |
| 5 | 55 (12.3%) |
| fm_a4_3 | |
| N-Miss | 2442 |
| 0 | 104 (12.1%) |
| 10 | 535 (62.2%) |
| 5 | 221 (25.7%) |
| fm_a4_4 | |
| N-Miss | 2526 |
| 0 | 91 (11.7%) |
| 10 | 599 (77.2%) |
| 5 | 86 (11.1%) |
| fm_a4_5 | |
| N-Miss | 3023 |
| 0 | 25 (9.0%) |
| 10 | 195 (69.9%) |
| 5 | 59 (21.1%) |
| fm_a4_6 | |
| N-Miss | 2804 |
| 0 | 78 (15.7%) |
| 10 | 278 (55.8%) |
| 5 | 142 (28.5%) |
| cg_a4_1 | |
| N-Miss | 2558 |
| 0 | 24 (3.2%) |
| 10 | 636 (85.5%) |
| 5 | 84 (11.3%) |
| cg_a4_2 | |
| N-Miss | 2752 |
| 0 | 25 (4.5%) |
| 10 | 467 (84.9%) |
| 5 | 58 (10.5%) |
| cg_a4_3 | |
| N-Miss | 2504 |
| 0 | 38 (4.8%) |
| 10 | 656 (82.2%) |
| 5 | 104 (13.0%) |
| cg_a4_4 | |
| N-Miss | 2905 |
| 0 | 9 (2.3%) |
| 10 | 349 (87.9%) |
| 5 | 39 (9.8%) |
| cg_a4_5 | |
| N-Miss | 2364 |
| 0 | 83 (8.8%) |
| 10 | 695 (74.1%) |
| 5 | 160 (17.1%) |
| cg_a4_6 | |
| N-Miss | 2527 |
| 0 | 65 (8.4%) |
| 10 | 574 (74.1%) |
| 5 | 136 (17.5%) |
| ps_a4_1 | |
| N-Miss | 1410 |
| 0 | 165 (8.7%) |
| 10 | 1439 (76.1%) |
| 5 | 288 (15.2%) |
| ps_a4_2 | |
| N-Miss | 2672 |
| 0 | 41 (6.5%) |
| 10 | 441 (70.0%) |
| 5 | 148 (23.5%) |
| ps_a4_3 | |
| N-Miss | 2205 |
| 0 | 116 (10.6%) |
| 10 | 845 (77.0%) |
| 5 | 136 (12.4%) |
| ps_a4_4 | |
| N-Miss | 2400 |
| 0 | 57 (6.3%) |
| 10 | 643 (71.3%) |
| 5 | 202 (22.4%) |
| ps_a4_5 | |
| N-Miss | 1614 |
| 0 | 317 (18.8%) |
| 10 | 1048 (62.1%) |
| 5 | 323 (19.1%) |
| ps_a4_6 | |
| N-Miss | 2765 |
| 0 | 67 (12.5%) |
| 10 | 391 (72.8%) |
| 5 | 79 (14.7%) |
NA
Except for 0,5, and 10, I’ll change all other values to missing cases.
ds_2 <- ds_2 %>%
mutate_at(vars(com_a3_1, com_a3_2, com_a3_3, com_a3_4, com_a3_5, com_a3_6,
gm_a3_1, gm_a3_2, gm_a3_3, gm_a3_4, gm_a3_5, gm_a3_6,
fm_a3_1, fm_a3_2, fm_a3_3, fm_a3_4, fm_a3_5, fm_a3_6,
cg_a3_1, cg_a3_2, cg_a3_3, cg_a3_4, cg_a3_5, cg_a3_6,
ps_a3_1, ps_a3_2, ps_a3_3, ps_a3_4, ps_a3_5, ps_a3_6,
com_a4_2, com_a4_3, com_a4_4, com_a4_5, com_a4_6,
gm_a4_1, gm_a4_2, gm_a4_3, gm_a4_4, gm_a4_5, gm_a4_6,
fm_a4_1, fm_a4_2, fm_a4_3, fm_a4_4, fm_a4_5, fm_a4_6,
cg_a4_1, cg_a4_2, cg_a4_3, cg_a4_4, cg_a4_5, cg_a4_6,
ps_a4_1, ps_a4_2, ps_a4_3, ps_a4_4, ps_a4_5, ps_a4_6),
~ifelse(. %in% c(0,5,10), ., NA))
Please tell me if you agree with my solution.
ds_2 %>% select(com_a3_1, com_a3_2, com_a3_3, com_a3_4, com_a3_5, com_a3_6,
gm_a3_1, gm_a3_2, gm_a3_3, gm_a3_4, gm_a3_5, gm_a3_6,
fm_a3_1, fm_a3_2, fm_a3_3, fm_a3_4, fm_a3_5, fm_a3_6,
cg_a3_1, cg_a3_2, cg_a3_3, cg_a3_4, cg_a3_5, cg_a3_6,
ps_a3_1, ps_a3_2, ps_a3_3, ps_a3_4, ps_a3_5, ps_a3_6,
com_a4_2, com_a4_3, com_a4_4, com_a4_5, com_a4_6,
gm_a4_1, gm_a4_2, gm_a4_3, gm_a4_4, gm_a4_5, gm_a4_6,
fm_a4_1, fm_a4_2, fm_a4_3, fm_a4_4, fm_a4_5, fm_a4_6,
cg_a4_1, cg_a4_2, cg_a4_3, cg_a4_4, cg_a4_5, cg_a4_6,
ps_a4_1, ps_a4_2, ps_a4_3, ps_a4_4, ps_a4_5, ps_a4_6) %>%
mutate_all(., factor) %>% arsenal::tableby(~., data = .) %>% summary()
| Overall (N=3302) | |
|---|---|
| com_a3_1 | |
| N-Miss | 1 |
| 0 | 134 (4.1%) |
| 5 | 266 (8.1%) |
| 10 | 2901 (87.9%) |
| com_a3_2 | |
| 0 | 201 (6.1%) |
| 5 | 370 (11.2%) |
| 10 | 2731 (82.7%) |
| com_a3_3 | |
| 0 | 251 (7.6%) |
| 5 | 496 (15.0%) |
| 10 | 2555 (77.4%) |
| com_a3_4 | |
| N-Miss | 2 |
| 0 | 462 (14.0%) |
| 5 | 497 (15.1%) |
| 10 | 2341 (70.9%) |
| com_a3_5 | |
| N-Miss | 3 |
| 0 | 464 (14.1%) |
| 5 | 529 (16.0%) |
| 10 | 2306 (69.9%) |
| com_a3_6 | |
| N-Miss | 2 |
| 0 | 773 (23.4%) |
| 5 | 561 (17.0%) |
| 10 | 1966 (59.6%) |
| gm_a3_1 | |
| N-Miss | 1 |
| 0 | 84 (2.5%) |
| 5 | 194 (5.9%) |
| 10 | 3023 (91.6%) |
| gm_a3_2 | |
| N-Miss | 1 |
| 0 | 76 (2.3%) |
| 5 | 218 (6.6%) |
| 10 | 3007 (91.1%) |
| gm_a3_3 | |
| N-Miss | 2 |
| 0 | 144 (4.4%) |
| 5 | 269 (8.2%) |
| 10 | 2887 (87.5%) |
| gm_a3_4 | |
| N-Miss | 1 |
| 0 | 189 (5.7%) |
| 5 | 346 (10.5%) |
| 10 | 2766 (83.8%) |
| gm_a3_5 | |
| N-Miss | 1 |
| 0 | 276 (8.4%) |
| 5 | 439 (13.3%) |
| 10 | 2586 (78.3%) |
| gm_a3_6 | |
| 0 | 448 (13.6%) |
| 5 | 403 (12.2%) |
| 10 | 2451 (74.2%) |
| fm_a3_1 | |
| N-Miss | 1 |
| 0 | 216 (6.5%) |
| 5 | 375 (11.4%) |
| 10 | 2710 (82.1%) |
| fm_a3_2 | |
| N-Miss | 3 |
| 0 | 325 (9.9%) |
| 5 | 386 (11.7%) |
| 10 | 2588 (78.4%) |
| fm_a3_3 | |
| N-Miss | 6 |
| 0 | 398 (12.1%) |
| 10 | 2356 (71.5%) |
| 5 | 542 (16.4%) |
| fm_a3_4 | |
| N-Miss | 4 |
| 0 | 399 (12.1%) |
| 5 | 612 (18.6%) |
| 10 | 2287 (69.3%) |
| fm_a3_5 | |
| N-Miss | 6 |
| 0 | 539 (16.4%) |
| 5 | 500 (15.2%) |
| 10 | 2257 (68.5%) |
| fm_a3_6 | |
| N-Miss | 2 |
| 0 | 593 (18.0%) |
| 5 | 669 (20.3%) |
| 10 | 2038 (61.8%) |
| cg_a3_1 | |
| N-Miss | 2 |
| 0 | 183 (5.5%) |
| 5 | 233 (7.1%) |
| 10 | 2884 (87.4%) |
| cg_a3_2 | |
| N-Miss | 3 |
| 0 | 241 (7.3%) |
| 5 | 333 (10.1%) |
| 10 | 2725 (82.6%) |
| cg_a3_3 | |
| N-Miss | 1 |
| 0 | 185 (5.6%) |
| 5 | 344 (10.4%) |
| 10 | 2772 (84.0%) |
| cg_a3_4 | |
| N-Miss | 1 |
| 0 | 306 (9.3%) |
| 5 | 363 (11.0%) |
| 10 | 2632 (79.7%) |
| cg_a3_5 | |
| N-Miss | 3 |
| 0 | 539 (16.3%) |
| 5 | 514 (15.6%) |
| 10 | 2246 (68.1%) |
| cg_a3_6 | |
| N-Miss | 4 |
| 0 | 497 (15.1%) |
| 5 | 509 (15.4%) |
| 10 | 2292 (69.5%) |
| ps_a3_1 | |
| N-Miss | 5 |
| 0 | 209 (6.3%) |
| 5 | 404 (12.3%) |
| 10 | 2684 (81.4%) |
| ps_a3_2 | |
| N-Miss | 4 |
| 0 | 180 (5.5%) |
| 5 | 426 (12.9%) |
| 10 | 2692 (81.6%) |
| ps_a3_3 | |
| N-Miss | 1 |
| 0 | 194 (5.9%) |
| 5 | 344 (10.4%) |
| 10 | 2763 (83.7%) |
| ps_a3_4 | |
| 0 | 412 (12.5%) |
| 5 | 674 (20.4%) |
| 10 | 2216 (67.1%) |
| ps_a3_5 | |
| N-Miss | 3 |
| 0 | 480 (14.5%) |
| 5 | 553 (16.8%) |
| 10 | 2266 (68.7%) |
| ps_a3_6 | |
| N-Miss | 1 |
| 0 | 399 (12.1%) |
| 5 | 606 (18.4%) |
| 10 | 2296 (69.6%) |
| com_a4_2 | |
| N-Miss | 3123 |
| 0 | 18 (10.1%) |
| 10 | 119 (66.5%) |
| 5 | 42 (23.5%) |
| com_a4_3 | |
| N-Miss | 2996 |
| 0 | 4 (1.3%) |
| 10 | 278 (90.8%) |
| 5 | 24 (7.8%) |
| com_a4_4 | |
| N-Miss | 2861 |
| 0 | 23 (5.2%) |
| 10 | 383 (86.8%) |
| 5 | 35 (7.9%) |
| com_a4_5 | |
| N-Miss | 3025 |
| 0 | 15 (5.4%) |
| 10 | 242 (87.4%) |
| 5 | 20 (7.2%) |
| com_a4_6 | |
| N-Miss | 2988 |
| 0 | 26 (8.3%) |
| 10 | 265 (84.4%) |
| 5 | 23 (7.3%) |
| gm_a4_1 | |
| N-Miss | 2928 |
| 0 | 3 (0.8%) |
| 10 | 354 (94.7%) |
| 5 | 17 (4.5%) |
| gm_a4_2 | |
| N-Miss | 2241 |
| 0 | 44 (4.1%) |
| 10 | 890 (83.9%) |
| 5 | 127 (12.0%) |
| gm_a4_3 | |
| N-Miss | 2707 |
| 0 | 52 (8.7%) |
| 10 | 474 (79.7%) |
| 5 | 69 (11.6%) |
| gm_a4_4 | |
| N-Miss | 2437 |
| 0 | 56 (6.5%) |
| 10 | 707 (81.7%) |
| 5 | 102 (11.8%) |
| gm_a4_5 | |
| N-Miss | 2877 |
| 0 | 24 (5.6%) |
| 10 | 365 (85.9%) |
| 5 | 36 (8.5%) |
| gm_a4_6 | |
| N-Miss | 2495 |
| 0 | 31 (3.8%) |
| 10 | 740 (91.7%) |
| 5 | 36 (4.5%) |
| fm_a4_1 | |
| N-Miss | 2911 |
| 0 | 22 (5.6%) |
| 10 | 307 (78.5%) |
| 5 | 62 (15.9%) |
| fm_a4_2 | |
| N-Miss | 2854 |
| 0 | 17 (3.8%) |
| 10 | 376 (83.9%) |
| 5 | 55 (12.3%) |
| fm_a4_3 | |
| N-Miss | 2442 |
| 0 | 104 (12.1%) |
| 10 | 535 (62.2%) |
| 5 | 221 (25.7%) |
| fm_a4_4 | |
| N-Miss | 2526 |
| 0 | 91 (11.7%) |
| 10 | 599 (77.2%) |
| 5 | 86 (11.1%) |
| fm_a4_5 | |
| N-Miss | 3023 |
| 0 | 25 (9.0%) |
| 10 | 195 (69.9%) |
| 5 | 59 (21.1%) |
| fm_a4_6 | |
| N-Miss | 2804 |
| 0 | 78 (15.7%) |
| 10 | 278 (55.8%) |
| 5 | 142 (28.5%) |
| cg_a4_1 | |
| N-Miss | 2558 |
| 0 | 24 (3.2%) |
| 10 | 636 (85.5%) |
| 5 | 84 (11.3%) |
| cg_a4_2 | |
| N-Miss | 2752 |
| 0 | 25 (4.5%) |
| 10 | 467 (84.9%) |
| 5 | 58 (10.5%) |
| cg_a4_3 | |
| N-Miss | 2504 |
| 0 | 38 (4.8%) |
| 10 | 656 (82.2%) |
| 5 | 104 (13.0%) |
| cg_a4_4 | |
| N-Miss | 2905 |
| 0 | 9 (2.3%) |
| 10 | 349 (87.9%) |
| 5 | 39 (9.8%) |
| cg_a4_5 | |
| N-Miss | 2364 |
| 0 | 83 (8.8%) |
| 10 | 695 (74.1%) |
| 5 | 160 (17.1%) |
| cg_a4_6 | |
| N-Miss | 2527 |
| 0 | 65 (8.4%) |
| 10 | 574 (74.1%) |
| 5 | 136 (17.5%) |
| ps_a4_1 | |
| N-Miss | 1410 |
| 0 | 165 (8.7%) |
| 10 | 1439 (76.1%) |
| 5 | 288 (15.2%) |
| ps_a4_2 | |
| N-Miss | 2672 |
| 0 | 41 (6.5%) |
| 10 | 441 (70.0%) |
| 5 | 148 (23.5%) |
| ps_a4_3 | |
| N-Miss | 2205 |
| 0 | 116 (10.6%) |
| 10 | 845 (77.0%) |
| 5 | 136 (12.4%) |
| ps_a4_4 | |
| N-Miss | 2400 |
| 0 | 57 (6.3%) |
| 10 | 643 (71.3%) |
| 5 | 202 (22.4%) |
| ps_a4_5 | |
| N-Miss | 1614 |
| 0 | 317 (18.8%) |
| 10 | 1048 (62.1%) |
| 5 | 323 (19.1%) |
| ps_a4_6 | |
| N-Miss | 2765 |
| 0 | 67 (12.5%) |
| 10 | 391 (72.8%) |
| 5 | 79 (14.7%) |
NA
This is a R internal issue. Don’t need to be worried about it.
ds_2 <- ds_2 %>%
mutate_at(vars(com_a3_1, com_a3_2, com_a3_3, com_a3_4, com_a3_5, com_a3_6,
gm_a3_1, gm_a3_2, gm_a3_3, gm_a3_4, gm_a3_5, gm_a3_6,
fm_a3_1, fm_a3_2, fm_a3_3, fm_a3_4, fm_a3_5, fm_a3_6,
cg_a3_1, cg_a3_2, cg_a3_3, cg_a3_4, cg_a3_5, cg_a3_6,
ps_a3_1, ps_a3_2, ps_a3_3, ps_a3_4, ps_a3_5, ps_a3_6,
com_a4_2, com_a4_3, com_a4_4, com_a4_5, com_a4_6,
gm_a4_1, gm_a4_2, gm_a4_3, gm_a4_4, gm_a4_5, gm_a4_6,
fm_a4_1, fm_a4_2, fm_a4_3, fm_a4_4, fm_a4_5, fm_a4_6,
cg_a4_1, cg_a4_2, cg_a4_3, cg_a4_4, cg_a4_5, cg_a4_6,
ps_a4_1, ps_a4_2, ps_a4_3, ps_a4_4, ps_a4_5, ps_a4_6), ~as.numeric(.))
Kimberly, In the table below, you’ll see the number of missing cases for each source.
For eample, the source “LaneCo HST” has 208 missing cases. “YaleChildCenter” has 19.
ds_2 %>%
group_by(source) %>%
summarise_at(vars(contains("sum")), ~sum(is.na(.))) %>%
#summarise(sum(is.na(csum))) %>%
adorn_totals() %>%
datatable(., options = list(dom='t'))
NA
I’ll compute the scores for each child and then check the missing cases again
ds_2 <- ds_2 %>% rowwise() %>%
mutate(c_sum = sum(c_across(c(com_a3_1, com_a3_2, com_a3_3, com_a3_4, com_a3_5, com_a3_6)), na.rm=T)) %>%
mutate(gm_sum = sum(c_across(c(gm_a3_1, gm_a3_2, gm_a3_3, gm_a3_4, gm_a3_5, gm_a3_6)), na.rm=T)) %>%
mutate(fm_sum = sum(c_across(c(fm_a3_1, fm_a3_2, fm_a3_3, fm_a3_4, fm_a3_5, fm_a3_6)), na.rm=T)) %>%
mutate(cg_sum = sum(c_across(c(cg_a3_1, cg_a3_2, cg_a3_3, cg_a3_4, cg_a3_5, cg_a3_6)), na.rm=T)) %>%
mutate(ps_sum = sum(c_across(c(ps_a3_1, ps_a3_2, ps_a3_3, ps_a3_4, ps_a3_5, ps_a3_6)), na.rm=T))
ds_2 %>%
select(contains("sum")) %>%
plot_missing()
ALERT: Kimberly, 6 children were repeated. Please check the table below.
ds_2 %>%
rename(id = id_3) %>%
mutate(ds="ds2") %>%
add_count(id) %>%
filter(n>1) %>%
select(id, contains("sum")) %>%
arrange(id)
ds_2 <- ds_2 %>%
distinct(id_3, .keep_all = TRUE)
I’ll drop the duplicated values. Is this solution adequate?
ALERT: Kimberly, 6 children have different ids across the dataset. Please let me know what to do with them.
ds_2 %>%
filter(id_3 != id_29) %>%
select(id_3)
bind_rows(
ds_1 %>%
mutate(ds="ds1") %>%
select(id)
,
ds_2 %>%
rename(id = id_3) %>%
mutate(ds="ds2") %>%
select(id)
)%>%
add_count(id) %>%
filter(n>1)
distinct(id, .keep_all = T)
view(dfSummary(ds_2))