compareGroups (without considering weight)
setwd("C:/Users/mvx13/OneDrive - Texas State University/Papers/2025/00_NewPapers/Survey/PewSurvey/W152/Arka/V3")
library(readxl)
library(ggplot2)
library(dplyr)
library(compareGroups)
library(vtree)
library(DT)
df= read.csv("ATP W152 Data_cleanedFin_AC.csv")
df1= df[, c(1:13, 15)]
res1 <- compareGroups(DrCell ~ ., df1, ref = 1, max.ylev = 50,
max.xlev = 50, chisq.test.perm=TRUE, chisq.test.B=10000)
res2= createTable(res1, show.ratio = TRUE)
res2
##
## --------Summary descriptives table by 'DrCell'---------
##
## _________________________________________________________________________________________________
## Major problem Minor problem No Response Not a problem p.overall
## N=4253 N=996 N=13 N=148
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## Area: 0.114
## Metropolitan 3746 (88.1%) 855 (85.8%) 12 (92.3%) 123 (83.1%)
## No Response 1 (0.02%) 2 (0.20%) 0 (0.00%) 0 (0.00%)
## Non-metropolitan 506 (11.9%) 139 (14.0%) 1 (7.69%) 25 (16.9%)
## Regn: 0.064
## Midwest 857 (20.2%) 198 (19.9%) 3 (23.1%) 31 (20.9%)
## No Response 1 (0.02%) 2 (0.20%) 0 (0.00%) 0 (0.00%)
## Northeast 703 (16.5%) 193 (19.4%) 5 (38.5%) 34 (23.0%)
## South 1663 (39.1%) 355 (35.6%) 4 (30.8%) 54 (36.5%)
## West 1029 (24.2%) 248 (24.9%) 1 (7.69%) 29 (19.6%)
## Land: 0.101
## Refused 44 (1.03%) 10 (1.00%) 0 (0.00%) 3 (2.03%)
## Rural 948 (22.3%) 237 (23.8%) 3 (23.1%) 47 (31.8%)
## Suburban 2224 (52.3%) 527 (52.9%) 5 (38.5%) 60 (40.5%)
## Urban 1037 (24.4%) 222 (22.3%) 5 (38.5%) 38 (25.7%)
## Age: 0.001
## 18-29 575 (13.5%) 152 (15.3%) 4 (30.8%) 32 (21.6%)
## 30-49 1491 (35.1%) 330 (33.1%) 3 (23.1%) 59 (39.9%)
## 50-64 1103 (25.9%) 208 (20.9%) 3 (23.1%) 28 (18.9%)
## 65+ 1062 (25.0%) 303 (30.4%) 2 (15.4%) 28 (18.9%)
## Refused 22 (0.52%) 3 (0.30%) 1 (7.69%) 1 (0.68%)
## Gender: <0.001
## A man 2089 (49.1%) 527 (52.9%) 5 (38.5%) 68 (45.9%)
## A woman 2127 (50.0%) 457 (45.9%) 6 (46.2%) 77 (52.0%)
## In some other way 22 (0.52%) 7 (0.70%) 0 (0.00%) 2 (1.35%)
## Refused 15 (0.35%) 5 (0.50%) 2 (15.4%) 1 (0.68%)
## Edu: 0.001
## College graduate+ 1805 (42.4%) 458 (46.0%) 5 (38.5%) 41 (27.7%)
## H.S. graduate or less 1116 (26.2%) 271 (27.2%) 5 (38.5%) 66 (44.6%)
## Refused 9 (0.21%) 3 (0.30%) 1 (7.69%) 0 (0.00%)
## Some College 1323 (31.1%) 264 (26.5%) 2 (15.4%) 41 (27.7%)
## Race: 0.124
## Cuban 45 (1.06%) 5 (0.50%) 0 (0.00%) 2 (1.35%)
## Dominican 19 (0.45%) 7 (0.70%) 0 (0.00%) 3 (2.03%)
## Mexican 363 (8.54%) 56 (5.62%) 3 (23.1%) 13 (8.78%)
## No Response 3570 (83.9%) 877 (88.1%) 10 (76.9%) 115 (77.7%)
## Other Central American 36 (0.85%) 5 (0.50%) 0 (0.00%) 1 (0.68%)
## Other country 18 (0.42%) 4 (0.40%) 0 (0.00%) 0 (0.00%)
## Other South American 74 (1.74%) 13 (1.31%) 0 (0.00%) 3 (2.03%)
## Puerto Rican 62 (1.46%) 10 (1.00%) 0 (0.00%) 7 (4.73%)
## Refused 12 (0.28%) 4 (0.40%) 0 (0.00%) 1 (0.68%)
## Salvadoran 20 (0.47%) 4 (0.40%) 0 (0.00%) 1 (0.68%)
## Spanish 34 (0.80%) 11 (1.10%) 0 (0.00%) 2 (1.35%)
## Born: 0.215
## Foreign 699 (16.4%) 156 (15.7%) 4 (30.8%) 23 (15.5%)
## Refused 23 (0.54%) 8 (0.80%) 0 (0.00%) 3 (2.03%)
## U.S. 3531 (83.0%) 832 (83.5%) 9 (69.2%) 122 (82.4%)
## Marr: <0.001
## Divorced 376 (8.84%) 102 (10.2%) 1 (7.69%) 15 (10.1%)
## Living with a partner 394 (9.26%) 83 (8.33%) 0 (0.00%) 21 (14.2%)
## Married 2365 (55.6%) 513 (51.5%) 6 (46.2%) 52 (35.1%)
## Never been married 837 (19.7%) 219 (22.0%) 4 (30.8%) 50 (33.8%)
## Refused 19 (0.45%) 7 (0.70%) 2 (15.4%) 1 (0.68%)
## Separated 70 (1.65%) 20 (2.01%) 0 (0.00%) 2 (1.35%)
## Widowed 192 (4.51%) 52 (5.22%) 0 (0.00%) 7 (4.73%)
## Politics: 0.027
## No Response 2419 (56.9%) 550 (55.2%) 9 (69.2%) 67 (45.3%)
## Refused 156 (3.67%) 32 (3.21%) 2 (15.4%) 7 (4.73%)
## The Democratic Party 885 (20.8%) 216 (21.7%) 1 (7.69%) 46 (31.1%)
## The Republican Party 793 (18.6%) 198 (19.9%) 1 (7.69%) 28 (18.9%)
## Income: <0.001
## $100,000 or more 1399 (32.9%) 320 (32.1%) 3 (23.1%) 19 (12.8%)
## $30,000 to less than $40,000 340 (7.99%) 79 (7.93%) 0 (0.00%) 17 (11.5%)
## $40,000 to less than $50,000 329 (7.74%) 69 (6.93%) 2 (15.4%) 16 (10.8%)
## $50,000 to less than $60,000 308 (7.24%) 59 (5.92%) 2 (15.4%) 9 (6.08%)
## $60,000 to less than $70,000 292 (6.87%) 63 (6.33%) 0 (0.00%) 14 (9.46%)
## $70,000 to less than $80,000 283 (6.65%) 75 (7.53%) 0 (0.00%) 12 (8.11%)
## $80,000 to less than $90,000 251 (5.90%) 44 (4.42%) 0 (0.00%) 9 (6.08%)
## $90,000 to less than $100,000 247 (5.81%) 77 (7.73%) 1 (7.69%) 2 (1.35%)
## Less than $30,000 597 (14.0%) 165 (16.6%) 1 (7.69%) 46 (31.1%)
## Refused 207 (4.87%) 45 (4.52%) 4 (30.8%) 4 (2.70%)
## Ideology: <0.001
## Conservative 1039 (24.4%) 245 (24.6%) 3 (23.1%) 27 (18.2%)
## Liberal 766 (18.0%) 196 (19.7%) 1 (7.69%) 19 (12.8%)
## Moderate 1696 (39.9%) 358 (35.9%) 3 (23.1%) 70 (47.3%)
## Refused 84 (1.98%) 17 (1.71%) 4 (30.8%) 3 (2.03%)
## Very conservative 351 (8.25%) 95 (9.54%) 2 (15.4%) 16 (10.8%)
## Very liberal 317 (7.45%) 85 (8.53%) 0 (0.00%) 13 (8.78%)
## IntUse: <0.001
## About once a day 153 (3.60%) 53 (5.32%) 0 (0.00%) 21 (14.2%)
## Almost constantly 2106 (49.5%) 458 (46.0%) 6 (46.2%) 59 (39.9%)
## Do not use the internet 61 (1.43%) 23 (2.31%) 0 (0.00%) 0 (0.00%)
## Less often 68 (1.60%) 15 (1.51%) 3 (23.1%) 6 (4.05%)
## Refused 17 (0.40%) 4 (0.40%) 1 (7.69%) 1 (0.68%)
## Several times a day 1759 (41.4%) 419 (42.1%) 3 (23.1%) 55 (37.2%)
## Several times a week 89 (2.09%) 24 (2.41%) 0 (0.00%) 6 (4.05%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
Alternative (considering weight)
library(survey)
vars <- c("Area","Regn","Land","Age","Gender","Edu","Race","Born","Marr",
"Politics","Income","Ideology","IntUse","DrCell","Wght")
df1 <- df[, vars]
# treat as categorical (edit if any should be numeric)
cat_vars <- setdiff(vars, "Wght")
df1[cat_vars] <- lapply(df1[cat_vars], function(x) factor(x, exclude = NULL))
# survey design (ids=~1 is OK if you only have weights)
des <- svydesign(ids = ~1, weights = ~Wght, data = df1)
# ---- helper: weighted % by group + survey-weighted chi-square p-value ----
wtab_cat <- function(var, by, design) {
f <- as.formula(paste0("~", var, "+", by))
tab <- svytable(f, design) # weighted counts
colp <- prop.table(tab, margin = 2) * 100 # % within DrCell groups
# design-based Rao-Scott test
p <- tryCatch(svychisq(as.formula(paste0("~", var, "+", by)),
design, statistic = "F")$p.value,
error = function(e) NA_real_)
# return long data.frame
out <- as.data.frame.matrix(round(colp, 2))
out$Level <- rownames(out)
rownames(out) <- NULL
out$Variable <- var
out$p_value <- p
out[, c("Variable","Level", setdiff(names(out), c("Variable","Level")))]
}
# build table for all predictors (exclude DrCell and Wght)
preds <- setdiff(names(df1), c("DrCell","Wght"))
tbl_list <- lapply(preds, wtab_cat, by = "DrCell", design = des)
tbl_weighted <- do.call(rbind, tbl_list)
tbl_weighted
## Variable Level Major problem Minor problem
## 1 Area Metropolitan 86.91 83.55
## 2 Area No Response 0.03 0.19
## 3 Area Non-metropolitan 13.06 16.26
## 4 Regn Midwest 20.60 20.10
## 5 Regn No Response 0.03 0.19
## 6 Regn Northeast 16.74 20.19
## 7 Regn South 39.00 35.23
## 8 Regn West 23.63 24.29
## 9 Land Refused 1.13 0.96
## 10 Land Rural 24.70 26.76
## 11 Land Suburban 49.96 51.24
## 12 Land Urban 24.21 21.05
## 13 Age 18-29 18.89 21.72
## 14 Age 30-49 33.65 30.93
## 15 Age 50-64 25.40 21.04
## 16 Age 65+ 21.60 26.11
## 17 Age Refused 0.46 0.19
## 18 Gender A man 47.34 52.42
## 19 Gender A woman 51.61 45.55
## 20 Gender In some other way 0.56 1.26
## 21 Gender Refused 0.49 0.77
## 22 Edu College graduate+ 34.09 36.51
## 23 Edu H.S. graduate or less 35.13 35.84
## 24 Edu Refused 0.38 0.57
## 25 Edu Some College 30.40 27.08
## 26 Race Cuban 0.69 0.38
## 27 Race Dominican 0.37 0.76
## 28 Race Mexican 9.58 6.04
## 29 Race No Response 83.24 86.69
## 30 Race Other Central American 1.07 0.45
## 31 Race Other country 0.41 0.33
## 32 Race Other South American 1.43 0.99
## 33 Race Puerto Rican 1.67 1.59
## 34 Race Refused 0.32 0.70
## 35 Race Salvadoran 0.61 0.67
## 36 Race Spanish 0.62 1.41
## 37 Born Foreign 15.61 11.89
## 38 Born Refused 0.87 0.58
## 39 Born U.S. 83.53 87.53
## 40 Marr Divorced 9.01 9.54
## 41 Marr Living with a partner 10.15 9.00
## 42 Marr Married 51.34 46.05
## 43 Marr Never been married 23.05 25.78
## 44 Marr Refused 0.44 0.83
## 45 Marr Separated 1.36 2.66
## 46 Marr Widowed 4.65 6.14
## 47 Politics No Response 53.82 52.51
## 48 Politics Refused 6.75 5.83
## 49 Politics The Democratic Party 19.65 22.50
## 50 Politics The Republican Party 19.79 19.16
## 51 Income $100,000 or more 27.25 27.18
## 52 Income $30,000 to less than $40,000 8.99 8.81
## 53 Income $40,000 to less than $50,000 8.88 9.08
## 54 Income $50,000 to less than $60,000 7.26 5.44
## 55 Income $60,000 to less than $70,000 6.93 6.75
## 56 Income $70,000 to less than $80,000 6.58 7.69
## 57 Income $80,000 to less than $90,000 5.99 4.16
## 58 Income $90,000 to less than $100,000 5.10 6.80
## 59 Income Less than $30,000 17.60 19.68
## 60 Income Refused 5.41 4.41
## 61 Ideology Conservative 24.57 24.92
## 62 Ideology Liberal 16.24 18.41
## 63 Ideology Moderate 40.62 36.01
## 64 Ideology Refused 3.02 3.25
## 65 Ideology Very conservative 8.84 9.29
## 66 Ideology Very liberal 6.70 8.12
## 67 IntUse About once a day 4.85 7.74
## 68 IntUse Almost constantly 42.68 39.00
## 69 IntUse Do not use the internet 2.64 3.46
## 70 IntUse Less often 2.88 2.51
## 71 IntUse Refused 0.42 0.41
## 72 IntUse Several times a day 43.66 42.49
## 73 IntUse Several times a week 2.86 4.38
## No Response Not a problem p_value
## 1 94.34 85.54 1.733991e-01
## 2 0.00 0.00 1.733991e-01
## 3 5.66 14.46 1.733991e-01
## 4 35.49 23.10 1.276890e-01
## 5 0.00 0.00 1.276890e-01
## 6 32.21 21.73 1.276890e-01
## 7 30.41 38.76 1.276890e-01
## 8 1.89 16.41 1.276890e-01
## 9 0.00 1.85 1.363321e-01
## 10 25.63 34.87 1.363321e-01
## 11 34.24 36.94 1.363321e-01
## 12 40.13 26.34 1.363321e-01
## 13 33.07 30.87 1.511489e-05
## 14 29.47 36.00 1.511489e-05
## 15 9.55 18.57 1.511489e-05
## 16 19.84 14.47 1.511489e-05
## 17 8.07 0.09 1.511489e-05
## 18 24.92 45.50 3.964284e-09
## 19 55.11 51.15 3.964284e-09
## 20 0.00 1.22 3.964284e-09
## 21 19.97 2.13 3.964284e-09
## 22 28.14 17.00 4.930719e-06
## 23 56.45 60.22 4.930719e-06
## 24 5.95 0.00 4.930719e-06
## 25 9.46 22.78 4.930719e-06
## 26 0.00 0.54 5.690414e-03
## 27 0.00 3.64 5.690414e-03
## 28 29.32 9.20 5.690414e-03
## 29 70.68 75.11 5.690414e-03
## 30 0.00 0.18 5.690414e-03
## 31 0.00 0.00 5.690414e-03
## 32 0.00 4.77 5.690414e-03
## 33 0.00 2.93 5.690414e-03
## 34 0.00 0.42 5.690414e-03
## 35 0.00 2.13 5.690414e-03
## 36 0.00 1.07 5.690414e-03
## 37 36.36 16.04 6.250988e-03
## 38 0.00 4.35 6.250988e-03
## 39 63.64 79.61 6.250988e-03
## 40 5.66 7.78 1.078551e-11
## 41 0.00 12.52 1.078551e-11
## 42 51.00 30.83 1.078551e-11
## 43 25.26 39.72 1.078551e-11
## 44 18.08 0.09 1.078551e-11
## 45 0.00 2.32 1.078551e-11
## 46 0.00 6.74 1.078551e-11
## 47 62.76 41.08 3.078118e-02
## 48 19.84 8.40 3.078118e-02
## 49 11.90 33.45 3.078118e-02
## 50 5.51 17.07 3.078118e-02
## 51 4.89 9.78 1.426458e-05
## 52 0.00 11.32 1.426458e-05
## 53 17.13 14.44 1.426458e-05
## 54 30.28 6.43 1.426458e-05
## 55 0.00 10.07 1.426458e-05
## 56 0.00 8.65 1.426458e-05
## 57 0.00 5.97 1.426458e-05
## 58 8.07 0.42 1.426458e-05
## 59 9.71 28.95 1.426458e-05
## 60 29.92 3.98 1.426458e-05
## 61 20.60 21.71 1.197302e-03
## 62 1.90 12.50 1.197302e-03
## 63 25.25 42.16 1.197302e-03
## 64 29.24 1.72 1.197302e-03
## 65 23.00 10.50 1.197302e-03
## 66 0.00 11.41 1.197302e-03
## 67 0.00 18.32 4.556412e-14
## 68 28.25 32.02 4.556412e-14
## 69 0.00 0.00 4.556412e-14
## 70 39.99 5.82 4.556412e-14
## 71 8.07 1.18 4.556412e-14
## 72 23.70 38.22 4.556412e-14
## 73 0.00 4.43 4.556412e-14
# write.csv(tbl_weighted, "Table1_weighted.csv", row.names = FALSE)
Single Var
single_var <- df %>%
group_by(Age) %>%
summarise(
Count = n(),
Weighted = sum(Wght),
.groups = "drop"
) %>%
mutate(
Unweighted_Percent = 100 * Count / sum(Count),
Weighted_Percent = 100 * Weighted / sum(Weighted)
)
single_var
## # A tibble: 5 × 5
## Age Count Weighted Unweighted_Percent Weighted_Percent
## <chr> <int> <dbl> <dbl> <dbl>
## 1 18-29 763 1074. 14.1 19.8
## 2 30-49 1883 1797. 34.8 33.2
## 3 50-64 1342 1316. 24.8 24.3
## 4 65+ 1395 1201. 25.8 22.2
## 5 Refused 27 23.0 0.499 0.426
Two Var
two_var <- df %>%
group_by(Regn, Land) %>%
summarise(
Count = n(),
Weighted = sum(Wght),
.groups = "drop"
) %>%
group_by(Regn) %>% # <-- cluster by first column
mutate(
Unweighted_Percent = 100 * Count / sum(Count),
Weighted_Percent = 100 * Weighted / sum(Weighted)
) %>%
ungroup()
two_var
## # A tibble: 18 × 6
## Regn Land Count Weighted Unweighted_Percent Weighted_Percent
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Midwest Refused 15 19.9 1.38 1.79
## 2 Midwest Rural 313 341. 28.7 30.5
## 3 Midwest Suburban 532 532. 48.9 47.7
## 4 Midwest Urban 229 223. 21.0 20.0
## 5 No Response Rural 2 2.11 66.7 66.3
## 6 No Response Suburban 1 1.07 33.3 33.7
## 7 Northeast Refused 9 7.88 0.963 0.828
## 8 Northeast Rural 186 221. 19.9 23.2
## 9 Northeast Suburban 496 469. 53.0 49.3
## 10 Northeast Urban 244 254. 26.1 26.7
## 11 South Refused 22 18.7 1.06 0.905
## 12 South Rural 532 596. 25.6 28.8
## 13 South Suburban 1075 1020. 51.8 49.3
## 14 South Urban 447 436. 21.5 21.1
## 15 West Refused 11 14.0 0.842 1.10
## 16 West Rural 202 215. 15.5 17.0
## 17 West Suburban 712 668. 54.5 52.7
## 18 West Urban 382 371. 29.2 29.3