## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## Warning: package 'statsr' was built under R version 4.2.3
## Warning: package 'BayesFactor' was built under R version 4.2.3
## Warning: package 'coda' was built under R version 4.2.3
## Warning: package 'Matrix' was built under R version 4.2.3
There is a potential of bias in this dataset based on the possibility of underrepresented groups even after weighting, the ability of individuals to recall information specific to the interview, and the possibility of individuals altering their responses due to the knowledge of being interviewed. Any bias present increases the possibility of Type I and Type II errors in the inferences, so it is essential to minimize this as much as possible.
In total 4 area will be considered of the following types: one numerical variable and one categorical variable of two levels, one numerical variable and one categorical variable of more than two levels, two categorical variables of two levels each, and two categorical variables of more than two levels each.
Area 1: With the advent of technological advances, a possible area of interest is the effect of age on the likelihood of a respondent to complete an in-person or phone-based interview. Is there a significant difference in the average age of respondents by sex?
Area 2: The data set contains various possible responses for political affiliation. From this data, is there any difference average number of Children among the Republican, Democrat, and Independent political parties?
Area 3: For various reasons, individuals may be unable to work and require government aid. Is the proportion of males who received aid greater than the proportion of females who received aid?
Area 4: In the past of the United States, minority groups have unknowingly been experimented upon, and in the modern time this may be linked to a distrust in medicine. Is there a significant difference in the response of individuals by race in their confidence of those who lead medical institutions?
Area 1
## Factor w/ 2 levels "Male","Female": 2 1 2 2 2 1 1 1 2 2 ...
## int [1:57061] 23 70 48 27 61 26 28 27 21 30 ...
## age n
## 1 18 206
## 2 19 777
## 3 20 818
## 4 21 930
## 5 22 970
## 6 23 1130
## 7 24 1109
## 8 25 1231
## 9 26 1216
## 10 27 1253
## 11 28 1314
## 12 29 1177
## 13 30 1289
## 14 31 1200
## 15 32 1291
## 16 33 1232
## 17 34 1262
## 18 35 1247
## 19 36 1230
## 20 37 1205
## 21 38 1221
## 22 39 1087
## 23 40 1154
## 24 41 1092
## 25 42 1076
## 26 43 1102
## 27 44 1043
## 28 45 994
## 29 46 1002
## 30 47 976
## 31 48 989
## 32 49 1011
## 33 50 918
## 34 51 932
## 35 52 897
## 36 53 874
## 37 54 862
## 38 55 781
## 39 56 860
## 40 57 762
## 41 58 810
## 42 59 764
## 43 60 795
## 44 61 709
## 45 62 741
## 46 63 721
## 47 64 631
## 48 65 696
## 49 66 631
## 50 67 683
## 51 68 634
## 52 69 583
## 53 70 610
## 54 71 536
## 55 72 548
## 56 73 488
## 57 74 515
## 58 75 441
## 59 76 439
## 60 77 409
## 61 78 368
## 62 79 320
## 63 80 284
## 64 81 285
## 65 82 248
## 66 83 220
## 67 84 192
## 68 85 168
## 69 86 151
## 70 87 126
## 71 88 99
## 72 89 294
## 73 NA 202
## sex n
## 1 Male 25146
## 2 Female 31915
## sex n
## 1 Male 62
## 2 Female 140
male_age <- gss$age[gss$sex == "Male" & !is.na(gss$age)]
female_age <- gss$age[gss$sex == "Female" & !is.na(gss$age)]
ggplot(gss) +
geom_histogram(aes(x = age)) +
facet_wrap(~sex) +
labs(
title = "Age of Respondents by Sex",
x = "Respondent Age",
y = "Total"
) +
theme_classic()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 202 rows containing non-finite outside the scale range
## (`stat_bin()`).
## [1] 0.4468543
## [1] 0.4339224
ggplot(gss %>% mutate(age = sqrt(age))) +
geom_histogram(aes(x = age)) +
facet_wrap(~sex) +
labs(
title = "Age of Respondents by Sex",
x = "Respondent Age",
y = "Total"
) +
theme_classic()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 202 rows containing non-finite outside the scale range
## (`stat_bin()`).
##
## F test to compare two variances
##
## data: male_age and female_age
## F = 0.93135, num df = 25083, denom df = 31774, p-value = 2.772e-09
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.9098136 0.9534278
## sample estimates:
## ratio of variances
## 0.9313502
## # A tibble: 2 × 2
## sex `average age`
## <fct> <dbl>
## 1 Male 45.0
## 2 Female 46.3
## [1] 6.58424
## [1] 6.675937
##
## Welch Two Sample t-test
##
## data: male_age and female_age
## t = -8.4458, df = 54637, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.11297786 -0.07041751
## sample estimates:
## mean of x mean of y
## 6.584240 6.675937
Area 2
gss<- gss %>%
mutate(
partyid =
as_factor(case_when(
str_detect(partyid, "Demo") ~ "Democrat",
str_detect(partyid, "Repu") ~ "Republican",
str_detect(partyid, "Ind") ~ "Independent",
str_detect(partyid, "Oth") ~ "Other Party"
)
)
)
gss %>% count(partyid)## partyid n
## 1 Independent 20163
## 2 Democrat 21157
## 3 Republican 14553
## 4 Other Party 861
## 5 <NA> 327
gss %>%
filter(partyid %in% c("Republican", "Democrat", "Independent")) %>%
group_by(partyid) %>%
summarize(`average number of Children` = mean(childs, na.rm = TRUE))## # A tibble: 3 × 2
## partyid `average number of Children`
## <fct> <dbl>
## 1 Independent 1.81
## 2 Democrat 2.09
## 3 Republican 1.96
gss %>%
filter(partyid %in% c("Republican", "Democrat", "Independent")) %>%
ggplot() +
geom_boxplot(aes(x = partyid, y = childs)) +
labs(
title = "Number of Children by Political Affiliation",
x = "Political Affiliation",
y = "Number of Children"
) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))## Warning: Removed 141 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
gss %>%
filter(partyid %in% c("Republican", "Democrat", "Independent")) %>%
ggplot() +
geom_histogram(aes(x = childs)) +
facet_wrap(~partyid) +
labs(
title = "Reported Number of Children by Political Affiliation",
x = "Reported Number of Children",
y = "Total"
) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 141 rows containing non-finite outside the scale range
## (`stat_bin()`).
rep_Children <- gss$childs[gss$partyid == "Republican"]
dem_Children <- gss$childs[gss$partyid == "Democrat"]
ind_Children <- gss$childs[gss$partyid == "Independent"]
skewness(rep_Children, na.rm = TRUE)## [1] 0.9398297
## [1] 1.000218
## [1] 1.075862
gss <- gss %>% mutate(childs = sqrt(childs))
gss %>%
filter(partyid %in% c("Republican", "Democrat", "Independent")) %>%
ggplot() +
geom_histogram(aes(x = childs)) +
facet_wrap(~partyid) +
labs(
title = "Reported Number of Children by Political Affiliation",
x = "Reported Number of Children",
y = "Total"
) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 141 rows containing non-finite outside the scale range
## (`stat_bin()`).
gss %>%
filter(partyid %in% c("Republican", "Democrat", "Independent")) %>%
group_by(partyid) %>%
summarize(`group variance` = var(childs, na.rm = TRUE)) ## # A tibble: 3 × 2
## partyid `group variance`
## <fct> <dbl>
## 1 Independent 0.636
## 2 Democrat 0.653
## 3 Republican 0.603
gss_party <- gss %>% filter(partyid %in% c("Republican", "Democrat", "Independent"))
summary(aov(childs ~ partyid, gss_party))## Df Sum Sq Mean Sq F value Pr(>F)
## partyid 2 138 68.82 108.6 <2e-16 ***
## Residuals 55729 35329 0.63
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 141 observations deleted due to missingness
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = childs ~ partyid, data = gss_party)
##
## $partyid
## diff lwr upr p adj
## Democrat-Independent 0.11390851 0.09551857 0.13229846 0.00e+00
## Republican-Independent 0.07777236 0.05744880 0.09809592 0.00e+00
## Republican-Democrat -0.03613615 -0.05625456 -0.01601774 7.59e-05
Area 3
## sex n
## 1 Male 25146
## 2 Female 31915
## govaid n
## 1 Yes 4325
## 2 No 7760
## 3 <NA> 44976
##
## Yes No
## Male 2090 3298
## Female 2235 4462
##
## Yes No
## Male 2090 3298
## Female 2235 4462
## [1] TRUE
## [1] TRUE
## [1] TRUE
## [1] TRUE
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(2090, 2235) out of c(5388, 6697)
## X-squared = 37.887, df = 1, p-value = 3.747e-10
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.03954131 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.3878990 0.3337315
Area 4
## race n
## 1 White 46350
## 2 Black 7926
## 3 Other 2785
## conmedic n
## 1 A Great Deal 17931
## 2 Only Some 17159
## 3 Hardly Any 3222
## 4 <NA> 18749
##
## A Great Deal Only Some Hardly Any
## White 14831 13943 2608
## Black 2353 2471 479
## Other 747 745 135
chis_sq_matrix <- as.matrix(table(gss$race, gss$conmedic))
expected_value <- matrix(data = NA, nrow = 3, ncol = 3)
for(j in 1:3){
for(i in 1:3){
expected_value[i,j] = (sum(chis_sq_matrix[i, ]) * sum(chis_sq_matrix[, j])) / 38312
}
}
expected_value## [,1] [,2] [,3]
## [1,] 14687.5820 14055.2239 2639.1941
## [2,] 2481.9402 2375.0829 445.9769
## [3,] 761.4778 728.6932 136.8290
##
## Pearson's Chi-squared test
##
## data: gss$race and gss$conmedic
## X-squared = 16.347, df = 4, p-value = 0.002587
## [1] 0.710723
## [1] 9.487729