stat_test_Tima
background
load('/Users/korshe/Downloads/data.rda')
library(broom)
library(stargazer)##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(rstatix)##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
library(ggplot2)
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
X<-split(table04c1, table04c1$Age)
A <- X[[1]]
B <- X[[2]]
C <- X[[3]]
D <- X[[4]]
E <- X[[5]]
fitA <- glm(If_voted ~ freq, data = A, family = 'binomial')
fitB <- glm(If_voted ~ freq, data = B, family = 'binomial')
fitC <- glm(If_voted ~ freq, data = C, family = 'binomial')
fitD <- glm(If_voted ~ freq, data = D, family = 'binomial')
fitE <- glm(If_voted ~ freq, data = E, family = 'binomial')
stargazer(fitA, fitB, fitC, fitD, fitE, type = "text")##
## =====================================================================
## Dependent variable:
## ---------------------------------------------------
## If_voted
## (1) (2) (3) (4) (5)
## ---------------------------------------------------------------------
## freq 0.00000 0.00000* 0.00000** 0.00000*** 0.00000***
## (0.00000) (0.00000) (0.00000) (0.00000) (0.00000)
##
## Constant -0.050 -0.383 -0.656** -1.017*** -1.257***
## (0.287) (0.301) (0.329) (0.354) (0.377)
##
## ---------------------------------------------------------------------
## Observations 92 92 92 92 92
## Log Likelihood -63.738 -62.035 -59.600 -55.160 -51.896
## Akaike Inf. Crit. 131.476 128.071 123.201 114.321 107.791
## =====================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
tidyfitA <- tidy(fitA)
tidyfitB <- tidy(fitB)
tidyfitC <- tidy(fitC)
tidyfitD <- tidy(fitD)
tidyfitE <- tidy(fitE)
tidyfitA <- subset(tidyfitA, term != "(Intercept)")
tidyfitB <- subset(tidyfitB, term != "(Intercept)")
tidyfitC <- subset(tidyfitC, term != "(Intercept)")
tidyfitD <- subset(tidyfitD, term != "(Intercept)")
tidyfitE <- subset(tidyfitE, term != "(Intercept)")
tidyfitA$OR <- exp(tidyfitA$estimate)
tidyfitA$LL <- exp(tidyfitA$estimate - (1.96 * tidyfitA$std.error))
tidyfitA$UL <- exp(tidyfitA$estimate + (1.96 * tidyfitA$std.error))
tidyfitB$OR <- exp(tidyfitB$estimate)
tidyfitB$LL <- exp(tidyfitB$estimate - (1.96 * tidyfitB$std.error))
tidyfitB$UL <- exp(tidyfitB$estimate + (1.96 * tidyfitB$std.error))
tidyfitC$OR <- exp(tidyfitC$estimate)
tidyfitC$LL <- exp(tidyfitC$estimate - (1.96 * tidyfitC$std.error))
tidyfitC$UL <- exp(tidyfitC$estimate + (1.96 * tidyfitC$std.error))
tidyfitD$OR <- exp(tidyfitD$estimate)
tidyfitD$LL <- exp(tidyfitD$estimate - (1.96 * tidyfitD$std.error))
tidyfitD$UL <- exp(tidyfitD$estimate + (1.96 * tidyfitD$std.error))
tidyfitE$OR <- exp(tidyfitE$estimate)
tidyfitE$LL <- exp(tidyfitE$estimate - (1.96 * tidyfitE$std.error))
tidyfitE$UL <- exp(tidyfitE$estimate + (1.96 * tidyfitE$std.error))
new <- rbind(tidyfitA, tidyfitB, tidyfitC, tidyfitD, tidyfitE)
new[1, 1] = "18-24"
new[2, 1] = "25-34"
new[3, 1] = '35-44'
new[4, 1] = '45-64'
new[5, 1] = '65+'
ggplot(new,
aes(x = term, y = OR, ymin = LL, ymax = UL)) +
geom_pointrange(aes(col = factor(term)),
position=position_dodge(width=0.30)) +
ylab("Odds ratio & 95% CI") +
geom_hline(aes(yintercept = 1)) +
scale_color_discrete(name = "Term") +
xlab("") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))Testing associations
Preparing table from long to wide
table04c1_wide <- spread(table04c1, key = State, value = freq)
table04c1_wide$total <- rowSums(as.matrix(table04c1_wide[,c(3:48)]))
table04c1_wide_key <- as.data.frame(table04c1_wide[,c(1,2,49)])
table04c1_wide_t <- as.data.frame(t(table04c1_wide))
colnames(table04c1_wide_t) <- table04c1_wide_t[1,]
table04c1_wide_t <- table04c1_wide_t[-1,]
colnames(table04c1_wide_t) <- paste(colnames(table04c1_wide_t), table04c1_wide_t[1,])
colnames(table04c1_wide_t) <- gsub(' ','_',colnames(table04c1_wide_t) )
table04c1_wide_t <- table04c1_wide_t[-1,]example of tab
head(table04c1_wide_t)## 18_to_24_not_voted 18_to_24_voted 25_to_34_not_voted 25_to_34_voted
## ALABAMA 233000 194000 272000 315000
## ARIZONA 302000 327000 271000 579000
## ARKANSAS 177000 84000 213000 176000
## CALIFORNIA 1548000 1786000 1981000 2976000
## COLORADO 224000 257000 334000 608000
## CONNECTICUT 130000 141000 147000 225000
## 35_to_44_not_voted 35_to_44_voted 45_to_64_not_voted 45_to_64_voted
## ALABAMA 243000 365000 422000 824000
## ARIZONA 247000 509000 385000 1242000
## ARKANSAS 140000 179000 279000 401000
## CALIFORNIA 1462000 2554000 2439000 5534000
## COLORADO 209000 420000 380000 922000
## CONNECTICUT 138000 228000 282000 660000
## 65+_not_voted 65+_voted
## ALABAMA 299000 549000
## ARIZONA 221000 992000
## ARKANSAS 201000 346000
## CALIFORNIA 1622000 4044000
## COLORADO 214000 631000
## CONNECTICUT 145000 428000
Checking if there is a significant differences between groups
table04c1_wide_t_Numeric <- mutate_all(table04c1_wide_t, function(x) as.numeric(as.character(x)))
t.test(table04c1_wide_t_Numeric$`18_to_24_not_voted`, table04c1_wide_t_Numeric$`18_to_24_voted`)##
## Welch Two Sample t-test
##
## data: table04c1_wide_t_Numeric$`18_to_24_not_voted` and table04c1_wide_t_Numeric$`18_to_24_voted`
## t = -0.079061, df = 91.729, p-value = 0.9372
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -814782.4 752399.4
## sample estimates:
## mean of x mean of y
## 547276.6 578468.1
t.test(table04c1_wide_t_Numeric$`25_to_34_not_voted`, table04c1_wide_t_Numeric$`25_to_34_voted`)##
## Welch Two Sample t-test
##
## data: table04c1_wide_t_Numeric$`25_to_34_not_voted` and table04c1_wide_t_Numeric$`25_to_34_voted`
## t = -0.57371, df = 79.719, p-value = 0.5678
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1539400.8 850464.7
## sample estimates:
## mean of x mean of y
## 667234 1011702
t.test(table04c1_wide_t_Numeric$`35_to_44_not_voted`, table04c1_wide_t_Numeric$`35_to_44_voted`)##
## Welch Two Sample t-test
##
## data: table04c1_wide_t_Numeric$`35_to_44_not_voted` and table04c1_wide_t_Numeric$`35_to_44_voted`
## t = -0.82432, df = 70.57, p-value = 0.4125
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1556810.8 646172.5
## sample estimates:
## mean of x mean of y
## 528212.8 983531.9
t.test(table04c1_wide_t_Numeric$`45_to_64_not_voted`, table04c1_wide_t_Numeric$`45_to_64_voted`)##
## Welch Two Sample t-test
##
## data: table04c1_wide_t_Numeric$`45_to_64_not_voted` and table04c1_wide_t_Numeric$`45_to_64_voted`
## t = -1.1043, df = 61.102, p-value = 0.2738
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3744255 1079915
## sample estimates:
## mean of x mean of y
## 925787.2 2257957.4
t.test(table04c1_wide_t_Numeric$`65+_not_voted`, table04c1_wide_t_Numeric$`65+_voted`)##
## Welch Two Sample t-test
##
## data: table04c1_wide_t_Numeric$`65+_not_voted` and table04c1_wide_t_Numeric$`65+_voted`
## t = -1.257, df = 56.739, p-value = 0.2139
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2850150 652022
## sample estimates:
## mean of x mean of y
## 573872.3 1672936.2
The results show that the diferences between ‘Voted’ and ‘Not voted’ status increase with age: for individuals older than 35 the pvalue of ttest is 0.007806 (below 0.05), which means that voted and not voted groups are different. And for older groups this diference increases (65+ 6.1e-05)
DT::datatable(table04c1,extensions = "Buttons",
options = list(paging = TRUE,
scrollX=TRUE,
searching = TRUE,
ordering = TRUE,
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf') ))table04c1$freq_k <- table04c1$freq/1000000
ggplot(table04c1, aes(x = Age,
y = freq_k,
fill = If_voted)) +
geom_bar(stat = "identity", position ='dodge') +
#facet_wrap(~ test3) +
scale_y_continuous("Number of citizens, million") +
theme(plot.title = element_text(hjust = 0.5), panel.grid.major.x = element_blank()) + theme_light() + xlab('Age group')library(ggplot2)
library(ggpubr)
ggballoonplot(table04c1, x = "If_voted", y = "Age", size = "freq_k",
fill = "freq_k",
ggtheme = theme_bw()) +
scale_fill_viridis_c(option = "C")check state with the biggest difference
# table04c1_wide_t_Numeric <- mutate_all(table04c1_wide_t, function(x) as.numeric(as.character(x)))
table04c1_wide_t_Numeric$difference_18_24 <- table04c1_wide_t_Numeric$`18_to_24_voted`/sum(table04c1_wide_t_Numeric$`18_to_24_not_voted` , table04c1_wide_t_Numeric$`18_to_24_voted`)
table04c1_wide_t_Numeric <- table04c1_wide_t_Numeric[order(table04c1_wide_t_Numeric$difference_18_24, decreasing = T),]
table04c1_wide_t_Numeric$state <- rownames(table04c1_wide_t_Numeric)
head(table04c1_wide_t_Numeric[,c('state','difference_18_24')])## state difference_18_24
## total total 0.25692686
## CALIFORNIA CALIFORNIA 0.03375543
## TEXAS TEXAS 0.01935362
## NEW YORK NEW YORK 0.01319221
## FLORIDA FLORIDA 0.01224721
## ILLINOIS ILLINOIS 0.01101871
# ggplot(breadth_data, aes(x=fct_reorder(Stakeholder, Value), y=Value)) +
library(forcats)
ggplot(data=table04c1_wide_t_Numeric, aes(x=fct_reorder(state,difference_18_24, .desc = F), y=difference_18_24)) + coord_flip() +
geom_bar(stat="identity", fill="steelblue")+
# geom_text(aes(label=difference_18_24), vjust=1.6, color="white", size=3.5)+
theme_minimal() + xlab('Proportion of voted citizens to the whole population by state')