stat_test_Tima

background

load('/Users/korshe/Downloads/data.rda')


library(broom)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(rstatix)
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
X<-split(table04c1, table04c1$Age)

A <- X[[1]]
B <- X[[2]]
C <- X[[3]]
D <- X[[4]]
E <- X[[5]]

fitA <- glm(If_voted ~ freq, data = A, family = 'binomial')
fitB <- glm(If_voted ~ freq, data = B, family = 'binomial')
fitC <- glm(If_voted ~ freq, data = C, family = 'binomial')
fitD <- glm(If_voted ~ freq, data = D, family = 'binomial')
fitE <- glm(If_voted ~ freq, data = E, family = 'binomial')

stargazer(fitA, fitB, fitC, fitD, fitE, type = "text")
## 
## =====================================================================
##                                   Dependent variable:                
##                   ---------------------------------------------------
##                                        If_voted                      
##                      (1)       (2)       (3)       (4)        (5)    
## ---------------------------------------------------------------------
## freq               0.00000  0.00000*  0.00000** 0.00000*** 0.00000***
##                   (0.00000) (0.00000) (0.00000) (0.00000)  (0.00000) 
##                                                                      
## Constant           -0.050    -0.383   -0.656**  -1.017***  -1.257*** 
##                    (0.287)   (0.301)   (0.329)   (0.354)    (0.377)  
##                                                                      
## ---------------------------------------------------------------------
## Observations         92        92        92         92         92    
## Log Likelihood     -63.738   -62.035   -59.600   -55.160    -51.896  
## Akaike Inf. Crit.  131.476   128.071   123.201   114.321    107.791  
## =====================================================================
## Note:                                     *p<0.1; **p<0.05; ***p<0.01
tidyfitA <- tidy(fitA)
tidyfitB <- tidy(fitB)
tidyfitC <- tidy(fitC)
tidyfitD <- tidy(fitD)
tidyfitE <- tidy(fitE)

tidyfitA <- subset(tidyfitA, term != "(Intercept)")
tidyfitB <- subset(tidyfitB, term != "(Intercept)")
tidyfitC <- subset(tidyfitC, term != "(Intercept)")
tidyfitD <- subset(tidyfitD, term != "(Intercept)")
tidyfitE <- subset(tidyfitE, term != "(Intercept)")

tidyfitA$OR <- exp(tidyfitA$estimate)
tidyfitA$LL <- exp(tidyfitA$estimate - (1.96 * tidyfitA$std.error))
tidyfitA$UL <- exp(tidyfitA$estimate + (1.96 * tidyfitA$std.error))

tidyfitB$OR <- exp(tidyfitB$estimate)
tidyfitB$LL <- exp(tidyfitB$estimate - (1.96 * tidyfitB$std.error))
tidyfitB$UL <- exp(tidyfitB$estimate + (1.96 * tidyfitB$std.error))

tidyfitC$OR <- exp(tidyfitC$estimate)
tidyfitC$LL <- exp(tidyfitC$estimate - (1.96 * tidyfitC$std.error))
tidyfitC$UL <- exp(tidyfitC$estimate + (1.96 * tidyfitC$std.error))

tidyfitD$OR <- exp(tidyfitD$estimate)
tidyfitD$LL <- exp(tidyfitD$estimate - (1.96 * tidyfitD$std.error))
tidyfitD$UL <- exp(tidyfitD$estimate + (1.96 * tidyfitD$std.error))

tidyfitE$OR <- exp(tidyfitE$estimate)
tidyfitE$LL <- exp(tidyfitE$estimate - (1.96 * tidyfitE$std.error))
tidyfitE$UL <- exp(tidyfitE$estimate + (1.96 * tidyfitE$std.error))

new <- rbind(tidyfitA, tidyfitB, tidyfitC, tidyfitD, tidyfitE)

new[1, 1] = "18-24"
new[2, 1] = "25-34"
new[3, 1] = '35-44'
new[4, 1] = '45-64'
new[5, 1] = '65+'

ggplot(new,
       aes(x = term, y = OR, ymin = LL, ymax = UL)) + 
  geom_pointrange(aes(col = factor(term)), 
                  position=position_dodge(width=0.30)) + 
  ylab("Odds ratio & 95% CI") + 
  geom_hline(aes(yintercept = 1)) + 
  scale_color_discrete(name = "Term") + 
  xlab("") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Testing associations

Preparing table from long to wide

table04c1_wide <- spread(table04c1, key = State, value = freq)
table04c1_wide$total <- rowSums(as.matrix(table04c1_wide[,c(3:48)]))
table04c1_wide_key <- as.data.frame(table04c1_wide[,c(1,2,49)])

table04c1_wide_t <- as.data.frame(t(table04c1_wide))
colnames(table04c1_wide_t) <- table04c1_wide_t[1,]
table04c1_wide_t <- table04c1_wide_t[-1,]
colnames(table04c1_wide_t) <- paste(colnames(table04c1_wide_t), table04c1_wide_t[1,])
colnames(table04c1_wide_t)  <- gsub(' ','_',colnames(table04c1_wide_t) )
table04c1_wide_t <- table04c1_wide_t[-1,]

example of tab

head(table04c1_wide_t)
##             18_to_24_not_voted 18_to_24_voted 25_to_34_not_voted 25_to_34_voted
## ALABAMA                 233000         194000             272000         315000
## ARIZONA                 302000         327000             271000         579000
## ARKANSAS                177000          84000             213000         176000
## CALIFORNIA             1548000        1786000            1981000        2976000
## COLORADO                224000         257000             334000         608000
## CONNECTICUT             130000         141000             147000         225000
##             35_to_44_not_voted 35_to_44_voted 45_to_64_not_voted 45_to_64_voted
## ALABAMA                 243000         365000             422000         824000
## ARIZONA                 247000         509000             385000        1242000
## ARKANSAS                140000         179000             279000         401000
## CALIFORNIA             1462000        2554000            2439000        5534000
## COLORADO                209000         420000             380000         922000
## CONNECTICUT             138000         228000             282000         660000
##             65+_not_voted 65+_voted
## ALABAMA            299000    549000
## ARIZONA            221000    992000
## ARKANSAS           201000    346000
## CALIFORNIA        1622000   4044000
## COLORADO           214000    631000
## CONNECTICUT        145000    428000

Checking if there is a significant differences between groups

table04c1_wide_t_Numeric <- mutate_all(table04c1_wide_t, function(x) as.numeric(as.character(x)))
t.test(table04c1_wide_t_Numeric$`18_to_24_not_voted`, table04c1_wide_t_Numeric$`18_to_24_voted`)
## 
##  Welch Two Sample t-test
## 
## data:  table04c1_wide_t_Numeric$`18_to_24_not_voted` and table04c1_wide_t_Numeric$`18_to_24_voted`
## t = -0.079061, df = 91.729, p-value = 0.9372
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -814782.4  752399.4
## sample estimates:
## mean of x mean of y 
##  547276.6  578468.1
t.test(table04c1_wide_t_Numeric$`25_to_34_not_voted`, table04c1_wide_t_Numeric$`25_to_34_voted`)
## 
##  Welch Two Sample t-test
## 
## data:  table04c1_wide_t_Numeric$`25_to_34_not_voted` and table04c1_wide_t_Numeric$`25_to_34_voted`
## t = -0.57371, df = 79.719, p-value = 0.5678
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1539400.8   850464.7
## sample estimates:
## mean of x mean of y 
##    667234   1011702
t.test(table04c1_wide_t_Numeric$`35_to_44_not_voted`, table04c1_wide_t_Numeric$`35_to_44_voted`)
## 
##  Welch Two Sample t-test
## 
## data:  table04c1_wide_t_Numeric$`35_to_44_not_voted` and table04c1_wide_t_Numeric$`35_to_44_voted`
## t = -0.82432, df = 70.57, p-value = 0.4125
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1556810.8   646172.5
## sample estimates:
## mean of x mean of y 
##  528212.8  983531.9
t.test(table04c1_wide_t_Numeric$`45_to_64_not_voted`, table04c1_wide_t_Numeric$`45_to_64_voted`)
## 
##  Welch Two Sample t-test
## 
## data:  table04c1_wide_t_Numeric$`45_to_64_not_voted` and table04c1_wide_t_Numeric$`45_to_64_voted`
## t = -1.1043, df = 61.102, p-value = 0.2738
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3744255  1079915
## sample estimates:
## mean of x mean of y 
##  925787.2 2257957.4
t.test(table04c1_wide_t_Numeric$`65+_not_voted`, table04c1_wide_t_Numeric$`65+_voted`)
## 
##  Welch Two Sample t-test
## 
## data:  table04c1_wide_t_Numeric$`65+_not_voted` and table04c1_wide_t_Numeric$`65+_voted`
## t = -1.257, df = 56.739, p-value = 0.2139
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2850150   652022
## sample estimates:
## mean of x mean of y 
##  573872.3 1672936.2

The results show that the diferences between ‘Voted’ and ‘Not voted’ status increase with age: for individuals older than 35 the pvalue of ttest is 0.007806 (below 0.05), which means that voted and not voted groups are different. And for older groups this diference increases (65+ 6.1e-05)

DT::datatable(table04c1,extensions = "Buttons", 
            options = list(paging = TRUE,
                           scrollX=TRUE, 
                           searching = TRUE,
                           ordering = TRUE,
                           dom = 'Bfrtip',
                           buttons = c('copy', 'csv', 'excel', 'pdf') ))
table04c1$freq_k <- table04c1$freq/1000000
ggplot(table04c1, aes(x = Age, 
                         y = freq_k, 
                         fill = If_voted)) +  
  geom_bar(stat = "identity", position ='dodge') +
  #facet_wrap(~ test3) + 
  scale_y_continuous("Number of citizens, million") +
  theme(plot.title = element_text(hjust = 0.5), panel.grid.major.x = element_blank()) + theme_light() + xlab('Age group')

library(ggplot2)
library(ggpubr)

ggballoonplot(table04c1, x = "If_voted", y = "Age", size = "freq_k",
              fill = "freq_k",
              ggtheme = theme_bw()) +
  scale_fill_viridis_c(option = "C")

check state with the biggest difference

# table04c1_wide_t_Numeric <- mutate_all(table04c1_wide_t, function(x) as.numeric(as.character(x)))


table04c1_wide_t_Numeric$difference_18_24 <-  table04c1_wide_t_Numeric$`18_to_24_voted`/sum(table04c1_wide_t_Numeric$`18_to_24_not_voted` , table04c1_wide_t_Numeric$`18_to_24_voted`)

table04c1_wide_t_Numeric <- table04c1_wide_t_Numeric[order(table04c1_wide_t_Numeric$difference_18_24, decreasing = T),]
table04c1_wide_t_Numeric$state <- rownames(table04c1_wide_t_Numeric)
head(table04c1_wide_t_Numeric[,c('state','difference_18_24')])
##                 state difference_18_24
## total           total       0.25692686
## CALIFORNIA CALIFORNIA       0.03375543
## TEXAS           TEXAS       0.01935362
## NEW YORK     NEW YORK       0.01319221
## FLORIDA       FLORIDA       0.01224721
## ILLINOIS     ILLINOIS       0.01101871
# ggplot(breadth_data, aes(x=fct_reorder(Stakeholder, Value), y=Value)) +
library(forcats)
ggplot(data=table04c1_wide_t_Numeric, aes(x=fct_reorder(state,difference_18_24, .desc = F), y=difference_18_24)) +  coord_flip() + 

  geom_bar(stat="identity", fill="steelblue")+
 # geom_text(aes(label=difference_18_24), vjust=1.6, color="white", size=3.5)+
  theme_minimal() + xlab('Proportion of voted citizens to the whole population by state')