To summarize what we discussed, we are interested in knowing if the
answers to questions 12 ("fellowship_yn"), 15
("enfolded_postgrad_yn"), 16
("private_academic"), 17 ("fellowship_years"),
and 18 ("fellowship_field") differ based on year of
training ("current_year"), debt ("debt"),
gender ("gender"), race ("race"), age
("age"), marital status ("marital_status"),
and family planning ("children").
Went from 265 to 257 rows because 8 cases from the original csv were removed due to having 50% or fewer of the questions answered.
df <- read.csv("maggie2.csv")
df <- df[!is.na(df$fellowship_yn),]
df2 <- df[c(2,7,20,22:23,31:33,35,37:39)]
df2$pgy_cat <- ifelse(df2$current_year==1|df2$current_year==2, 'junior', ifelse(df2$current_year==7|df2$current_year==6, "senior", "midlevel"))
df2$fellow_cat <- ifelse(df2$fellowship_yn==1|df2$fellowship_yn==2, 'Probably or Definitely Yes', ifelse(df2$fellowship_yn==4|df2$fellowship_yn==5, "Probably or Definitely No", "Undecided"))
df2$age_cat <- ifelse(df2$age==2, '22-25 y/o', ifelse(df2$age==3, "26-30 y/o", ifelse(df2$age==4, "31-35 y/o", "36-45 y/o")))
df2$race_cat <- ifelse(df2$race==1, 'Native American or Alaska Native', ifelse(df2$race==2, "Asian", ifelse(df2$race==3, "Black or African American", ifelse(df2$race==4, "Native Hawaiian or other Pacific Islander", ifelse(df2$race==5, "White", "Other/No Response")))))
df2$gender_cat <- ifelse(df2$gender==1, 'Cis Woman', ifelse(df2$gender==2, "Cis Man", ifelse(df2$gender==3, "NB", ifelse(df2$gender==4, "Trans Woman", ifelse(df2$gender==7, "No Response", "Other")))))
df_race <- df2[!df2$race==7,]
df_race <- df_race[!df_race$race==6,]
df_race <- df_race[!is.na(df_race$fellowship_yn),]
race <- table(df_race$fellow_cat,df_race$race_cat)
race2 <- as.data.frame(table(df_race$fellow_cat,df_race$race_cat))
colnames(race2) <- c("fellow_cat","race","count")
race_pct <- setDT(race2)[, list(Sum_Count = sum(count)), keyby = list(race, fellow_cat)][,
Count_Pct := round(Sum_Count/sum(Sum_Count), 2)*100, by = race][]
race_pct2 <- setDT(race2)[, list(sum_count = sum(count)), keyby = list(fellow_cat,race)][,
pct := round(sum_count/sum(sum_count), 2)*100, by = fellow_cat][]
p_all<-ggplot(data=race_pct, aes(x=race, y=Count_Pct, fill=fellow_cat)) + geom_bar(stat="identity", color="black", position=position_dodge())+
theme_minimal()+ scale_fill_brewer(palette="Blues")+gghisto+ggtitle("fellowship committments for all races") + #scale_x_discrete(labels=c("NA or Alaskan","Asian","Black or AA", "Hawaiian or PI","White")) +
ylim(0,100)+
geom_text(aes(label=Count_Pct), position=position_dodge(width=0.9), vjust=-0.25)+ theme(axis.text.x = element_text(size=6, angle=15))
p_all
fishers tests between groups race and fellow cat
fisher.test(df_race$race_cat,df_race$fellow_cat, simulate.p.value = TRUE, B=2000)
Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)
data: df_race$race_cat and df_race$fellow_cat
p-value = 0.0004998
alternative hypothesis: two.sided
fisher.test(table(df_race$fellow_cat,df_race$race_cat), simulate.p.value = TRUE, B=2000)
Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)
data: table(df_race$fellow_cat, df_race$race_cat)
p-value = 0.0004998
alternative hypothesis: two.sided
Conclusion from fisher’s test: there are significant differences between race and fellowship plans.
Below: 2-group fishers tests
Native American excluded for low sample size (n=4)
white.asian <- table(df_race[df_race$race_cat=="White" | df_race$race_cat=="Asian",14],df_race[df_race$race_cat=="White" | df_race$race_cat=="Asian",16])
white.black <- table(df_race[df_race$race_cat=="White" | df_race$race_cat=="Black or African American",14],df_race[df_race$race_cat=="White" | df_race$race_cat=="Black or African American",16])
white.NHPA <- table(df_race[df_race$race_cat=="White" | df_race$race_cat=="Native Hawaiian or other Pacific Islander",14],df_race[df_race$race_cat=="White" | df_race$race_cat=="Native Hawaiian or other Pacific Islander",16])
asian.black <- table(df_race[df_race$race_cat=="Asian" | df_race$race_cat=="Black or African American",14],df_race[df_race$race_cat=="Asian" | df_race$race_cat=="Black or African American",16])
asian.NHPA <- table(df_race[df_race$race_cat=="Asian" | df_race$race_cat=="Native Hawaiian or other Pacific Islander",14],df_race[df_race$race_cat=="Asian" | df_race$race_cat=="Native Hawaiian or other Pacific Islander",16])
black.NHPA <- table(df_race[df_race$race_cat=="Black or African American" | df_race$race_cat=="Native Hawaiian or other Pacific Islander",14],df_race[df_race$race_cat=="Black or African American" | df_race$race_cat=="Native Hawaiian or other Pacific Islander",16])
#tests
fisher.test(white.asian, simulate.p.value = TRUE, B=2000) #S
Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)
data: white.asian
p-value = 0.0004998
alternative hypothesis: two.sided
fisher.test(white.black, simulate.p.value = TRUE, B=2000) #NS
Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)
data: white.black
p-value = 0.07746
alternative hypothesis: two.sided
fisher.test(white.NHPA, simulate.p.value = TRUE, B=2000) #S
Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)
data: white.NHPA
p-value = 0.0009995
alternative hypothesis: two.sided
fisher.test(asian.black, simulate.p.value = TRUE, B=2000) #S -> NS after corrections.
Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)
data: asian.black
p-value = 0.01549
alternative hypothesis: two.sided
fisher.test(asian.NHPA, simulate.p.value = TRUE, B=2000) #NS
Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)
data: asian.NHPA
p-value = 1
alternative hypothesis: two.sided
fisher.test(black.NHPA, simulate.p.value = TRUE, B=2000) #NS
Fisher's Exact Test for Count Data
data: black.NHPA
p-value = 0.09753
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.6423599 35.5109712
sample estimates:
odds ratio
4.298978
df_race[,13:16] %>%
tbl_summary(by=race_cat) %>%
add_p()
There was an error in 'add_p()/add_difference()' for variable 'fellow_cat', p-value omitted:
Error in stats::fisher.test(c("Probably or Definitely No", "Probably or Definitely Yes", : FEXACT error 7(location). LDSTP=18630 is too small for this problem,
(pastp=19.1276, ipn_0:=ipoin[itp=423]=279, stp[ipn_0]=16.7221).
Increase workspace or consider using 'simulate.p.value=TRUE'
There was an error in 'add_p()/add_difference()' for variable 'age_cat', p-value omitted:
Error in stats::fisher.test(c("36-45 y/o", "26-30 y/o", "36-45 y/o", "31-35 y/o", : FEXACT error 6. LDKEY=621 is too small for this problem,
(ii := key2[itp=295] = 3790869, ldstp=18630)
Try increasing the size of the workspace and possibly 'mult'
| Characteristic | Asian, N = 321 | Black or African American, N = 221 | Native American or Alaska Native, N = 41 | Native Hawaiian or other Pacific Islander, N = 121 | White, N = 1541 | p-value2 |
|---|---|---|---|---|---|---|
| pgy_cat | 0.053 | |||||
| junior | 5 (16%) | 7 (33%) | 0 (0%) | 7 (58%) | 34 (22%) | |
| midlevel | 18 (56%) | 12 (57%) | 4 (100%) | 3 (25%) | 96 (62%) | |
| senior | 9 (28%) | 2 (9.5%) | 0 (0%) | 2 (17%) | 24 (16%) | |
| Unknown | 0 | 1 | 0 | 0 | 0 | |
| fellow_cat | ||||||
| Probably or Definitely No | 1 (3.1%) | 0 (0%) | 0 (0%) | 0 (0%) | 8 (5.2%) | |
| Probably or Definitely Yes | 17 (53%) | 19 (86%) | 1 (25%) | 7 (58%) | 141 (92%) | |
| Undecided | 14 (44%) | 3 (14%) | 3 (75%) | 5 (42%) | 5 (3.2%) | |
| age_cat | ||||||
| 22-25 y/o | 4 (12%) | 2 (9.1%) | 0 (0%) | 3 (25%) | 18 (12%) | |
| 26-30 y/o | 8 (25%) | 8 (36%) | 1 (25%) | 5 (42%) | 76 (49%) | |
| 31-35 y/o | 16 (50%) | 11 (50%) | 0 (0%) | 4 (33%) | 50 (32%) | |
| 36-45 y/o | 4 (12%) | 1 (4.5%) | 3 (75%) | 0 (0%) | 10 (6.5%) | |
| 1 n (%) | ||||||
| 2 Fisher's exact test | ||||||
Native American or Alaskan: Prob/Def No 0% Prob/Def Yes 25% Unsure 75%
Asian Prob/Def No 3% Prob/Def Yes 53% Unsure 44%
Black or African American Prob/Def No 0% Prob/Def Yes 86% Unsure 14%
Native Hawaiian or Pacific Islander Prob/Def No 0% Prob/Def Yes 58% Unsure 42%
White Prob/Def No 5% Prob/Def Yes 92% Unsure 3%
df_gen <- df2[!df2$gender==5,]
df_gen <- df_gen[!df_gen$gender==6,]
df_gen <- df_gen[!is.na(df_gen$fellowship_yn),]
#df_gen2 is only cis men, cis women, and NR
df_gen2 <- df_gen[!df_gen$gender==3,]
df_gen2 <- df_gen2[!df_gen2$gender==4,]
gender <- table(df_gen$fellow_cat,df_gen$gender_cat)
gender2 <- as.data.frame(table(df_gen$fellow_cat,df_gen$gender_cat))
colnames(gender2) <- c("fellow_cat","gender","count")
gen_pct <- setDT(gender2)[, list(sum_count = sum(count)), keyby = list(fellow_cat,gender)][,
pct := round(sum_count/sum(sum_count), 2)*100, by = fellow_cat][]
gen_pct2 <- setDT(gender2)[, list(sum_count = sum(count)), keyby = list(fellow_cat,gender)][,
pct := round(sum_count/sum(sum_count), 2)*100, by = gender][]
p_all<-ggplot(data=gen_pct2, aes(x=gender, y=pct, fill=fellow_cat)) + geom_bar(stat="identity", color="black", position=position_dodge())+
theme_minimal()+ scale_fill_brewer(palette="Blues")+gghisto+ggtitle("fellowship committments for all genders") + #scale_x_discrete(labels=c("Cis Woman","Cis Man","NB", "T Woman","No Response")) +
ylim(0,100)+
geom_text(aes(label=pct), position=position_dodge(width=0.9), vjust=-0.25)+ theme(axis.text.x = element_text(size=12, angle=15))
p_all
The categories for TW and NB are not normally distributed compared to the other 3 categories. “In 2018, 0.7% of matriculating medical students identified as TGNB” https://www.aamc.org/data-reports/students-residents/report/matriculating-student-questionnaire-msq
df_gen[,13:17] %>%
tbl_summary(by=gender_cat)%>%
add_p()
There was an error in 'add_p()/add_difference()' for variable 'age_cat', p-value omitted:
Error in stats::fisher.test(c("36-45 y/o", "26-30 y/o", "26-30 y/o", "36-45 y/o", : FEXACT error 6. LDKEY=621 is too small for this problem,
(ii := key2[itp=562] = 5767166, ldstp=18630)
Try increasing the size of the workspace and possibly 'mult'
There was an error in 'add_p()/add_difference()' for variable 'race_cat', p-value omitted:
Error in stats::fisher.test(c("White", "Other/No Response", "White", "White", : FEXACT[f3xact()] error: hash key 7e+09 > INT_MAX, kyy=153, it[i (= nco = 6)]= -13532483.
Rather set 'simulate.p.value=TRUE'
| Characteristic | Cis Man, N = 1511 | Cis Woman, N = 591 | NB, N = 81 | No Response, N = 111 | Trans Woman, N = 151 | p-value2 |
|---|---|---|---|---|---|---|
| pgy_cat | 0.048 | |||||
| junior | 26 (17%) | 20 (35%) | 4 (50%) | 1 (20%) | 5 (33%) | |
| midlevel | 95 (63%) | 28 (49%) | 2 (25%) | 2 (40%) | 8 (53%) | |
| senior | 30 (20%) | 9 (16%) | 2 (25%) | 2 (40%) | 2 (13%) | |
| Unknown | 0 | 2 | 0 | 6 | 0 | |
| fellow_cat | <0.001 | |||||
| Probably or Definitely No | 5 (3.3%) | 4 (6.8%) | 0 (0%) | 1 (9.1%) | 0 (0%) | |
| Probably or Definitely Yes | 133 (88%) | 51 (86%) | 5 (62%) | 9 (82%) | 6 (40%) | |
| Undecided | 13 (8.6%) | 4 (6.8%) | 3 (38%) | 1 (9.1%) | 9 (60%) | |
| age_cat | ||||||
| 22-25 y/o | 18 (12%) | 4 (7.0%) | 0 (0%) | 0 (0%) | 1 (6.7%) | |
| 26-30 y/o | 61 (40%) | 32 (56%) | 2 (25%) | 3 (27%) | 5 (33%) | |
| 31-35 y/o | 57 (38%) | 16 (28%) | 5 (62%) | 5 (45%) | 7 (47%) | |
| 36-45 y/o | 15 (9.9%) | 5 (8.8%) | 1 (12%) | 3 (27%) | 2 (13%) | |
| Unknown | 0 | 2 | 0 | 0 | 0 | |
| race_cat | ||||||
| Asian | 15 (9.9%) | 6 (10%) | 0 (0%) | 0 (0%) | 9 (60%) | |
| Black or African American | 15 (9.9%) | 1 (1.7%) | 1 (12%) | 1 (9.1%) | 2 (13%) | |
| Native American or Alaska Native | 4 (2.6%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Native Hawaiian or other Pacific Islander | 3 (2.0%) | 0 (0%) | 6 (75%) | 0 (0%) | 3 (20%) | |
| Other/No Response | 11 (7.3%) | 4 (6.9%) | 0 (0%) | 10 (91%) | 0 (0%) | |
| White | 103 (68%) | 47 (81%) | 1 (12%) | 0 (0%) | 1 (6.7%) | |
| Unknown | 0 | 1 | 0 | 0 | 0 | |
| 1 n (%) | ||||||
| 2 Fisher's exact test | ||||||
gender2 <- as.data.frame(table(df_gen2$fellow_cat,df_gen2$gender_cat))
colnames(gender2) <- c("fellow_cat","gender","count")
gen_pct <- setDT(gender2)[, list(sum_count = sum(count)), keyby = list(fellow_cat,gender)][,
pct := round(sum_count/sum(sum_count), 2)*100, by = fellow_cat][]
gen_pct2 <- setDT(gender2)[, list(sum_count = sum(count)), keyby = list(fellow_cat,gender)][,
pct := round(sum_count/sum(sum_count), 2)*100, by = gender][]
p_all<-ggplot(data=gen_pct2, aes(x=gender, y=pct, fill=fellow_cat)) + geom_bar(stat="identity", color="black", position=position_dodge())+
theme_minimal()+ scale_fill_brewer(palette="Blues")+gghisto+ggtitle("fellowship committments for all genders") + ylim(0,100)+
geom_text(aes(label=pct), position=position_dodge(width=0.9), vjust=-0.25)
p_all
fishers tests between groups race and fellow cat
fisher.test(df_gen2$gender_cat,df_gen2$fellow_cat, simulate.p.value = TRUE, B=2000)
Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)
data: df_gen2$gender_cat and df_gen2$fellow_cat
p-value = 0.4973
alternative hypothesis: two.sided
fisher.test(table(df_gen2$fellow_cat,df_gen2$gender_cat), simulate.p.value = TRUE, B=2000)
Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)
data: table(df_gen2$fellow_cat, df_gen2$gender_cat)
p-value = 0.5012
alternative hypothesis: two.sided
df_gen2[,13:17] %>%
tbl_summary(by=gender_cat)%>%
add_p()
There was an error in 'add_p()/add_difference()' for variable 'race_cat', p-value omitted:
Error in stats::fisher.test(c("White", "Other/No Response", "White", "White", : FEXACT error 7(location). LDSTP=18630 is too small for this problem,
(pastp=26.1932, ipn_0:=ipoin[itp=38]=1202, stp[ipn_0]=28.4756).
Increase workspace or consider using 'simulate.p.value=TRUE'
| Characteristic | Cis Man, N = 1511 | Cis Woman, N = 591 | No Response, N = 111 | p-value2 |
|---|---|---|---|---|
| pgy_cat | 0.043 | |||
| junior | 26 (17%) | 20 (35%) | 1 (20%) | |
| midlevel | 95 (63%) | 28 (49%) | 2 (40%) | |
| senior | 30 (20%) | 9 (16%) | 2 (40%) | |
| Unknown | 0 | 2 | 6 | |
| fellow_cat | 0.5 | |||
| Probably or Definitely No | 5 (3.3%) | 4 (6.8%) | 1 (9.1%) | |
| Probably or Definitely Yes | 133 (88%) | 51 (86%) | 9 (82%) | |
| Undecided | 13 (8.6%) | 4 (6.8%) | 1 (9.1%) | |
| age_cat | 0.2 | |||
| 22-25 y/o | 18 (12%) | 4 (7.0%) | 0 (0%) | |
| 26-30 y/o | 61 (40%) | 32 (56%) | 3 (27%) | |
| 31-35 y/o | 57 (38%) | 16 (28%) | 5 (45%) | |
| 36-45 y/o | 15 (9.9%) | 5 (8.8%) | 3 (27%) | |
| Unknown | 0 | 2 | 0 | |
| race_cat | ||||
| Asian | 15 (9.9%) | 6 (10%) | 0 (0%) | |
| Black or African American | 15 (9.9%) | 1 (1.7%) | 1 (9.1%) | |
| Native American or Alaska Native | 4 (2.6%) | 0 (0%) | 0 (0%) | |
| Native Hawaiian or other Pacific Islander | 3 (2.0%) | 0 (0%) | 0 (0%) | |
| Other/No Response | 11 (7.3%) | 4 (6.9%) | 10 (91%) | |
| White | 103 (68%) | 47 (81%) | 0 (0%) | |
| Unknown | 0 | 1 | 0 | |
| 1 n (%) | ||||
| 2 Fisher's exact test | ||||