# import data -------------------------------------------------------------
df <- read.csv(file="fa_data.csv", header=T, na.strings = c("#N/A","NA",""," "))
names(df)
##  [1] "ID"                             "pre_FeltLikeSciencePerson"     
##  [3] "pre_SeeMyselfSciencePerson"     "pre_FamilySeeSciencePerson"    
##  [5] "pre_InstructorSeeSciencePerson" "pre_PeerSeeSciencePerson"      
##  [7] "pre_EnjoyScience"               "pre_InterestedScience"         
##  [9] "pre_UnderstandPreviousScience"  "pre_UnderstandNewScience"      
## [11] "pre_OvercomeSetbacks"           "pre_ConfidentOutsideClass"     
## [13] "pre_ConfidentExams"             "pre_OthersAskHelp"             
## [15] "pre_OutsideClassInSubject1"     "pre_OutsideClassInSubject2"    
## [17] "pre_RealWorldIssues"            "pre_FindArticles"              
## [19] "pre_CriticallyRead"             "pre_IdentifyPatterns"          
## [21] "pre_RecognizeArgument"          "pre_DevelopArgument"           
## [23] "pre_WriteDocuments"             "pre_WorkWithOthers"            
## [25] "pre_OralPresentation"           "pre_Enthusiastic"              
## [27] "pre_DiscussWithFriends"         "pre_PlanningAdditionalClasses" 
## [29] "pre_PursuringCareer"            "pre_UnderstandSubject"         
## [31] "pre_SucceedSubject"             "pre_ComplexIdeas"              
## [33] "pre_AskingForHelp"              "pre_ConnectIdeas"              
## [35] "pre_ApplyingOutsideClass"       "pre_SystematicReasoning"       
## [37] "pre_AnalyzingData"              "pre_Course"                    
## [39] "pre_Ethnicity"                  "pre_UnlistedEthnicity"         
## [41] "pre_Gender"                     "pre_Career.Goal"               
## [43] "pre_OtherCareer"                "pre_PreviousCourses"           
## [45] "term"                           "pre_Gender2"                   
## [47] "freq"
# clean race/ethnicity ----------------------------------------------------
table(df$pre_Ethnicity)
## 
##                                0                          African 
##                                3                                3 
##           African American/Black An Ethnicity Which is not Listed 
##                               18                               47 
##              Asian: Asian Indian                   Asian: Chinese 
##                               43                               83 
##                  Asian: Filipinx                  Asian: Japanese 
##                               86                               13 
##                    Asian: Korean                  Asian: Vietmese 
##                               15                               43 
##                Asian: Vietnamese         Latinx: Central American 
##                               84                               30 
##        Latinx: Chicanx / Mexican           Latinx: South American 
##                              161                               15 
##                  White: European             White: Midde Eastern 
##                              162                               24 
##             White: North African 
##                                7
unlisted <- data.frame(table(df$pre_UnlistedEthnicity))
unlisted$rvar <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[1]] <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[2]] <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[3]] <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[4]] <- "White: Middle Eastern"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[5]] <- "Latinx: Central American"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[6]] <- "Native American"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[7]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[8]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[9]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[10]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[11]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[12]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[13]] <- "Asian: East Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[14]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[16]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[17]] <- "Latinx: South American"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[18]] <- "Asian: Southeast Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[19]] <- "Asian: Southeast Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[20]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[21]] <- "Latinx: Chicanx / Mexican"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[22]] <- "Asian: Filipinx"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[23]] <- "Asian: Filipinx"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[24]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[25]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[26]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[27]] <- "Hispanic"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[28]] <- "Hispanic"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[29]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[30]] <- "Latinx: Central American"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[31]] <- "White: Middle Eastern"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[32]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[33]] <- "White: Middle Eastern"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[34]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[35]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[36]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[37]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[38]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[39]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[40]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[41]] <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[42]] <- "Native American"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[43]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[44]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[45]] <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[46]] <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[47]] <- "Asian: Filipinx"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[48]] <- "White: European"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[49]] <- "Latinx: Central American"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[50]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[51]] <- "White: European"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[52]] <- "Asian: East Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[53]] <- "Asian: East Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[54]] <- "Asian: East Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[55]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[56]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[57]] <- "Biracial"

table(unlisted$rvar, useNA = "always")
## 
##         Asian: East Asian           Asian: Filipinx        Asian: South Asian 
##                         4                         3                         8 
##    Asian: Southeast Asian                  Biracial                  Hispanic 
##                         2                        19                         2 
##  Latinx: Central American Latinx: Chicanx / Mexican    Latinx: South American 
##                         3                         1                         1 
##           Native American               No Response           White: European 
##                         2                         7                         2 
##     White: Middle Eastern                      <NA> 
##                         3                         0
table(df$pre_Ethnicity, useNA = "always")
## 
##                                0                          African 
##                                3                                3 
##           African American/Black An Ethnicity Which is not Listed 
##                               18                               47 
##              Asian: Asian Indian                   Asian: Chinese 
##                               43                               83 
##                  Asian: Filipinx                  Asian: Japanese 
##                               86                               13 
##                    Asian: Korean                  Asian: Vietmese 
##                               15                               43 
##                Asian: Vietnamese         Latinx: Central American 
##                               84                               30 
##        Latinx: Chicanx / Mexican           Latinx: South American 
##                              161                               15 
##                  White: European             White: Midde Eastern 
##                              162                               24 
##             White: North African                             <NA> 
##                                7                                9
df$pre_Ethnicity[is.na(df$pre_Ethnicity)] <- "No Response"

df2 <- merge(df, unlisted, by.x = "pre_UnlistedEthnicity", by.y = "Var1")
df2$pre_Ethnicity[df2$pre_Ethnicity == "An Ethnicity Which is not Listed"] <- df2$rvar[df2$pre_Ethnicity == "An Ethnicity Which is not Listed"]
df2$pre_Ethnicity[df2$pre_Ethnicity == "0"] <- df2$rvar[df2$pre_Ethnicity == "0"]
df2$pre_Ethnicity[df2$pre_Ethnicity == "Asian: Vietmese"] <- "Asian: Vietnamese"
df2$pre_Ethnicity[df2$pre_Ethnicity == "White: Midde Eastern"] <- "White: Middle Eastern"
table(df2$pre_Ethnicity, useNA = "always")
## 
##                   African    African American/Black       Asian: Asian Indian 
##                         3                        18                        42 
##            Asian: Chinese         Asian: East Asian           Asian: Filipinx 
##                        83                         4                        87 
##           Asian: Japanese             Asian: Korean        Asian: South Asian 
##                        13                        14                         8 
##    Asian: Southeast Asian         Asian: Vietnamese                  Biracial 
##                         3                       127                        15 
##                  Hispanic  Latinx: Central American Latinx: Chicanx / Mexican 
##                         3                        32                       161 
##    Latinx: South American           Native American               No Response 
##                        14                         1                        16 
##           White: European     White: Middle Eastern      White: North African 
##                       163                        27                         7 
##                      <NA> 
##                         0
df2$eth_cond[df2$pre_Ethnicity == "African" |
               df2$pre_Ethnicity == "African American/Black" |
               df2$pre_Ethnicity == "Native American"] <- "Black & Indigenous"
df2$eth_cond[df2$pre_Ethnicity == "Asian: Chinese" |
               df2$pre_Ethnicity ==  "Asian: East Asian" |
               df2$pre_Ethnicity == "Asian: Japanese" |
               df2$pre_Ethnicity == "Asian: Korean"] <- "Asian: East Asian"
df2$eth_cond[df2$pre_Ethnicity == "Asian: Asian Indian" |
               df2$pre_Ethnicity == "Asian: South Asian"] <- "Asian: South Asian"
df2$eth_cond[df2$pre_Ethnicity == "Asian: Filipinx" |
               df2$pre_Ethnicity == "Asian: Southeast Asian" |
               df2$pre_Ethnicity == "Asian: Vietnamese"] <- "Asian: Southeast Asian"
df2$eth_cond[df2$pre_Ethnicity == "Asian: Filipinx" |
               df2$pre_Ethnicity == "Asian: Southeast Asian" |
               df2$pre_Ethnicity == "Asian: Vietnamese"] <- "Asian: Southeast Asian"
df2$eth_cond[df2$pre_Ethnicity == "Hispanic" |
               df2$pre_Ethnicity == "Latinx: Central American" |
               df2$pre_Ethnicity == "Latinx: Chicanx / Mexican" |
               df2$pre_Ethnicity == "Latinx: South American"] <- "Hispanic & Latino"
df2$eth_cond[df2$pre_Ethnicity == "White: European" |
               df2$pre_Ethnicity == "White: Middle Eastern" |
               df2$pre_Ethnicity == "White: North African"] <- "White"
df2$eth_cond[df2$pre_Ethnicity == "Biracial"] <- "Biracial"
df2$eth_cond[df2$pre_Ethnicity == "No Response"] <- "No Response"

table(df2$pre_Ethnicity, df2$eth_cond, useNA = "always")
##                            
##                             Asian: East Asian Asian: South Asian
##   African                                   0                  0
##   African American/Black                    0                  0
##   Asian: Asian Indian                       0                 42
##   Asian: Chinese                           83                  0
##   Asian: East Asian                         4                  0
##   Asian: Filipinx                           0                  0
##   Asian: Japanese                          13                  0
##   Asian: Korean                            14                  0
##   Asian: South Asian                        0                  8
##   Asian: Southeast Asian                    0                  0
##   Asian: Vietnamese                         0                  0
##   Biracial                                  0                  0
##   Hispanic                                  0                  0
##   Latinx: Central American                  0                  0
##   Latinx: Chicanx / Mexican                 0                  0
##   Latinx: South American                    0                  0
##   Native American                           0                  0
##   No Response                               0                  0
##   White: European                           0                  0
##   White: Middle Eastern                     0                  0
##   White: North African                      0                  0
##   <NA>                                      0                  0
##                            
##                             Asian: Southeast Asian Biracial Black & Indigenous
##   African                                        0        0                  3
##   African American/Black                         0        0                 18
##   Asian: Asian Indian                            0        0                  0
##   Asian: Chinese                                 0        0                  0
##   Asian: East Asian                              0        0                  0
##   Asian: Filipinx                               87        0                  0
##   Asian: Japanese                                0        0                  0
##   Asian: Korean                                  0        0                  0
##   Asian: South Asian                             0        0                  0
##   Asian: Southeast Asian                         3        0                  0
##   Asian: Vietnamese                            127        0                  0
##   Biracial                                       0       15                  0
##   Hispanic                                       0        0                  0
##   Latinx: Central American                       0        0                  0
##   Latinx: Chicanx / Mexican                      0        0                  0
##   Latinx: South American                         0        0                  0
##   Native American                                0        0                  1
##   No Response                                    0        0                  0
##   White: European                                0        0                  0
##   White: Middle Eastern                          0        0                  0
##   White: North African                           0        0                  0
##   <NA>                                           0        0                  0
##                            
##                             Hispanic & Latino No Response White <NA>
##   African                                   0           0     0    0
##   African American/Black                    0           0     0    0
##   Asian: Asian Indian                       0           0     0    0
##   Asian: Chinese                            0           0     0    0
##   Asian: East Asian                         0           0     0    0
##   Asian: Filipinx                           0           0     0    0
##   Asian: Japanese                           0           0     0    0
##   Asian: Korean                             0           0     0    0
##   Asian: South Asian                        0           0     0    0
##   Asian: Southeast Asian                    0           0     0    0
##   Asian: Vietnamese                         0           0     0    0
##   Biracial                                  0           0     0    0
##   Hispanic                                  3           0     0    0
##   Latinx: Central American                 32           0     0    0
##   Latinx: Chicanx / Mexican               161           0     0    0
##   Latinx: South American                   14           0     0    0
##   Native American                           0           0     0    0
##   No Response                               0          16     0    0
##   White: European                           0           0   163    0
##   White: Middle Eastern                     0           0    27    0
##   White: North African                      0           0     7    0
##   <NA>                                      0           0     0    0
table(df2$eth_cond, useNA = "always")
## 
##      Asian: East Asian     Asian: South Asian Asian: Southeast Asian 
##                    114                     50                    217 
##               Biracial     Black & Indigenous      Hispanic & Latino 
##                     15                     22                    210 
##            No Response                  White                   <NA> 
##                     16                    197                      0
# clean gender ------------------------------------------------------------
table(df2$pre_Gender, useNA = "always")
## 
##       F  Female       M    Male Nonbiry    <NA> 
##     322     169     126      65       1     158
table(df2$pre_Gender2, useNA = "always")
## 
##    B    F    M    N <NA> 
##    1  268  110    1  461
table(df2$pre_Gender, df2$pre_Gender2, useNA = "always")
##          
##             B   F   M   N <NA>
##   F         0   0   0   0  322
##   Female    0 167   0   1    1
##   M         0   0   0   0  126
##   Male      0   0  65   0    0
##   Nonbiry   1   0   0   0    0
##   <NA>      0 101  45   0   12
df2$pre_Gender3 <- df2$pre_Gender
df2$pre_Gender3[is.na(df2$pre_Gender)] <- df2$pre_Gender2[is.na(df2$pre_Gender)]
df2$pre_Gender3[df2$pre_Gender3 == "Female"] <- "F"
df2$pre_Gender3[df2$pre_Gender3 == "Male"] <- "M"
df2$pre_Gender3[df2$pre_Gender3 == "Nonbiry"] <- "B"
df2$pre_Gender3[df2$pre_Gender2 == "N"] <- "B"

table(df2$pre_Gender3, useNA = "always")
## 
##    B    F    M <NA> 
##    2  591  236   12
# clean career goal -------------------------------------------------------
table(df2$pre_Career.Goal)
## 
##                                             0 
##                                             4 
##               Biologist/Biomedical Researcher 
##                                            11 
##                                    Counseling 
##                                            10 
##               Dental Assisting/Dental Hygiene 
##                                            58 
##                                       Dentist 
##                                             8 
##                           Dietician/Nutrition 
##                                            10 
##                             EMS/EMT/Paramedic 
##                                             7 
## Kinesiology/Sports Medicine/Athletic Training 
##                                            19 
##                                Medical School 
##                                            51 
##                                       Nursing 
##                                           271 
##                            Occupatiol Therapy 
##                                             2 
##                          Occupational Therapy 
##                                             9 
##                         Other: Health/Science 
##                                            64 
##                     Other: Non-Health/Science 
##                                            57 
##              Pharmacy Technologist/Pharmacist 
##                                            13 
##                              Physical Therapy 
##                                            23 
##                           Physician Assistant 
##                                            22 
##                                 Public Health 
##                                            10 
##                                     Radiology 
##                                            83 
##                           Respiratory Therapy 
##                                            25 
##                                   Social Work 
##                                             5 
##                                Speech Therapy 
##                                             3 
##                                       Teacher 
##                                             9 
##    Veterinary Assistant/Veterinary Technology 
##                                            56 
##        Veteriry Assistant/Veteriry Technology 
##                                             7
df2$pre_Career <- NA
df2$pre_Career[df2$pre_Career.Goal == "Dental Assisting/Dental Hygiene" |
              df2$pre_Career.Goal == "Dietician/Nutrition" |
              df2$pre_Career.Goal == "EMS/EMT/Paramedic" |
              df2$pre_Career.Goal == "Kinesiology/Sports Medicine/Athletic Training" |
              df2$pre_Career.Goal == "Occupational Therapy" |
              df2$pre_Career.Goal == "Physical Therapy" |
              df2$pre_Career.Goal == "Physician Assistant" |
              df2$pre_Career.Goal == "Radiology" |
              df2$pre_Career.Goal == "Respiratory Therapy" |
              df2$pre_Career.Goal == "Speech Therapy"] <- "AH"

df2$pre_Career[df2$pre_Career.Goal == "Nursing"] <- "N"

df2$pre_Career[df2$pre_Career.Goal == "Social Work" |
              df2$pre_Career.Goal == "Teacher"] <- "Non-STEM"

df2$pre_Career[df2$pre_Career.Goal == "Public Health" |
              df2$pre_Career.Goal == "Dentist" |
              df2$pre_Career.Goal == "Medical School" |
              df2$pre_Career.Goal == "Pharmacy Technologist/Pharmacist"] <- "Other Health"

df2$pre_Career[df2$pre_Career.Goal == "Veterinary Assistant/Veterinary Technology"] <- "Veterinary"

df2$pre_Career[df2$pre_Career.Goal == "Counseling" |
              df2$pre_Career.Goal == "Biologist/Biomedical Researcher"] <- "STEM"

table(df2$pre_OtherCareer)
## 
##                                                                                                                          Please Specify 
##                                                                                                                                       1 
##                                                                                                                                       0 
##                                                                                                                                       6 
##                                                                                                      A career related to art and design 
##                                                                                                                                       1 
##                                                                                                                              Accountant 
##                                                                                                                                       2 
##                                                                                                                      also public health 
##                                                                                                                                       1 
##                                                                                                                      and public health  
##                                                                                                                                       1 
##                                                                                                                      Anesthesiologists  
##                                                                                                                                       1 
##                                                                                                                   Art and Gaming Design 
##                                                                                                                                       1 
##                                                                                                                                  artist 
##                                                                                                                                       1 
##                                                                                                                              Biochemist 
##                                                                                                                                       1 
##                                                                                                                            Biochemistry 
##                                                                                                                                       1 
##                                                                                                                          Bioengineering 
##                                                                                                                                       1 
##                                                                                                                     Biomedical Engineer 
##                                                                                                                                       1 
##                                                                                                                                Business 
##                                                                                                                                       2 
##                                                                                                                               business  
##                                                                                                                                       2 
##                                                                                                                 Business Administration 
##                                                                                                                                       1 
##                                                                                                                business administration  
##                                                                                                                                       1 
##                                                                                                                     Business Psychology 
##                                                                                                                                       1 
##                                                                                                                              Cardiology 
##                                                                                                                                       1 
##                                                                                                                  cardiovascular science 
##                                                                                                                                       1 
##                                                                                                           Clinical Laboratory Scientist 
##                                                                                                                                       1 
##                                                                                                                   clinical psychologist 
##                                                                                                                                       1 
##                                                                                                                        College student  
##                                                                                                                                       1 
##                                                                                                                          communicating  
##                                                                                                                                       1 
##                                                                                                                          communications 
##                                                                                                                                       1 
##                                                                                                                          Communications 
##                                                                                                                                       1 
##                                                                                                                    Computer engineering 
##                                                                                                                                       1 
##                                                                                                                    Computer Engineering 
##                                                                                                                                       2 
##                                                                                                                        computer science 
##                                                                                                                                       1 
##                                                                                                                        Computer Science 
##                                                                                                                                       2 
##                                                                                                                            Conservation 
##                                                                                                                                       1 
##                                                                                                                 construction management 
##                                                                                                                                       1 
##                                                                                                                        Criminal Justice 
##                                                                                                                                       1 
##                                                                                                                             Criminology 
##                                                                                                                                       1 
##                                                                                                                            Data Science 
##                                                                                                                                       1 
##                                                                                                                          dental hygiene 
##                                                                                                                                       1 
##                                                                                                                        Dental Hygienist 
##                                                                                                                                       1 
##                                                                                                                           Dental School 
##                                                                                                                                       1 
##                                                                                                                 Dental School (DDS/DMD) 
##                                                                                                                                       1 
##                                                                                                                                 Dentist 
##                                                                                                                                       2 
##                                                                                                                               Dentistry 
##                                                                                                                                       1 
##                                                                                                                           determatology 
##                                                                                                                                       1 
##                                                                                                           Diagnostic medical Sonography 
##                                                                                                                                       1 
##                                                                                                           Diagnostic Medical Sonography 
##                                                                                                                                       1 
##                                                                                                                               Dietetics 
##                                                                                                                                       1 
##                                                                                                                              dietician  
##                                                                                                                                       1 
##                                                                                                                         Echocardiogram  
##                                                                                                                                       1 
##                                                                                                                                 ecology 
##                                                                                                                                       1 
##                                                                                                                               Economist 
##                                                                                                                                       1 
##                                                                                                                              Economist  
##                                                                                                                                       1 
##                                                                                                                 Electrical engineering  
##                                                                                                                                       1 
##                                                                                                                             Electrician 
##                                                                                                                                       1 
##                                                                                                                      Engineer, Chemical 
##                                                                                                                                       1 
##                                                                                                                             Engineering 
##                                                                                                                                       5 
##                                                                          Financial Advisor or Economic Consultant within the STEM field 
##                                                                                                                                       1 
##                                                                                                                            Fire fighter 
##                                                                                                                                       1 
##                                                                                                                            Food science 
##                                                                                                                                       1 
##                                                                                                                             Government  
##                                                                                                                                       1 
##                                                                                                                          Graphic Design 
##                                                                                                                                       1 
##                                                                                             HealthCare Administration or Speech Therapy 
##                                                                                                                                       1 
## I already have an undergraduate degree in counseling and am mostly taking classes at Foothill to explore other science related fields.  
##                                                                                                                                       1 
##                                                                                                                         i dont know yet 
##                                                                                                                                       1 
##                                                                                                    Industrial Design, can apply science 
##                                                                                                                                       1 
##                                                                                                                              Journalism 
##                                                                                                                                       1 
##                                                                                                                       Journalism or law 
##                                                                                                                                       1 
##                                                                                                                                     Law 
##                                                                                                                                       1 
##                                                                                                                         Law Enforcement 
##                                                                                                                                       1 
##                                                                                                                          Marine Biology 
##                                                                                                                                       2 
##                                                                                                                  marketing/advertising  
##                                                                                                                                       1 
##                                                                                                                         Massage therapy 
##                                                                                                                                       1 
##                                                                                                                        Math or Business 
##                                                                                                                                       1 
##                                                                                                                     Medical Sonographer 
##                                                                                                                                       1 
##                                                                                                                        Military aviator 
##                                                                                                                                       1 
##                                                                                                                                     MLT 
##                                                                                                                                       1 
##                                                                                                    more related to computer programming 
##                                                                                                                                       1 
##                                                                                                                          Music Business 
##                                                                                                                                       1 
##                                                                                                                                     N/A 
##                                                                                                                                       2 
##                                                                                                                                    Nada 
##                                                                                                                                       1 
##                                                                                                                            not sure yet 
##                                                                                                                                       1 
##                                                                                                                        Nurse Practioner 
##                                                                                                                                       2 
##                                                                                                                      Nurse Practitioner 
##                                                                                                                                       1 
##                                                                                                             Nursing first then NP or MD 
##                                                                                                                                       1 
##                                                                                                                              Nutrition  
##                                                                                                                                       1 
##                                                                                                                 Nutrition and Dietetics 
##                                                                                                                                       1 
##                                                                                                                               Optometry 
##                                                                                                                                       2 
##                                                                                                                              Optometry  
##                                                                                                                                       1 
##                                                                                                                            Or Radiology 
##                                                                                                                                       1 
##                                                                                                                           Or radiology  
##                                                                                                                                       1 
##                                                                                             or stay in biologics process development... 
##                                                                                                                                       1 
##                                                                                                                                      PA 
##                                                                                                                                       1 
##                                                                                                                             Park Ranger 
##                                                                                                                                       1 
##                                                                                                                              Pharmacist 
##                                                                                                                                       1 
##                                                                                                                              Phlebotomy 
##                                                                                                                                       1 
##                                                                                                                       physical therapy  
##                                                                                                                                       1 
##                                                                                                                   Physician's Assistant 
##                                                                                                                                       2 
##                                                                                                                    Physician Assisstant 
##                                                                                                                                       1 
##                                                                                                                     Physician Assistant 
##                                                                                                                                       2 
##                                                                                                                    Physician Assistant  
##                                                                                                                                       1 
##                                                                                                              Physician Assistant School 
##                                                                                                                                       1 
##                                                                                                                          Please Specify 
##                                                                                                                                     679 
##                                                                                                                primatology/anthropology 
##                                                                                                                                       1 
##                                                                                                                             Prosthetist 
##                                                                                                                                       1 
##                                                                                                                              Psychiatry 
##                                                                                                                                       2 
##                                                                                                                              psychology 
##                                                                                                                                       2 
##                                                                                                                              Psychology 
##                                                                                                                                       5 
##                                                                                                                             Psychology  
##                                                                                                                                       2 
##                                                                                                           Psychology and Public Health  
##                                                                                                                                       1 
##                                                                                                                               radiology 
##                                                                                                                                       1 
##                                                                                                                               Radiology 
##                                                                                                                                       1 
##                                                                                                                Research in Dermatology  
##                                                                                                                                       1 
##                                                                                                                      Research Scientist 
##                                                                                                                                       1 
##                                                                                Research, rehabilitation, conservation of marine animals 
##                                                                                                                                       1 
##                                                                                                                     Respiratory Therapy 
##                                                                                                                                       1 
##                                                                                                          social and clinical psychology 
##                                                                                                                                       1 
##                                                                                                                              Sociology  
##                                                                                                                                       1 
##                                                                                                                                Software 
##                                                                                                                                       1 
##                                                                                                                       software engineer 
##                                                                                                                                       1 
##                                                                                                                       Software Engineer 
##                                                                                                                                       1 
##                                                                                                                    Software Engineering 
##                                                                                                                                       1 
##                                                                                              Something with sociology, but still unsure 
##                                                                                                                                       1 
##                                                                                                                              Sonography 
##                                                                                                                                       2 
##                                                                                                                             Sonography  
##                                                                                                                                       1 
##                                                                                                        Sonography/Ultrasound technology 
##                                                                                                                                       1 
##                                                                                                                        Sports Marketing 
##                                                                                                                                       1 
##                                                                                                                            Statistician 
##                                                                                                                                       1 
##                                                                                                                     Theater Technician  
##                                                                                                                                       1 
##                                                                                                                               Therapist 
##                                                                                                                                       1 
##                                                                                                                                  Unsure 
##                                                                                                                                       1 
##                                                                                                                            Veterinarian 
##                                                                                                                                       1 
##                                                                                                                       Veterinary school 
##                                                                                                                                       1
df2$pre_Career[df2$pre_OtherCareer == "dental hygiene"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Dietetics"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Nutrition and Dietetics"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "PA"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Physician's Assistant"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Physician Assisstant"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Physician Assistant"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Physician Assistant School"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Radiology"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Respiratory Therapy"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Dental School (DDS/DMD)"] <- "Other Health"
df2$pre_Career[df2$pre_OtherCareer == "Dentist"] <- "Other Health"
df2$pre_Career[df2$pre_OtherCareer == "Psychiatry"] <- "Other Health"
df2$pre_Career[df2$pre_OtherCareer == "Nurse Practioner"] <- "N"
df2$pre_Career[df2$pre_OtherCareer == "Pharmacist"] <- "Other Health"
df2$pre_Career[df2$pre_OtherCareer == "Veterinarian"] <- "Veterinary"
df2$pre_Career[df2$pre_OtherCareer == "Veterinary school"] <- "Veterinary"
df2$pre_Career[df2$pre_OtherCareer == "Anesthesiologists"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Diagnostic medical Sonography"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Echocardiogram"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Massage therapy"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Medical Sonographer"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Optometry"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Prosthetist"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Sonography"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Biochemist"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Bioengineering"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Biomedical Engineer"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "cardiovascular science"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Clinical Laboratory Scientist"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "clinical psychologist"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Computer Engineering"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Computer Science"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Conservation"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "ecology"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Engineer, Chemical"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Engineering"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Industrial Design, can apply science"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "psychology"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "social and clinical psychology"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Software Engineering"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Statistician"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "College student"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "Criminology"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "HealthCare Administration or Speech Therapy"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "Military aviator"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "MLT"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Park Ranger"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Psychology and Public Health"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "Something that involves Physics or a Product designer"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "Unsure"] <- NA

df2$pre_Career[df2$pre_OtherCareer == "Economist"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Electrician"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Financial Advisor or Economic Consultant within the STEM field"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "Therapist"] <- "Other Health"
df2$pre_Career[df2$pre_OtherCareer == "Math or Business"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "Something with sociology, but still unsure"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "A career related to art and design"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Art and Gaming Design"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "artist"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Business"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Business Administration"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Business Psychology"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "communications"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "I already have an undergraduate degree in counseling and am mostly taking classes at Foothill to explore other science related fields."] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "i dont know yet"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Journalism"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Journalism or law"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Law"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Law Enforcement"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "marketing/advertising"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Sports Marketing"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Theater Technician"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Accountant"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Fire fighter"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Government"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Graphic Design"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "construction management"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Criminal Justice"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Biochemistry"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Computer engineering"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "computer science"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Data Science"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Electrical engineering"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "psychology"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Research, rehabilitation, conservation of marine animals"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Sociology"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Software"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "software engineer"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Marine Biology"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Engineering"] <- "STEM"

table(df2$pre_Career, useNA = "always")
## 
##           AH            N     Non-STEM Other Health         STEM   Veterinary 
##          279          272           39           88           54           58 
##         <NA> 
##           51
# create composites -------------------------------------------------------
names(df2)
##  [1] "pre_UnlistedEthnicity"          "ID"                            
##  [3] "pre_FeltLikeSciencePerson"      "pre_SeeMyselfSciencePerson"    
##  [5] "pre_FamilySeeSciencePerson"     "pre_InstructorSeeSciencePerson"
##  [7] "pre_PeerSeeSciencePerson"       "pre_EnjoyScience"              
##  [9] "pre_InterestedScience"          "pre_UnderstandPreviousScience" 
## [11] "pre_UnderstandNewScience"       "pre_OvercomeSetbacks"          
## [13] "pre_ConfidentOutsideClass"      "pre_ConfidentExams"            
## [15] "pre_OthersAskHelp"              "pre_OutsideClassInSubject1"    
## [17] "pre_OutsideClassInSubject2"     "pre_RealWorldIssues"           
## [19] "pre_FindArticles"               "pre_CriticallyRead"            
## [21] "pre_IdentifyPatterns"           "pre_RecognizeArgument"         
## [23] "pre_DevelopArgument"            "pre_WriteDocuments"            
## [25] "pre_WorkWithOthers"             "pre_OralPresentation"          
## [27] "pre_Enthusiastic"               "pre_DiscussWithFriends"        
## [29] "pre_PlanningAdditionalClasses"  "pre_PursuringCareer"           
## [31] "pre_UnderstandSubject"          "pre_SucceedSubject"            
## [33] "pre_ComplexIdeas"               "pre_AskingForHelp"             
## [35] "pre_ConnectIdeas"               "pre_ApplyingOutsideClass"      
## [37] "pre_SystematicReasoning"        "pre_AnalyzingData"             
## [39] "pre_Course"                     "pre_Ethnicity"                 
## [41] "pre_Gender"                     "pre_Career.Goal"               
## [43] "pre_OtherCareer"                "pre_PreviousCourses"           
## [45] "term"                           "pre_Gender2"                   
## [47] "freq"                           "Freq"                          
## [49] "rvar"                           "eth_cond"                      
## [51] "pre_Gender3"                    "pre_Career"
df3 <- subset(df2, select=c(2,3:4,5:7,8:9,10:11,13:14,21:23,27:30,35:37,16:18,50,51,52))
names(df3)
##  [1] "ID"                             "pre_FeltLikeSciencePerson"     
##  [3] "pre_SeeMyselfSciencePerson"     "pre_FamilySeeSciencePerson"    
##  [5] "pre_InstructorSeeSciencePerson" "pre_PeerSeeSciencePerson"      
##  [7] "pre_EnjoyScience"               "pre_InterestedScience"         
##  [9] "pre_UnderstandPreviousScience"  "pre_UnderstandNewScience"      
## [11] "pre_ConfidentOutsideClass"      "pre_ConfidentExams"            
## [13] "pre_IdentifyPatterns"           "pre_RecognizeArgument"         
## [15] "pre_DevelopArgument"            "pre_Enthusiastic"              
## [17] "pre_DiscussWithFriends"         "pre_PlanningAdditionalClasses" 
## [19] "pre_PursuringCareer"            "pre_ConnectIdeas"              
## [21] "pre_ApplyingOutsideClass"       "pre_SystematicReasoning"       
## [23] "pre_OutsideClassInSubject1"     "pre_OutsideClassInSubject2"    
## [25] "pre_RealWorldIssues"            "eth_cond"                      
## [27] "pre_Gender3"                    "pre_Career"
rename_vars <- c("id",
  paste("gen",1:2,sep=""),
  paste("rec",1:3,sep=""),
  paste("int",1:2,sep=""),
  paste("pc",1:4,sep=""),
  paste("pc_verb",1:3,sep=""),
  paste("int2",1:4,sep=""),
  paste("sciapp",1:3,sep=""),
  paste("connect",1:3,sep=""),
  "raceeth","gender","careergoal"
)
itemlist <- cbind(colnames(df3), rename_vars)
colnames(df3) <- rename_vars
head(df3)
##         id gen1 gen2 rec1 rec2 rec3 int1 int2 pc1 pc2 pc3 pc4 pc_verb1 pc_verb2
## 1 33939702    5    5    5    5    5    5    5   5   4   3   4        4        5
## 2 33756918    4    5    4    4    4    5    5   5   5   5   5        5        5
## 3 61089066    5    5    5    5    5    5    5   5   5   5   5        5        5
## 4 60927867    1    5    5    5    5    5    5   5   5   5   5        5        5
## 5 61129203    3    4    4    3    4    4    4   4   4   4   4        4        4
## 6 61037895    5    5    5    4    4    5    5   5   5   5   4        5        5
##   pc_verb3 int21 int22 int23 int24 sciapp1 sciapp2 sciapp3 connect1 connect2
## 1        3     5     5     5    NA       5       5       5        5        5
## 2        5     5     5     5     5       4       5       5        5        5
## 3        5     5     5     5     5       5       5       5        5        5
## 4        5     5     5     5     5       5       5       5        5        5
## 5        4     4     5     4     4       5       5       5        4        4
## 6        5     5     5     5     5       4       4       5        5        4
##   connect3                raceeth gender   careergoal
## 1        5 Asian: Southeast Asian      F           AH
## 2        5            No Response      F            N
## 3        5      Asian: East Asian      M Other Health
## 4        4                  White      M            N
## 5        4                  White      F           AH
## 6        5                  White      F            N
# load libraries ----------------------------------------------------------
library(afex)
## Loading required package: lme4
## Loading required package: Matrix
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4
## ************
## Welcome to afex. For support visit: http://afex.singmann.science/
## - Functions for ANOVAs: aov_car(), aov_ez(), and aov_4()
## - Methods for calculating p-values with mixed(): 'KR', 'S', 'LRT', and 'PB'
## - 'afex_aov' and 'mixed' objects can be passed to emmeans() for follow-up tests
## - NEWS: library('emmeans') now needs to be called explicitly!
## - Get and set global package options with: afex_options()
## - Set orthogonal sum-to-zero contrasts globally: set_sum_contrasts()
## - For example analyses see: browseVignettes("afex")
## ************
## 
## Attaching package: 'afex'
## The following object is masked from 'package:lme4':
## 
##     lmer
library(emmeans)
library(psych)
library(DT)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(ggsignif)
## Warning: package 'ggsignif' was built under R version 4.0.5
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
library(rstatix)
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
library(Rmisc)
## Warning: package 'Rmisc' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: plyr
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:rstatix':
## 
##     desc, mutate
library(afex)
library(emmeans)


# univariate normality ----------------------------------------------------
norm <- describe(df3[2:25])
datatable(norm) %>%
  formatRound(1:13) %>%
  formatStyle(11:12, color = styleInterval(c(-2, 2), c('red', 'black', 'red')))
hist(df3$connect3, breaks = 5)

# Outliers ----------------------------------------------------------------
names(df3)
##  [1] "id"         "gen1"       "gen2"       "rec1"       "rec2"      
##  [6] "rec3"       "int1"       "int2"       "pc1"        "pc2"       
## [11] "pc3"        "pc4"        "pc_verb1"   "pc_verb2"   "pc_verb3"  
## [16] "int21"      "int22"      "int23"      "int24"      "sciapp1"   
## [21] "sciapp2"    "sciapp3"    "connect1"   "connect2"   "connect3"  
## [26] "raceeth"    "gender"     "careergoal"
data_imp <- subset(df3, select=c(1:24))
d1 <- na.omit(data_imp)

m_dist <- mahalanobis(d1[-1], colMeans(d1[-1]), cov(d1[-1]))
d1$MD <- round(m_dist, 1)
plot(d1$MD)
describe(m_dist)
##    vars   n  mean    sd median trimmed   mad  min    max  range skew kurtosis
## X1    1 803 22.97 16.81  18.15   20.34 11.97 1.86 112.21 110.35 1.89     4.86
##      se
## X1 0.59
cut <- qchisq(.99, df=(ncol(d1)-1))
abline(a=cut, b=0, col="red")

d1$outlier <- "No"
d1$outlier[d1$MD > cut] <- "Yes"
table(d1$outlier)
## 
##  No Yes 
## 722  81
81/841
## [1] 0.09631391
names(d1)
##  [1] "id"       "gen1"     "gen2"     "rec1"     "rec2"     "rec3"    
##  [7] "int1"     "int2"     "pc1"      "pc2"      "pc3"      "pc4"     
## [13] "pc_verb1" "pc_verb2" "pc_verb3" "int21"    "int22"    "int23"   
## [19] "int24"    "sciapp1"  "sciapp2"  "sciapp3"  "connect1" "connect2"
## [25] "MD"       "outlier"
d2 <- subset(d1, outlier == "No", select=c(1,outlier))
df4 <- subset(df3, id %in% d2$id)


# prepare data ------------------------------------------------------------
df4$gen <- (df4$gen1 + df4$gen2)/2
df4$rec <- (df4$rec1 + df4$rec2 + df4$rec3)/3
df4$int1 <- (df4$int1 + df4$int2)/2
df4$pc <- (df4$pc1 + df4$pc2 + df4$pc3 + df4$pc4)/4
df4$pc_verb <- (df4$pc_verb1 + df4$pc_verb2 + df4$pc_verb3)/3
df4$int2 <- (df4$int21 + df4$int22 + df4$int23 + df4$int24)/4
df4$sciapp <- (df4$sciapp1 + df4$sciapp2 + df4$sciapp3)/3
df4$connect <- (df4$connect1 + df4$connect2)/2
head(df4)
##         id gen1 gen2 rec1 rec2 rec3 int1 int2 pc1 pc2 pc3 pc4 pc_verb1 pc_verb2
## 2 33756918    4    5    4    4    4    5 5.00   5   5   5   5        5        5
## 3 61089066    5    5    5    5    5    5 5.00   5   5   5   5        5        5
## 4 60927867    1    5    5    5    5    5 5.00   5   5   5   5        5        5
## 5 61129203    3    4    4    3    4    4 4.25   4   4   4   4        4        4
## 6 61037895    5    5    5    4    4    5 5.00   5   5   5   4        5        5
## 7 60245619    4    4    3    3    4    4 5.00   4   4   4   4        4        4
##   pc_verb3 int21 int22 int23 int24 sciapp1 sciapp2 sciapp3 connect1 connect2
## 2        5     5     5     5     5       4       5       5        5        5
## 3        5     5     5     5     5       5       5       5        5        5
## 4        5     5     5     5     5       5       5       5        5        5
## 5        4     4     5     4     4       5       5       5        4        4
## 6        5     5     5     5     5       4       4       5        5        4
## 7        4     5     5     5     5       4       4       4        5        5
##   connect3           raceeth gender   careergoal gen      rec   pc pc_verb
## 2        5       No Response      F            N 4.5 4.000000 5.00       5
## 3        5 Asian: East Asian      M Other Health 5.0 5.000000 5.00       5
## 4        4             White      M            N 3.0 5.000000 5.00       5
## 5        4             White      F           AH 3.5 3.666667 4.00       4
## 6        5             White      F            N 5.0 4.333333 4.75       5
## 7        5 Hispanic & Latino      F Other Health 4.0 3.333333 4.00       4
##     sciapp connect
## 2 4.666667     5.0
## 3 5.000000     5.0
## 4 5.000000     5.0
## 5 5.000000     4.0
## 6 4.333333     4.5
## 7 4.000000     5.0
str(df4)
## 'data.frame':    722 obs. of  34 variables:
##  $ id        : int  33756918 61089066 60927867 61129203 61037895 60245619 61120473 61045896 60837294 60778827 ...
##  $ gen1      : int  4 5 1 3 5 4 4 3 3 4 ...
##  $ gen2      : int  5 5 5 4 5 4 5 4 3 3 ...
##  $ rec1      : int  4 5 5 4 5 3 5 5 4 3 ...
##  $ rec2      : int  4 5 5 3 4 3 4 3 3 3 ...
##  $ rec3      : int  4 5 5 4 4 4 5 3 3 3 ...
##  $ int1      : num  5 5 5 4 5 4 5 4 5 4 ...
##  $ int2      : num  5 5 5 4.25 5 5 5 5 5 4 ...
##  $ pc1       : int  5 5 5 4 5 4 4 4 5 3 ...
##  $ pc2       : int  5 5 5 4 5 4 4 5 5 3 ...
##  $ pc3       : int  5 5 5 4 5 4 5 4 5 3 ...
##  $ pc4       : int  5 5 5 4 4 4 3 5 4 3 ...
##  $ pc_verb1  : int  5 5 5 4 5 4 5 5 3 3 ...
##  $ pc_verb2  : int  5 5 5 4 5 4 4 5 3 3 ...
##  $ pc_verb3  : int  5 5 5 4 5 4 4 4 4 3 ...
##  $ int21     : int  5 5 5 4 5 5 5 5 5 4 ...
##  $ int22     : int  5 5 5 5 5 5 5 5 5 4 ...
##  $ int23     : int  5 5 5 4 5 5 5 5 5 4 ...
##  $ int24     : int  5 5 5 4 5 5 5 5 5 4 ...
##  $ sciapp1   : int  4 5 5 5 4 4 5 4 4 3 ...
##  $ sciapp2   : int  5 5 5 5 4 4 5 5 4 3 ...
##  $ sciapp3   : int  5 5 5 5 5 4 5 4 4 3 ...
##  $ connect1  : int  5 5 5 4 5 5 5 5 5 3 ...
##  $ connect2  : int  5 5 5 4 4 5 5 4 4 3 ...
##  $ connect3  : int  5 5 4 4 5 5 5 5 5 3 ...
##  $ raceeth   : chr  "No Response" "Asian: East Asian" "White" "White" ...
##  $ gender    : chr  "F" "M" "M" "F" ...
##  $ careergoal: chr  "N" "Other Health" "N" "AH" ...
##  $ gen       : num  4.5 5 3 3.5 5 4 4.5 3.5 3 3.5 ...
##  $ rec       : num  4 5 5 3.67 4.33 ...
##  $ pc        : num  5 5 5 4 4.75 4 4 4.5 4.75 3 ...
##  $ pc_verb   : num  5 5 5 4 5 ...
##  $ sciapp    : num  4.67 5 5 5 4.33 ...
##  $ connect   : num  5 5 5 4 4.5 5 5 4.5 4.5 3 ...
df4$raceeth <- as.factor(df4$raceeth)
df4$gender <- as.factor(df4$gender)
df4$careergoal <- as.factor(df4$careergoal)


# examine distribution of each dv by each group ---------------------------
# focus on general science identity first ---------------------------------
ggplot(df4, aes(sample = gen)) +
  stat_qq() +
  facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(gen))$gen)

## [1]  44 215
car::qqPlot(subset(df4, careergoal == "N", select=c(gen))$gen)

## [1]  49 129
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(gen))$gen)

## [1] 22 15
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(gen))$gen)

## [1]  8 69
car::qqPlot(subset(df4, careergoal == "STEM", select=c(gen))$gen)

## [1]  6 42
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(gen))$gen)

## [1]  8 44
bartlett.test(gen ~ careergoal, data = df4)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  gen by careergoal
## Bartlett's K-squared = 10.078, df = 5, p-value = 0.07306
leveneTest(gen ~ careergoal, data = df4)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value  Pr(>F)  
## group   5  2.0374 0.07155 .
##       675                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df5 <- df4 %>%
  reorder_levels(careergoal, order = c("AH","N","Other Health","STEM","Veterinary","Non-STEM"))

df5 %>%  
  group_by(careergoal) %>%
  get_summary_stats(gen, type = "common")
## # A tibble: 7 x 11
##   careergoal   variable     n   min   max median   iqr  mean    sd    se    ci
##   <fct>        <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AH           gen        239   1       5    4     1    3.65 0.918 0.059 0.117
## 2 N            gen        238   1       5    4     1.5  3.67 0.956 0.062 0.122
## 3 Other Health gen         80   1.5     5    4     1    3.92 0.823 0.092 0.183
## 4 STEM         gen         42   1       5    4     2    3.77 1.20  0.184 0.373
## 5 Veterinary   gen         47   1       5    3.5   1    3.37 1.07  0.155 0.313
## 6 Non-STEM     gen         35   1       5    3     2    3.06 1.01  0.171 0.348
## 7 <NA>         gen         41   1       5    3     1.5  3.17 0.885 0.138 0.279
df5 %>% kruskal_test(gen ~ careergoal)
## # A tibble: 1 x 6
##   .y.       n statistic    df        p method        
## * <chr> <int>     <dbl> <int>    <dbl> <chr>         
## 1 gen     722      23.7     5 0.000248 Kruskal-Wallis
df5 %>% kruskal_effsize(gen ~ careergoal)
## # A tibble: 1 x 5
##   .y.       n effsize method  magnitude
## * <chr> <int>   <dbl> <chr>   <ord>    
## 1 gen     722  0.0261 eta2[H] small
df5 %>% dunn_test(gen ~ careergoal, p.adjust.method = "bonferroni")
## # A tibble: 15 x 9
##    .y.   group1    group2       n1    n2 statistic        p   p.adj p.adj.signif
##  * <chr> <chr>     <chr>     <int> <int>     <dbl>    <dbl>   <dbl> <chr>       
##  1 gen   AH        N           239   238     0.397  6.92e-1 1.00e+0 ns          
##  2 gen   AH        Other He~   239    80     2.38   1.71e-2 2.56e-1 ns          
##  3 gen   AH        STEM        239    42     1.34   1.79e-1 1.00e+0 ns          
##  4 gen   AH        Veterina~   239    47    -1.50   1.34e-1 1.00e+0 ns          
##  5 gen   AH        Non-STEM    239    35    -3.12   1.78e-3 2.67e-2 *           
##  6 gen   N         Other He~   238    80     2.10   3.55e-2 5.32e-1 ns          
##  7 gen   N         STEM        238    42     1.13   2.60e-1 1.00e+0 ns          
##  8 gen   N         Veterina~   238    47    -1.72   8.46e-2 1.00e+0 ns          
##  9 gen   N         Non-STEM    238    35    -3.32   8.86e-4 1.33e-2 *           
## 10 gen   Other He~ STEM         80    42    -0.437  6.62e-1 1.00e+0 ns          
## 11 gen   Other He~ Veterina~    80    47    -2.98   2.92e-3 4.37e-2 *           
## 12 gen   Other He~ Non-STEM     80    35    -4.31   1.63e-5 2.44e-4 ***         
## 13 gen   STEM      Veterina~    42    47    -2.18   2.90e-2 4.34e-1 ns          
## 14 gen   STEM      Non-STEM     42    35    -3.45   5.54e-4 8.32e-3 **          
## 15 gen   Veterina~ Non-STEM     47    35    -1.46   1.44e-1 1.00e+0 ns
ggplot(data = df5, aes(x=careergoal, y=gen, fill=careergoal)) +
  geom_boxplot() +
  geom_signif(comparisons = list(c("Other Health","Veterinary")), annotations = "*", y_position = 5) +
  geom_signif(comparisons = list(c("AH","Non-STEM")), annotations = "*", y_position = 5.25) +
  geom_signif(comparisons = list(c("N","Non-STEM")), annotations = "*", y_position = 5.5) +
  geom_signif(comparisons = list(c("Other Health","Non-STEM")), annotations = "***", y_position = 5.75) +
  geom_signif(comparisons = list(c("STEM","Non-STEM")), annotations = "**", y_position = 6) +
  ylab("General Science Identity") +
  labs(caption = "* <.05, ** < .01, *** < .001")

# focus on recognition second ---------------------------------------------
ggplot(df4, aes(sample = rec)) +
  stat_qq() +
  facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(rec))$rec)

## [1]  44 112
car::qqPlot(subset(df4, careergoal == "N", select=c(rec))$rec)

## [1] 49 61
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(rec))$rec)

## [1] 16 22
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(rec))$rec)

## [1] 32 69
car::qqPlot(subset(df4, careergoal == "STEM", select=c(rec))$rec)

## [1] 6 1
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(rec))$rec)

## [1]  8 44
bartlett.test(rec ~ careergoal, data = df4)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  rec by careergoal
## Bartlett's K-squared = 9.1596, df = 5, p-value = 0.1029
mod1 <- aov_ez(data = df4, id = "id", dv = "rec", between = c("careergoal"))
## Warning: Missing values for following ID(s):
## 30907380, 31849689, 31904526, 33978435, 34099323, 60031050, 60053202, 60484917, 60717393, 60783132, 60786471, 60820386, 60896259, 60920796, 60975375, 60984777, 60996597, 61012992, 61014561, 61031031, 61035267, 61037175, 61049004, 61054185, 61065885, 61066638, 61079886, 61094778, 61099134, 61107876, 61120572, 61132227, 61151703, 61152804, 61158807, 61163778, 61198677, 61221978, 61228515, 61252080, 722937306
## Removing those cases from the analysis.
## Contrasts set to contr.sum for the following variables: careergoal
test_levene(mod1)
## Levene's Test for Homogeneity of Variance (center = center)
##        Df F value  Pr(>F)  
## group   5  2.3134 0.04242 *
##       675                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod1)
## Anova Table (Type 3 tests)
## 
## Response: rec
##            num Df den Df     MSE      F      ges    Pr(>F)    
## careergoal      5    675 0.76831 8.7611 0.060942 4.587e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(mod1, specs = "careergoal")
##  careergoal   emmean     SE  df lower.CL upper.CL
##  AH             3.38 0.0567 675     3.27     3.49
##  N              3.45 0.0568 675     3.33     3.56
##  Non-STEM       2.90 0.1482 675     2.60     3.19
##  Other Health   3.84 0.0980 675     3.65     4.03
##  STEM           3.61 0.1353 675     3.35     3.88
##  Veterinary     3.01 0.1279 675     2.76     3.27
## 
## Confidence level used: 0.95
pairs(emmeans(mod1, specs = "careergoal"))
##  contrast                  estimate     SE  df t.ratio p.value
##  AH - N                     -0.0646 0.0803 675 -0.805  0.9666 
##  AH - (Non-STEM)             0.4855 0.1586 675  3.060  0.0277 
##  AH - Other Health          -0.4609 0.1132 675 -4.071  0.0007 
##  AH - STEM                  -0.2304 0.1467 675 -1.571  0.6182 
##  AH - Veterinary             0.3666 0.1399 675  2.621  0.0936 
##  N - (Non-STEM)              0.5501 0.1587 675  3.467  0.0074 
##  N - Other Health           -0.3963 0.1133 675 -3.498  0.0066 
##  N - STEM                   -0.1657 0.1467 675 -1.130  0.8690 
##  N - Veterinary              0.4312 0.1399 675  3.082  0.0260 
##  (Non-STEM) - Other Health  -0.9464 0.1776 675 -5.328  <.0001 
##  (Non-STEM) - STEM          -0.7159 0.2006 675 -3.568  0.0051 
##  (Non-STEM) - Veterinary    -0.1189 0.1957 675 -0.608  0.9905 
##  Other Health - STEM         0.2306 0.1670 675  1.380  0.7389 
##  Other Health - Veterinary   0.8275 0.1611 675  5.137  <.0001 
##  STEM - Veterinary           0.5969 0.1861 675  3.207  0.0176 
## 
## P value adjustment: tukey method for comparing a family of 6 estimates
ggplot(data = df4, aes(x=careergoal, y=rec, fill=careergoal)) +
  geom_boxplot() +
  geom_signif(comparisons = list(c("AH","Non-STEM")), annotations = "*", y_position = 5.25) +
  geom_signif(comparisons = list(c("Non-STEM","Other Health")), annotations = "***", y_position = 5.25) +
  geom_signif(comparisons = list(c("AH","Other Health")), annotations = "***", y_position = 5.5) +
  geom_signif(comparisons = list(c("N","Non-STEM")), annotations = "***", y_position = 5) +
  geom_signif(comparisons = list(c("STEM","Veterinary")), annotations = "*", y_position = 5) +
  geom_signif(comparisons = list(c("N","Other Health")), annotations = "*", y_position = 5.75) +
  geom_signif(comparisons = list(c("Other Health","Veterinary")), annotations = "***", y_position = 5.75) +
  geom_signif(comparisons = list(c("N","Veterinary")), annotations = "*", y_position = 6) +
  geom_signif(comparisons = list(c("Non-STEM","STEM")), annotations = "**", y_position = 6.25) +
  ylab("Science Identity (Recognition)") +
  labs(caption = "* <.05, ** < .01, *** < .001")

# focus on pc third -------------------------------------------------------
ggplot(df4, aes(sample = pc)) +
  stat_qq() +
  facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(pc))$pc)

## [1] 187 212
car::qqPlot(subset(df4, careergoal == "N", select=c(pc))$pc)

## [1] 216 173
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(pc))$pc)

## [1] 15 16
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(pc))$pc)

## [1] 5 3
car::qqPlot(subset(df4, careergoal == "STEM", select=c(pc))$pc)

## [1] 42 27
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(pc))$pc)

## [1] 18 32
bartlett.test(pc ~ careergoal, data = df4)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  pc by careergoal
## Bartlett's K-squared = 3.7307, df = 5, p-value = 0.5888
mod2 <- aov_ez(data = df4, id = "id", dv = "pc", between = c("careergoal"))
## Warning: Missing values for following ID(s):
## 30907380, 31849689, 31904526, 33978435, 34099323, 60031050, 60053202, 60484917, 60717393, 60783132, 60786471, 60820386, 60896259, 60920796, 60975375, 60984777, 60996597, 61012992, 61014561, 61031031, 61035267, 61037175, 61049004, 61054185, 61065885, 61066638, 61079886, 61094778, 61099134, 61107876, 61120572, 61132227, 61151703, 61152804, 61158807, 61163778, 61198677, 61221978, 61228515, 61252080, 722937306
## Removing those cases from the analysis.
## Contrasts set to contr.sum for the following variables: careergoal
test_levene(mod2)
## Levene's Test for Homogeneity of Variance (center = center)
##        Df F value Pr(>F)
## group   5  0.4764  0.794
##       675
summary(mod2)
## Anova Table (Type 3 tests)
## 
## Response: pc
##            num Df den Df     MSE      F     ges   Pr(>F)   
## careergoal      5    675 0.49828 3.8203 0.02752 0.002025 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(mod2, specs = "careergoal")
##  careergoal   emmean     SE  df lower.CL upper.CL
##  AH             3.90 0.0457 675     3.81     3.99
##  N              3.89 0.0458 675     3.80     3.98
##  Non-STEM       3.72 0.1193 675     3.49     3.96
##  Other Health   4.08 0.0789 675     3.93     4.24
##  STEM           4.08 0.1089 675     3.86     4.29
##  Veterinary     3.60 0.1030 675     3.39     3.80
## 
## Confidence level used: 0.95
pairs(emmeans(mod2, specs = "careergoal"))
##  contrast                  estimate     SE  df t.ratio p.value
##  AH - N                     0.00358 0.0646 675  0.055  1.0000 
##  AH - (Non-STEM)            0.17606 0.1278 675  1.378  0.7402 
##  AH - Other Health         -0.18689 0.0912 675 -2.050  0.3154 
##  AH - STEM                 -0.17989 0.1181 675 -1.523  0.6494 
##  AH - Veterinary            0.30174 0.1126 675  2.679  0.0808 
##  N - (Non-STEM)             0.17248 0.1278 675  1.350  0.7569 
##  N - Other Health          -0.19047 0.0912 675 -2.088  0.2948 
##  N - STEM                  -0.18347 0.1181 675 -1.553  0.6299 
##  N - Veterinary             0.29816 0.1127 675  2.646  0.0878 
##  (Non-STEM) - Other Health -0.36295 0.1431 675 -2.537  0.1149 
##  (Non-STEM) - STEM         -0.35595 0.1616 675 -2.203  0.2375 
##  (Non-STEM) - Veterinary    0.12568 0.1576 675  0.797  0.9679 
##  Other Health - STEM        0.00699 0.1345 675  0.052  1.0000 
##  Other Health - Veterinary  0.48863 0.1297 675  3.766  0.0025 
##  STEM - Veterinary          0.48164 0.1499 675  3.213  0.0172 
## 
## P value adjustment: tukey method for comparing a family of 6 estimates
ggplot(data = df4, aes(x=careergoal, y=pc, fill=careergoal)) +
  geom_boxplot() +
  geom_signif(comparisons = list(c("Other Health","Veterinary")), annotations = "**", y_position = 5.25) +
  geom_signif(comparisons = list(c("STEM","Veterinary")), annotations = "*", y_position = 5) +
  ylab("Science Identity (Performance/Competence)") +
  labs(caption = "* <.05, ** < .01, *** < .001")

# focus on interest last --------------------------------------------------
ggplot(df4, aes(sample = int1)) +
  stat_qq() +
  facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(int1))$int1)

## [1] 187 212
car::qqPlot(subset(df4, careergoal == "N", select=c(int1))$int1)

## [1] 216 156
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(int1))$int1)

## [1]  7 35
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(int1))$int1)

## [1] 27 68
car::qqPlot(subset(df4, careergoal == "STEM", select=c(int1))$int1)

## [1] 42 14
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(int1))$int1)

## [1] 18 32
bartlett.test(int1 ~ careergoal, data = df4)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  int1 by careergoal
## Bartlett's K-squared = 8.216, df = 5, p-value = 0.1447
leveneTest(int1 ~ careergoal, data = df4)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   5  0.6145 0.6888
##       675
df5 %>%  
  group_by(careergoal) %>%
  get_summary_stats(int1, type = "common")
## # A tibble: 7 x 11
##   careergoal   variable     n   min   max median   iqr  mean    sd    se    ci
##   <fct>        <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AH           int1       239   2       5    4.5   1    4.33 0.701 0.045 0.089
## 2 N            int1       238   1       5    4.5   1    4.38 0.688 0.045 0.088
## 3 Other Health int1        80   2.5     5    5     1    4.51 0.593 0.066 0.132
## 4 STEM         int1        42   2       5    5     1    4.5  0.796 0.123 0.248
## 5 Veterinary   int1        47   2       5    4     1    4.28 0.728 0.106 0.214
## 6 Non-STEM     int1        35   1.5     5    4     0.5  4.03 0.84  0.142 0.288
## 7 <NA>         int1        41   2       5    4     0.5  4.04 0.778 0.121 0.245
df5 %>% kruskal_test(int1 ~ careergoal)
## # A tibble: 1 x 6
##   .y.       n statistic    df      p method        
## * <chr> <int>     <dbl> <int>  <dbl> <chr>         
## 1 int1    722      15.0     5 0.0104 Kruskal-Wallis
df5 %>% kruskal_effsize(int1 ~ careergoal)
## # A tibble: 1 x 5
##   .y.       n effsize method  magnitude
## * <chr> <int>   <dbl> <chr>   <ord>    
## 1 int1    722  0.0140 eta2[H] small
df5 %>% dunn_test(int1 ~ careergoal, p.adjust.method = "bonferroni")
## # A tibble: 15 x 9
##    .y.   group1     group2        n1    n2 statistic       p  p.adj p.adj.signif
##  * <chr> <chr>      <chr>      <int> <int>     <dbl>   <dbl>  <dbl> <chr>       
##  1 int1  AH         N            239   238     0.922 0.357   1      ns          
##  2 int1  AH         Other Hea~   239    80     1.92  0.0544  0.815  ns          
##  3 int1  AH         STEM         239    42     2.13  0.0330  0.495  ns          
##  4 int1  AH         Veterinary   239    47    -0.403 0.687   1      ns          
##  5 int1  AH         Non-STEM     239    35    -2.09  0.0365  0.548  ns          
##  6 int1  N          Other Hea~   238    80     1.27  0.204   1      ns          
##  7 int1  N          STEM         238    42     1.63  0.104   1      ns          
##  8 int1  N          Veterinary   238    47    -0.932 0.351   1      ns          
##  9 int1  N          Non-STEM     238    35    -2.56  0.0106  0.159  ns          
## 10 int1  Other Hea~ STEM          80    42     0.568 0.570   1      ns          
## 11 int1  Other Hea~ Veterinary    80    47    -1.70  0.0887  1      ns          
## 12 int1  Other Hea~ Non-STEM      80    35    -3.09  0.00198 0.0297 *           
## 13 int1  STEM       Veterinary    42    47    -1.98  0.0473  0.710  ns          
## 14 int1  STEM       Non-STEM      42    35    -3.21  0.00132 0.0197 *           
## 15 int1  Veterinary Non-STEM      47    35    -1.41  0.159   1      ns
ggplot(data = df5, aes(x=careergoal, y=int1, fill=careergoal)) +
  geom_boxplot() +
  geom_signif(comparisons = list(c("Other Health","Non-STEM")), annotations = "***", y_position = 5.5) +
  ylab("Science Identity (Interest)") +
  labs(caption = "* <.05, ** < .01, *** < .001")

SALG

# pc verbal ---------------------------------------------------------------
ggplot(df4, aes(sample = pc_verb)) +
  stat_qq() +
  facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(pc_verb))$pc_verb)

## [1] 85 54
car::qqPlot(subset(df4, careergoal == "N", select=c(pc_verb))$pc_verb)

## [1] 129 134
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(pc_verb))$pc_verb)

## [1]  7 13
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(pc_verb))$pc_verb)

## [1] 76 39
car::qqPlot(subset(df4, careergoal == "STEM", select=c(pc_verb))$pc_verb)

## [1]  1 42
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(pc_verb))$pc_verb)

## [1] 29  7
bartlett.test(pc_verb ~ careergoal, data = df4)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  pc_verb by careergoal
## Bartlett's K-squared = 6.4157, df = 5, p-value = 0.2678
mod2.5 <- aov_ez(data = df4, id = "id", dv = "pc_verb", between = c("careergoal"))
## Warning: Missing values for following ID(s):
## 30907380, 31849689, 31904526, 33978435, 34099323, 60031050, 60053202, 60484917, 60717393, 60783132, 60786471, 60820386, 60896259, 60920796, 60975375, 60984777, 60996597, 61012992, 61014561, 61031031, 61035267, 61037175, 61049004, 61054185, 61065885, 61066638, 61079886, 61094778, 61099134, 61107876, 61120572, 61132227, 61151703, 61152804, 61158807, 61163778, 61198677, 61221978, 61228515, 61252080, 722937306
## Removing those cases from the analysis.
## Contrasts set to contr.sum for the following variables: careergoal
test_levene(mod2.5)
## Levene's Test for Homogeneity of Variance (center = center)
##        Df F value Pr(>F)
## group   5  0.5391 0.7467
##       675
summary(mod2.5)
## Anova Table (Type 3 tests)
## 
## Response: pc_verb
##            num Df den Df     MSE      F     ges    Pr(>F)    
## careergoal      5    675 0.51939 5.0676 0.03618 0.0001438 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(mod2.5, specs = "careergoal")
##  careergoal   emmean     SE  df lower.CL upper.CL
##  AH             3.90 0.0466 675     3.81     3.99
##  N              3.84 0.0467 675     3.75     3.93
##  Non-STEM       4.08 0.1218 675     3.84     4.32
##  Other Health   4.22 0.0806 675     4.06     4.38
##  STEM           4.08 0.1112 675     3.86     4.30
##  Veterinary     3.69 0.1051 675     3.48     3.89
## 
## Confidence level used: 0.95
pairs(emmeans(mod2.5, specs = "careergoal"))
##  contrast                  estimate     SE  df t.ratio p.value
##  AH - N                     0.05923 0.0660 675  0.898  0.9471 
##  AH - (Non-STEM)           -0.17382 0.1304 675 -1.333  0.7668 
##  AH - Other Health         -0.31846 0.0931 675 -3.421  0.0086 
##  AH - STEM                 -0.17699 0.1206 675 -1.468  0.6850 
##  AH - Veterinary            0.21443 0.1150 675  1.865  0.4250 
##  N - (Non-STEM)            -0.23305 0.1305 675 -1.786  0.4753 
##  N - Other Health          -0.37770 0.0931 675 -4.055  0.0008 
##  N - STEM                  -0.23623 0.1206 675 -1.958  0.3676 
##  N - Veterinary             0.15519 0.1150 675  1.349  0.7573 
##  (Non-STEM) - Other Health -0.14464 0.1461 675 -0.990  0.9210 
##  (Non-STEM) - STEM         -0.00317 0.1649 675 -0.019  1.0000 
##  (Non-STEM) - Veterinary    0.38825 0.1609 675  2.413  0.1532 
##  Other Health - STEM        0.14147 0.1373 675  1.030  0.9078 
##  Other Health - Veterinary  0.53289 0.1325 675  4.023  0.0009 
##  STEM - Veterinary          0.39142 0.1530 675  2.558  0.1093 
## 
## P value adjustment: tukey method for comparing a family of 6 estimates
ggplot(data = df4, aes(x=careergoal, y=pc_verb, fill=careergoal)) +
  geom_boxplot() +
  geom_signif(comparisons = list(c("AH","Other Health")), annotations = "**", y_position = 5.25) +
  geom_signif(comparisons = list(c("N","Other Health")), annotations = "***", y_position = 5) +
  geom_signif(comparisons = list(c("Other Health","Veterinary")), annotations = "***", y_position = 5.25) +
  ylab("Verbal Performance/Competence") +
  labs(caption = "* <.05, ** < .01, *** < .001")

# using interest2 ---------------------------------------------------------
ggplot(df4, aes(sample = int2)) +
  stat_qq() +
  facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(int2))$int2)

## [1] 216  29
car::qqPlot(subset(df4, careergoal == "N", select=c(int2))$int2)

## [1] 129 173
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(int2))$int2)

## [1] 24 15
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(int2))$int2)

## [1] 75 40
car::qqPlot(subset(df4, careergoal == "STEM", select=c(int2))$int2)

## [1]  1 27
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(int2))$int2)

## [1] 18  3
bartlett.test(int2 ~ careergoal, data = df4)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  int2 by careergoal
## Bartlett's K-squared = 11.731, df = 5, p-value = 0.03867
leveneTest(int2 ~ careergoal, data = df4)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   5  1.5892 0.1609
##       675
df5 %>%  
  group_by(careergoal) %>%
  get_summary_stats(int2, type = "common")
## # A tibble: 7 x 11
##   careergoal   variable     n   min   max median   iqr  mean    sd    se    ci
##   <fct>        <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AH           int2       239  1.25     5   4.25 1      4.26 0.672 0.043 0.086
## 2 N            int2       238  1        5   4.25 1      4.28 0.666 0.043 0.085
## 3 Other Health int2        80  2.25     5   4.5  1      4.38 0.646 0.072 0.144
## 4 STEM         int2        42  1.75     5   4.62 1.25   4.23 0.881 0.136 0.275
## 5 Veterinary   int2        47  1.25     5   4    1.25   4.08 0.843 0.123 0.247
## 6 Non-STEM     int2        35  1.75     5   3.25 0.625  3.41 0.74  0.125 0.254
## 7 <NA>         int2        41  2        5   3.75 1      3.81 0.825 0.129 0.26
df5 %>% kruskal_test(int2 ~ careergoal)
## # A tibble: 1 x 6
##   .y.       n statistic    df            p method        
## * <chr> <int>     <dbl> <int>        <dbl> <chr>         
## 1 int2    722      43.7     5 0.0000000267 Kruskal-Wallis
df5 %>% kruskal_effsize(int2 ~ careergoal)
## # A tibble: 1 x 5
##   .y.       n effsize method  magnitude
## * <chr> <int>   <dbl> <chr>   <ord>    
## 1 int2    722  0.0540 eta2[H] small
df5 %>% dunn_test(int2 ~ careergoal, p.adjust.method = "bonferroni")
## # A tibble: 15 x 9
##    .y.   group1   group2      n1    n2 statistic        p     p.adj p.adj.signif
##  * <chr> <chr>    <chr>    <int> <int>     <dbl>    <dbl>     <dbl> <chr>       
##  1 int2  AH       N          239   238    0.327  7.44e- 1   1.00e+0 ns          
##  2 int2  AH       Other H~   239    80    1.44   1.51e- 1   1.00e+0 ns          
##  3 int2  AH       STEM       239    42    0.267  7.89e- 1   1.00e+0 ns          
##  4 int2  AH       Veterin~   239    47   -1.15   2.52e- 1   1.00e+0 ns          
##  5 int2  AH       Non-STEM   239    35   -5.89   3.83e- 9   5.74e-8 ****        
##  6 int2  N        Other H~   238    80    1.20   2.29e- 1   1.00e+0 ns          
##  7 int2  N        STEM       238    42    0.0882 9.30e- 1   1.00e+0 ns          
##  8 int2  N        Veterin~   238    47   -1.33   1.82e- 1   1.00e+0 ns          
##  9 int2  N        Non-STEM   238    35   -6.06   1.40e- 9   2.11e-8 ****        
## 10 int2  Other H~ STEM        80    42   -0.738  4.60e- 1   1.00e+0 ns          
## 11 int2  Other H~ Veterin~    80    47   -2.00   4.51e- 2   6.77e-1 ns          
## 12 int2  Other H~ Non-STEM    80    35   -6.18   6.58e-10   9.87e-9 ****        
## 13 int2  STEM     Veterin~    42    47   -1.07   2.84e- 1   1.00e+0 ns          
## 14 int2  STEM     Non-STEM    42    35   -4.85   1.21e- 6   1.82e-5 ****        
## 15 int2  Veterin~ Non-STEM    47    35   -3.96   7.60e- 5   1.14e-3 **
ggplot(data = df5, aes(x=careergoal, y=int2, fill=careergoal)) +
  geom_boxplot() +
  geom_signif(comparisons = list(c("AH","Non-STEM")), annotations = "***", y_position = 5) +
  geom_signif(comparisons = list(c("N","Non-STEM")), annotations = "***", y_position = 5.25) +
  geom_signif(comparisons = list(c("Other Health","Non-STEM")), annotations = "***", y_position = 5.5) +
  geom_signif(comparisons = list(c("STEM","Non-STEM")), annotations = "***", y_position = 5.75) +
  geom_signif(comparisons = list(c("Veterinary","Non-STEM")), annotations = "***", y_position = 6) +
  ylab("Science Interest") +
  labs(caption = "* <.05, ** < .01, *** < .001")

# sciapp and connect ------------------------------------------------------
ggplot(df4, aes(sample = sciapp)) +
  stat_qq() +
  facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(sciapp))$sciapp)

## [1] 228  39
car::qqPlot(subset(df4, careergoal == "N", select=c(sciapp))$sciapp)

## [1] 223 129
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(sciapp))$sciapp)

## [1]  2 33
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(sciapp))$sciapp)

## [1] 3 7
car::qqPlot(subset(df4, careergoal == "STEM", select=c(sciapp))$sciapp)

## [1] 23 42
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(sciapp))$sciapp)

## [1] 18 30
bartlett.test(sciapp ~ careergoal, data = df4)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  sciapp by careergoal
## Bartlett's K-squared = 5.9911, df = 5, p-value = 0.3071
mod3 <- aov_ez(data = df4, id = "id", dv = "sciapp", between = c("careergoal"))
## Warning: Missing values for following ID(s):
## 30907380, 31849689, 31904526, 33978435, 34099323, 60031050, 60053202, 60484917, 60717393, 60783132, 60786471, 60820386, 60896259, 60920796, 60975375, 60984777, 60996597, 61012992, 61014561, 61031031, 61035267, 61037175, 61049004, 61054185, 61065885, 61066638, 61079886, 61094778, 61099134, 61107876, 61120572, 61132227, 61151703, 61152804, 61158807, 61163778, 61198677, 61221978, 61228515, 61252080, 722937306
## Removing those cases from the analysis.
## Contrasts set to contr.sum for the following variables: careergoal
test_levene(mod3)
## Levene's Test for Homogeneity of Variance (center = center)
##        Df F value Pr(>F)
## group   5  1.3666 0.2349
##       675
summary(mod3)
## Anova Table (Type 3 tests)
## 
## Response: sciapp
##            num Df den Df     MSE      F      ges Pr(>F)
## careergoal      5    675 0.45821 1.6775 0.012273 0.1378
ggplot(df4, aes(sample = connect)) +
  stat_qq() +
  facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(connect))$connect)

## [1] 42 85
car::qqPlot(subset(df4, careergoal == "N", select=c(connect))$connect)

## [1] 223  35
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(connect))$connect)

## [1] 14 22
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(connect))$connect)

## [1] 16  3
car::qqPlot(subset(df4, careergoal == "STEM", select=c(connect))$connect)

## [1]  1 18
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(connect))$connect)

## [1]  8 18
bartlett.test(connect ~ careergoal, data = df4)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  connect by careergoal
## Bartlett's K-squared = 7.7098, df = 5, p-value = 0.173
mod4 <- aov_ez(data = df4, id = "id", dv = "connect", between = c("careergoal"))
## Warning: Missing values for following ID(s):
## 30907380, 31849689, 31904526, 33978435, 34099323, 60031050, 60053202, 60484917, 60717393, 60783132, 60786471, 60820386, 60896259, 60920796, 60975375, 60984777, 60996597, 61012992, 61014561, 61031031, 61035267, 61037175, 61049004, 61054185, 61065885, 61066638, 61079886, 61094778, 61099134, 61107876, 61120572, 61132227, 61151703, 61152804, 61158807, 61163778, 61198677, 61221978, 61228515, 61252080, 722937306
## Removing those cases from the analysis.
## Contrasts set to contr.sum for the following variables: careergoal
test_levene(mod4)
## Levene's Test for Homogeneity of Variance (center = center)
##        Df F value Pr(>F)
## group   5  1.5081  0.185
##       675
summary(mod4)
## Anova Table (Type 3 tests)
## 
## Response: connect
##            num Df den Df     MSE      F      ges  Pr(>F)  
## careergoal      5    675 0.48215 2.6083 0.018954 0.02389 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(mod4, specs = "careergoal")
##  careergoal   emmean     SE  df lower.CL upper.CL
##  AH             4.18 0.0449 675     4.10     4.27
##  N              4.19 0.0450 675     4.10     4.28
##  Non-STEM       3.86 0.1174 675     3.63     4.09
##  Other Health   4.26 0.0776 675     4.11     4.41
##  STEM           4.40 0.1071 675     4.19     4.62
##  Veterinary     4.16 0.1013 675     3.96     4.36
## 
## Confidence level used: 0.95
pairs(emmeans(mod4, specs = "careergoal"))
##  contrast                  estimate     SE  df t.ratio p.value
##  AH - N                    -0.00918 0.0636 675 -0.144  1.0000 
##  AH - (Non-STEM)            0.32696 0.1257 675  2.602  0.0982 
##  AH - Other Health         -0.07840 0.0897 675 -0.874  0.9526 
##  AH - STEM                 -0.22066 0.1162 675 -1.899  0.4034 
##  AH - Veterinary            0.02453 0.1108 675  0.221  0.9999 
##  N - (Non-STEM)             0.33613 0.1257 675  2.674  0.0818 
##  N - Other Health          -0.06922 0.0897 675 -0.771  0.9722 
##  N - STEM                  -0.21148 0.1162 675 -1.820  0.4536 
##  N - Veterinary             0.03370 0.1108 675  0.304  0.9997 
##  (Non-STEM) - Other Health -0.40536 0.1407 675 -2.881  0.0469 
##  (Non-STEM) - STEM         -0.54762 0.1589 675 -3.446  0.0079 
##  (Non-STEM) - Veterinary   -0.30243 0.1550 675 -1.951  0.3721 
##  Other Health - STEM       -0.14226 0.1323 675 -1.075  0.8912 
##  Other Health - Veterinary  0.10293 0.1276 675  0.807  0.9663 
##  STEM - Veterinary          0.24519 0.1474 675  1.663  0.5569 
## 
## P value adjustment: tukey method for comparing a family of 6 estimates
ggplot(data = df5, aes(x=careergoal, y=connect, fill=careergoal)) +
  geom_boxplot() +
  ylab("Connect Ideas") +
  labs(caption = "* <.05, ** < .01, *** < .001")