# import data -------------------------------------------------------------
df <- read.csv(file="fa_data.csv", header=T, na.strings = c("#N/A","NA",""," "))
names(df)
## [1] "ID" "pre_FeltLikeSciencePerson"
## [3] "pre_SeeMyselfSciencePerson" "pre_FamilySeeSciencePerson"
## [5] "pre_InstructorSeeSciencePerson" "pre_PeerSeeSciencePerson"
## [7] "pre_EnjoyScience" "pre_InterestedScience"
## [9] "pre_UnderstandPreviousScience" "pre_UnderstandNewScience"
## [11] "pre_OvercomeSetbacks" "pre_ConfidentOutsideClass"
## [13] "pre_ConfidentExams" "pre_OthersAskHelp"
## [15] "pre_OutsideClassInSubject1" "pre_OutsideClassInSubject2"
## [17] "pre_RealWorldIssues" "pre_FindArticles"
## [19] "pre_CriticallyRead" "pre_IdentifyPatterns"
## [21] "pre_RecognizeArgument" "pre_DevelopArgument"
## [23] "pre_WriteDocuments" "pre_WorkWithOthers"
## [25] "pre_OralPresentation" "pre_Enthusiastic"
## [27] "pre_DiscussWithFriends" "pre_PlanningAdditionalClasses"
## [29] "pre_PursuringCareer" "pre_UnderstandSubject"
## [31] "pre_SucceedSubject" "pre_ComplexIdeas"
## [33] "pre_AskingForHelp" "pre_ConnectIdeas"
## [35] "pre_ApplyingOutsideClass" "pre_SystematicReasoning"
## [37] "pre_AnalyzingData" "pre_Course"
## [39] "pre_Ethnicity" "pre_UnlistedEthnicity"
## [41] "pre_Gender" "pre_Career.Goal"
## [43] "pre_OtherCareer" "pre_PreviousCourses"
## [45] "term" "pre_Gender2"
## [47] "freq"
# clean race/ethnicity ----------------------------------------------------
table(df$pre_Ethnicity)
##
## 0 African
## 3 3
## African American/Black An Ethnicity Which is not Listed
## 18 47
## Asian: Asian Indian Asian: Chinese
## 43 83
## Asian: Filipinx Asian: Japanese
## 86 13
## Asian: Korean Asian: Vietmese
## 15 43
## Asian: Vietnamese Latinx: Central American
## 84 30
## Latinx: Chicanx / Mexican Latinx: South American
## 161 15
## White: European White: Midde Eastern
## 162 24
## White: North African
## 7
unlisted <- data.frame(table(df$pre_UnlistedEthnicity))
unlisted$rvar <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[1]] <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[2]] <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[3]] <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[4]] <- "White: Middle Eastern"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[5]] <- "Latinx: Central American"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[6]] <- "Native American"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[7]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[8]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[9]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[10]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[11]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[12]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[13]] <- "Asian: East Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[14]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[16]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[17]] <- "Latinx: South American"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[18]] <- "Asian: Southeast Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[19]] <- "Asian: Southeast Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[20]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[21]] <- "Latinx: Chicanx / Mexican"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[22]] <- "Asian: Filipinx"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[23]] <- "Asian: Filipinx"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[24]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[25]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[26]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[27]] <- "Hispanic"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[28]] <- "Hispanic"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[29]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[30]] <- "Latinx: Central American"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[31]] <- "White: Middle Eastern"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[32]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[33]] <- "White: Middle Eastern"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[34]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[35]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[36]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[37]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[38]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[39]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[40]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[41]] <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[42]] <- "Native American"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[43]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[44]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[45]] <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[46]] <- "No Response"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[47]] <- "Asian: Filipinx"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[48]] <- "White: European"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[49]] <- "Latinx: Central American"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[50]] <- "Asian: South Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[51]] <- "White: European"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[52]] <- "Asian: East Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[53]] <- "Asian: East Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[54]] <- "Asian: East Asian"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[55]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[56]] <- "Biracial"
unlisted$rvar[unlisted$Var1 == unlisted$Var1[57]] <- "Biracial"
table(unlisted$rvar, useNA = "always")
##
## Asian: East Asian Asian: Filipinx Asian: South Asian
## 4 3 8
## Asian: Southeast Asian Biracial Hispanic
## 2 19 2
## Latinx: Central American Latinx: Chicanx / Mexican Latinx: South American
## 3 1 1
## Native American No Response White: European
## 2 7 2
## White: Middle Eastern <NA>
## 3 0
table(df$pre_Ethnicity, useNA = "always")
##
## 0 African
## 3 3
## African American/Black An Ethnicity Which is not Listed
## 18 47
## Asian: Asian Indian Asian: Chinese
## 43 83
## Asian: Filipinx Asian: Japanese
## 86 13
## Asian: Korean Asian: Vietmese
## 15 43
## Asian: Vietnamese Latinx: Central American
## 84 30
## Latinx: Chicanx / Mexican Latinx: South American
## 161 15
## White: European White: Midde Eastern
## 162 24
## White: North African <NA>
## 7 9
df$pre_Ethnicity[is.na(df$pre_Ethnicity)] <- "No Response"
df2 <- merge(df, unlisted, by.x = "pre_UnlistedEthnicity", by.y = "Var1")
df2$pre_Ethnicity[df2$pre_Ethnicity == "An Ethnicity Which is not Listed"] <- df2$rvar[df2$pre_Ethnicity == "An Ethnicity Which is not Listed"]
df2$pre_Ethnicity[df2$pre_Ethnicity == "0"] <- df2$rvar[df2$pre_Ethnicity == "0"]
df2$pre_Ethnicity[df2$pre_Ethnicity == "Asian: Vietmese"] <- "Asian: Vietnamese"
df2$pre_Ethnicity[df2$pre_Ethnicity == "White: Midde Eastern"] <- "White: Middle Eastern"
table(df2$pre_Ethnicity, useNA = "always")
##
## African African American/Black Asian: Asian Indian
## 3 18 42
## Asian: Chinese Asian: East Asian Asian: Filipinx
## 83 4 87
## Asian: Japanese Asian: Korean Asian: South Asian
## 13 14 8
## Asian: Southeast Asian Asian: Vietnamese Biracial
## 3 127 15
## Hispanic Latinx: Central American Latinx: Chicanx / Mexican
## 3 32 161
## Latinx: South American Native American No Response
## 14 1 16
## White: European White: Middle Eastern White: North African
## 163 27 7
## <NA>
## 0
df2$eth_cond[df2$pre_Ethnicity == "African" |
df2$pre_Ethnicity == "African American/Black" |
df2$pre_Ethnicity == "Native American"] <- "Black & Indigenous"
df2$eth_cond[df2$pre_Ethnicity == "Asian: Chinese" |
df2$pre_Ethnicity == "Asian: East Asian" |
df2$pre_Ethnicity == "Asian: Japanese" |
df2$pre_Ethnicity == "Asian: Korean"] <- "Asian: East Asian"
df2$eth_cond[df2$pre_Ethnicity == "Asian: Asian Indian" |
df2$pre_Ethnicity == "Asian: South Asian"] <- "Asian: South Asian"
df2$eth_cond[df2$pre_Ethnicity == "Asian: Filipinx" |
df2$pre_Ethnicity == "Asian: Southeast Asian" |
df2$pre_Ethnicity == "Asian: Vietnamese"] <- "Asian: Southeast Asian"
df2$eth_cond[df2$pre_Ethnicity == "Asian: Filipinx" |
df2$pre_Ethnicity == "Asian: Southeast Asian" |
df2$pre_Ethnicity == "Asian: Vietnamese"] <- "Asian: Southeast Asian"
df2$eth_cond[df2$pre_Ethnicity == "Hispanic" |
df2$pre_Ethnicity == "Latinx: Central American" |
df2$pre_Ethnicity == "Latinx: Chicanx / Mexican" |
df2$pre_Ethnicity == "Latinx: South American"] <- "Hispanic & Latino"
df2$eth_cond[df2$pre_Ethnicity == "White: European" |
df2$pre_Ethnicity == "White: Middle Eastern" |
df2$pre_Ethnicity == "White: North African"] <- "White"
df2$eth_cond[df2$pre_Ethnicity == "Biracial"] <- "Biracial"
df2$eth_cond[df2$pre_Ethnicity == "No Response"] <- "No Response"
table(df2$pre_Ethnicity, df2$eth_cond, useNA = "always")
##
## Asian: East Asian Asian: South Asian
## African 0 0
## African American/Black 0 0
## Asian: Asian Indian 0 42
## Asian: Chinese 83 0
## Asian: East Asian 4 0
## Asian: Filipinx 0 0
## Asian: Japanese 13 0
## Asian: Korean 14 0
## Asian: South Asian 0 8
## Asian: Southeast Asian 0 0
## Asian: Vietnamese 0 0
## Biracial 0 0
## Hispanic 0 0
## Latinx: Central American 0 0
## Latinx: Chicanx / Mexican 0 0
## Latinx: South American 0 0
## Native American 0 0
## No Response 0 0
## White: European 0 0
## White: Middle Eastern 0 0
## White: North African 0 0
## <NA> 0 0
##
## Asian: Southeast Asian Biracial Black & Indigenous
## African 0 0 3
## African American/Black 0 0 18
## Asian: Asian Indian 0 0 0
## Asian: Chinese 0 0 0
## Asian: East Asian 0 0 0
## Asian: Filipinx 87 0 0
## Asian: Japanese 0 0 0
## Asian: Korean 0 0 0
## Asian: South Asian 0 0 0
## Asian: Southeast Asian 3 0 0
## Asian: Vietnamese 127 0 0
## Biracial 0 15 0
## Hispanic 0 0 0
## Latinx: Central American 0 0 0
## Latinx: Chicanx / Mexican 0 0 0
## Latinx: South American 0 0 0
## Native American 0 0 1
## No Response 0 0 0
## White: European 0 0 0
## White: Middle Eastern 0 0 0
## White: North African 0 0 0
## <NA> 0 0 0
##
## Hispanic & Latino No Response White <NA>
## African 0 0 0 0
## African American/Black 0 0 0 0
## Asian: Asian Indian 0 0 0 0
## Asian: Chinese 0 0 0 0
## Asian: East Asian 0 0 0 0
## Asian: Filipinx 0 0 0 0
## Asian: Japanese 0 0 0 0
## Asian: Korean 0 0 0 0
## Asian: South Asian 0 0 0 0
## Asian: Southeast Asian 0 0 0 0
## Asian: Vietnamese 0 0 0 0
## Biracial 0 0 0 0
## Hispanic 3 0 0 0
## Latinx: Central American 32 0 0 0
## Latinx: Chicanx / Mexican 161 0 0 0
## Latinx: South American 14 0 0 0
## Native American 0 0 0 0
## No Response 0 16 0 0
## White: European 0 0 163 0
## White: Middle Eastern 0 0 27 0
## White: North African 0 0 7 0
## <NA> 0 0 0 0
table(df2$eth_cond, useNA = "always")
##
## Asian: East Asian Asian: South Asian Asian: Southeast Asian
## 114 50 217
## Biracial Black & Indigenous Hispanic & Latino
## 15 22 210
## No Response White <NA>
## 16 197 0
# clean gender ------------------------------------------------------------
table(df2$pre_Gender, useNA = "always")
##
## F Female M Male Nonbiry <NA>
## 322 169 126 65 1 158
table(df2$pre_Gender2, useNA = "always")
##
## B F M N <NA>
## 1 268 110 1 461
table(df2$pre_Gender, df2$pre_Gender2, useNA = "always")
##
## B F M N <NA>
## F 0 0 0 0 322
## Female 0 167 0 1 1
## M 0 0 0 0 126
## Male 0 0 65 0 0
## Nonbiry 1 0 0 0 0
## <NA> 0 101 45 0 12
df2$pre_Gender3 <- df2$pre_Gender
df2$pre_Gender3[is.na(df2$pre_Gender)] <- df2$pre_Gender2[is.na(df2$pre_Gender)]
df2$pre_Gender3[df2$pre_Gender3 == "Female"] <- "F"
df2$pre_Gender3[df2$pre_Gender3 == "Male"] <- "M"
df2$pre_Gender3[df2$pre_Gender3 == "Nonbiry"] <- "B"
df2$pre_Gender3[df2$pre_Gender2 == "N"] <- "B"
table(df2$pre_Gender3, useNA = "always")
##
## B F M <NA>
## 2 591 236 12
# clean career goal -------------------------------------------------------
table(df2$pre_Career.Goal)
##
## 0
## 4
## Biologist/Biomedical Researcher
## 11
## Counseling
## 10
## Dental Assisting/Dental Hygiene
## 58
## Dentist
## 8
## Dietician/Nutrition
## 10
## EMS/EMT/Paramedic
## 7
## Kinesiology/Sports Medicine/Athletic Training
## 19
## Medical School
## 51
## Nursing
## 271
## Occupatiol Therapy
## 2
## Occupational Therapy
## 9
## Other: Health/Science
## 64
## Other: Non-Health/Science
## 57
## Pharmacy Technologist/Pharmacist
## 13
## Physical Therapy
## 23
## Physician Assistant
## 22
## Public Health
## 10
## Radiology
## 83
## Respiratory Therapy
## 25
## Social Work
## 5
## Speech Therapy
## 3
## Teacher
## 9
## Veterinary Assistant/Veterinary Technology
## 56
## Veteriry Assistant/Veteriry Technology
## 7
df2$pre_Career <- NA
df2$pre_Career[df2$pre_Career.Goal == "Dental Assisting/Dental Hygiene" |
df2$pre_Career.Goal == "Dietician/Nutrition" |
df2$pre_Career.Goal == "EMS/EMT/Paramedic" |
df2$pre_Career.Goal == "Kinesiology/Sports Medicine/Athletic Training" |
df2$pre_Career.Goal == "Occupational Therapy" |
df2$pre_Career.Goal == "Physical Therapy" |
df2$pre_Career.Goal == "Physician Assistant" |
df2$pre_Career.Goal == "Radiology" |
df2$pre_Career.Goal == "Respiratory Therapy" |
df2$pre_Career.Goal == "Speech Therapy"] <- "AH"
df2$pre_Career[df2$pre_Career.Goal == "Nursing"] <- "N"
df2$pre_Career[df2$pre_Career.Goal == "Social Work" |
df2$pre_Career.Goal == "Teacher"] <- "Non-STEM"
df2$pre_Career[df2$pre_Career.Goal == "Public Health" |
df2$pre_Career.Goal == "Dentist" |
df2$pre_Career.Goal == "Medical School" |
df2$pre_Career.Goal == "Pharmacy Technologist/Pharmacist"] <- "Other Health"
df2$pre_Career[df2$pre_Career.Goal == "Veterinary Assistant/Veterinary Technology"] <- "Veterinary"
df2$pre_Career[df2$pre_Career.Goal == "Counseling" |
df2$pre_Career.Goal == "Biologist/Biomedical Researcher"] <- "STEM"
table(df2$pre_OtherCareer)
##
## Please Specify
## 1
## 0
## 6
## A career related to art and design
## 1
## Accountant
## 2
## also public health
## 1
## and public health
## 1
## Anesthesiologists
## 1
## Art and Gaming Design
## 1
## artist
## 1
## Biochemist
## 1
## Biochemistry
## 1
## Bioengineering
## 1
## Biomedical Engineer
## 1
## Business
## 2
## business
## 2
## Business Administration
## 1
## business administration
## 1
## Business Psychology
## 1
## Cardiology
## 1
## cardiovascular science
## 1
## Clinical Laboratory Scientist
## 1
## clinical psychologist
## 1
## College student
## 1
## communicating
## 1
## communications
## 1
## Communications
## 1
## Computer engineering
## 1
## Computer Engineering
## 2
## computer science
## 1
## Computer Science
## 2
## Conservation
## 1
## construction management
## 1
## Criminal Justice
## 1
## Criminology
## 1
## Data Science
## 1
## dental hygiene
## 1
## Dental Hygienist
## 1
## Dental School
## 1
## Dental School (DDS/DMD)
## 1
## Dentist
## 2
## Dentistry
## 1
## determatology
## 1
## Diagnostic medical Sonography
## 1
## Diagnostic Medical Sonography
## 1
## Dietetics
## 1
## dietician
## 1
## Echocardiogram
## 1
## ecology
## 1
## Economist
## 1
## Economist
## 1
## Electrical engineering
## 1
## Electrician
## 1
## Engineer, Chemical
## 1
## Engineering
## 5
## Financial Advisor or Economic Consultant within the STEM field
## 1
## Fire fighter
## 1
## Food science
## 1
## Government
## 1
## Graphic Design
## 1
## HealthCare Administration or Speech Therapy
## 1
## I already have an undergraduate degree in counseling and am mostly taking classes at Foothill to explore other science related fields.
## 1
## i dont know yet
## 1
## Industrial Design, can apply science
## 1
## Journalism
## 1
## Journalism or law
## 1
## Law
## 1
## Law Enforcement
## 1
## Marine Biology
## 2
## marketing/advertising
## 1
## Massage therapy
## 1
## Math or Business
## 1
## Medical Sonographer
## 1
## Military aviator
## 1
## MLT
## 1
## more related to computer programming
## 1
## Music Business
## 1
## N/A
## 2
## Nada
## 1
## not sure yet
## 1
## Nurse Practioner
## 2
## Nurse Practitioner
## 1
## Nursing first then NP or MD
## 1
## Nutrition
## 1
## Nutrition and Dietetics
## 1
## Optometry
## 2
## Optometry
## 1
## Or Radiology
## 1
## Or radiology
## 1
## or stay in biologics process development...
## 1
## PA
## 1
## Park Ranger
## 1
## Pharmacist
## 1
## Phlebotomy
## 1
## physical therapy
## 1
## Physician's Assistant
## 2
## Physician Assisstant
## 1
## Physician Assistant
## 2
## Physician Assistant
## 1
## Physician Assistant School
## 1
## Please Specify
## 679
## primatology/anthropology
## 1
## Prosthetist
## 1
## Psychiatry
## 2
## psychology
## 2
## Psychology
## 5
## Psychology
## 2
## Psychology and Public Health
## 1
## radiology
## 1
## Radiology
## 1
## Research in Dermatology
## 1
## Research Scientist
## 1
## Research, rehabilitation, conservation of marine animals
## 1
## Respiratory Therapy
## 1
## social and clinical psychology
## 1
## Sociology
## 1
## Software
## 1
## software engineer
## 1
## Software Engineer
## 1
## Software Engineering
## 1
## Something with sociology, but still unsure
## 1
## Sonography
## 2
## Sonography
## 1
## Sonography/Ultrasound technology
## 1
## Sports Marketing
## 1
## Statistician
## 1
## Theater Technician
## 1
## Therapist
## 1
## Unsure
## 1
## Veterinarian
## 1
## Veterinary school
## 1
df2$pre_Career[df2$pre_OtherCareer == "dental hygiene"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Dietetics"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Nutrition and Dietetics"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "PA"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Physician's Assistant"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Physician Assisstant"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Physician Assistant"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Physician Assistant School"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Radiology"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Respiratory Therapy"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Dental School (DDS/DMD)"] <- "Other Health"
df2$pre_Career[df2$pre_OtherCareer == "Dentist"] <- "Other Health"
df2$pre_Career[df2$pre_OtherCareer == "Psychiatry"] <- "Other Health"
df2$pre_Career[df2$pre_OtherCareer == "Nurse Practioner"] <- "N"
df2$pre_Career[df2$pre_OtherCareer == "Pharmacist"] <- "Other Health"
df2$pre_Career[df2$pre_OtherCareer == "Veterinarian"] <- "Veterinary"
df2$pre_Career[df2$pre_OtherCareer == "Veterinary school"] <- "Veterinary"
df2$pre_Career[df2$pre_OtherCareer == "Anesthesiologists"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Diagnostic medical Sonography"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Echocardiogram"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Massage therapy"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Medical Sonographer"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Optometry"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Prosthetist"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Sonography"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Biochemist"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Bioengineering"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Biomedical Engineer"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "cardiovascular science"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Clinical Laboratory Scientist"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "clinical psychologist"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Computer Engineering"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Computer Science"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Conservation"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "ecology"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Engineer, Chemical"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Engineering"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Industrial Design, can apply science"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "psychology"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "social and clinical psychology"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Software Engineering"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Statistician"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "College student"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "Criminology"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "HealthCare Administration or Speech Therapy"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "Military aviator"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "MLT"] <- "AH"
df2$pre_Career[df2$pre_OtherCareer == "Park Ranger"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Psychology and Public Health"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "Something that involves Physics or a Product designer"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "Unsure"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "Economist"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Electrician"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Financial Advisor or Economic Consultant within the STEM field"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "Therapist"] <- "Other Health"
df2$pre_Career[df2$pre_OtherCareer == "Math or Business"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "Something with sociology, but still unsure"] <- NA
df2$pre_Career[df2$pre_OtherCareer == "A career related to art and design"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Art and Gaming Design"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "artist"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Business"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Business Administration"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Business Psychology"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "communications"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "I already have an undergraduate degree in counseling and am mostly taking classes at Foothill to explore other science related fields."] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "i dont know yet"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Journalism"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Journalism or law"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Law"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Law Enforcement"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "marketing/advertising"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Sports Marketing"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Theater Technician"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Accountant"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Fire fighter"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Government"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Graphic Design"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "construction management"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Criminal Justice"] <- "Non-STEM"
df2$pre_Career[df2$pre_OtherCareer == "Biochemistry"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Computer engineering"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "computer science"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Data Science"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Electrical engineering"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "psychology"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Research, rehabilitation, conservation of marine animals"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Sociology"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Software"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "software engineer"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Marine Biology"] <- "STEM"
df2$pre_Career[df2$pre_OtherCareer == "Engineering"] <- "STEM"
table(df2$pre_Career, useNA = "always")
##
## AH N Non-STEM Other Health STEM Veterinary
## 279 272 39 88 54 58
## <NA>
## 51
# create composites -------------------------------------------------------
names(df2)
## [1] "pre_UnlistedEthnicity" "ID"
## [3] "pre_FeltLikeSciencePerson" "pre_SeeMyselfSciencePerson"
## [5] "pre_FamilySeeSciencePerson" "pre_InstructorSeeSciencePerson"
## [7] "pre_PeerSeeSciencePerson" "pre_EnjoyScience"
## [9] "pre_InterestedScience" "pre_UnderstandPreviousScience"
## [11] "pre_UnderstandNewScience" "pre_OvercomeSetbacks"
## [13] "pre_ConfidentOutsideClass" "pre_ConfidentExams"
## [15] "pre_OthersAskHelp" "pre_OutsideClassInSubject1"
## [17] "pre_OutsideClassInSubject2" "pre_RealWorldIssues"
## [19] "pre_FindArticles" "pre_CriticallyRead"
## [21] "pre_IdentifyPatterns" "pre_RecognizeArgument"
## [23] "pre_DevelopArgument" "pre_WriteDocuments"
## [25] "pre_WorkWithOthers" "pre_OralPresentation"
## [27] "pre_Enthusiastic" "pre_DiscussWithFriends"
## [29] "pre_PlanningAdditionalClasses" "pre_PursuringCareer"
## [31] "pre_UnderstandSubject" "pre_SucceedSubject"
## [33] "pre_ComplexIdeas" "pre_AskingForHelp"
## [35] "pre_ConnectIdeas" "pre_ApplyingOutsideClass"
## [37] "pre_SystematicReasoning" "pre_AnalyzingData"
## [39] "pre_Course" "pre_Ethnicity"
## [41] "pre_Gender" "pre_Career.Goal"
## [43] "pre_OtherCareer" "pre_PreviousCourses"
## [45] "term" "pre_Gender2"
## [47] "freq" "Freq"
## [49] "rvar" "eth_cond"
## [51] "pre_Gender3" "pre_Career"
df3 <- subset(df2, select=c(2,3:4,5:7,8:9,10:11,13:14,21:23,27:30,35:37,16:18,50,51,52))
names(df3)
## [1] "ID" "pre_FeltLikeSciencePerson"
## [3] "pre_SeeMyselfSciencePerson" "pre_FamilySeeSciencePerson"
## [5] "pre_InstructorSeeSciencePerson" "pre_PeerSeeSciencePerson"
## [7] "pre_EnjoyScience" "pre_InterestedScience"
## [9] "pre_UnderstandPreviousScience" "pre_UnderstandNewScience"
## [11] "pre_ConfidentOutsideClass" "pre_ConfidentExams"
## [13] "pre_IdentifyPatterns" "pre_RecognizeArgument"
## [15] "pre_DevelopArgument" "pre_Enthusiastic"
## [17] "pre_DiscussWithFriends" "pre_PlanningAdditionalClasses"
## [19] "pre_PursuringCareer" "pre_ConnectIdeas"
## [21] "pre_ApplyingOutsideClass" "pre_SystematicReasoning"
## [23] "pre_OutsideClassInSubject1" "pre_OutsideClassInSubject2"
## [25] "pre_RealWorldIssues" "eth_cond"
## [27] "pre_Gender3" "pre_Career"
rename_vars <- c("id",
paste("gen",1:2,sep=""),
paste("rec",1:3,sep=""),
paste("int",1:2,sep=""),
paste("pc",1:4,sep=""),
paste("pc_verb",1:3,sep=""),
paste("int2",1:4,sep=""),
paste("sciapp",1:3,sep=""),
paste("connect",1:3,sep=""),
"raceeth","gender","careergoal"
)
itemlist <- cbind(colnames(df3), rename_vars)
colnames(df3) <- rename_vars
head(df3)
## id gen1 gen2 rec1 rec2 rec3 int1 int2 pc1 pc2 pc3 pc4 pc_verb1 pc_verb2
## 1 33939702 5 5 5 5 5 5 5 5 4 3 4 4 5
## 2 33756918 4 5 4 4 4 5 5 5 5 5 5 5 5
## 3 61089066 5 5 5 5 5 5 5 5 5 5 5 5 5
## 4 60927867 1 5 5 5 5 5 5 5 5 5 5 5 5
## 5 61129203 3 4 4 3 4 4 4 4 4 4 4 4 4
## 6 61037895 5 5 5 4 4 5 5 5 5 5 4 5 5
## pc_verb3 int21 int22 int23 int24 sciapp1 sciapp2 sciapp3 connect1 connect2
## 1 3 5 5 5 NA 5 5 5 5 5
## 2 5 5 5 5 5 4 5 5 5 5
## 3 5 5 5 5 5 5 5 5 5 5
## 4 5 5 5 5 5 5 5 5 5 5
## 5 4 4 5 4 4 5 5 5 4 4
## 6 5 5 5 5 5 4 4 5 5 4
## connect3 raceeth gender careergoal
## 1 5 Asian: Southeast Asian F AH
## 2 5 No Response F N
## 3 5 Asian: East Asian M Other Health
## 4 4 White M N
## 5 4 White F AH
## 6 5 White F N
# load libraries ----------------------------------------------------------
library(afex)
## Loading required package: lme4
## Loading required package: Matrix
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
## ************
## Welcome to afex. For support visit: http://afex.singmann.science/
## - Functions for ANOVAs: aov_car(), aov_ez(), and aov_4()
## - Methods for calculating p-values with mixed(): 'KR', 'S', 'LRT', and 'PB'
## - 'afex_aov' and 'mixed' objects can be passed to emmeans() for follow-up tests
## - NEWS: library('emmeans') now needs to be called explicitly!
## - Get and set global package options with: afex_options()
## - Set orthogonal sum-to-zero contrasts globally: set_sum_contrasts()
## - For example analyses see: browseVignettes("afex")
## ************
##
## Attaching package: 'afex'
## The following object is masked from 'package:lme4':
##
## lmer
library(emmeans)
library(psych)
library(DT)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(ggsignif)
## Warning: package 'ggsignif' was built under R version 4.0.5
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
library(rstatix)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
library(Rmisc)
## Warning: package 'Rmisc' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: plyr
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:rstatix':
##
## desc, mutate
library(afex)
library(emmeans)
# univariate normality ----------------------------------------------------
norm <- describe(df3[2:25])
datatable(norm) %>%
formatRound(1:13) %>%
formatStyle(11:12, color = styleInterval(c(-2, 2), c('red', 'black', 'red')))
hist(df3$connect3, breaks = 5)

# Outliers ----------------------------------------------------------------
names(df3)
## [1] "id" "gen1" "gen2" "rec1" "rec2"
## [6] "rec3" "int1" "int2" "pc1" "pc2"
## [11] "pc3" "pc4" "pc_verb1" "pc_verb2" "pc_verb3"
## [16] "int21" "int22" "int23" "int24" "sciapp1"
## [21] "sciapp2" "sciapp3" "connect1" "connect2" "connect3"
## [26] "raceeth" "gender" "careergoal"
data_imp <- subset(df3, select=c(1:24))
d1 <- na.omit(data_imp)
m_dist <- mahalanobis(d1[-1], colMeans(d1[-1]), cov(d1[-1]))
d1$MD <- round(m_dist, 1)
plot(d1$MD)
describe(m_dist)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 803 22.97 16.81 18.15 20.34 11.97 1.86 112.21 110.35 1.89 4.86
## se
## X1 0.59
cut <- qchisq(.99, df=(ncol(d1)-1))
abline(a=cut, b=0, col="red")

d1$outlier <- "No"
d1$outlier[d1$MD > cut] <- "Yes"
table(d1$outlier)
##
## No Yes
## 722 81
81/841
## [1] 0.09631391
names(d1)
## [1] "id" "gen1" "gen2" "rec1" "rec2" "rec3"
## [7] "int1" "int2" "pc1" "pc2" "pc3" "pc4"
## [13] "pc_verb1" "pc_verb2" "pc_verb3" "int21" "int22" "int23"
## [19] "int24" "sciapp1" "sciapp2" "sciapp3" "connect1" "connect2"
## [25] "MD" "outlier"
d2 <- subset(d1, outlier == "No", select=c(1,outlier))
df4 <- subset(df3, id %in% d2$id)
# prepare data ------------------------------------------------------------
df4$gen <- (df4$gen1 + df4$gen2)/2
df4$rec <- (df4$rec1 + df4$rec2 + df4$rec3)/3
df4$int1 <- (df4$int1 + df4$int2)/2
df4$pc <- (df4$pc1 + df4$pc2 + df4$pc3 + df4$pc4)/4
df4$pc_verb <- (df4$pc_verb1 + df4$pc_verb2 + df4$pc_verb3)/3
df4$int2 <- (df4$int21 + df4$int22 + df4$int23 + df4$int24)/4
df4$sciapp <- (df4$sciapp1 + df4$sciapp2 + df4$sciapp3)/3
df4$connect <- (df4$connect1 + df4$connect2)/2
head(df4)
## id gen1 gen2 rec1 rec2 rec3 int1 int2 pc1 pc2 pc3 pc4 pc_verb1 pc_verb2
## 2 33756918 4 5 4 4 4 5 5.00 5 5 5 5 5 5
## 3 61089066 5 5 5 5 5 5 5.00 5 5 5 5 5 5
## 4 60927867 1 5 5 5 5 5 5.00 5 5 5 5 5 5
## 5 61129203 3 4 4 3 4 4 4.25 4 4 4 4 4 4
## 6 61037895 5 5 5 4 4 5 5.00 5 5 5 4 5 5
## 7 60245619 4 4 3 3 4 4 5.00 4 4 4 4 4 4
## pc_verb3 int21 int22 int23 int24 sciapp1 sciapp2 sciapp3 connect1 connect2
## 2 5 5 5 5 5 4 5 5 5 5
## 3 5 5 5 5 5 5 5 5 5 5
## 4 5 5 5 5 5 5 5 5 5 5
## 5 4 4 5 4 4 5 5 5 4 4
## 6 5 5 5 5 5 4 4 5 5 4
## 7 4 5 5 5 5 4 4 4 5 5
## connect3 raceeth gender careergoal gen rec pc pc_verb
## 2 5 No Response F N 4.5 4.000000 5.00 5
## 3 5 Asian: East Asian M Other Health 5.0 5.000000 5.00 5
## 4 4 White M N 3.0 5.000000 5.00 5
## 5 4 White F AH 3.5 3.666667 4.00 4
## 6 5 White F N 5.0 4.333333 4.75 5
## 7 5 Hispanic & Latino F Other Health 4.0 3.333333 4.00 4
## sciapp connect
## 2 4.666667 5.0
## 3 5.000000 5.0
## 4 5.000000 5.0
## 5 5.000000 4.0
## 6 4.333333 4.5
## 7 4.000000 5.0
str(df4)
## 'data.frame': 722 obs. of 34 variables:
## $ id : int 33756918 61089066 60927867 61129203 61037895 60245619 61120473 61045896 60837294 60778827 ...
## $ gen1 : int 4 5 1 3 5 4 4 3 3 4 ...
## $ gen2 : int 5 5 5 4 5 4 5 4 3 3 ...
## $ rec1 : int 4 5 5 4 5 3 5 5 4 3 ...
## $ rec2 : int 4 5 5 3 4 3 4 3 3 3 ...
## $ rec3 : int 4 5 5 4 4 4 5 3 3 3 ...
## $ int1 : num 5 5 5 4 5 4 5 4 5 4 ...
## $ int2 : num 5 5 5 4.25 5 5 5 5 5 4 ...
## $ pc1 : int 5 5 5 4 5 4 4 4 5 3 ...
## $ pc2 : int 5 5 5 4 5 4 4 5 5 3 ...
## $ pc3 : int 5 5 5 4 5 4 5 4 5 3 ...
## $ pc4 : int 5 5 5 4 4 4 3 5 4 3 ...
## $ pc_verb1 : int 5 5 5 4 5 4 5 5 3 3 ...
## $ pc_verb2 : int 5 5 5 4 5 4 4 5 3 3 ...
## $ pc_verb3 : int 5 5 5 4 5 4 4 4 4 3 ...
## $ int21 : int 5 5 5 4 5 5 5 5 5 4 ...
## $ int22 : int 5 5 5 5 5 5 5 5 5 4 ...
## $ int23 : int 5 5 5 4 5 5 5 5 5 4 ...
## $ int24 : int 5 5 5 4 5 5 5 5 5 4 ...
## $ sciapp1 : int 4 5 5 5 4 4 5 4 4 3 ...
## $ sciapp2 : int 5 5 5 5 4 4 5 5 4 3 ...
## $ sciapp3 : int 5 5 5 5 5 4 5 4 4 3 ...
## $ connect1 : int 5 5 5 4 5 5 5 5 5 3 ...
## $ connect2 : int 5 5 5 4 4 5 5 4 4 3 ...
## $ connect3 : int 5 5 4 4 5 5 5 5 5 3 ...
## $ raceeth : chr "No Response" "Asian: East Asian" "White" "White" ...
## $ gender : chr "F" "M" "M" "F" ...
## $ careergoal: chr "N" "Other Health" "N" "AH" ...
## $ gen : num 4.5 5 3 3.5 5 4 4.5 3.5 3 3.5 ...
## $ rec : num 4 5 5 3.67 4.33 ...
## $ pc : num 5 5 5 4 4.75 4 4 4.5 4.75 3 ...
## $ pc_verb : num 5 5 5 4 5 ...
## $ sciapp : num 4.67 5 5 5 4.33 ...
## $ connect : num 5 5 5 4 4.5 5 5 4.5 4.5 3 ...
df4$raceeth <- as.factor(df4$raceeth)
df4$gender <- as.factor(df4$gender)
df4$careergoal <- as.factor(df4$careergoal)
# examine distribution of each dv by each group ---------------------------
# focus on general science identity first ---------------------------------
ggplot(df4, aes(sample = gen)) +
stat_qq() +
facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(gen))$gen)

## [1] 44 215
car::qqPlot(subset(df4, careergoal == "N", select=c(gen))$gen)

## [1] 49 129
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(gen))$gen)

## [1] 22 15
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(gen))$gen)

## [1] 8 69
car::qqPlot(subset(df4, careergoal == "STEM", select=c(gen))$gen)

## [1] 6 42
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(gen))$gen)

## [1] 8 44
bartlett.test(gen ~ careergoal, data = df4)
##
## Bartlett test of homogeneity of variances
##
## data: gen by careergoal
## Bartlett's K-squared = 10.078, df = 5, p-value = 0.07306
leveneTest(gen ~ careergoal, data = df4)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 5 2.0374 0.07155 .
## 675
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df5 <- df4 %>%
reorder_levels(careergoal, order = c("AH","N","Other Health","STEM","Veterinary","Non-STEM"))
df5 %>%
group_by(careergoal) %>%
get_summary_stats(gen, type = "common")
## # A tibble: 7 x 11
## careergoal variable n min max median iqr mean sd se ci
## <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AH gen 239 1 5 4 1 3.65 0.918 0.059 0.117
## 2 N gen 238 1 5 4 1.5 3.67 0.956 0.062 0.122
## 3 Other Health gen 80 1.5 5 4 1 3.92 0.823 0.092 0.183
## 4 STEM gen 42 1 5 4 2 3.77 1.20 0.184 0.373
## 5 Veterinary gen 47 1 5 3.5 1 3.37 1.07 0.155 0.313
## 6 Non-STEM gen 35 1 5 3 2 3.06 1.01 0.171 0.348
## 7 <NA> gen 41 1 5 3 1.5 3.17 0.885 0.138 0.279
df5 %>% kruskal_test(gen ~ careergoal)
## # A tibble: 1 x 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <int> <dbl> <chr>
## 1 gen 722 23.7 5 0.000248 Kruskal-Wallis
df5 %>% kruskal_effsize(gen ~ careergoal)
## # A tibble: 1 x 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 gen 722 0.0261 eta2[H] small
df5 %>% dunn_test(gen ~ careergoal, p.adjust.method = "bonferroni")
## # A tibble: 15 x 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 gen AH N 239 238 0.397 6.92e-1 1.00e+0 ns
## 2 gen AH Other He~ 239 80 2.38 1.71e-2 2.56e-1 ns
## 3 gen AH STEM 239 42 1.34 1.79e-1 1.00e+0 ns
## 4 gen AH Veterina~ 239 47 -1.50 1.34e-1 1.00e+0 ns
## 5 gen AH Non-STEM 239 35 -3.12 1.78e-3 2.67e-2 *
## 6 gen N Other He~ 238 80 2.10 3.55e-2 5.32e-1 ns
## 7 gen N STEM 238 42 1.13 2.60e-1 1.00e+0 ns
## 8 gen N Veterina~ 238 47 -1.72 8.46e-2 1.00e+0 ns
## 9 gen N Non-STEM 238 35 -3.32 8.86e-4 1.33e-2 *
## 10 gen Other He~ STEM 80 42 -0.437 6.62e-1 1.00e+0 ns
## 11 gen Other He~ Veterina~ 80 47 -2.98 2.92e-3 4.37e-2 *
## 12 gen Other He~ Non-STEM 80 35 -4.31 1.63e-5 2.44e-4 ***
## 13 gen STEM Veterina~ 42 47 -2.18 2.90e-2 4.34e-1 ns
## 14 gen STEM Non-STEM 42 35 -3.45 5.54e-4 8.32e-3 **
## 15 gen Veterina~ Non-STEM 47 35 -1.46 1.44e-1 1.00e+0 ns
ggplot(data = df5, aes(x=careergoal, y=gen, fill=careergoal)) +
geom_boxplot() +
geom_signif(comparisons = list(c("Other Health","Veterinary")), annotations = "*", y_position = 5) +
geom_signif(comparisons = list(c("AH","Non-STEM")), annotations = "*", y_position = 5.25) +
geom_signif(comparisons = list(c("N","Non-STEM")), annotations = "*", y_position = 5.5) +
geom_signif(comparisons = list(c("Other Health","Non-STEM")), annotations = "***", y_position = 5.75) +
geom_signif(comparisons = list(c("STEM","Non-STEM")), annotations = "**", y_position = 6) +
ylab("General Science Identity") +
labs(caption = "* <.05, ** < .01, *** < .001")

# focus on recognition second ---------------------------------------------
ggplot(df4, aes(sample = rec)) +
stat_qq() +
facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(rec))$rec)

## [1] 44 112
car::qqPlot(subset(df4, careergoal == "N", select=c(rec))$rec)

## [1] 49 61
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(rec))$rec)

## [1] 16 22
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(rec))$rec)

## [1] 32 69
car::qqPlot(subset(df4, careergoal == "STEM", select=c(rec))$rec)

## [1] 6 1
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(rec))$rec)

## [1] 8 44
bartlett.test(rec ~ careergoal, data = df4)
##
## Bartlett test of homogeneity of variances
##
## data: rec by careergoal
## Bartlett's K-squared = 9.1596, df = 5, p-value = 0.1029
mod1 <- aov_ez(data = df4, id = "id", dv = "rec", between = c("careergoal"))
## Warning: Missing values for following ID(s):
## 30907380, 31849689, 31904526, 33978435, 34099323, 60031050, 60053202, 60484917, 60717393, 60783132, 60786471, 60820386, 60896259, 60920796, 60975375, 60984777, 60996597, 61012992, 61014561, 61031031, 61035267, 61037175, 61049004, 61054185, 61065885, 61066638, 61079886, 61094778, 61099134, 61107876, 61120572, 61132227, 61151703, 61152804, 61158807, 61163778, 61198677, 61221978, 61228515, 61252080, 722937306
## Removing those cases from the analysis.
## Contrasts set to contr.sum for the following variables: careergoal
test_levene(mod1)
## Levene's Test for Homogeneity of Variance (center = center)
## Df F value Pr(>F)
## group 5 2.3134 0.04242 *
## 675
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod1)
## Anova Table (Type 3 tests)
##
## Response: rec
## num Df den Df MSE F ges Pr(>F)
## careergoal 5 675 0.76831 8.7611 0.060942 4.587e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(mod1, specs = "careergoal")
## careergoal emmean SE df lower.CL upper.CL
## AH 3.38 0.0567 675 3.27 3.49
## N 3.45 0.0568 675 3.33 3.56
## Non-STEM 2.90 0.1482 675 2.60 3.19
## Other Health 3.84 0.0980 675 3.65 4.03
## STEM 3.61 0.1353 675 3.35 3.88
## Veterinary 3.01 0.1279 675 2.76 3.27
##
## Confidence level used: 0.95
pairs(emmeans(mod1, specs = "careergoal"))
## contrast estimate SE df t.ratio p.value
## AH - N -0.0646 0.0803 675 -0.805 0.9666
## AH - (Non-STEM) 0.4855 0.1586 675 3.060 0.0277
## AH - Other Health -0.4609 0.1132 675 -4.071 0.0007
## AH - STEM -0.2304 0.1467 675 -1.571 0.6182
## AH - Veterinary 0.3666 0.1399 675 2.621 0.0936
## N - (Non-STEM) 0.5501 0.1587 675 3.467 0.0074
## N - Other Health -0.3963 0.1133 675 -3.498 0.0066
## N - STEM -0.1657 0.1467 675 -1.130 0.8690
## N - Veterinary 0.4312 0.1399 675 3.082 0.0260
## (Non-STEM) - Other Health -0.9464 0.1776 675 -5.328 <.0001
## (Non-STEM) - STEM -0.7159 0.2006 675 -3.568 0.0051
## (Non-STEM) - Veterinary -0.1189 0.1957 675 -0.608 0.9905
## Other Health - STEM 0.2306 0.1670 675 1.380 0.7389
## Other Health - Veterinary 0.8275 0.1611 675 5.137 <.0001
## STEM - Veterinary 0.5969 0.1861 675 3.207 0.0176
##
## P value adjustment: tukey method for comparing a family of 6 estimates
ggplot(data = df4, aes(x=careergoal, y=rec, fill=careergoal)) +
geom_boxplot() +
geom_signif(comparisons = list(c("AH","Non-STEM")), annotations = "*", y_position = 5.25) +
geom_signif(comparisons = list(c("Non-STEM","Other Health")), annotations = "***", y_position = 5.25) +
geom_signif(comparisons = list(c("AH","Other Health")), annotations = "***", y_position = 5.5) +
geom_signif(comparisons = list(c("N","Non-STEM")), annotations = "***", y_position = 5) +
geom_signif(comparisons = list(c("STEM","Veterinary")), annotations = "*", y_position = 5) +
geom_signif(comparisons = list(c("N","Other Health")), annotations = "*", y_position = 5.75) +
geom_signif(comparisons = list(c("Other Health","Veterinary")), annotations = "***", y_position = 5.75) +
geom_signif(comparisons = list(c("N","Veterinary")), annotations = "*", y_position = 6) +
geom_signif(comparisons = list(c("Non-STEM","STEM")), annotations = "**", y_position = 6.25) +
ylab("Science Identity (Recognition)") +
labs(caption = "* <.05, ** < .01, *** < .001")

# focus on pc third -------------------------------------------------------
ggplot(df4, aes(sample = pc)) +
stat_qq() +
facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(pc))$pc)

## [1] 187 212
car::qqPlot(subset(df4, careergoal == "N", select=c(pc))$pc)

## [1] 216 173
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(pc))$pc)

## [1] 15 16
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(pc))$pc)

## [1] 5 3
car::qqPlot(subset(df4, careergoal == "STEM", select=c(pc))$pc)

## [1] 42 27
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(pc))$pc)

## [1] 18 32
bartlett.test(pc ~ careergoal, data = df4)
##
## Bartlett test of homogeneity of variances
##
## data: pc by careergoal
## Bartlett's K-squared = 3.7307, df = 5, p-value = 0.5888
mod2 <- aov_ez(data = df4, id = "id", dv = "pc", between = c("careergoal"))
## Warning: Missing values for following ID(s):
## 30907380, 31849689, 31904526, 33978435, 34099323, 60031050, 60053202, 60484917, 60717393, 60783132, 60786471, 60820386, 60896259, 60920796, 60975375, 60984777, 60996597, 61012992, 61014561, 61031031, 61035267, 61037175, 61049004, 61054185, 61065885, 61066638, 61079886, 61094778, 61099134, 61107876, 61120572, 61132227, 61151703, 61152804, 61158807, 61163778, 61198677, 61221978, 61228515, 61252080, 722937306
## Removing those cases from the analysis.
## Contrasts set to contr.sum for the following variables: careergoal
test_levene(mod2)
## Levene's Test for Homogeneity of Variance (center = center)
## Df F value Pr(>F)
## group 5 0.4764 0.794
## 675
summary(mod2)
## Anova Table (Type 3 tests)
##
## Response: pc
## num Df den Df MSE F ges Pr(>F)
## careergoal 5 675 0.49828 3.8203 0.02752 0.002025 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(mod2, specs = "careergoal")
## careergoal emmean SE df lower.CL upper.CL
## AH 3.90 0.0457 675 3.81 3.99
## N 3.89 0.0458 675 3.80 3.98
## Non-STEM 3.72 0.1193 675 3.49 3.96
## Other Health 4.08 0.0789 675 3.93 4.24
## STEM 4.08 0.1089 675 3.86 4.29
## Veterinary 3.60 0.1030 675 3.39 3.80
##
## Confidence level used: 0.95
pairs(emmeans(mod2, specs = "careergoal"))
## contrast estimate SE df t.ratio p.value
## AH - N 0.00358 0.0646 675 0.055 1.0000
## AH - (Non-STEM) 0.17606 0.1278 675 1.378 0.7402
## AH - Other Health -0.18689 0.0912 675 -2.050 0.3154
## AH - STEM -0.17989 0.1181 675 -1.523 0.6494
## AH - Veterinary 0.30174 0.1126 675 2.679 0.0808
## N - (Non-STEM) 0.17248 0.1278 675 1.350 0.7569
## N - Other Health -0.19047 0.0912 675 -2.088 0.2948
## N - STEM -0.18347 0.1181 675 -1.553 0.6299
## N - Veterinary 0.29816 0.1127 675 2.646 0.0878
## (Non-STEM) - Other Health -0.36295 0.1431 675 -2.537 0.1149
## (Non-STEM) - STEM -0.35595 0.1616 675 -2.203 0.2375
## (Non-STEM) - Veterinary 0.12568 0.1576 675 0.797 0.9679
## Other Health - STEM 0.00699 0.1345 675 0.052 1.0000
## Other Health - Veterinary 0.48863 0.1297 675 3.766 0.0025
## STEM - Veterinary 0.48164 0.1499 675 3.213 0.0172
##
## P value adjustment: tukey method for comparing a family of 6 estimates
ggplot(data = df4, aes(x=careergoal, y=pc, fill=careergoal)) +
geom_boxplot() +
geom_signif(comparisons = list(c("Other Health","Veterinary")), annotations = "**", y_position = 5.25) +
geom_signif(comparisons = list(c("STEM","Veterinary")), annotations = "*", y_position = 5) +
ylab("Science Identity (Performance/Competence)") +
labs(caption = "* <.05, ** < .01, *** < .001")

# focus on interest last --------------------------------------------------
ggplot(df4, aes(sample = int1)) +
stat_qq() +
facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(int1))$int1)

## [1] 187 212
car::qqPlot(subset(df4, careergoal == "N", select=c(int1))$int1)

## [1] 216 156
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(int1))$int1)

## [1] 7 35
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(int1))$int1)

## [1] 27 68
car::qqPlot(subset(df4, careergoal == "STEM", select=c(int1))$int1)

## [1] 42 14
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(int1))$int1)

## [1] 18 32
bartlett.test(int1 ~ careergoal, data = df4)
##
## Bartlett test of homogeneity of variances
##
## data: int1 by careergoal
## Bartlett's K-squared = 8.216, df = 5, p-value = 0.1447
leveneTest(int1 ~ careergoal, data = df4)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 5 0.6145 0.6888
## 675
df5 %>%
group_by(careergoal) %>%
get_summary_stats(int1, type = "common")
## # A tibble: 7 x 11
## careergoal variable n min max median iqr mean sd se ci
## <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AH int1 239 2 5 4.5 1 4.33 0.701 0.045 0.089
## 2 N int1 238 1 5 4.5 1 4.38 0.688 0.045 0.088
## 3 Other Health int1 80 2.5 5 5 1 4.51 0.593 0.066 0.132
## 4 STEM int1 42 2 5 5 1 4.5 0.796 0.123 0.248
## 5 Veterinary int1 47 2 5 4 1 4.28 0.728 0.106 0.214
## 6 Non-STEM int1 35 1.5 5 4 0.5 4.03 0.84 0.142 0.288
## 7 <NA> int1 41 2 5 4 0.5 4.04 0.778 0.121 0.245
df5 %>% kruskal_test(int1 ~ careergoal)
## # A tibble: 1 x 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <int> <dbl> <chr>
## 1 int1 722 15.0 5 0.0104 Kruskal-Wallis
df5 %>% kruskal_effsize(int1 ~ careergoal)
## # A tibble: 1 x 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 int1 722 0.0140 eta2[H] small
df5 %>% dunn_test(int1 ~ careergoal, p.adjust.method = "bonferroni")
## # A tibble: 15 x 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 int1 AH N 239 238 0.922 0.357 1 ns
## 2 int1 AH Other Hea~ 239 80 1.92 0.0544 0.815 ns
## 3 int1 AH STEM 239 42 2.13 0.0330 0.495 ns
## 4 int1 AH Veterinary 239 47 -0.403 0.687 1 ns
## 5 int1 AH Non-STEM 239 35 -2.09 0.0365 0.548 ns
## 6 int1 N Other Hea~ 238 80 1.27 0.204 1 ns
## 7 int1 N STEM 238 42 1.63 0.104 1 ns
## 8 int1 N Veterinary 238 47 -0.932 0.351 1 ns
## 9 int1 N Non-STEM 238 35 -2.56 0.0106 0.159 ns
## 10 int1 Other Hea~ STEM 80 42 0.568 0.570 1 ns
## 11 int1 Other Hea~ Veterinary 80 47 -1.70 0.0887 1 ns
## 12 int1 Other Hea~ Non-STEM 80 35 -3.09 0.00198 0.0297 *
## 13 int1 STEM Veterinary 42 47 -1.98 0.0473 0.710 ns
## 14 int1 STEM Non-STEM 42 35 -3.21 0.00132 0.0197 *
## 15 int1 Veterinary Non-STEM 47 35 -1.41 0.159 1 ns
ggplot(data = df5, aes(x=careergoal, y=int1, fill=careergoal)) +
geom_boxplot() +
geom_signif(comparisons = list(c("Other Health","Non-STEM")), annotations = "***", y_position = 5.5) +
ylab("Science Identity (Interest)") +
labs(caption = "* <.05, ** < .01, *** < .001")

SALG
# pc verbal ---------------------------------------------------------------
ggplot(df4, aes(sample = pc_verb)) +
stat_qq() +
facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(pc_verb))$pc_verb)

## [1] 85 54
car::qqPlot(subset(df4, careergoal == "N", select=c(pc_verb))$pc_verb)

## [1] 129 134
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(pc_verb))$pc_verb)

## [1] 7 13
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(pc_verb))$pc_verb)

## [1] 76 39
car::qqPlot(subset(df4, careergoal == "STEM", select=c(pc_verb))$pc_verb)

## [1] 1 42
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(pc_verb))$pc_verb)

## [1] 29 7
bartlett.test(pc_verb ~ careergoal, data = df4)
##
## Bartlett test of homogeneity of variances
##
## data: pc_verb by careergoal
## Bartlett's K-squared = 6.4157, df = 5, p-value = 0.2678
mod2.5 <- aov_ez(data = df4, id = "id", dv = "pc_verb", between = c("careergoal"))
## Warning: Missing values for following ID(s):
## 30907380, 31849689, 31904526, 33978435, 34099323, 60031050, 60053202, 60484917, 60717393, 60783132, 60786471, 60820386, 60896259, 60920796, 60975375, 60984777, 60996597, 61012992, 61014561, 61031031, 61035267, 61037175, 61049004, 61054185, 61065885, 61066638, 61079886, 61094778, 61099134, 61107876, 61120572, 61132227, 61151703, 61152804, 61158807, 61163778, 61198677, 61221978, 61228515, 61252080, 722937306
## Removing those cases from the analysis.
## Contrasts set to contr.sum for the following variables: careergoal
test_levene(mod2.5)
## Levene's Test for Homogeneity of Variance (center = center)
## Df F value Pr(>F)
## group 5 0.5391 0.7467
## 675
summary(mod2.5)
## Anova Table (Type 3 tests)
##
## Response: pc_verb
## num Df den Df MSE F ges Pr(>F)
## careergoal 5 675 0.51939 5.0676 0.03618 0.0001438 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(mod2.5, specs = "careergoal")
## careergoal emmean SE df lower.CL upper.CL
## AH 3.90 0.0466 675 3.81 3.99
## N 3.84 0.0467 675 3.75 3.93
## Non-STEM 4.08 0.1218 675 3.84 4.32
## Other Health 4.22 0.0806 675 4.06 4.38
## STEM 4.08 0.1112 675 3.86 4.30
## Veterinary 3.69 0.1051 675 3.48 3.89
##
## Confidence level used: 0.95
pairs(emmeans(mod2.5, specs = "careergoal"))
## contrast estimate SE df t.ratio p.value
## AH - N 0.05923 0.0660 675 0.898 0.9471
## AH - (Non-STEM) -0.17382 0.1304 675 -1.333 0.7668
## AH - Other Health -0.31846 0.0931 675 -3.421 0.0086
## AH - STEM -0.17699 0.1206 675 -1.468 0.6850
## AH - Veterinary 0.21443 0.1150 675 1.865 0.4250
## N - (Non-STEM) -0.23305 0.1305 675 -1.786 0.4753
## N - Other Health -0.37770 0.0931 675 -4.055 0.0008
## N - STEM -0.23623 0.1206 675 -1.958 0.3676
## N - Veterinary 0.15519 0.1150 675 1.349 0.7573
## (Non-STEM) - Other Health -0.14464 0.1461 675 -0.990 0.9210
## (Non-STEM) - STEM -0.00317 0.1649 675 -0.019 1.0000
## (Non-STEM) - Veterinary 0.38825 0.1609 675 2.413 0.1532
## Other Health - STEM 0.14147 0.1373 675 1.030 0.9078
## Other Health - Veterinary 0.53289 0.1325 675 4.023 0.0009
## STEM - Veterinary 0.39142 0.1530 675 2.558 0.1093
##
## P value adjustment: tukey method for comparing a family of 6 estimates
ggplot(data = df4, aes(x=careergoal, y=pc_verb, fill=careergoal)) +
geom_boxplot() +
geom_signif(comparisons = list(c("AH","Other Health")), annotations = "**", y_position = 5.25) +
geom_signif(comparisons = list(c("N","Other Health")), annotations = "***", y_position = 5) +
geom_signif(comparisons = list(c("Other Health","Veterinary")), annotations = "***", y_position = 5.25) +
ylab("Verbal Performance/Competence") +
labs(caption = "* <.05, ** < .01, *** < .001")

# using interest2 ---------------------------------------------------------
ggplot(df4, aes(sample = int2)) +
stat_qq() +
facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(int2))$int2)

## [1] 216 29
car::qqPlot(subset(df4, careergoal == "N", select=c(int2))$int2)

## [1] 129 173
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(int2))$int2)

## [1] 24 15
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(int2))$int2)

## [1] 75 40
car::qqPlot(subset(df4, careergoal == "STEM", select=c(int2))$int2)

## [1] 1 27
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(int2))$int2)

## [1] 18 3
bartlett.test(int2 ~ careergoal, data = df4)
##
## Bartlett test of homogeneity of variances
##
## data: int2 by careergoal
## Bartlett's K-squared = 11.731, df = 5, p-value = 0.03867
leveneTest(int2 ~ careergoal, data = df4)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 5 1.5892 0.1609
## 675
df5 %>%
group_by(careergoal) %>%
get_summary_stats(int2, type = "common")
## # A tibble: 7 x 11
## careergoal variable n min max median iqr mean sd se ci
## <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AH int2 239 1.25 5 4.25 1 4.26 0.672 0.043 0.086
## 2 N int2 238 1 5 4.25 1 4.28 0.666 0.043 0.085
## 3 Other Health int2 80 2.25 5 4.5 1 4.38 0.646 0.072 0.144
## 4 STEM int2 42 1.75 5 4.62 1.25 4.23 0.881 0.136 0.275
## 5 Veterinary int2 47 1.25 5 4 1.25 4.08 0.843 0.123 0.247
## 6 Non-STEM int2 35 1.75 5 3.25 0.625 3.41 0.74 0.125 0.254
## 7 <NA> int2 41 2 5 3.75 1 3.81 0.825 0.129 0.26
df5 %>% kruskal_test(int2 ~ careergoal)
## # A tibble: 1 x 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <int> <dbl> <chr>
## 1 int2 722 43.7 5 0.0000000267 Kruskal-Wallis
df5 %>% kruskal_effsize(int2 ~ careergoal)
## # A tibble: 1 x 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 int2 722 0.0540 eta2[H] small
df5 %>% dunn_test(int2 ~ careergoal, p.adjust.method = "bonferroni")
## # A tibble: 15 x 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 int2 AH N 239 238 0.327 7.44e- 1 1.00e+0 ns
## 2 int2 AH Other H~ 239 80 1.44 1.51e- 1 1.00e+0 ns
## 3 int2 AH STEM 239 42 0.267 7.89e- 1 1.00e+0 ns
## 4 int2 AH Veterin~ 239 47 -1.15 2.52e- 1 1.00e+0 ns
## 5 int2 AH Non-STEM 239 35 -5.89 3.83e- 9 5.74e-8 ****
## 6 int2 N Other H~ 238 80 1.20 2.29e- 1 1.00e+0 ns
## 7 int2 N STEM 238 42 0.0882 9.30e- 1 1.00e+0 ns
## 8 int2 N Veterin~ 238 47 -1.33 1.82e- 1 1.00e+0 ns
## 9 int2 N Non-STEM 238 35 -6.06 1.40e- 9 2.11e-8 ****
## 10 int2 Other H~ STEM 80 42 -0.738 4.60e- 1 1.00e+0 ns
## 11 int2 Other H~ Veterin~ 80 47 -2.00 4.51e- 2 6.77e-1 ns
## 12 int2 Other H~ Non-STEM 80 35 -6.18 6.58e-10 9.87e-9 ****
## 13 int2 STEM Veterin~ 42 47 -1.07 2.84e- 1 1.00e+0 ns
## 14 int2 STEM Non-STEM 42 35 -4.85 1.21e- 6 1.82e-5 ****
## 15 int2 Veterin~ Non-STEM 47 35 -3.96 7.60e- 5 1.14e-3 **
ggplot(data = df5, aes(x=careergoal, y=int2, fill=careergoal)) +
geom_boxplot() +
geom_signif(comparisons = list(c("AH","Non-STEM")), annotations = "***", y_position = 5) +
geom_signif(comparisons = list(c("N","Non-STEM")), annotations = "***", y_position = 5.25) +
geom_signif(comparisons = list(c("Other Health","Non-STEM")), annotations = "***", y_position = 5.5) +
geom_signif(comparisons = list(c("STEM","Non-STEM")), annotations = "***", y_position = 5.75) +
geom_signif(comparisons = list(c("Veterinary","Non-STEM")), annotations = "***", y_position = 6) +
ylab("Science Interest") +
labs(caption = "* <.05, ** < .01, *** < .001")

# sciapp and connect ------------------------------------------------------
ggplot(df4, aes(sample = sciapp)) +
stat_qq() +
facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(sciapp))$sciapp)

## [1] 228 39
car::qqPlot(subset(df4, careergoal == "N", select=c(sciapp))$sciapp)

## [1] 223 129
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(sciapp))$sciapp)

## [1] 2 33
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(sciapp))$sciapp)

## [1] 3 7
car::qqPlot(subset(df4, careergoal == "STEM", select=c(sciapp))$sciapp)

## [1] 23 42
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(sciapp))$sciapp)

## [1] 18 30
bartlett.test(sciapp ~ careergoal, data = df4)
##
## Bartlett test of homogeneity of variances
##
## data: sciapp by careergoal
## Bartlett's K-squared = 5.9911, df = 5, p-value = 0.3071
mod3 <- aov_ez(data = df4, id = "id", dv = "sciapp", between = c("careergoal"))
## Warning: Missing values for following ID(s):
## 30907380, 31849689, 31904526, 33978435, 34099323, 60031050, 60053202, 60484917, 60717393, 60783132, 60786471, 60820386, 60896259, 60920796, 60975375, 60984777, 60996597, 61012992, 61014561, 61031031, 61035267, 61037175, 61049004, 61054185, 61065885, 61066638, 61079886, 61094778, 61099134, 61107876, 61120572, 61132227, 61151703, 61152804, 61158807, 61163778, 61198677, 61221978, 61228515, 61252080, 722937306
## Removing those cases from the analysis.
## Contrasts set to contr.sum for the following variables: careergoal
test_levene(mod3)
## Levene's Test for Homogeneity of Variance (center = center)
## Df F value Pr(>F)
## group 5 1.3666 0.2349
## 675
summary(mod3)
## Anova Table (Type 3 tests)
##
## Response: sciapp
## num Df den Df MSE F ges Pr(>F)
## careergoal 5 675 0.45821 1.6775 0.012273 0.1378
ggplot(df4, aes(sample = connect)) +
stat_qq() +
facet_wrap(~careergoal, scales = "free")

car::qqPlot(subset(df4, careergoal == "AH", select=c(connect))$connect)

## [1] 42 85
car::qqPlot(subset(df4, careergoal == "N", select=c(connect))$connect)

## [1] 223 35
car::qqPlot(subset(df4, careergoal == "Non-STEM", select=c(connect))$connect)

## [1] 14 22
car::qqPlot(subset(df4, careergoal == "Other Health", select=c(connect))$connect)

## [1] 16 3
car::qqPlot(subset(df4, careergoal == "STEM", select=c(connect))$connect)

## [1] 1 18
car::qqPlot(subset(df4, careergoal == "Veterinary", select=c(connect))$connect)

## [1] 8 18
bartlett.test(connect ~ careergoal, data = df4)
##
## Bartlett test of homogeneity of variances
##
## data: connect by careergoal
## Bartlett's K-squared = 7.7098, df = 5, p-value = 0.173
mod4 <- aov_ez(data = df4, id = "id", dv = "connect", between = c("careergoal"))
## Warning: Missing values for following ID(s):
## 30907380, 31849689, 31904526, 33978435, 34099323, 60031050, 60053202, 60484917, 60717393, 60783132, 60786471, 60820386, 60896259, 60920796, 60975375, 60984777, 60996597, 61012992, 61014561, 61031031, 61035267, 61037175, 61049004, 61054185, 61065885, 61066638, 61079886, 61094778, 61099134, 61107876, 61120572, 61132227, 61151703, 61152804, 61158807, 61163778, 61198677, 61221978, 61228515, 61252080, 722937306
## Removing those cases from the analysis.
## Contrasts set to contr.sum for the following variables: careergoal
test_levene(mod4)
## Levene's Test for Homogeneity of Variance (center = center)
## Df F value Pr(>F)
## group 5 1.5081 0.185
## 675
summary(mod4)
## Anova Table (Type 3 tests)
##
## Response: connect
## num Df den Df MSE F ges Pr(>F)
## careergoal 5 675 0.48215 2.6083 0.018954 0.02389 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(mod4, specs = "careergoal")
## careergoal emmean SE df lower.CL upper.CL
## AH 4.18 0.0449 675 4.10 4.27
## N 4.19 0.0450 675 4.10 4.28
## Non-STEM 3.86 0.1174 675 3.63 4.09
## Other Health 4.26 0.0776 675 4.11 4.41
## STEM 4.40 0.1071 675 4.19 4.62
## Veterinary 4.16 0.1013 675 3.96 4.36
##
## Confidence level used: 0.95
pairs(emmeans(mod4, specs = "careergoal"))
## contrast estimate SE df t.ratio p.value
## AH - N -0.00918 0.0636 675 -0.144 1.0000
## AH - (Non-STEM) 0.32696 0.1257 675 2.602 0.0982
## AH - Other Health -0.07840 0.0897 675 -0.874 0.9526
## AH - STEM -0.22066 0.1162 675 -1.899 0.4034
## AH - Veterinary 0.02453 0.1108 675 0.221 0.9999
## N - (Non-STEM) 0.33613 0.1257 675 2.674 0.0818
## N - Other Health -0.06922 0.0897 675 -0.771 0.9722
## N - STEM -0.21148 0.1162 675 -1.820 0.4536
## N - Veterinary 0.03370 0.1108 675 0.304 0.9997
## (Non-STEM) - Other Health -0.40536 0.1407 675 -2.881 0.0469
## (Non-STEM) - STEM -0.54762 0.1589 675 -3.446 0.0079
## (Non-STEM) - Veterinary -0.30243 0.1550 675 -1.951 0.3721
## Other Health - STEM -0.14226 0.1323 675 -1.075 0.8912
## Other Health - Veterinary 0.10293 0.1276 675 0.807 0.9663
## STEM - Veterinary 0.24519 0.1474 675 1.663 0.5569
##
## P value adjustment: tukey method for comparing a family of 6 estimates
ggplot(data = df5, aes(x=careergoal, y=connect, fill=careergoal)) +
geom_boxplot() +
ylab("Connect Ideas") +
labs(caption = "* <.05, ** < .01, *** < .001")
