library(haven)
baby2.0 <- read_sav("TheaWulff_Dataset 9.17.24Updated.sav")
head(baby2.0)
## # A tibble: 6 × 39
##   FamilyID demo3mA demo46mA     demo47bmA demo47a4mA demo47cmA demo40mA demo41mA
##      <dbl>   <dbl> <dbl+lbl>    <dbl+lbl> <chr>      <dbl+lbl> <dbl+lb> <dbl+lb>
## 1        4      25 1 [Hispanic… 5 [White] ""         10 [Whit… 4 [Bach… NA      
## 2        6      24 0 [Not Hisp… 5 [White] ""          9 [Whit… 3 [Asso… NA      
## 3        8      36 0 [Not Hisp… 6 [Prefe… ""         11 [Self… 5 [Any …  3 [Emp…
## 4        9      31 0 [Not Hisp… 6 [Prefe… ""         11 [Self… 4 [Bach…  3 [Emp…
## 5       10      27 0 [Not Hisp… 2 [Asian] ""          3 [Asia… 5 [Any …  3 [Emp…
## 6       16      27 0 [Not Hisp… 5 [White] ""          9 [Whit… 4 [Bach…  3 [Emp…
## # ℹ 31 more variables: demo42mA <dbl+lbl>, demo44mA <dbl+lbl>, demo52mA <dbl>,
## #   sexmB <dbl+lbl>, DERStotmA <dbl>, RetentionC <dbl+lbl>, demo35mC <dbl+lbl>,
## #   demo36bmC <dbl+lbl>, demo36amC <chr>, demo36cmC <dbl+lbl>,
## #   ConsiderExcluding <dbl>, TsensmC1 <dbl>, TintrmC1 <dbl>, TdetmC1 <dbl>,
## #   TprmC1 <dbl>, TdistcC1 <dbl>, RetentionD1 <dbl+lbl>, ITSEAextmD1 <dbl>,
## #   ITSEAintmD1 <dbl>, ITSEAdysmD1 <dbl>, ITSEAcompmD1 <dbl>,
## #   ITSEAextTmD1 <dbl>, ITSEAintTmD1 <dbl>, ITSEAdysTmD1 <dbl>, …
# Rename column for readability. 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
baby2.0 <- baby2.0 %>%
  rename(MotherRace = demo47bmA)
head(baby2.0)
## # A tibble: 6 × 39
##   FamilyID demo3mA demo46mA    MotherRace demo47a4mA demo47cmA demo40mA demo41mA
##      <dbl>   <dbl> <dbl+lbl>   <dbl+lbl>  <chr>      <dbl+lbl> <dbl+lb> <dbl+lb>
## 1        4      25 1 [Hispani… 5 [White]  ""         10 [Whit… 4 [Bach… NA      
## 2        6      24 0 [Not His… 5 [White]  ""          9 [Whit… 3 [Asso… NA      
## 3        8      36 0 [Not His… 6 [Prefer… ""         11 [Self… 5 [Any …  3 [Emp…
## 4        9      31 0 [Not His… 6 [Prefer… ""         11 [Self… 4 [Bach…  3 [Emp…
## 5       10      27 0 [Not His… 2 [Asian]  ""          3 [Asia… 5 [Any …  3 [Emp…
## 6       16      27 0 [Not His… 5 [White]  ""          9 [Whit… 4 [Bach…  3 [Emp…
## # ℹ 31 more variables: demo42mA <dbl+lbl>, demo44mA <dbl+lbl>, demo52mA <dbl>,
## #   sexmB <dbl+lbl>, DERStotmA <dbl>, RetentionC <dbl+lbl>, demo35mC <dbl+lbl>,
## #   demo36bmC <dbl+lbl>, demo36amC <chr>, demo36cmC <dbl+lbl>,
## #   ConsiderExcluding <dbl>, TsensmC1 <dbl>, TintrmC1 <dbl>, TdetmC1 <dbl>,
## #   TprmC1 <dbl>, TdistcC1 <dbl>, RetentionD1 <dbl+lbl>, ITSEAextmD1 <dbl>,
## #   ITSEAintmD1 <dbl>, ITSEAdysmD1 <dbl>, ITSEAcompmD1 <dbl>,
## #   ITSEAextTmD1 <dbl>, ITSEAintTmD1 <dbl>, ITSEAdysTmD1 <dbl>, …
# Check MotherRace column for missing values
sum(is.na(baby2.0$MotherRace))
## [1] 12
# Remove rows with NA in MotherRace
baby2.0 <- baby2.0 %>%
  filter(!is.na(MotherRace)) 

# Percentages from the MotherRace column
RacePercentage <- baby2.0 %>%
  summarise(
    AmericanIndian_percentage = mean(MotherRace == 1) * 100,
    Asian_percentage = mean(MotherRace == 2) * 100,
    PacificIslander_percentage = mean(MotherRace == 3) * 100,
    Black_percentage = mean(MotherRace == 4) * 100,
    White_percentage = mean(MotherRace == 5) * 100,
    SelfReport_percentage = mean(MotherRace == 6) * 100,
    MoreThanOneRace_percentage = mean(MotherRace == 7) * 100,
    DeclineToAnswer_percentage = mean(MotherRace == 99) * 100
  )

print(RacePercentage)
## # A tibble: 1 × 8
##   AmericanIndian_percentage Asian_percentage PacificIslander_percentage
##                       <dbl>            <dbl>                      <dbl>
## 1                      1.88             6.97                       1.88
## # ℹ 5 more variables: Black_percentage <dbl>, White_percentage <dbl>,
## #   SelfReport_percentage <dbl>, MoreThanOneRace_percentage <dbl>,
## #   DeclineToAnswer_percentage <dbl>
race <- c("American Indian or Alaskan Native", "Asian", "Native Hawaiian or Other Pacific Islander", "Black or African American", "White", 
          "Hispanic or Latinx", "Prefer to self-report", "More than 1 race")
percentage <- c(1.89, 7.03, 1.89, 2.97, 76.76, 22.20, 5.14, 4.32)

RaceTable <- data.frame(Race = race, Percentage = percentage)

print(RaceTable)
##                                        Race Percentage
## 1         American Indian or Alaskan Native       1.89
## 2                                     Asian       7.03
## 3 Native Hawaiian or Other Pacific Islander       1.89
## 4                 Black or African American       2.97
## 5                                     White      76.76
## 6                        Hispanic or Latinx      22.20
## 7                     Prefer to self-report       5.14
## 8                          More than 1 race       4.32
# Average mother age

library(dplyr)
baby2.0 <- baby2.0 %>%
  rename(MotherAge = demo3mA)
head(baby2.0)
## # A tibble: 6 × 39
##   FamilyID MotherAge demo46mA  MotherRace demo47a4mA demo47cmA demo40mA demo41mA
##      <dbl>     <dbl> <dbl+lbl> <dbl+lbl>  <chr>      <dbl+lbl> <dbl+lb> <dbl+lb>
## 1        4        25 1 [Hispa… 5 [White]  ""         10 [Whit… 4 [Bach… NA      
## 2        6        24 0 [Not H… 5 [White]  ""          9 [Whit… 3 [Asso… NA      
## 3        8        36 0 [Not H… 6 [Prefer… ""         11 [Self… 5 [Any …  3 [Emp…
## 4        9        31 0 [Not H… 6 [Prefer… ""         11 [Self… 4 [Bach…  3 [Emp…
## 5       10        27 0 [Not H… 2 [Asian]  ""          3 [Asia… 5 [Any …  3 [Emp…
## 6       16        27 0 [Not H… 5 [White]  ""          9 [Whit… 4 [Bach…  3 [Emp…
## # ℹ 31 more variables: demo42mA <dbl+lbl>, demo44mA <dbl+lbl>, demo52mA <dbl>,
## #   sexmB <dbl+lbl>, DERStotmA <dbl>, RetentionC <dbl+lbl>, demo35mC <dbl+lbl>,
## #   demo36bmC <dbl+lbl>, demo36amC <chr>, demo36cmC <dbl+lbl>,
## #   ConsiderExcluding <dbl>, TsensmC1 <dbl>, TintrmC1 <dbl>, TdetmC1 <dbl>,
## #   TprmC1 <dbl>, TdistcC1 <dbl>, RetentionD1 <dbl+lbl>, ITSEAextmD1 <dbl>,
## #   ITSEAintmD1 <dbl>, ITSEAdysmD1 <dbl>, ITSEAcompmD1 <dbl>,
## #   ITSEAextTmD1 <dbl>, ITSEAintTmD1 <dbl>, ITSEAdysTmD1 <dbl>, …
average_MotherAge <- mean(baby2.0$MotherAge, na.rm = TRUE)

print(average_MotherAge)
## [1] 29.38606
# Median household income
library(dplyr)
baby2.0 <- baby2.0 %>%
  rename(MedianIncome = demo44mA)
head(baby2.0)
## # A tibble: 6 × 39
##   FamilyID MotherAge demo46mA  MotherRace demo47a4mA demo47cmA demo40mA demo41mA
##      <dbl>     <dbl> <dbl+lbl> <dbl+lbl>  <chr>      <dbl+lbl> <dbl+lb> <dbl+lb>
## 1        4        25 1 [Hispa… 5 [White]  ""         10 [Whit… 4 [Bach… NA      
## 2        6        24 0 [Not H… 5 [White]  ""          9 [Whit… 3 [Asso… NA      
## 3        8        36 0 [Not H… 6 [Prefer… ""         11 [Self… 5 [Any …  3 [Emp…
## 4        9        31 0 [Not H… 6 [Prefer… ""         11 [Self… 4 [Bach…  3 [Emp…
## 5       10        27 0 [Not H… 2 [Asian]  ""          3 [Asia… 5 [Any …  3 [Emp…
## 6       16        27 0 [Not H… 5 [White]  ""          9 [Whit… 4 [Bach…  3 [Emp…
## # ℹ 31 more variables: demo42mA <dbl+lbl>, MedianIncome <dbl+lbl>,
## #   demo52mA <dbl>, sexmB <dbl+lbl>, DERStotmA <dbl>, RetentionC <dbl+lbl>,
## #   demo35mC <dbl+lbl>, demo36bmC <dbl+lbl>, demo36amC <chr>,
## #   demo36cmC <dbl+lbl>, ConsiderExcluding <dbl>, TsensmC1 <dbl>,
## #   TintrmC1 <dbl>, TdetmC1 <dbl>, TprmC1 <dbl>, TdistcC1 <dbl>,
## #   RetentionD1 <dbl+lbl>, ITSEAextmD1 <dbl>, ITSEAintmD1 <dbl>,
## #   ITSEAdysmD1 <dbl>, ITSEAcompmD1 <dbl>, ITSEAextTmD1 <dbl>, …
median_income <- median(baby2.0$MedianIncome, na.rm = TRUE)

print(median_income)
## [1] 8
## Not sure whether I did this correctly. 
## Output is 8 and SPSS says an 8 is $50,000-$79,999
# Maternal education
library(dplyr)
baby2.0 <- baby2.0 %>%
  rename(Education = demo40mA)
head(baby2.0)
## # A tibble: 6 × 39
##   FamilyID MotherAge demo46mA MotherRace demo47a4mA demo47cmA Education demo41mA
##      <dbl>     <dbl> <dbl+lb> <dbl+lbl>  <chr>      <dbl+lbl> <dbl+lbl> <dbl+lb>
## 1        4        25 1 [Hisp… 5 [White]  ""         10 [Whit… 4 [Bache… NA      
## 2        6        24 0 [Not … 5 [White]  ""          9 [Whit… 3 [Assoc… NA      
## 3        8        36 0 [Not … 6 [Prefer… ""         11 [Self… 5 [Any g…  3 [Emp…
## 4        9        31 0 [Not … 6 [Prefer… ""         11 [Self… 4 [Bache…  3 [Emp…
## 5       10        27 0 [Not … 2 [Asian]  ""          3 [Asia… 5 [Any g…  3 [Emp…
## 6       16        27 0 [Not … 5 [White]  ""          9 [Whit… 4 [Bache…  3 [Emp…
## # ℹ 31 more variables: demo42mA <dbl+lbl>, MedianIncome <dbl+lbl>,
## #   demo52mA <dbl>, sexmB <dbl+lbl>, DERStotmA <dbl>, RetentionC <dbl+lbl>,
## #   demo35mC <dbl+lbl>, demo36bmC <dbl+lbl>, demo36amC <chr>,
## #   demo36cmC <dbl+lbl>, ConsiderExcluding <dbl>, TsensmC1 <dbl>,
## #   TintrmC1 <dbl>, TdetmC1 <dbl>, TprmC1 <dbl>, TdistcC1 <dbl>,
## #   RetentionD1 <dbl+lbl>, ITSEAextmD1 <dbl>, ITSEAintmD1 <dbl>,
## #   ITSEAdysmD1 <dbl>, ITSEAcompmD1 <dbl>, ITSEAextTmD1 <dbl>, …
# Finding the percent of women with a value of 4 or 5, indicating 
# a bachelor's degree or any graduate school

education_4or5 <- baby2.0$Education

filtered_4or5 <- education_4or5[education_4or5 %in% c(4, 5)]

percentage <- (length(filtered_4or5) / length(education_4or5)) * 100

print(percentage)
## [1] 54.42359
# Hispanic/Latinx percentage
baby2.0 <- baby2.0 %>%
  rename(MotherHisp = demo46mA)
head(baby2.0)
## # A tibble: 6 × 39
##   FamilyID MotherAge MotherHisp        MotherRace demo47a4mA demo47cmA Education
##      <dbl>     <dbl> <dbl+lbl>         <dbl+lbl>  <chr>      <dbl+lbl> <dbl+lbl>
## 1        4        25 1 [Hispanic or L… 5 [White]  ""         10 [Whit… 4 [Bache…
## 2        6        24 0 [Not Hispanic … 5 [White]  ""          9 [Whit… 3 [Assoc…
## 3        8        36 0 [Not Hispanic … 6 [Prefer… ""         11 [Self… 5 [Any g…
## 4        9        31 0 [Not Hispanic … 6 [Prefer… ""         11 [Self… 4 [Bache…
## 5       10        27 0 [Not Hispanic … 2 [Asian]  ""          3 [Asia… 5 [Any g…
## 6       16        27 0 [Not Hispanic … 5 [White]  ""          9 [Whit… 4 [Bache…
## # ℹ 32 more variables: demo41mA <dbl+lbl>, demo42mA <dbl+lbl>,
## #   MedianIncome <dbl+lbl>, demo52mA <dbl>, sexmB <dbl+lbl>, DERStotmA <dbl>,
## #   RetentionC <dbl+lbl>, demo35mC <dbl+lbl>, demo36bmC <dbl+lbl>,
## #   demo36amC <chr>, demo36cmC <dbl+lbl>, ConsiderExcluding <dbl>,
## #   TsensmC1 <dbl>, TintrmC1 <dbl>, TdetmC1 <dbl>, TprmC1 <dbl>,
## #   TdistcC1 <dbl>, RetentionD1 <dbl+lbl>, ITSEAextmD1 <dbl>,
## #   ITSEAintmD1 <dbl>, ITSEAdysmD1 <dbl>, ITSEAcompmD1 <dbl>, …
sum(is.na(baby2.0$MotherRace))
## [1] 0
baby2.0 <- baby2.0 %>%
  filter(!is.na(MotherHisp)) 

# Find the percentage of mothers who are Hispanic or Latinx
RacePercentage <- baby2.0 %>%
  summarise(Hispanic_percentage = mean(MotherHisp == 1) * 100)
print(RacePercentage)
## # A tibble: 1 × 1
##   Hispanic_percentage
##                 <dbl>
## 1                22.2
# Find the percentage of mothers who are Hispanic or Latinx
RacePercentage <- baby2.0 %>%
  summarise(Hispanic_percentage = mean(MotherHisp == 1) * 100)
print(RacePercentage)
## # A tibble: 1 × 1
##   Hispanic_percentage
##                 <dbl>
## 1                22.2
# Find the percentage of mothers who are White and NOT Hispanic or Latinx

baby2.0 <- baby2.0 %>%
  rename(MotherRace2 = demo47cmA)
head(baby2.0)
## # A tibble: 6 × 39
##   FamilyID MotherAge MotherHisp      MotherRace demo47a4mA MotherRace2 Education
##      <dbl>     <dbl> <dbl+lbl>       <dbl+lbl>  <chr>      <dbl+lbl>   <dbl+lbl>
## 1        4        25 1 [Hispanic or… 5 [White]  ""         10 [White … 4 [Bache…
## 2        6        24 0 [Not Hispani… 5 [White]  ""          9 [White … 3 [Assoc…
## 3        8        36 0 [Not Hispani… 6 [Prefer… ""         11 [Self-r… 5 [Any g…
## 4        9        31 0 [Not Hispani… 6 [Prefer… ""         11 [Self-r… 4 [Bache…
## 5       10        27 0 [Not Hispani… 2 [Asian]  ""          3 [Asian … 5 [Any g…
## 6       16        27 0 [Not Hispani… 5 [White]  ""          9 [White … 4 [Bache…
## # ℹ 32 more variables: demo41mA <dbl+lbl>, demo42mA <dbl+lbl>,
## #   MedianIncome <dbl+lbl>, demo52mA <dbl>, sexmB <dbl+lbl>, DERStotmA <dbl>,
## #   RetentionC <dbl+lbl>, demo35mC <dbl+lbl>, demo36bmC <dbl+lbl>,
## #   demo36amC <chr>, demo36cmC <dbl+lbl>, ConsiderExcluding <dbl>,
## #   TsensmC1 <dbl>, TintrmC1 <dbl>, TdetmC1 <dbl>, TprmC1 <dbl>,
## #   TdistcC1 <dbl>, RetentionD1 <dbl+lbl>, ITSEAextmD1 <dbl>,
## #   ITSEAintmD1 <dbl>, ITSEAdysmD1 <dbl>, ITSEAcompmD1 <dbl>, …
# Percentages from the MotherRace2 column
RacePercentage2 <- baby2.0 %>%
  summarise(
    AmericanIndianNotHL_percentage = mean(MotherRace == 1) * 100,
    AmericanIndianHL_percentage = mean(MotherRace == 2) * 100,
    AsianNotHL_percentage = mean(MotherRace == 3) * 100,
    AsianHL_percentage = mean(MotherRace == 4) * 100,
    PacificIslanderNotHL_percentage = mean(MotherRace == 5) * 100,
    PacificIslanderHL_percentage = mean(MotherRace == 6) * 100,
    BlackNotHL_percentage = mean(MotherRace == 7) * 100,
    BlackHL_percentage = mean(MotherRace == 8) * 100,
    WhiteNotHL_percentage = mean(MotherRace == 9) * 100,
    WhiteHL_percentage = mean(MotherRace == 10) * 100,
    SelfReportNotHL_percentage = mean(MotherRace == 11) * 100,
    SelfReportHL_percentage = mean(MotherRace == 12) * 100,
    MoreThanOneRaceNotHL_percentage = mean(MotherRace == 13) * 100,
    MoreThanOneRaceHL_percentage = mean(MotherRace == 14) * 100,
    NoRaceSelectedHL_percentage = mean(MotherRace == 15) * 100,
    NoRaceSelectedNotHL_percentage = mean(MotherRace == 16) * 100,
    DeclineToAnswer_percentage = mean(MotherRace == 99) * 100
  )

print(RacePercentage2)
## # A tibble: 1 × 17
##   AmericanIndianNotHL_percentage AmericanIndianHL_percen…¹ AsianNotHL_percentage
##                            <dbl>                     <dbl>                 <dbl>
## 1                           1.89                      7.03                  1.89
## # ℹ abbreviated name: ¹​AmericanIndianHL_percentage
## # ℹ 14 more variables: AsianHL_percentage <dbl>,
## #   PacificIslanderNotHL_percentage <dbl>, PacificIslanderHL_percentage <dbl>,
## #   BlackNotHL_percentage <dbl>, BlackHL_percentage <dbl>,
## #   WhiteNotHL_percentage <dbl>, WhiteHL_percentage <dbl>,
## #   SelfReportNotHL_percentage <dbl>, SelfReportHL_percentage <dbl>,
## #   MoreThanOneRaceNotHL_percentage <dbl>, …
mother_race_9 <- baby2.0$MotherRace2

filtered_9 <- mother_race_9[mother_race_9 == 9]

percentage_9 <- (length(filtered_9) / length(mother_race_9)) * 100

print(percentage_9)
## [1] 59.18919