package requirements

library(lubridate)
## Warning: package 'lubridate' was built under R version 3.4.4
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(psych)
## Warning: package 'psych' was built under R version 3.4.4

read in datasets

d.1 <- read.csv("C:/Users/wvillano/Downloads/addhealth part 1.csv")
d.2 <- read.csv("C:/Users/wvillano/Downloads/addhealth part 2.csv")

merge datasets by ID

Append <- c()
missing_index <- c()
for (i in unique(d.1$ID)) {
  if (nrow(d.2[which(d.2$ID == i),]) != 0) {
    Append <- rbind(Append,
                    d.2[which(d.2$ID == i),])
    missing_index <- c(missing_index, 0)
  } else {
    Append <- rbind(Append,
                    rep(NA,dim(d.2[which(d.2$ID == i),])[2]))
    missing_index <- c(missing_index, 1)
  }
}
# any missing data in this lookup?
if (sum(missing_index) > 0) {
  print(paste0("Missing ", sum(missing_index)," rows of data."))
}
# combine d.1 and d.2 -> d.combined
d.combined <- cbind(d.1,Append)

clean income data

# preallocate categorical income variable w/ NAs
d.combined$income_cat <- rep(NA, dim(d.combined)[1])
# assign categorical income factors
d.combined$income_cat[which(d.combined$Income < 20000)] <- 1
d.combined$income_cat[which(d.combined$Income >= 20000 & d.combined$Income < 50000)] <- 2
d.combined$income_cat[which(d.combined$Income >= 50000 & d.combined$Income < 75000)] <- 3
d.combined$income_cat[which(d.combined$Income >= 75000 & d.combined$Income <= 100000)] <- 4
d.combined$income_cat[which(d.combined$Income > 100000)] <- 5
# convert variable to factor
d.combined$income_cat <- as.factor(d.combined$income_cat)

clean ethnicity data

# preallocate ethnicity factor variable w/ NAs
d.combined$ethnicity_factor <- rep(NA, dim(d.combined)[1])
# assign ethnicity factors
d.combined$ethnicity_factor[which(d.combined$Ethnicity == 1)] <- "Latino"
d.combined$ethnicity_factor[which(d.combined$Ethnicity == 2)] <- "African American" 
d.combined$ethnicity_factor[which(d.combined$Ethnicity == 3)] <- "Asian American" 
d.combined$ethnicity_factor[which(d.combined$Ethnicity == 4)] <- "American Indian" 
d.combined$ethnicity_factor[which(d.combined$Ethnicity == 5)] <- "White" 
# convert variable to factor
d.combined$ethnicity_factor <- as.factor(d.combined$ethnicity_factor)

clean sex data

# preallocate sex factor variable w/ NAs
d.combined$sex_factor <- rep(NA, dim(d.combined)[1])
# assign sex factors
d.combined$sex_factor[which(d.combined$Sex == 0)] <- "Male"
d.combined$sex_factor[which(d.combined$Sex == 1)] <- "Female"
# convert variable to factor
d.combined$sex_factor <- as.factor(d.combined$sex_factor)

compute self-esteem scale

# convert self-esteem items that are not on 1-5 scale to NA (missing data)
if (sum(d.combined[,grep("PF*", colnames(d.combined))] > 5) > 0) {
  for (i in grep("PF*", colnames(d.combined))) {
    for (j in 1:nrow(d.combined)) {
      if (d.combined[j,i] > 5) {
        d.combined[j,i] <- NA
      }
    }
  }
}
# no reverse coding in self-esteem scale - compute raw sum score
d.combined$SE_score <- rowMeans(d.combined[,grep("PF*", colnames(d.combined))], na.rm = TRUE)

compute depression scale

# convert self-esteem items that are not on 0-3 scale to NA (missing data)
if (sum(d.combined[,grep("FS", colnames(d.combined))] > 3) > 0) {
  for (i in grep("FS", colnames(d.combined))) {
    for (j in 1:nrow(d.combined)) {
      if (d.combined[j,i] > 3) {
        d.combined[j,i] <- NA
      }
    }
  }
}
# handle reverse coded items (4, 8, 11, 15)
d.combined$FS4 <- 3 - d.combined$FS4
d.combined$FS8 <- 3 - d.combined$FS8
d.combined$FS11 <- 3 - d.combined$FS11
d.combined$FS15 <- 3 - d.combined$FS15
# compute sum score
d.combined$DEP_score <- rowMeans(d.combined[,grep("FS", colnames(d.combined))], na.rm = TRUE)

compute age at assessment

# convert dates to Posix format
d.combined$BirthDate_posix <- as.POSIXct(as.character(d.combined$BirthDate), format = "%m/%d/%Y")
d.combined$SurveyDate_posix <- as.POSIXct(as.character(d.combined$SurveyDate), format = "%m/%d/%Y")
# compute age in years
d.combined$age <- as.numeric(difftime(d.combined$SurveyDate_posix, d.combined$BirthDate_posix, units = "days") / 365)

assign Fall and Spring semester categories

# preallocate semester factor variable w/ NAs
d.combined$semester_factor <- rep(NA, dim(d.combined)[1])
# assign semester categories
d.combined$semester_factor[which(month(d.combined$SurveyDate_posix) >= 1 & month(d.combined$SurveyDate_posix) < 6)] <- "Spring"
d.combined$semester_factor[which(month(d.combined$SurveyDate_posix) >= 7 & month(d.combined$SurveyDate_posix) <= 12)] <- "Fall"
# convert to factor
d.combined$semester_factor <- as.factor(d.combined$semester_factor)

generate summary table

row_names <- c("Male", "Female", "White", "Latino", "African American", "Asian American", "American Indian", "< $20,000", "$20,000-$49,999", "$50,000-$74,999", "$75,000-$100,000", "> $100,000")

Spring_data <- d.combined[which(d.combined$semester_factor == "Spring"),]
Spring_data.num <- c(sum(Spring_data$sex_factor == "Male"),
                             sum(Spring_data$sex_factor == "Female"),
                             sum(Spring_data$ethnicity_factor == "White", na.rm = TRUE),
                             sum(Spring_data$ethnicity_factor == "Latino", na.rm = TRUE), 
                             sum(Spring_data$ethnicity_factor == "African American", na.rm = TRUE),
                             sum(Spring_data$ethnicity_factor == "Asian American", na.rm = TRUE),
                             sum(Spring_data$ethnicity_factor == "American Indian", na.rm = TRUE),
                             sum(Spring_data$income_cat == 1),
                             sum(Spring_data$income_cat == 2),
                             sum(Spring_data$income_cat == 3),
                             sum(Spring_data$income_cat == 4),
                             sum(Spring_data$income_cat == 5)
)
Spring_data.Percentages <- round(Spring_data.num / nrow(Spring_data) * 100,2)                        


Fall_data <- d.combined[which(d.combined$semester_factor == "Fall"),]
Fall_data.num <- c(sum(Fall_data$sex_factor == "Male"),
                     sum(Fall_data$sex_factor == "Female"),
                     sum(Fall_data$ethnicity_factor == "White", na.rm = TRUE),
                     sum(Fall_data$ethnicity_factor == "Latino", na.rm = TRUE), 
                     sum(Fall_data$ethnicity_factor == "African American", na.rm = TRUE),
                     sum(Fall_data$ethnicity_factor == "Asian American", na.rm = TRUE),
                     sum(Fall_data$ethnicity_factor == "American Indian", na.rm = TRUE),
                     sum(Fall_data$income_cat == 1),
                     sum(Fall_data$income_cat == 2),
                     sum(Fall_data$income_cat == 3),
                     sum(Fall_data$income_cat == 4),
                     sum(Fall_data$income_cat == 5)
)
Fall_data.Percentages <- round(Fall_data.num / nrow(Fall_data) * 100,2)   

# table part 1: gender, ethnicity, parent income 
Summary_table <- data.frame(Spring_data.num, Spring_data.Percentages, Fall_data.num, Fall_data.Percentages)
rownames(Summary_table) <- row_names


# table part 2: age, depression score, self-esteem score

Spring.M <- c(round(mean(Spring_data$age),2),
              round(mean(Spring_data$DEP_score),2),
              round(mean(Spring_data$SE_score),2))
Spring.Range <- c(round(range(Spring_data$age),2),
                  round(range(Spring_data$DEP_score),2),
                  round(range(Spring_data$SE_score),2))

Fall.M <- c(round(mean(Fall_data$age),2),
            round(mean(Fall_data$DEP_score),2),
            round(mean(Fall_data$SE_score),2))
Fall.Range <- c(round(range(Fall_data$age),2),
                round(range(Fall_data$DEP_score),2),
                round(range(Fall_data$SE_score),2))

print(Summary_table)
##                  Spring_data.num Spring_data.Percentages Fall_data.num
## Male                         220                   51.89           239
## Female                       204                   48.11           249
## White                        134                   31.60           167
## Latino                        67                   15.80            72
## African American             108                   25.47           111
## Asian American                55                   12.97            64
## American Indian               59                   13.92            73
## < $20,000                     66                   15.57           103
## $20,000-$49,999              124                   29.25           134
## $50,000-$74,999              105                   24.76           109
## $75,000-$100,000             110                   25.94           121
## > $100,000                    19                    4.48            21
##                  Fall_data.Percentages
## Male                             48.98
## Female                           51.02
## White                            34.22
## Latino                           14.75
## African American                 22.75
## Asian American                   13.11
## American Indian                  14.96
## < $20,000                        21.11
## $20,000-$49,999                  27.46
## $50,000-$74,999                  22.34
## $75,000-$100,000                 24.80
## > $100,000                        4.30