package requirements
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.4.4
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(psych)
## Warning: package 'psych' was built under R version 3.4.4
read in datasets
d.1 <- read.csv("C:/Users/wvillano/Downloads/addhealth part 1.csv")
d.2 <- read.csv("C:/Users/wvillano/Downloads/addhealth part 2.csv")
merge datasets by ID
Append <- c()
missing_index <- c()
for (i in unique(d.1$ID)) {
if (nrow(d.2[which(d.2$ID == i),]) != 0) {
Append <- rbind(Append,
d.2[which(d.2$ID == i),])
missing_index <- c(missing_index, 0)
} else {
Append <- rbind(Append,
rep(NA,dim(d.2[which(d.2$ID == i),])[2]))
missing_index <- c(missing_index, 1)
}
}
# any missing data in this lookup?
if (sum(missing_index) > 0) {
print(paste0("Missing ", sum(missing_index)," rows of data."))
}
# combine d.1 and d.2 -> d.combined
d.combined <- cbind(d.1,Append)
clean income data
# preallocate categorical income variable w/ NAs
d.combined$income_cat <- rep(NA, dim(d.combined)[1])
# assign categorical income factors
d.combined$income_cat[which(d.combined$Income < 20000)] <- 1
d.combined$income_cat[which(d.combined$Income >= 20000 & d.combined$Income < 50000)] <- 2
d.combined$income_cat[which(d.combined$Income >= 50000 & d.combined$Income < 75000)] <- 3
d.combined$income_cat[which(d.combined$Income >= 75000 & d.combined$Income <= 100000)] <- 4
d.combined$income_cat[which(d.combined$Income > 100000)] <- 5
# convert variable to factor
d.combined$income_cat <- as.factor(d.combined$income_cat)
clean ethnicity data
# preallocate ethnicity factor variable w/ NAs
d.combined$ethnicity_factor <- rep(NA, dim(d.combined)[1])
# assign ethnicity factors
d.combined$ethnicity_factor[which(d.combined$Ethnicity == 1)] <- "Latino"
d.combined$ethnicity_factor[which(d.combined$Ethnicity == 2)] <- "African American"
d.combined$ethnicity_factor[which(d.combined$Ethnicity == 3)] <- "Asian American"
d.combined$ethnicity_factor[which(d.combined$Ethnicity == 4)] <- "American Indian"
d.combined$ethnicity_factor[which(d.combined$Ethnicity == 5)] <- "White"
# convert variable to factor
d.combined$ethnicity_factor <- as.factor(d.combined$ethnicity_factor)
clean sex data
# preallocate sex factor variable w/ NAs
d.combined$sex_factor <- rep(NA, dim(d.combined)[1])
# assign sex factors
d.combined$sex_factor[which(d.combined$Sex == 0)] <- "Male"
d.combined$sex_factor[which(d.combined$Sex == 1)] <- "Female"
# convert variable to factor
d.combined$sex_factor <- as.factor(d.combined$sex_factor)
compute self-esteem scale
# convert self-esteem items that are not on 1-5 scale to NA (missing data)
if (sum(d.combined[,grep("PF*", colnames(d.combined))] > 5) > 0) {
for (i in grep("PF*", colnames(d.combined))) {
for (j in 1:nrow(d.combined)) {
if (d.combined[j,i] > 5) {
d.combined[j,i] <- NA
}
}
}
}
# no reverse coding in self-esteem scale - compute raw sum score
d.combined$SE_score <- rowMeans(d.combined[,grep("PF*", colnames(d.combined))], na.rm = TRUE)
compute depression scale
# convert self-esteem items that are not on 0-3 scale to NA (missing data)
if (sum(d.combined[,grep("FS", colnames(d.combined))] > 3) > 0) {
for (i in grep("FS", colnames(d.combined))) {
for (j in 1:nrow(d.combined)) {
if (d.combined[j,i] > 3) {
d.combined[j,i] <- NA
}
}
}
}
# handle reverse coded items (4, 8, 11, 15)
d.combined$FS4 <- 3 - d.combined$FS4
d.combined$FS8 <- 3 - d.combined$FS8
d.combined$FS11 <- 3 - d.combined$FS11
d.combined$FS15 <- 3 - d.combined$FS15
# compute sum score
d.combined$DEP_score <- rowMeans(d.combined[,grep("FS", colnames(d.combined))], na.rm = TRUE)
compute age at assessment
# convert dates to Posix format
d.combined$BirthDate_posix <- as.POSIXct(as.character(d.combined$BirthDate), format = "%m/%d/%Y")
d.combined$SurveyDate_posix <- as.POSIXct(as.character(d.combined$SurveyDate), format = "%m/%d/%Y")
# compute age in years
d.combined$age <- as.numeric(difftime(d.combined$SurveyDate_posix, d.combined$BirthDate_posix, units = "days") / 365)
assign Fall and Spring semester categories
# preallocate semester factor variable w/ NAs
d.combined$semester_factor <- rep(NA, dim(d.combined)[1])
# assign semester categories
d.combined$semester_factor[which(month(d.combined$SurveyDate_posix) >= 1 & month(d.combined$SurveyDate_posix) < 6)] <- "Spring"
d.combined$semester_factor[which(month(d.combined$SurveyDate_posix) >= 7 & month(d.combined$SurveyDate_posix) <= 12)] <- "Fall"
# convert to factor
d.combined$semester_factor <- as.factor(d.combined$semester_factor)
generate summary table
row_names <- c("Male", "Female", "White", "Latino", "African American", "Asian American", "American Indian", "< $20,000", "$20,000-$49,999", "$50,000-$74,999", "$75,000-$100,000", "> $100,000")
Spring_data <- d.combined[which(d.combined$semester_factor == "Spring"),]
Spring_data.num <- c(sum(Spring_data$sex_factor == "Male"),
sum(Spring_data$sex_factor == "Female"),
sum(Spring_data$ethnicity_factor == "White", na.rm = TRUE),
sum(Spring_data$ethnicity_factor == "Latino", na.rm = TRUE),
sum(Spring_data$ethnicity_factor == "African American", na.rm = TRUE),
sum(Spring_data$ethnicity_factor == "Asian American", na.rm = TRUE),
sum(Spring_data$ethnicity_factor == "American Indian", na.rm = TRUE),
sum(Spring_data$income_cat == 1),
sum(Spring_data$income_cat == 2),
sum(Spring_data$income_cat == 3),
sum(Spring_data$income_cat == 4),
sum(Spring_data$income_cat == 5)
)
Spring_data.Percentages <- round(Spring_data.num / nrow(Spring_data) * 100,2)
Fall_data <- d.combined[which(d.combined$semester_factor == "Fall"),]
Fall_data.num <- c(sum(Fall_data$sex_factor == "Male"),
sum(Fall_data$sex_factor == "Female"),
sum(Fall_data$ethnicity_factor == "White", na.rm = TRUE),
sum(Fall_data$ethnicity_factor == "Latino", na.rm = TRUE),
sum(Fall_data$ethnicity_factor == "African American", na.rm = TRUE),
sum(Fall_data$ethnicity_factor == "Asian American", na.rm = TRUE),
sum(Fall_data$ethnicity_factor == "American Indian", na.rm = TRUE),
sum(Fall_data$income_cat == 1),
sum(Fall_data$income_cat == 2),
sum(Fall_data$income_cat == 3),
sum(Fall_data$income_cat == 4),
sum(Fall_data$income_cat == 5)
)
Fall_data.Percentages <- round(Fall_data.num / nrow(Fall_data) * 100,2)
# table part 1: gender, ethnicity, parent income
Summary_table <- data.frame(Spring_data.num, Spring_data.Percentages, Fall_data.num, Fall_data.Percentages)
rownames(Summary_table) <- row_names
# table part 2: age, depression score, self-esteem score
Spring.M <- c(round(mean(Spring_data$age),2),
round(mean(Spring_data$DEP_score),2),
round(mean(Spring_data$SE_score),2))
Spring.Range <- c(round(range(Spring_data$age),2),
round(range(Spring_data$DEP_score),2),
round(range(Spring_data$SE_score),2))
Fall.M <- c(round(mean(Fall_data$age),2),
round(mean(Fall_data$DEP_score),2),
round(mean(Fall_data$SE_score),2))
Fall.Range <- c(round(range(Fall_data$age),2),
round(range(Fall_data$DEP_score),2),
round(range(Fall_data$SE_score),2))
print(Summary_table)
## Spring_data.num Spring_data.Percentages Fall_data.num
## Male 220 51.89 239
## Female 204 48.11 249
## White 134 31.60 167
## Latino 67 15.80 72
## African American 108 25.47 111
## Asian American 55 12.97 64
## American Indian 59 13.92 73
## < $20,000 66 15.57 103
## $20,000-$49,999 124 29.25 134
## $50,000-$74,999 105 24.76 109
## $75,000-$100,000 110 25.94 121
## > $100,000 19 4.48 21
## Fall_data.Percentages
## Male 48.98
## Female 51.02
## White 34.22
## Latino 14.75
## African American 22.75
## Asian American 13.11
## American Indian 14.96
## < $20,000 21.11
## $20,000-$49,999 27.46
## $50,000-$74,999 22.34
## $75,000-$100,000 24.80
## > $100,000 4.30