#1
install.packages("haven")
## Warning: package 'haven' is in use and will not be installed
library("haven")
read_dta("AddHealth_w1_simple.dta")
## # A tibble: 6,504 × 52
## id intmonth intyear sch_yr sex birthmonth birthyear grade chores
## <chr> <dbl+lb> <dbl+lb> <dbl+l> <dbl+l> <dbl+lbl> <dbl+lbl> <dbl+lb> <dbl+l>
## 1 5710… 6 [(6) … 95 [(95… 1 [(1)… 2 [(2)… 10 [(10) … 77 [(77)… NA 2 [(2)…
## 2 5710… 5 [(5) … 95 [(95… 1 [(1)… 2 [(2)… 11 [(11) … 76 [(76)… 11 [(11… 1 [(1)…
## 3 5710… 6 [(6) … 95 [(95… 0 [(0)… 1 [(1)… 10 [(10) … 79 [(79)… 10 [(10… 1 [(1)…
## 4 5710… 7 [(7) … 95 [(95… 0 [(0)… 1 [(1)… 1 [(1) J… 77 [(77)… 12 [(12… 2 [(2)…
## 5 5710… 7 [(7) … 95 [(95… 1 [(1)… 2 [(2)… 6 [(6) J… 76 [(76)… 12 [(12… 2 [(2)…
## 6 5710… 6 [(6) … 95 [(95… 1 [(1)… 1 [(1)… 12 [(12) … 81 [(81)… 7 [(7)… 1 [(1)…
## 7 5710… 5 [(5) … 95 [(95… 1 [(1)… 1 [(1)… 10 [(10) … 83 [(83)… 7 [(7)… 3 [(3)…
## 8 5710… 6 [(6) … 95 [(95… 1 [(1)… 1 [(1)… 3 [(3) M… 81 [(81)… 8 [(8)… 1 [(1)…
## 9 5711… 6 [(6) … 95 [(95… 0 [(0)… 1 [(1)… 9 [(9) S… 81 [(81)… 8 [(8)… 2 [(2)…
## 10 5711… 8 [(8) … 95 [(95… 0 [(0)… 1 [(1)… 6 [(6) J… 81 [(81)… 8 [(8)… 1 [(1)…
## # ℹ 6,494 more rows
## # ℹ 43 more variables: tvhrs <dbl+lbl>, sleep <dbl+lbl>, sch_skip <dbl+lbl>,
## # suspended <dbl+lbl>, sch_focus <dbl+lbl>, sr_intel <dbl+lbl>,
## # H1TO30 <dbl+lbl>, H1TO53 <dbl+lbl>, H1PA7 <dbl+lbl>, nghbrs <dbl+lbl>,
## # coll_desire <dbl+lbl>, coll_likely <dbl+lbl>, S1 <dbl+lbl>, S3 <dbl+lbl>,
## # S10A <dbl+lbl>, S10B <dbl+lbl>, S44 <dbl+lbl>, S45A <dbl+lbl>,
## # S45B <dbl+lbl>, S45C <dbl+lbl>, S45D <dbl+lbl>, S45E <dbl+lbl>, …
#2
##Variables: 52 columns x 3 variables = 156 variables
##observations: thousands of observations
##it seems like the data set is talking about general characteristics like sex, ethnicity, happiness, etc. And the digits correlate to what is in the article that determine individual lifestyle.
#3
#nominal: sex
#ordinal: birth month
#interval-ratio: H1EE8 (Allowance each week)
#4
library(haven)
library(dplyr)
d1 <- read_dta("AddHealth_w1_simple.dta")
d1 <- d1 %>%
mutate(tv10orless = case_when(
tvhrs <= 10 ~ "Watches 10 hours or less per week",
tvhrs > 10 ~ "Watches more than 10 hours per week",
TRUE ~ NA_character_
)) %>%
mutate(tv10orless = factor(
tv10orless,
levels = c("Watches 10 hours or less per week",
"Watches more than 10 hours per week"),
ordered = TRUE
))
table(d1$tv10orless, useNA = "ifany")
##
## Watches 10 hours or less per week Watches more than 10 hours per week
## 3221 3256
## <NA>
## 27
##note, original code wasn't detecting tvhrs for some reason, this is the fixed code
#b
library(dplyr)
d1 <- d1 %>%
mutate(p1_edu2 = case_when(
p1_edu %in% c(1, 2) ~ "Less than a high school diploma",
p1_edu %in% c(3) ~ "High school diploma or GED",
p1_edu %in% c(4, 5) ~ "Some postsecondary education",
p1_edu %in% c(6, 7) ~ "College degree or higher",
TRUE ~ NA_character_
)) %>%
mutate(p1_edu2 = factor(
p1_edu2,
levels = c(
"Less than a high school diploma",
"High school diploma or GED",
"Some postsecondary education",
"College degree or higher"
),
ordered = TRUE
))
table(d1$p1_edu2, useNA = "ifany")
##
## Less than a high school diploma High school diploma or GED
## 824 34
## Some postsecondary education College degree or higher
## 1692 1670
## <NA>
## 2284
##fixed code again, R was not detecting p1_edu
#5
library (dplyr)
d1 <- d1 %>%
mutate(age = intyear - birthyear)
summary (d1$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 12.00 15.00 16.00 16.04 17.00 21.00 3
##R are detecting d1 variables now, not quite sure why it wasn't working earlier.
#6
summary(d1$PA55)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 22.0 40.0 47.7 60.0 999.0 1575
##summary says everything
library(haven)
library(ggplot2)
d1 <- read_dta("AddHealth_w1_simple.dta")
d1$PA55 <- as.numeric(d1$PA55)
income <- d1$PA55[d1$PA55 > 0 & !is.na(d1$PA55)]
mean_income <- mean(income)
sd_income <- sd(income)
ggplot(data.frame(income), aes(x = income)) +
geom_histogram(aes(y = ..density..),
bins = 30,
fill = "green",
color = "black") +
theme_minimal()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##fixed code again by ai, code kept breaking
##The family income is positively skewed as shown by the histogram, signifying that while most make a decent, moderate income, there are a very select few that make substantially more than the rest of the population.
#b.
library(haven)
library(ggplot2)
d1 <- read_dta("AddHealth_w1_simple.dta")
d1$sleep <- as.numeric(d1$sleep)
sleep_data <- d1$sleep[d1$sleep > 0 & !is.na(d1$sleep)]
mean_sleep <- mean(sleep_data)
sd_sleep <- sd(sleep_data)
ggplot(data.frame(sleep_data), aes(x = sleep_data)) +
geom_histogram(aes(y = ..density..),
bins = 20,
fill = "blue",
color = "black") +
labs(
title = "Typical Hours of Sleep per Night",
x = "Hours of Sleep per Night",
y = "Density"
) +
theme_minimal()

##just rewrote previous code.
##For the typical hours of sleep per night, it seems that the data is normally distributed, signifying that the average amount of sleep is around 8 hours for most adults.
#c
library(haven)
library(dplyr)
d1 <- read_dta("AddHealth_w1_simple.dta")
#(Age at time of interview = interview year - birth year)
d1 <- d1 %>%
mutate(
birthyear = as.numeric(birthyear),
intyear = as.numeric(intyear),
age = intyear - birthyear
)
#ages for respondents who reported Household Income (PA55) ---
income_age <- d1 %>%
filter(!is.na(PA55) & !is.na(age)) %>%
summarise(
mean_age = round(mean(age, na.rm = TRUE), 1),
min_age = min(age, na.rm = TRUE),
max_age = max(age, na.rm = TRUE),
count = n()
)
#Ages for respondents who reported typical hours of sleep
sleep_age <- d1 %>%
filter(!is.na(sleep) & !is.na(age)) %>%
summarise(
mean_age = round(mean(age, na.rm = TRUE), 1),
min_age = min(age, na.rm = TRUE),
max_age = max(age, na.rm = TRUE),
count = n()
)
cat("Age distribution for respondents reporting Household Income (PA55):\n")
## Age distribution for respondents reporting Household Income (PA55):
print(income_age)
## # A tibble: 1 × 4
## mean_age min_age max_age count
## <dbl> <dbl> <dbl> <int>
## 1 15.9 12 21 4927
cat("\nAge distribution for respondents reporting Typical Hours of Sleep (sleep):\n")
##
## Age distribution for respondents reporting Typical Hours of Sleep (sleep):
print(sleep_age)
## # A tibble: 1 × 4
## mean_age min_age max_age count
## <dbl> <dbl> <dbl> <int>
## 1 16 12 21 6477
##most respondents were around 16 years of age when responding to this question.
#7
library(haven)
d1 <- read_dta("AddHealth_w1_simple.dta")
names(d1)
## [1] "id" "intmonth" "intyear" "sch_yr" "sex"
## [6] "birthmonth" "birthyear" "grade" "chores" "tvhrs"
## [11] "sleep" "sch_skip" "suspended" "sch_focus" "sr_intel"
## [16] "H1TO30" "H1TO53" "H1PA7" "nghbrs" "coll_desire"
## [21] "coll_likely" "S1" "S3" "S10A" "S10B"
## [26] "S44" "S45A" "S45B" "S45C" "S45D"
## [31] "S45E" "S45F" "S47" "S49" "S50"
## [36] "p1_edu" "p1_work" "p1_happy" "p1_benes" "PA55"
## [41] "AH_PVT" "AH_RAW" "hisp" "yob" "white"
## [46] "black" "natam" "asian" "race" "milk"
## [51] "health" "famst8"
d1$suspended <- as.numeric(d1$suspended)
valid_data <- d1$suspended[!is.na(d1$suspended)]
percent_suspended <- mean(valid_data == 1) * 100
percent_suspended
## [1] 27.75894
##percent = 27.76% -> 28% of respondents were suspended at least once during their adolescent period.
library(haven)
library(dplyr)
library(ggplot2)
d1 <- read_dta("AddHealth_w1_simple.dta")
table(d1$S10B, useNA = "ifany")
##
## 1 2 3 4 5 <NA>
## 1063 1357 923 513 171 2477
d1 <- d1 %>%
mutate(math_grade = case_when(
S10B == 1 ~ "A",
S10B == 2 ~ "B",
S10B == 3 ~ "C",
S10B == 4 ~ "D or F",
S10B == 6 ~ "Not in school",
TRUE ~ NA_character_
))
#1=A, 2=B, 3=C, 4=D/F, 6=Not in school
##fixed code here
grade_distribution <- d1 %>%
filter(!is.na(math_grade)) %>%
count(math_grade) %>%
mutate(percentage = round(100 * n / sum(n), 1))
print(grade_distribution)
## # A tibble: 4 × 3
## math_grade n percentage
## <chr> <int> <dbl>
## 1 A 1063 27.6
## 2 B 1357 35.2
## 3 C 923 23.9
## 4 D or F 513 13.3
##it appears in the distribution, that only a small portion of the population (13.3%) had poor grades, whereas the remaining (86.7%) reached passing/exemplary grades.
library(haven)
library(dplyr)
library(ggplot2)
d1 <- read_dta("UPDATED_AddHealth_w1_simple.dta")
#1 = Yes (suspended at least once)
#0 = No (never suspended)
d1 <- d1 %>%
mutate(suspended = case_when(
S10B == 1 ~ "Suspended",
S10B == 0 ~ "Not suspended",
TRUE ~ NA_character_
))
d1_clean <- d1 %>%
filter(!is.na(suspended))
freq_table <- d1_clean %>%
count(suspended) %>%
mutate(percentage = round(100 * n / sum(n), 1))
print(freq_table)
## # A tibble: 1 × 3
## suspended n percentage
## <chr> <int> <dbl>
## 1 Suspended 1063 100
ggplot(freq_table, aes(x = suspended, y = percentage, fill = suspended)) +
geom_col(color = "black") +
geom_text(aes(label = paste0(percentage, "%")),
vjust = -0.5, size = 4) +
labs(
title = "Percentage of Students Suspended vs. Not Suspended",
x = "Suspension Status",
y = "Percentage of Respondents"
) +
theme_minimal() +
theme(legend.position = "none")

##Ok, I am very confused as I am unable to find the variable where there are no suspended.Looks like everyone got suspended once, but seemed to do just fine according to Q8.