RAssignment1Code

#1

install.packages("haven")

## Warning: package 'haven' is in use and will not be installed

library("haven")

read_dta("AddHealth_w1_simple.dta")

## # A tibble: 6,504 × 52
##    id    intmonth intyear  sch_yr  sex     birthmonth birthyear grade    chores 
##    <chr> <dbl+lb> <dbl+lb> <dbl+l> <dbl+l> <dbl+lbl>  <dbl+lbl> <dbl+lb> <dbl+l>
##  1 5710… 6 [(6) … 95 [(95… 1 [(1)… 2 [(2)… 10 [(10) … 77 [(77)… NA       2 [(2)…
##  2 5710… 5 [(5) … 95 [(95… 1 [(1)… 2 [(2)… 11 [(11) … 76 [(76)… 11 [(11… 1 [(1)…
##  3 5710… 6 [(6) … 95 [(95… 0 [(0)… 1 [(1)… 10 [(10) … 79 [(79)… 10 [(10… 1 [(1)…
##  4 5710… 7 [(7) … 95 [(95… 0 [(0)… 1 [(1)…  1 [(1) J… 77 [(77)… 12 [(12… 2 [(2)…
##  5 5710… 7 [(7) … 95 [(95… 1 [(1)… 2 [(2)…  6 [(6) J… 76 [(76)… 12 [(12… 2 [(2)…
##  6 5710… 6 [(6) … 95 [(95… 1 [(1)… 1 [(1)… 12 [(12) … 81 [(81)…  7 [(7)… 1 [(1)…
##  7 5710… 5 [(5) … 95 [(95… 1 [(1)… 1 [(1)… 10 [(10) … 83 [(83)…  7 [(7)… 3 [(3)…
##  8 5710… 6 [(6) … 95 [(95… 1 [(1)… 1 [(1)…  3 [(3) M… 81 [(81)…  8 [(8)… 1 [(1)…
##  9 5711… 6 [(6) … 95 [(95… 0 [(0)… 1 [(1)…  9 [(9) S… 81 [(81)…  8 [(8)… 2 [(2)…
## 10 5711… 8 [(8) … 95 [(95… 0 [(0)… 1 [(1)…  6 [(6) J… 81 [(81)…  8 [(8)… 1 [(1)…
## # ℹ 6,494 more rows
## # ℹ 43 more variables: tvhrs <dbl+lbl>, sleep <dbl+lbl>, sch_skip <dbl+lbl>,
## #   suspended <dbl+lbl>, sch_focus <dbl+lbl>, sr_intel <dbl+lbl>,
## #   H1TO30 <dbl+lbl>, H1TO53 <dbl+lbl>, H1PA7 <dbl+lbl>, nghbrs <dbl+lbl>,
## #   coll_desire <dbl+lbl>, coll_likely <dbl+lbl>, S1 <dbl+lbl>, S3 <dbl+lbl>,
## #   S10A <dbl+lbl>, S10B <dbl+lbl>, S44 <dbl+lbl>, S45A <dbl+lbl>,
## #   S45B <dbl+lbl>, S45C <dbl+lbl>, S45D <dbl+lbl>, S45E <dbl+lbl>, …

#2

##Variables: 52 columns x 3 variables = 156 variables
##observations: thousands of observations

##it seems like the data set is talking about general characteristics like sex, ethnicity, happiness, etc. And the digits correlate to what is in the article that determine individual lifestyle.


#3

#nominal: sex
#ordinal: birth month
#interval-ratio: H1EE8 (Allowance each week)

#4
library(haven)
library(dplyr)

d1 <- read_dta("AddHealth_w1_simple.dta")

d1 <- d1 %>%
  mutate(tv10orless = case_when(
    tvhrs <= 10 ~ "Watches 10 hours or less per week",
    tvhrs > 10 ~ "Watches more than 10 hours per week",
    TRUE ~ NA_character_
  )) %>%
  mutate(tv10orless = factor(
    tv10orless,
    levels = c("Watches 10 hours or less per week",
               "Watches more than 10 hours per week"),
    ordered = TRUE
  ))

table(d1$tv10orless, useNA = "ifany")

## 
##   Watches 10 hours or less per week Watches more than 10 hours per week 
##                                3221                                3256 
##                                <NA> 
##                                  27

##note, original code wasn't detecting tvhrs for some reason, this is the fixed code 


#b

library(dplyr)

d1 <- d1 %>%
  mutate(p1_edu2 = case_when(
    p1_edu %in% c(1, 2) ~ "Less than a high school diploma",
    p1_edu %in% c(3)    ~ "High school diploma or GED",
    p1_edu %in% c(4, 5) ~ "Some postsecondary education",
    p1_edu %in% c(6, 7) ~ "College degree or higher",
    TRUE ~ NA_character_
  )) %>%
  mutate(p1_edu2 = factor(
    p1_edu2,
    levels = c(
      "Less than a high school diploma",
      "High school diploma or GED",
      "Some postsecondary education",
      "College degree or higher"
    ),
    ordered = TRUE
  ))


table(d1$p1_edu2, useNA = "ifany")

## 
## Less than a high school diploma      High school diploma or GED 
##                             824                              34 
##    Some postsecondary education        College degree or higher 
##                            1692                            1670 
##                            <NA> 
##                            2284

##fixed code again, R was not detecting p1_edu

#5

library (dplyr)

d1 <- d1 %>%
  mutate(age = intyear - birthyear)

summary (d1$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   12.00   15.00   16.00   16.04   17.00   21.00       3

##R are detecting d1 variables now, not quite sure why it wasn't working earlier.

#6
summary(d1$PA55)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0    22.0    40.0    47.7    60.0   999.0    1575

##summary says everything


library(haven)   
library(ggplot2)

d1 <- read_dta("AddHealth_w1_simple.dta")

d1$PA55 <- as.numeric(d1$PA55)

income <- d1$PA55[d1$PA55 > 0 & !is.na(d1$PA55)]

mean_income <- mean(income)
sd_income <- sd(income)

ggplot(data.frame(income), aes(x = income)) +
  geom_histogram(aes(y = ..density..),
                 bins = 30,
                 fill = "green",
                 color = "black") +
 
  theme_minimal()

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##fixed code again by ai, code kept breaking
##The family income is positively skewed as shown by the histogram, signifying that while most make a decent, moderate income, there are a very select few that make substantially more than the rest of the population.


#b.

library(haven)
library(ggplot2)

d1 <- read_dta("AddHealth_w1_simple.dta")

d1$sleep <- as.numeric(d1$sleep)

sleep_data <- d1$sleep[d1$sleep > 0 & !is.na(d1$sleep)]

mean_sleep <- mean(sleep_data)
sd_sleep <- sd(sleep_data)

ggplot(data.frame(sleep_data), aes(x = sleep_data)) +
  geom_histogram(aes(y = ..density..),
                 bins = 20,
                 fill = "blue",
                 color = "black") +

  labs(
    title = "Typical Hours of Sleep per Night",
    x = "Hours of Sleep per Night",
    y = "Density"
  ) +
  theme_minimal()

##just rewrote previous code. 
##For the typical hours of sleep per night, it seems that the data is normally distributed, signifying that the average amount of sleep is around 8 hours for most adults.

#c
library(haven)
library(dplyr)

d1 <- read_dta("AddHealth_w1_simple.dta")


#(Age at time of interview = interview year - birth year)
d1 <- d1 %>%
  mutate(
    birthyear = as.numeric(birthyear),
    intyear = as.numeric(intyear),
    age = intyear - birthyear
  )

#ages for respondents who reported Household Income (PA55) ---
income_age <- d1 %>%
  filter(!is.na(PA55) & !is.na(age)) %>%
  summarise(
    mean_age = round(mean(age, na.rm = TRUE), 1),
    min_age = min(age, na.rm = TRUE),
    max_age = max(age, na.rm = TRUE),
    count = n()
  )

#Ages for respondents who reported typical hours of sleep
sleep_age <- d1 %>%
  filter(!is.na(sleep) & !is.na(age)) %>%
  summarise(
    mean_age = round(mean(age, na.rm = TRUE), 1),
    min_age = min(age, na.rm = TRUE),
    max_age = max(age, na.rm = TRUE),
    count = n()
  )

cat("Age distribution for respondents reporting Household Income (PA55):\n")

## Age distribution for respondents reporting Household Income (PA55):

print(income_age)

## # A tibble: 1 × 4
##   mean_age min_age max_age count
##      <dbl>   <dbl>   <dbl> <int>
## 1     15.9      12      21  4927

cat("\nAge distribution for respondents reporting Typical Hours of Sleep (sleep):\n")

## 
## Age distribution for respondents reporting Typical Hours of Sleep (sleep):

print(sleep_age)

## # A tibble: 1 × 4
##   mean_age min_age max_age count
##      <dbl>   <dbl>   <dbl> <int>
## 1       16      12      21  6477

##most respondents were around 16 years of age when responding to this question.

#7

library(haven)
d1 <- read_dta("AddHealth_w1_simple.dta")

names(d1)

##  [1] "id"          "intmonth"    "intyear"     "sch_yr"      "sex"        
##  [6] "birthmonth"  "birthyear"   "grade"       "chores"      "tvhrs"      
## [11] "sleep"       "sch_skip"    "suspended"   "sch_focus"   "sr_intel"   
## [16] "H1TO30"      "H1TO53"      "H1PA7"       "nghbrs"      "coll_desire"
## [21] "coll_likely" "S1"          "S3"          "S10A"        "S10B"       
## [26] "S44"         "S45A"        "S45B"        "S45C"        "S45D"       
## [31] "S45E"        "S45F"        "S47"         "S49"         "S50"        
## [36] "p1_edu"      "p1_work"     "p1_happy"    "p1_benes"    "PA55"       
## [41] "AH_PVT"      "AH_RAW"      "hisp"        "yob"         "white"      
## [46] "black"       "natam"       "asian"       "race"        "milk"       
## [51] "health"      "famst8"

d1$suspended <- as.numeric(d1$suspended)

valid_data <- d1$suspended[!is.na(d1$suspended)]

percent_suspended <- mean(valid_data == 1) * 100

percent_suspended

## [1] 27.75894

##percent = 27.76% -> 28% of respondents were suspended at least once during their adolescent period.

library(haven)
library(dplyr)
library(ggplot2)

d1 <- read_dta("AddHealth_w1_simple.dta")

table(d1$S10B, useNA = "ifany")

## 
##    1    2    3    4    5 <NA> 
## 1063 1357  923  513  171 2477

d1 <- d1 %>%
  mutate(math_grade = case_when(
    S10B == 1 ~ "A",
    S10B == 2 ~ "B",
    S10B == 3 ~ "C",
    S10B == 4 ~ "D or F",
    S10B == 6 ~ "Not in school",
    TRUE ~ NA_character_
  ))
#1=A, 2=B, 3=C, 4=D/F, 6=Not in school
##fixed code here
grade_distribution <- d1 %>%
  filter(!is.na(math_grade)) %>%
  count(math_grade) %>%
  mutate(percentage = round(100 * n / sum(n), 1))

print(grade_distribution)

## # A tibble: 4 × 3
##   math_grade     n percentage
##   <chr>      <int>      <dbl>
## 1 A           1063       27.6
## 2 B           1357       35.2
## 3 C            923       23.9
## 4 D or F       513       13.3

##it appears in the distribution, that only a small portion of the population (13.3%) had poor grades, whereas the remaining (86.7%) reached passing/exemplary grades.

library(haven)
library(dplyr)
library(ggplot2)


d1 <- read_dta("UPDATED_AddHealth_w1_simple.dta")


#1 = Yes (suspended at least once)
#0 = No (never suspended)
d1 <- d1 %>%
  mutate(suspended = case_when(
    S10B == 1 ~ "Suspended",
    S10B == 0 ~ "Not suspended",
    TRUE ~ NA_character_
  ))


d1_clean <- d1 %>%
  filter(!is.na(suspended))


freq_table <- d1_clean %>%
  count(suspended) %>%
  mutate(percentage = round(100 * n / sum(n), 1))


print(freq_table)

## # A tibble: 1 × 3
##   suspended     n percentage
##   <chr>     <int>      <dbl>
## 1 Suspended  1063        100

ggplot(freq_table, aes(x = suspended, y = percentage, fill = suspended)) +
  geom_col(color = "black") +
  geom_text(aes(label = paste0(percentage, "%")),
            vjust = -0.5, size = 4) +
  labs(
    title = "Percentage of Students Suspended vs. Not Suspended",
    x = "Suspension Status",
    y = "Percentage of Respondents"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

##Ok, I am very confused as I am unable to find the variable where there are no suspended.Looks like everyone got suspended once, but seemed to do just fine according to Q8.

RAssignment1Code

me

2025-10-24