Set Up HW 1

Use the following dataset: NYCH2004.sas7bdat

Load libraries

  library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
 library(labelled)

Load data

 NYCH2004 <- haven::read_sas("NYCH2004.sas7bdat") 

Take a look at the data

 glimpse(NYCH2004)

For this assignment, you will need the following variables:

SP_ID, riagendr, riaageyr, race_eth, SFQ180, SMQ020, SMQ040, WHQ025L, WHQ030

Homework Questions

Question 1

Create a dataset that only has the variables listed above. Show the first 6 rows of this dataset.

 NYCH2004_sub <- NYCH2004 %>%
  select(SP_ID, riagendr, riaageyr, race_eth, SFQ180, SMQ020, SMQ040, WHQ025L, WHQ030)
  head(NYCH2004)
## # A tibble: 6 × 438
##    SP_ID TCQ010 TCQ015 TCQ020 TCQ030G TCQ030Q TCQ030U OCQ152 OCQ180 OCQ210
##    <dbl>  <dbl>  <dbl>  <dbl>   <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
## 1 100230      1     NA      2      NA      NA      NA      1     40     NA
## 2 100243      2      1     NA      NA      NA      NA      1     43     NA
## 3 100270      1     NA      9      NA      NA      NA      4     NA     NA
## 4 100597      2      1     NA      NA      NA      NA      1     50     NA
## 5 101166      1     NA      2      NA      NA      NA      4     NA     NA
## 6 101285      1     NA      2      NA      NA      NA      1     55     NA
## # ℹ 428 more variables: OCQ260 <dbl>, OCQ280 <dbl>, OCQ290G <dbl>,
## #   OCQ290Q <dbl>, OCQ380 <dbl>, SMQ020 <dbl>, SMQ030G <dbl>, SMQ030Q <dbl>,
## #   SMQ040 <dbl>, SMQ050G <dbl>, SMQ050Q <dbl>, SMQ050U <dbl>, SMQ055 <dbl>,
## #   SMQ057 <dbl>, SMQ070 <dbl>, SMQ075 <dbl>, SMQ077 <dbl>, SMQ641 <dbl>,
## #   SMQ650 <dbl>, PAQ020 <dbl>, PAQ050G <dbl>, PAQ050Q <dbl>, PAQ050U <dbl>,
## #   PAQ080G <dbl>, PAQ080Q <dbl>, PAQ080U <dbl>, PAQ180 <dbl>, PAQ206 <dbl>,
## #   PAQ280G <dbl>, PAQ280Q <dbl>, PAQ280U <dbl>, PAQ300G <dbl>, …

Question 2

A. Label variable names something that makes sense. # note: WHQ025L is a variable in the dataset but it is not in the codebook; use the codebook information for WHQ025 instead.

NYCH2004_sub <- NYCH2004_sub %>%
  rename(ID = SP_ID, 
         Gender = riagendr, 
         Respondent_Age = riaageyr, 
         Race = race_eth,  
         Marital_Status = SFQ180,
         First_smoke_age = SMQ020,
         Smoking_status = SMQ040,
         Naked_Weight_lb = WHQ025L,
         SSWeight = WHQ030)


glimpse(NYCH2004_sub)
## Rows: 1,999
## Columns: 9
## $ ID              <dbl> 100230, 100243, 100270, 100597, 101166, 101285, 101720…
## $ Gender          <dbl> 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, …
## $ Respondent_Age  <dbl> 59, 25, 76, 29, 48, 60, 23, 58, 22, 47, 60, 40, 24, 26…
## $ Race            <dbl> 1, 5, 4, 1, 2, 1, 4, 1, 4, 1, 1, 4, 3, 4, 4, 4, 4, 4, …
## $ Marital_Status  <dbl> 1, 1, 2, 6, 5, 1, 1, 1, 5, 1, 4, 5, 5, 1, 1, 5, 1, 1, …
## $ First_smoke_age <dbl> 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ Smoking_status  <dbl> NA, NA, NA, 2, 2, 3, 3, 3, 3, NA, NA, NA, NA, NA, NA, …
## $ Naked_Weight_lb <dbl> 215, 145, 222, 241, 134, NA, 120, 128, 290, 115, 220, …
## $ SSWeight        <dbl> 3, 2, 3, 1, 3, 1, 2, 1, 1, 3, 1, 1, 2, 1, 3, 1, 3, 1, …

B. Label values in variables

 NYCH2004_sub <- NYCH2004_sub %>%
  set_value_labels(Gender = c("Male" = 1, "Female" = 2), 
                   Race = c("NH White" = 1, "NH Black" = 2, "Hispanic" = 3, "Asian" = 4, "Other" = 5), 
                   Marital_Status = c("Married"=1, "Widowed"=2, "Divorced"=3, "Separated"=4, "Never_Married"=5, "Living With a       
                                      Partner"=6), 
                   First_smoke_age = c("Yes" = 1, "No" = 2), 
                   Smoking_status = c("Everyday"=1, "Some days"=2, "Not at all"=3),
                   SSWeight = c("Overweight" = 1, "Underweight" = 2, "About the right weight" = 3))

glimpse(NYCH2004_sub)                                                
## Rows: 1,999
## Columns: 9
## $ ID              <dbl> 100230, 100243, 100270, 100597, 101166, 101285, 101720…
## $ Gender          <dbl+lbl> 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2,…
## $ Respondent_Age  <dbl> 59, 25, 76, 29, 48, 60, 23, 58, 22, 47, 60, 40, 24, 26…
## $ Race            <dbl+lbl> 1, 5, 4, 1, 2, 1, 4, 1, 4, 1, 1, 4, 3, 4, 4, 4, 4,…
## $ Marital_Status  <dbl+lbl> 1, 1, 2, 6, 5, 1, 1, 1, 5, 1, 4, 5, 5, 1, 1, 5, 1,…
## $ First_smoke_age <dbl+lbl> 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ Smoking_status  <dbl+lbl> NA(_), NA(_), NA(_),     2,     2,     3,     3,  …
## $ Naked_Weight_lb <dbl> 215, 145, 222, 241, 134, NA, 120, 128, 290, 115, 220, …
## $ SSWeight        <dbl+lbl> 3, 2, 3, 1, 3, 1, 2, 1, 1, 3, 1, 1, 2, 1, 3, 1, 3,…

C. Convert anything that should be a factor into a factor

NYCH2004_sub <- NYCH2004_sub %>%
  mutate_if(is.labelled, to_factor)

glimpse(NYCH2004_sub)
## Rows: 1,999
## Columns: 9
## $ ID              <dbl> 100230, 100243, 100270, 100597, 101166, 101285, 101720…
## $ Gender          <fct> Male, Male, Female, Male, Female, Female, Male, Female…
## $ Respondent_Age  <dbl> 59, 25, 76, 29, 48, 60, 23, 58, 22, 47, 60, 40, 24, 26…
## $ Race            <fct> NH White, Other, Asian, NH White, NH Black, NH White, …
## $ Marital_Status  <fct> Married, Married, Widowed, Living With a       
##       …
## $ First_smoke_age <fct> No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, …
## $ Smoking_status  <fct> NA, NA, NA, Some days, Some days, Not at all, Not at a…
## $ Naked_Weight_lb <dbl> 215, 145, 222, 241, 134, NA, 120, 128, 290, 115, 220, …
## $ SSWeight        <fct> About the right weight, Underweight, About the right w…

Question 3

How many females are in this dataset?

 NYCH2004_sub %>%
  filter(Gender == "Female") %>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1168

ANSWER:1168

Question 4

Are there more Hispanic males or females in this dataset? Use a graph to answer this question.

 NYCH2004_sub %>%
  na.omit() %>%
  ggplot(aes(x = Gender)) +
  geom_bar() +
  facet_wrap(~Race)

Answer: More Hispanic Males

Question 5

Are there more Hispanic males or females in this dataset? Use code to answer this question.

NYCH2004_sub %>%
  filter(Gender == "Male" & Race == "Hispanic") %>%
  count() ###114
## # A tibble: 1 × 1
##       n
##   <int>
## 1   114
NYCH2004_sub %>%
  filter(Gender == "Female" & Race == "Hispanic") %>%
  count() ###146
## # A tibble: 1 × 1
##       n
##   <int>
## 1   146
near(146-114,32)
## [1] TRUE

Answer:32

Question 6

How old is the oldest hispanic male and how much does he currently weigh?

 NYCH2004_sub %>%
  filter(Respondent_Age > 60 & Race == "Hispanic" & Gender == "Male" & Naked_Weight_lb > 0)
## # A tibble: 8 × 9
##       ID Gender Respondent_Age Race     Marital_Status First_smoke_age
##    <dbl> <fct>           <dbl> <fct>    <fct>          <fct>          
## 1 125280 Male               63 Hispanic Married        Yes            
## 2 126172 Male               68 Hispanic Married        Yes            
## 3 355661 Male               64 Hispanic Married        Yes            
## 4 369465 Male               62 Hispanic Separated      Yes            
## 5 566610 Male               68 Hispanic Married        No             
## 6 745850 Male               66 Hispanic Married        No             
## 7 763579 Male               80 Hispanic Married        Yes            
## 8 909195 Male               61 Hispanic Widowed        Yes            
## # ℹ 3 more variables: Smoking_status <fct>, Naked_Weight_lb <dbl>,
## #   SSWeight <fct>

Answer:80 and weighs 132lb

Question 7

Are there more married or divorced people who think they are overweight?

 NYCH2004_sub %>%
  filter(Marital_Status == "Married" & SSWeight == "Overweight")%>%
  count() 
## # A tibble: 1 × 1
##       n
##   <int>
## 1   464
#464
  
 NYCH2004_sub %>%
  filter(Marital_Status == "Divorced" & SSWeight == "Overweight")%>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1   102
 #102

Answer:More married people think they are overweight

Question 8

Make a bar graph that shows what respondents think about their current weight by gender. Remove anyone who responded that they “didn’t know” what they thought about their current weight. Order the bars so that underweight is first.

NYCH2004_sub %>%
  na.omit()%>%
  ggplot(aes(x = fct_relevel(SSWeight, "Underweight", "Overweight", "About the right weight"))) +
  geom_bar() + 
  facet_wrap(~Gender)

Question 9

See if there is anyone who answered “no” to ever smoking (smoked <100 cigs in lifetime) but answered “yes” to being a current smoker.

NYCH2004_sub %>%
  na.omit()%>%
  group_by(First_smoke_age == "No") %>%
  count(Smoking_status)
## # A tibble: 3 × 3
## # Groups:   First_smoke_age == "No" [1]
##   `First_smoke_age == "No"` Smoking_status     n
##   <lgl>                     <fct>          <int>
## 1 FALSE                     Everyday         296
## 2 FALSE                     Some days        153
## 3 FALSE                     Not at all       335
  near(296+153,449) 
## [1] TRUE

Answer:Yes, 449

Question 10

Did the oldest person in this dataset ever smoke?

  NYCH2004_sub %>%
  filter(First_smoke_age == "Yes") %>%
  arrange(desc(Respondent_Age)) %>%
  head()
## # A tibble: 6 × 9
##       ID Gender Respondent_Age Race     Marital_Status First_smoke_age
##    <dbl> <fct>           <dbl> <fct>    <fct>          <fct>          
## 1 707953 Male               89 NH Black Never_Married  Yes            
## 2 588840 Male               87 NH White Widowed        Yes            
## 3 106658 Male               86 NH Black Married        Yes            
## 4 231374 Female             86 NH White Widowed        Yes            
## 5 265679 Female             85 NH White Widowed        Yes            
## 6 643054 Female             85 NH White Widowed        Yes            
## # ℹ 3 more variables: Smoking_status <fct>, Naked_Weight_lb <dbl>,
## #   SSWeight <fct>

Answer:Yes