library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(labelled)
NYCH2004 <- haven::read_sas("NYCH2004.sas7bdat")
glimpse(NYCH2004)
SP_ID, riagendr, riaageyr, race_eth, SFQ180, SMQ020, SMQ040, WHQ025L, WHQ030
Create a dataset that only has the variables listed above. Show the first 6 rows of this dataset.
NYCH2004_sub <- NYCH2004 %>%
select(SP_ID, riagendr, riaageyr, race_eth, SFQ180, SMQ020, SMQ040, WHQ025L, WHQ030)
head(NYCH2004)
## # A tibble: 6 × 438
## SP_ID TCQ010 TCQ015 TCQ020 TCQ030G TCQ030Q TCQ030U OCQ152 OCQ180 OCQ210
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 100230 1 NA 2 NA NA NA 1 40 NA
## 2 100243 2 1 NA NA NA NA 1 43 NA
## 3 100270 1 NA 9 NA NA NA 4 NA NA
## 4 100597 2 1 NA NA NA NA 1 50 NA
## 5 101166 1 NA 2 NA NA NA 4 NA NA
## 6 101285 1 NA 2 NA NA NA 1 55 NA
## # ℹ 428 more variables: OCQ260 <dbl>, OCQ280 <dbl>, OCQ290G <dbl>,
## # OCQ290Q <dbl>, OCQ380 <dbl>, SMQ020 <dbl>, SMQ030G <dbl>, SMQ030Q <dbl>,
## # SMQ040 <dbl>, SMQ050G <dbl>, SMQ050Q <dbl>, SMQ050U <dbl>, SMQ055 <dbl>,
## # SMQ057 <dbl>, SMQ070 <dbl>, SMQ075 <dbl>, SMQ077 <dbl>, SMQ641 <dbl>,
## # SMQ650 <dbl>, PAQ020 <dbl>, PAQ050G <dbl>, PAQ050Q <dbl>, PAQ050U <dbl>,
## # PAQ080G <dbl>, PAQ080Q <dbl>, PAQ080U <dbl>, PAQ180 <dbl>, PAQ206 <dbl>,
## # PAQ280G <dbl>, PAQ280Q <dbl>, PAQ280U <dbl>, PAQ300G <dbl>, …
A. Label variable names something that makes sense. # note: WHQ025L is a variable in the dataset but it is not in the codebook; use the codebook information for WHQ025 instead.
NYCH2004_sub <- NYCH2004_sub %>%
rename(ID = SP_ID,
Gender = riagendr,
Respondent_Age = riaageyr,
Race = race_eth,
Marital_Status = SFQ180,
First_smoke_age = SMQ020,
Smoking_status = SMQ040,
Naked_Weight_lb = WHQ025L,
SSWeight = WHQ030)
glimpse(NYCH2004_sub)
## Rows: 1,999
## Columns: 9
## $ ID <dbl> 100230, 100243, 100270, 100597, 101166, 101285, 101720…
## $ Gender <dbl> 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, …
## $ Respondent_Age <dbl> 59, 25, 76, 29, 48, 60, 23, 58, 22, 47, 60, 40, 24, 26…
## $ Race <dbl> 1, 5, 4, 1, 2, 1, 4, 1, 4, 1, 1, 4, 3, 4, 4, 4, 4, 4, …
## $ Marital_Status <dbl> 1, 1, 2, 6, 5, 1, 1, 1, 5, 1, 4, 5, 5, 1, 1, 5, 1, 1, …
## $ First_smoke_age <dbl> 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ Smoking_status <dbl> NA, NA, NA, 2, 2, 3, 3, 3, 3, NA, NA, NA, NA, NA, NA, …
## $ Naked_Weight_lb <dbl> 215, 145, 222, 241, 134, NA, 120, 128, 290, 115, 220, …
## $ SSWeight <dbl> 3, 2, 3, 1, 3, 1, 2, 1, 1, 3, 1, 1, 2, 1, 3, 1, 3, 1, …
B. Label values in variables
NYCH2004_sub <- NYCH2004_sub %>%
set_value_labels(Gender = c("Male" = 1, "Female" = 2),
Race = c("NH White" = 1, "NH Black" = 2, "Hispanic" = 3, "Asian" = 4, "Other" = 5),
Marital_Status = c("Married"=1, "Widowed"=2, "Divorced"=3, "Separated"=4, "Never_Married"=5, "Living With a
Partner"=6),
First_smoke_age = c("Yes" = 1, "No" = 2),
Smoking_status = c("Everyday"=1, "Some days"=2, "Not at all"=3),
SSWeight = c("Overweight" = 1, "Underweight" = 2, "About the right weight" = 3))
glimpse(NYCH2004_sub)
## Rows: 1,999
## Columns: 9
## $ ID <dbl> 100230, 100243, 100270, 100597, 101166, 101285, 101720…
## $ Gender <dbl+lbl> 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2,…
## $ Respondent_Age <dbl> 59, 25, 76, 29, 48, 60, 23, 58, 22, 47, 60, 40, 24, 26…
## $ Race <dbl+lbl> 1, 5, 4, 1, 2, 1, 4, 1, 4, 1, 1, 4, 3, 4, 4, 4, 4,…
## $ Marital_Status <dbl+lbl> 1, 1, 2, 6, 5, 1, 1, 1, 5, 1, 4, 5, 5, 1, 1, 5, 1,…
## $ First_smoke_age <dbl+lbl> 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ Smoking_status <dbl+lbl> NA(_), NA(_), NA(_), 2, 2, 3, 3, …
## $ Naked_Weight_lb <dbl> 215, 145, 222, 241, 134, NA, 120, 128, 290, 115, 220, …
## $ SSWeight <dbl+lbl> 3, 2, 3, 1, 3, 1, 2, 1, 1, 3, 1, 1, 2, 1, 3, 1, 3,…
C. Convert anything that should be a factor into a factor
NYCH2004_sub <- NYCH2004_sub %>%
mutate_if(is.labelled, to_factor)
glimpse(NYCH2004_sub)
## Rows: 1,999
## Columns: 9
## $ ID <dbl> 100230, 100243, 100270, 100597, 101166, 101285, 101720…
## $ Gender <fct> Male, Male, Female, Male, Female, Female, Male, Female…
## $ Respondent_Age <dbl> 59, 25, 76, 29, 48, 60, 23, 58, 22, 47, 60, 40, 24, 26…
## $ Race <fct> NH White, Other, Asian, NH White, NH Black, NH White, …
## $ Marital_Status <fct> Married, Married, Widowed, Living With a
## …
## $ First_smoke_age <fct> No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, …
## $ Smoking_status <fct> NA, NA, NA, Some days, Some days, Not at all, Not at a…
## $ Naked_Weight_lb <dbl> 215, 145, 222, 241, 134, NA, 120, 128, 290, 115, 220, …
## $ SSWeight <fct> About the right weight, Underweight, About the right w…
How many females are in this dataset?
NYCH2004_sub %>%
filter(Gender == "Female") %>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 1168
ANSWER:1168
Are there more Hispanic males or females in this dataset? Use a graph to answer this question.
NYCH2004_sub %>%
na.omit() %>%
ggplot(aes(x = Gender)) +
geom_bar() +
facet_wrap(~Race)
Answer: More Hispanic Males
Are there more Hispanic males or females in this dataset? Use code to answer this question.
NYCH2004_sub %>%
filter(Gender == "Male" & Race == "Hispanic") %>%
count() ###114
## # A tibble: 1 × 1
## n
## <int>
## 1 114
NYCH2004_sub %>%
filter(Gender == "Female" & Race == "Hispanic") %>%
count() ###146
## # A tibble: 1 × 1
## n
## <int>
## 1 146
near(146-114,32)
## [1] TRUE
Answer:32
How old is the oldest hispanic male and how much does he currently weigh?
NYCH2004_sub %>%
filter(Respondent_Age > 60 & Race == "Hispanic" & Gender == "Male" & Naked_Weight_lb > 0)
## # A tibble: 8 × 9
## ID Gender Respondent_Age Race Marital_Status First_smoke_age
## <dbl> <fct> <dbl> <fct> <fct> <fct>
## 1 125280 Male 63 Hispanic Married Yes
## 2 126172 Male 68 Hispanic Married Yes
## 3 355661 Male 64 Hispanic Married Yes
## 4 369465 Male 62 Hispanic Separated Yes
## 5 566610 Male 68 Hispanic Married No
## 6 745850 Male 66 Hispanic Married No
## 7 763579 Male 80 Hispanic Married Yes
## 8 909195 Male 61 Hispanic Widowed Yes
## # ℹ 3 more variables: Smoking_status <fct>, Naked_Weight_lb <dbl>,
## # SSWeight <fct>
Answer:80 and weighs 132lb
Are there more married or divorced people who think they are overweight?
NYCH2004_sub %>%
filter(Marital_Status == "Married" & SSWeight == "Overweight")%>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 464
#464
NYCH2004_sub %>%
filter(Marital_Status == "Divorced" & SSWeight == "Overweight")%>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 102
#102
Answer:More married people think they are overweight
Make a bar graph that shows what respondents think about their current weight by gender. Remove anyone who responded that they “didn’t know” what they thought about their current weight. Order the bars so that underweight is first.
NYCH2004_sub %>%
na.omit()%>%
ggplot(aes(x = fct_relevel(SSWeight, "Underweight", "Overweight", "About the right weight"))) +
geom_bar() +
facet_wrap(~Gender)
See if there is anyone who answered “no” to ever smoking (smoked <100 cigs in lifetime) but answered “yes” to being a current smoker.
NYCH2004_sub %>%
na.omit()%>%
group_by(First_smoke_age == "No") %>%
count(Smoking_status)
## # A tibble: 3 × 3
## # Groups: First_smoke_age == "No" [1]
## `First_smoke_age == "No"` Smoking_status n
## <lgl> <fct> <int>
## 1 FALSE Everyday 296
## 2 FALSE Some days 153
## 3 FALSE Not at all 335
near(296+153,449)
## [1] TRUE
Answer:Yes, 449
Did the oldest person in this dataset ever smoke?
NYCH2004_sub %>%
filter(First_smoke_age == "Yes") %>%
arrange(desc(Respondent_Age)) %>%
head()
## # A tibble: 6 × 9
## ID Gender Respondent_Age Race Marital_Status First_smoke_age
## <dbl> <fct> <dbl> <fct> <fct> <fct>
## 1 707953 Male 89 NH Black Never_Married Yes
## 2 588840 Male 87 NH White Widowed Yes
## 3 106658 Male 86 NH Black Married Yes
## 4 231374 Female 86 NH White Widowed Yes
## 5 265679 Female 85 NH White Widowed Yes
## 6 643054 Female 85 NH White Widowed Yes
## # ℹ 3 more variables: Smoking_status <fct>, Naked_Weight_lb <dbl>,
## # SSWeight <fct>
Answer:Yes