dim(cdc)
## [1] 20000 9
str(cdc)
## 'data.frame': 20000 obs. of 9 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
## $ exerany : num 0 0 1 1 0 1 1 0 0 1 ...
## $ hlthplan: num 1 1 1 1 1 1 1 1 1 1 ...
## $ smoke100: num 0 1 1 0 0 0 0 0 1 0 ...
## $ height : num 70 64 60 66 61 64 71 67 65 70 ...
## $ weight : int 175 125 105 132 150 114 194 170 150 180 ...
## $ wtdesire: int 175 115 105 124 130 114 185 160 130 170 ...
## $ age : int 77 33 49 42 55 55 31 45 27 44 ...
## $ gender : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
cdc_var <- data.frame(variable = c("genhlth", "exerany", "hlthplan", "smoke100","height","weight","wtdesire","age","gender"), datatype = c("factor","numeric","numeric","numeric","numeric","integer","integer","integer","factor"), cat_disc = c("categorical","categorical","categorical","categorical","continuous","continuous","continuous","discrete","categorical"))
pander(cdc_var,style = 'rmarkdown')
variable | datatype | cat_disc |
---|---|---|
genhlth | factor | categorical |
exerany | numeric | categorical |
hlthplan | numeric | categorical |
smoke100 | numeric | categorical |
height | numeric | continuous |
weight | integer | continuous |
wtdesire | integer | continuous |
age | integer | discrete |
gender | factor | categorical |
height
and age
, and compute the interquartile range for each.height_summary <- summary(cdc$height)
height_summary <- t(height_summary)
height_IQR <- height_summary[5] - height_summary[2]
height_info <- data.frame(cbind(height_summary,height_IQR))
height_info
## Min. X1st.Qu. Median Mean X3rd.Qu. Max. height_IQR
## 1 48 64 67 67.1829 70 93 6
age_summary <- summary(cdc$age)
age_summary <- t(age_summary)
age_IQR <- age_summary[5] - age_summary[2]
age_info <- data.frame(cbind(age_summary,age_IQR))
age_info
## Min. X1st.Qu. Median Mean X3rd.Qu. Max. age_IQR
## 1 18 31 43 45.06825 57 99 26
gender
and exerany
. How many males are in the sample? What proportion ofthe sample reports being in excellent health?freq_table <- prop.table(table(cdc$gender, cdc$exerany))
colnames(freq_table) <- c("non-exercisers","exercisers")
freq_table
##
## non-exercisers exercisers
## m 0.10745 0.37100
## f 0.14685 0.37470
males_total <- sum(cdc$gender == "m")
male_exc_health <- sum(cdc$gender == "m" & cdc$genhlth == "excellent")
males_prop_exc_health <- round(male_exc_health / males_total,4) * 100
print(paste0("Males in sample: ", males_total))
## [1] "Males in sample: 9569"
print(paste0("Males reporting excellent health: ", male_exc_health))
## [1] "Males reporting excellent health: 2298"
print(paste0("Proportion of Males reporting excellent health: ", males_prop_exc_health, "%"))
## [1] "Proportion of Males reporting excellent health: 24.02%"
mosiac <- mosaicplot(table(cdc$gender,cdc$smoke100), main = "Smokers vs. Gender", color = T)
#### On the y-axis, the value “0” indicates the subject has smoked less than 100 cigarettes in his or her lifetime. The mosiac plot reveals that a great percentage of females can be so classified. This percentage is 57.6%, as compared to that of males of 47.5%.
non_smoker <- c((sum(!cdc$smoke100[cdc$gender == "f"])/ sum(cdc$gender == "f")),
(sum(!cdc$smoke100[cdc$gender == "m"])/ sum(cdc$gender == "m")))
non_smoker
## [1] 0.5763589 0.4751803
under23_and_smoke
that contains all observations of respondents under the age of 23 that have smoked 100 cigarettes in their lifetime. Write the command you used to create the new object as the answer to this exercise.under23_and_smoke <- subset(cdc, cdc$age < 23 & cdc$smoke100 == 1)
head(under23_and_smoke)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 13 excellent 1 0 1 66 185 220 21 m
## 37 very good 1 0 1 70 160 140 18 f
## 96 excellent 1 1 1 74 175 200 22 m
## 180 good 1 1 1 64 190 140 20 f
## 182 very good 1 1 1 62 92 92 21 f
## 240 very good 1 0 1 64 125 115 22 f
cdc$bmi <- (cdc$weight / cdc$height^2) * 703
boxplot(cdc$bmi ~ cdc$genhlth)
exercisers_bmi <- subset(cdc, cdc$exerany == 1)
non_exercisers_bmi <- subset(cdc, cdc$exerany == 0)
exercisers_summary <- summary(exercisers_bmi$bmi)
non_exercisers_summary <- summary(non_exercisers_bmi$bmi)
x3 <- rbind(exercisers_summary, non_exercisers_summary)
x3
## Min. 1st Qu. Median Mean 3rd Qu.
## exercisers_summary 12.40045 22.65527 25.10714 25.98012 28.33963
## non_exercisers_summary 12.74954 23.02595 26.49673 27.26525 30.26681
## Max.
## exercisers_summary 73.09074
## non_exercisers_summary 62.64201
boxplot(cdc$bmi ~ cdc$exerany)
#### In this case, one would expect that people who exercise (cdc$exerany == 1) have a lower BMI on average than those who do not.
plot(x = cdc$wtdesire, y = cdc$weight, xlim=c(50,350))
reg2 <- lm(wtdesire ~ weight,data=cdc)
summary(reg2)
##
## Call:
## lm(formula = wtdesire ~ weight, data = cdc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -167.98 -9.32 0.08 11.51 518.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.664015 0.590782 78.99 <2e-16 ***
## weight 0.639014 0.003388 188.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.21 on 19998 degrees of freedom
## Multiple R-squared: 0.6401, Adjusted R-squared: 0.6401
## F-statistic: 3.556e+04 on 1 and 19998 DF, p-value: < 2.2e-16
wtdesire
) and current weight (weight
). Create this new variable by subtracting the two columns in the data frame and assigning them to a new object called wdiff
.cdc$wdiff <- cdc$wtdesire - cdc$weight
reg1 <- lm(bmi ~ wdiff, data=cdc)
with(cdc,plot(bmi, wdiff))
abline(h=0)
wdiff
? If an observation wdiff
is 0, what does this mean about the person’s weight and desired weight. What if wdiff
is positive or negative?wdiff
in terms of its center, shape, and spread, including any plots you use. What does this tell us about how people feel about their current weight?class(cdc$wdiff)
## [1] "integer"
summary(cdc$wdiff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -300.00 -21.00 -10.00 -14.59 0.00 500.00
hist(cdc$wdiff, breaks = 80)
sd(cdc$wdiff)
## [1] 24.04586
men_wdiff_summary <- summary(cdc$wdiff[cdc$gender == "m"])
women_wdiff_summary <- summary(cdc$wdiff[cdc$gender == "f"])
wdiff_sum <- rbind(men_wdiff_summary,women_wdiff_summary)
wdiff_sum
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## men_wdiff_summary -300 -20 -5 -10.70613 0 500
## women_wdiff_summary -300 -27 -10 -18.15118 0 83
boxplot(cdc$wdiff ~ cdc$gender)
prop <- sum((cdc$weight > (mean(cdc$weight) - sd(cdc$weight))) & cdc$weight < (mean(cdc$weight) + sd(cdc$weight)))/20000
prop2 <- sum((cdc$weight > (mean(cdc$weight) - 2 * sd(cdc$weight))) & cdc$weight < (mean(cdc$weight) + 2 * sd(cdc$weight)))/20000
data2 <- list(c(mean(cdc$weight)),sd(cdc$weight), prop, prop2)
data3 <- t(data2)
colnames(data3) <- c("mean", "sd", "prop w/in 1SD", "prop w/in 2 SD")
data3
## mean sd prop w/in 1SD prop w/in 2 SD
## [1,] 169.683 40.08097 0.7076 0.9563