library(tibble)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(haven)
library(ggplot2)
health <- read_xpt("/Volumes/NetStorage/Yunis File/Class/Fall' 21/DATA 333/Week-07/NHANES-2011-2012-Demo.xpt")
# remove NA data #
health$INDHHIN2[health$INDHHIN2==77] <- NA
health$INDHHIN2[health$INDHHIN2==99] <- NA
health.new <- cbind (health$RIAGENDR, health$INDHHIN2)
health.new <- na.omit(health.new)
# we are testing that difference in gender and average annual household income, at 95% confidence interval, the different is between 4.77 and 4.90. The p-value is less than 0 2.2e-16, this indicating a very small probability of true mean is not equal to 0. At 95% confidence interval, we can reject null and accept alternative hypothesis. #
t.test (health.new)
##
## One Sample t-test
##
## data: health.new
## t = 142.6, df = 18519, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 4.772311 4.905334
## sample estimates:
## mean of x
## 4.838823
# Null= the mean in all groups are the same #
# Alt = at least one race's average annual household income is different from the other #
# we are testing the difference in racial/ethnic and average annual household income, the p-value is less than 0 at 2.2e-16, this indicating a very small probability, we can reject null and accept alternative hypothesis. we can conclued that at 95% confidence interval, at least race's average annual household income is different from the other race group #
aggregate (INDHHIN2 ~ RIDRETH3 , data = health, mean)
## RIDRETH3 INDHHIN2
## 1 1 7.155416
## 2 2 7.496008
## 3 3 8.656778
## 4 4 7.370298
## 5 6 10.224613
## 6 7 8.822102
model_race <- aov(INDHHIN2 ~ RIDRETH3, data = health)
summary(model_race)
## Df Sum Sq Mean Sq F value Pr(>F)
## RIDRETH3 1 3734 3734 189 <2e-16 ***
## Residuals 9258 182925 20
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 496 observations deleted due to missingness
# racial/ethnic is significant in relationship to annual household income#
# gender is also a small significant in relationship to annual household income#
# race and gender are not significant in relationship to annual household income#
aggregate (INDHHIN2 ~ RIDRETH3 + RIAGENDR, data = health, mean)
## RIDRETH3 RIAGENDR INDHHIN2
## 1 1 1 7.150538
## 2 2 1 7.667355
## 3 3 1 8.786785
## 4 4 1 7.594968
## 5 6 1 10.122164
## 6 7 1 8.808743
## 7 1 2 7.160514
## 8 2 2 7.335907
## 9 3 2 8.523410
## 10 4 2 7.160606
## 11 6 2 10.324278
## 12 7 2 8.835106
model_twofac <- aov(INDHHIN2 ~ RIDRETH3 * RIAGENDR, data = health)
summary (model_twofac)
## Df Sum Sq Mean Sq F value Pr(>F)
## RIDRETH3 1 3734 3734 189.099 <2e-16 ***
## RIAGENDR 1 125 125 6.348 0.0118 *
## RIDRETH3:RIAGENDR 1 6 6 0.308 0.5787
## Residuals 9256 182794 20
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 496 observations deleted due to missingness
# racial/ethnic and educational level chi square test#
# Null= the mean in racial/ethnic and educational are the same #
# Alt = at mean in racial/ethnic and educational is different from the each other #
# the p-value is less than 0 at < 2.2e-16, at 95% confidence interval we are rejecting null hypothesis and accept the alternative hypothesis. In this situation as the p < 0.01. We are comparing difference between race and educational level, we can conclude that race and educational level have significance differences in the mean value.
# remove data #
health$DMDEDUC2[health$DMDEDUC2==7] <- NA
health$DMDEDUC2[health$DMDEDUC2==9] <- NA
health.new2 <- cbind (health$RIDRETH3, health$DMDEDUC2)
health.new2 <- na.omit(health.new2)
summary (health$DMDEDUC2)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 3.000 4.000 3.462 5.000 5.000 4201
summary (health$RIDRETH3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 3.00 3.00 3.44 4.00 7.00
# adding row and column #
race_edu <- table (health$DMDEDUC2, health$RIDRETH3)
addmargins (race_edu)
##
## 1 2 3 4 6 7 Sum
## 1 177 129 95 71 73 5 550
## 2 114 106 243 246 60 13 782
## 3 103 117 437 374 107 31 1169
## 4 109 138 683 492 169 66 1657
## 5 37 87 582 269 385 37 1397
## Sum 540 577 2040 1452 794 152 5555
proportions (race_edu, margin = 1)
##
## 1 2 3 4 6 7
## 1 0.321818182 0.234545455 0.172727273 0.129090909 0.132727273 0.009090909
## 2 0.145780051 0.135549872 0.310741688 0.314578005 0.076726343 0.016624041
## 3 0.088109495 0.100085543 0.373823781 0.319931565 0.091531223 0.026518392
## 4 0.065781533 0.083283042 0.412190706 0.296922148 0.101991551 0.039831020
## 5 0.026485326 0.062276306 0.416607015 0.192555476 0.275590551 0.026485326
#chi sq test #
table (health$DMDEDUC2, health$RIDRETH3)
##
## 1 2 3 4 6 7
## 1 177 129 95 71 73 5
## 2 114 106 243 246 60 13
## 3 103 117 437 374 107 31
## 4 109 138 683 492 169 66
## 5 37 87 582 269 385 37
test_race <- chisq.test (health$DMDEDUC2, health$RIDRETH3)
test_race
##
## Pearson's Chi-squared test
##
## data: health$DMDEDUC2 and health$RIDRETH3
## X-squared = 955.14, df = 20, p-value < 2.2e-16