library(tibble)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(haven)
library(ggplot2)
health <- read_xpt("/Volumes/NetStorage/Yunis File/Class/Fall' 21/DATA 333/Week-07/NHANES-2011-2012-Demo.xpt") 

# remove NA data #
health$INDHHIN2[health$INDHHIN2==77] <- NA 
health$INDHHIN2[health$INDHHIN2==99] <- NA
health.new <- cbind (health$RIAGENDR, health$INDHHIN2)
health.new <- na.omit(health.new) 
# we are testing that difference in gender and average annual household income, at 95% confidence interval, the different is between 4.77 and 4.90. The p-value is less than 0 2.2e-16, this indicating a very small probability of true mean is not equal to 0. At 95% confidence interval, we can reject null and accept alternative hypothesis. #

t.test (health.new)
## 
##  One Sample t-test
## 
## data:  health.new
## t = 142.6, df = 18519, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  4.772311 4.905334
## sample estimates:
## mean of x 
##  4.838823
# Null= the mean in all groups are the same # 
# Alt = at least one race's average annual household income is different from the other # 
# we are testing the difference in racial/ethnic and average annual household income, the p-value is less than 0 at 2.2e-16, this indicating a very small probability, we can reject null and accept alternative hypothesis. we can conclued that at 95% confidence interval, at least race's average annual household income is different from the other race group # 

aggregate (INDHHIN2 ~ RIDRETH3 , data = health, mean)
##   RIDRETH3  INDHHIN2
## 1        1  7.155416
## 2        2  7.496008
## 3        3  8.656778
## 4        4  7.370298
## 5        6 10.224613
## 6        7  8.822102
model_race <- aov(INDHHIN2 ~ RIDRETH3, data = health)
summary(model_race)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## RIDRETH3       1   3734    3734     189 <2e-16 ***
## Residuals   9258 182925      20                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 496 observations deleted due to missingness
# racial/ethnic is significant in relationship to annual household income# 
# gender is also a small significant in relationship to annual household income# 
# race and gender are not significant in relationship to annual household income# 

aggregate (INDHHIN2 ~ RIDRETH3 + RIAGENDR, data = health, mean)
##    RIDRETH3 RIAGENDR  INDHHIN2
## 1         1        1  7.150538
## 2         2        1  7.667355
## 3         3        1  8.786785
## 4         4        1  7.594968
## 5         6        1 10.122164
## 6         7        1  8.808743
## 7         1        2  7.160514
## 8         2        2  7.335907
## 9         3        2  8.523410
## 10        4        2  7.160606
## 11        6        2 10.324278
## 12        7        2  8.835106
model_twofac <- aov(INDHHIN2 ~ RIDRETH3 * RIAGENDR, data = health)
summary (model_twofac)
##                     Df Sum Sq Mean Sq F value Pr(>F)    
## RIDRETH3             1   3734    3734 189.099 <2e-16 ***
## RIAGENDR             1    125     125   6.348 0.0118 *  
## RIDRETH3:RIAGENDR    1      6       6   0.308 0.5787    
## Residuals         9256 182794      20                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 496 observations deleted due to missingness
# racial/ethnic and educational level chi square test# 
# Null= the mean in racial/ethnic and educational are the same # 
# Alt = at  mean in racial/ethnic and educational is different from the each other # 

# the p-value is less than 0 at  < 2.2e-16, at 95% confidence interval we are rejecting null hypothesis and accept the alternative hypothesis. In this situation as the p < 0.01. We are comparing difference between race and educational level, we can conclude that race and educational level have significance differences in the mean value. 

# remove data # 
health$DMDEDUC2[health$DMDEDUC2==7] <- NA 
health$DMDEDUC2[health$DMDEDUC2==9] <- NA
health.new2 <- cbind (health$RIDRETH3, health$DMDEDUC2)
health.new2 <- na.omit(health.new2) 
summary (health$DMDEDUC2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   3.000   4.000   3.462   5.000   5.000    4201
summary (health$RIDRETH3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.00    3.00    3.44    4.00    7.00
# adding row and column # 
race_edu <- table (health$DMDEDUC2, health$RIDRETH3)
addmargins (race_edu)
##      
##          1    2    3    4    6    7  Sum
##   1    177  129   95   71   73    5  550
##   2    114  106  243  246   60   13  782
##   3    103  117  437  374  107   31 1169
##   4    109  138  683  492  169   66 1657
##   5     37   87  582  269  385   37 1397
##   Sum  540  577 2040 1452  794  152 5555
proportions (race_edu, margin = 1)
##    
##               1           2           3           4           6           7
##   1 0.321818182 0.234545455 0.172727273 0.129090909 0.132727273 0.009090909
##   2 0.145780051 0.135549872 0.310741688 0.314578005 0.076726343 0.016624041
##   3 0.088109495 0.100085543 0.373823781 0.319931565 0.091531223 0.026518392
##   4 0.065781533 0.083283042 0.412190706 0.296922148 0.101991551 0.039831020
##   5 0.026485326 0.062276306 0.416607015 0.192555476 0.275590551 0.026485326
#chi sq test #
table (health$DMDEDUC2, health$RIDRETH3)
##    
##       1   2   3   4   6   7
##   1 177 129  95  71  73   5
##   2 114 106 243 246  60  13
##   3 103 117 437 374 107  31
##   4 109 138 683 492 169  66
##   5  37  87 582 269 385  37
test_race <- chisq.test (health$DMDEDUC2, health$RIDRETH3)
test_race
## 
##  Pearson's Chi-squared test
## 
## data:  health$DMDEDUC2 and health$RIDRETH3
## X-squared = 955.14, df = 20, p-value < 2.2e-16