Homework7

##load libraries

library(car)

## Loading required package: carData

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(ggplot2)
library(pander)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)
library(haven)
library(readr)

##load data

nhis_4 <- read_stata("nhis_00004.dta.gz")
View(nhis_4)

##Recode varaibles

#poor
nhis_4$poor<-Recode(nhis_4$pooryn, recodes =" 1=1;2=0;else=NA")

##Alcohol
nhis_4$alcohol<-Recode(nhis_4$alclife, recodes ="1=1;2=0;7:9=NA;else=NA")

##smoking
nhis_4$smoke<-Recode(nhis_4$smokev, recodes ="1=1;2=0;7:9=NA; else=NA")

##usborn
nhis_4$born<-Recode(nhis_4$usborn, recodes ="20=1;10:12=0;else=NA")

##citizen
nhis_4$mycitizen<-Recode(nhis_4$citizen, recodes =" 1=1;2=0;7:9=NA; else=NA")

#Age cut into intervals
nhis_4$agec<-cut(nhis_4$age,breaks = c(18, 25, 35, 45, 55, 65, 75, 85), include.lowest = T)

##Sex
nhis_4$male<-as.factor(ifelse(nhis_4$sex==1, "Male", "Female"))

##ERVISIT
nhis_4$ervisit<-Recode(nhis_4$eryrno, recodes = "20=1;30=2;41=3;40=4;42=5;43=6;50=7;60=8;61:62=9;00=NA;97:99=NA;10=NA;else=NA")

##educ
#nhis_4$education<-Recode(nhis_4$educ, recodes="100:102='0Prim'; 103:201='1somehs'; 200:202='2hsgrad'; 300:303='3somecol'; 400='4colgrad';500:503='5masteranddoc';504:999=NA;000=NA;000=NA")



##Marital status
#nhis_4$marital<-Recode(nhis_4$marstat, recodes="10:13='married'; 30='divorced'; 20='widowed'; 40='separated'; 50='nm'; 99=NA;00=NA", as.factor=T)


#race/ethnicity
#nhis_4$race<-Recode(nhis_4$racesr, recodes="100='white'; 200='black'; else=NA", as.factor=T)

##Hispanic
#nhis_4$ethnicity<-Recode(nhis_4$hispeth, recodes="10='not hispanic'; 20:70='hispanic'; else=NA", as.factor=T)

Principal Components For this homework, you are to use the technique of Principal Components Analysis (PCA) to perform a variable reduction of at least 5 variables. If you have an idea for latent construct, state what you believe this is. Report the summary statistics and correlation matrix for your data

nhis_4b<-nhis_4%>%
  filter(complete.cases(alcohol,mycitizen,born,poor,smoke,
         agec,ervisit,strata,psu, perweight))%>%
  select(alcohol,mycitizen,born,poor,smoke,
         agec,ervisit,strata,psu, perweight)%>%
 mutate_at(vars(ervisit,alcohol,born,smoke,mycitizen),scale)




nhis4.pc<-princomp(~smoke+ervisit+alcohol+born+mycitizen,
                   data=nhis_4b,
                   scores=T)


 


#Screeplot
screeplot(nhis4.pc,
          type = "l",
          main = "Scree Plot")
abline(h=1)

summary(nhis4.pc)

## Importance of components:
##                          Comp.1    Comp.2    Comp.3    Comp.4    Comp.5
## Standard deviation     1.348969 1.0507037 1.0048173 0.8418174 0.5967555
## Proportion of Variance 0.364080 0.2208784 0.2020072 0.1417844 0.0712501
## Cumulative Proportion  0.364080 0.5849583 0.7869655 0.9287499 1.0000000

##We see that comp.1 &2  comprises of about 36% and 58% of the variation. While the other components compose of 50% or less of  input variables, that isn't bad.We see that the last 3 components account for at least 78% and 92% and 100% of the variation which is pretty good Also, we see 3 eigenvalues of at least 1, which suggests there are 4 real components among these variables.

Report the results of the PCA, being sure to include the eigenvalues and corresponding vectors. Interpret your component(s) if possible

##Examine eignevectors/loading vectors

loadings(nhis4.pc)

## 
## Loadings:
##           Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## smoke      0.359  0.599  0.141  0.701       
## ervisit                 -0.967  0.236       
## alcohol    0.333  0.630 -0.191 -0.673       
## born      -0.623  0.311                0.714
## mycitizen  0.603 -0.384                0.698
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings       1.0    1.0    1.0    1.0    1.0
## Proportion Var    0.2    0.2    0.2    0.2    0.2
## Cumulative Var    0.2    0.4    0.6    0.8    1.0

#then I plot the first 2 components

scores<-data.frame(nhis4.pc$scores)
hist(scores$Comp.1)

hist(scores$Comp.2)

hist(scores$Comp.3)

hist(scores$Comp.4)

hist(scores$Comp.5)

##In terms of loadings in Comp 1we see a positive correlation with all the variables except the born variable which has a negative correlation. This shows that I do not have a good index since not all of them are loading in the same direction. In general, we see both negative and positive loadings among variables.However, the higher values in the component show an increase in mortality is associated with Comp 1,2&3, which is smoking,alcohol and ervisits.

cor(scores[,1:2])

##              Comp.1       Comp.2
## Comp.1 1.000000e+00 6.634208e-15
## Comp.2 6.634208e-15 1.000000e+00

##We see a small value in the correlation with shows that the correlation is 0


nhis_4c<-cbind(nhis_4b,scores)
names(nhis_4c)

##  [1] "alcohol"   "mycitizen" "born"      "poor"      "smoke"     "agec"     
##  [7] "ervisit"   "strata"    "psu"       "perweight" "Comp.1"    "Comp.2"   
## [13] "Comp.3"    "Comp.4"    "Comp.5"

##correlation among variables
names(nhis_4c)

##  [1] "alcohol"   "mycitizen" "born"      "poor"      "smoke"     "agec"     
##  [7] "ervisit"   "strata"    "psu"       "perweight" "Comp.1"    "Comp.2"   
## [13] "Comp.3"    "Comp.4"    "Comp.5"

#Make the survey design object
##We see variance with age and mortality due to smoking, with increased mortality aming young adults and older adults
options(survey.lonely.psu = "adjust")
nhis_4c$pc1q<-cut(nhis_4c$Comp.1,
                    breaks = quantile(nhis_4c$Comp.1,
                                      probs = c(0,.25,.5,.75,1) ,
                                      na.rm=T), 
                    include.lowest=T)
des<-svydesign(ids=~1,
               strata=~strata,
               weights=~perweight,
               data=nhis_4c)
##The first analysis will look at variation in my  index across age, and nationality level:
library(ggplot2)
ggplot(aes(x=agec, y=Comp.1),
       data=nhis_4c)+
  geom_boxplot()

nhis_4c$born<-factor(nhis_4c$born, levels(nhis_4c$born)[c(1,3,2,4,5)])

ggplot(aes(x=born, y=Comp.1),
       data=nhis_4c)+
  geom_boxplot()

If deemed appropriate, conduct some testing of your index/components/latent variables.

fit.1<-svyglm(Comp.1~agec+born
              ,
              des,
              family=gaussian)
summary(fit.1)

## 
## Call:
## svyglm(formula = Comp.1 ~ agec + born, design = des, family = gaussian)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~strata, weights = ~perweight, data = nhis_4c)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.28812    0.04243   6.790 1.41e-11 ***
## agec(25,35] -0.12543    0.06478  -1.936    0.053 .  
## agec(35,45] -0.27058    0.06020  -4.495 7.30e-06 ***
## agec(45,55] -0.44839    0.05882  -7.623 3.57e-14 ***
## agec(55,65] -0.47482    0.05567  -8.530  < 2e-16 ***
## agec(65,75] -0.40723    0.05532  -7.361 2.51e-13 ***
## agec(75,85] -0.37296    0.05658  -6.592 5.34e-11 ***
## born        -1.07960    0.02355 -45.852  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.4774404)
## 
## Number of Fisher Scoring iterations: 2

##To test the hypothesis,we see lower values of smoking with increase in age

Homework7

Lydia Okabe

3/23/2021