HW7

library(car)
library(stargazer)
library(survey)
library(ggplot2)
library(pander)
library(dplyr)
library(knitr)

library(haven)
tb19 <- read_dta("state_19_working_pudf.dta")


nams<-names(tb19)
head(nams, n=10)

##  [1] "year"     "ststr"    "state"    "fmonth"   "dispcode" "orgseqno"
##  [7] "c01q01"   "fairpoor" "c02q01"   "phyng"

newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(tb19)<-newnames

#recode

#diabetes
tb19$dia<-Recode(tb19$m02q04, recodes = "88=0; 77=NA; 99=NA")

#drvisit
tb19$drvis<-Recode(tb19$c03q04, recodes = "1=1; 2=2; 3:4=5; 8=0; else=NA")

#chol
tb19$chol<-Recode(tb19$c05q01, recodes = "1= 0; 2=1; 3=2; 4=3; 5=5; else=NA")

#sores
tb19$sores<-Recode(tb19$m02q06, recodes = "88=0; 77=NA; 99=NA")

#mental health not good
tb19$mntl<-Recode(tb19$c02q02, recodes = "88=0; 77=NA; 99=NA")


#fruit
tb19$frt2new = as.factor(tb19$frt2new)
tb19$fruit<-Recode(tb19$frt2new, recodes = "1= 'les2'; 2='2mor'; else=NA")

#veggie
tb19$vegt3new = as.factor(tb19$vegt3new)
tb19$veggie<-Recode(tb19$vegt3new, recodes = "1= 'les3'; 2='3mor'; else=NA")

#fruitandveggies
tb19$fv5 = as.factor(tb19$fv5)
tb19$fv<-Recode(tb19$fv5, recodes = "1= 'les5'; 2='5mor'; else=NA")

#physicalactivities
tb19$c11q01 = as.factor(tb19$c11q01)
tb19$physa<-Recode(tb19$c11q01, recodes ="1=1;2=0")

#sex
tb19$sex = as.factor(tb19$sex)
tb19$sex<-Recode(tb19$sex, recodes="1='Male'; 2='Female'")

#educ
tb19$educat4 = as.factor(tb19$educat4)
tb19$educ<-Recode(tb19$educat4, recodes="1='leshs'; 2='hsgrad'; 3='sumcol'; 4='colgrad'")
tb19$educ<-relevel(tb19$educ, ref='hsgrad')

#ethnicity
tb19$racegr5 = as.factor(tb19$racegr5)
tb19$ethn<-Recode(tb19$racegr5, recodes="1='nhwhite'; 2='nh black'; 3='Hispanic';4='othernh'; 5='multinh'; else=NA", as.factor = T)

#insurance
tb19$c03q01 = as.factor(tb19$c03q01)
tb19$ins<-Recode(tb19$c03q01, recodes ="1=1;2=0")

#employment
tb19$c08q14 = as.factor(tb19$c08q14)
tb19$employ<-Recode(tb19$c08q14, recodes="1:2='Employed'; 3:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
 tb19$employ<-relevel(tb19$employ, ref='Employed')

Homework 7: Principal Components For this homework, you are to use the technique of Principal Components Analysis (PCA) to perform a variable reduction of at least 5 variables.

If you have an idea for latent construct, state what you believe this is.

Report the summary statistics and correlation matrix for your data.

Report the results of the PCA, being sure to include the eigenvalues and corresponding vectors. Interpret your component(s) if possible If deemed appropriate, conduct some testing of your index/components/latent variables.

#The LC will be diabetes. Variables used to see if they determine diabetes will be: sex, ethnicity, insurance, education, physical activity, consumption of fruits and vegetables, cholesterol, mental health, and sores on feet.

sub19<-tb19%>%
  filter(complete.cases(llcpwt, ststr, dia,sex,ethn,employ,ins,educ,physa,fv,veggie,fruit,drvis,sores,mntl,chol))%>%
  select(llcpwt, ststr, dia,sex,ethn,employ,ins,educ,physa,fv,mntl,veggie,fruit,drvis,sores,chol)%>%
  mutate_at(vars(dia,drvis,sores,chol,mntl),scale)

sub19.pc<-princomp(~dia+drvis+sores+chol+mntl,
                   data=sub19,
                   scores=T)

#Screeplot
screeplot(sub19.pc,
          type = "l",
          main = "Scree Plot")
abline(h=1)

summary of eignevalues and varience

summary(sub19.pc)

## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4    Comp.5
## Standard deviation     1.2287298 1.1285857 0.9618352 0.8066264 0.7979382
## Proportion of Variance 0.3021995 0.2549471 0.1851750 0.1302344 0.1274440
## Cumulative Proportion  0.3021995 0.5571466 0.7423216 0.8725560 1.0000000

#For the 5 components only components 1 and 2 have eiganvalues above 1.

Examine eignevectors/loadingvectors

loadings(sub19.pc)

## 
## Loadings:
##       Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## dia    0.557  0.361  0.163  0.307  0.662
## drvis -0.432  0.545  0.120  0.656 -0.268
## sores  0.515  0.353  0.417 -0.275 -0.600
## chol  -0.429  0.549  0.118 -0.629  0.324
## mntl   0.231  0.382 -0.878        -0.155
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings       1.0    1.0    1.0    1.0    1.0
## Proportion Var    0.2    0.2    0.2    0.2    0.2
## Cumulative Var    0.2    0.4    0.6    0.8    1.0

#in terms of the loadsings comp.1 variables we see a mixture of loadings, some positive, some negative. drvis and chol are negative. Varibles for comp. 2 we see positive associations with everthing. I will use Comp. 2 as an index for diabetes since all variables are loading in the same direction.

plot

scores<-data.frame(sub19.pc$scores)
hist(scores$Comp.1)

scores<-data.frame(sub19.pc$scores)
hist(scores$Comp.2)

Calculate Correlation

cor(scores[,1:5])

##               Comp.1        Comp.2        Comp.3        Comp.4        Comp.5
## Comp.1  1.000000e+00 -2.238337e-15  2.393284e-16 -2.307608e-15  1.471210e-15
## Comp.2 -2.238337e-15  1.000000e+00  2.374322e-15  4.202660e-15 -9.358111e-16
## Comp.3  2.393284e-16  2.374322e-15  1.000000e+00  8.715593e-16 -1.440974e-15
## Comp.4 -2.307608e-15  4.202660e-15  8.715593e-16  1.000000e+00 -2.031426e-15
## Comp.5  1.471210e-15 -9.358111e-16 -1.440974e-15 -2.031426e-15  1.000000e+00

sub19c<-cbind(sub19, scores)
names(sub19c)

##  [1] "llcpwt" "ststr"  "dia"    "sex"    "ethn"   "employ" "ins"    "educ"  
##  [9] "physa"  "fv"     "mntl"   "veggie" "fruit"  "drvis"  "sores"  "chol"  
## [17] "Comp.1" "Comp.2" "Comp.3" "Comp.4" "Comp.5"

names(sub19c)

##  [1] "llcpwt" "ststr"  "dia"    "sex"    "ethn"   "employ" "ins"    "educ"  
##  [9] "physa"  "fv"     "mntl"   "veggie" "fruit"  "drvis"  "sores"  "chol"  
## [17] "Comp.1" "Comp.2" "Comp.3" "Comp.4" "Comp.5"

Using PC to generate analysis

#survey design object
options(survey.lonely.psu = "adjust")
sub19c$pc1<-cut(sub19c$Comp.2,
                    breaks = quantile(sub19c$Comp.2,
                                      probs = c(0,.25,.5,.75,1) ,
                                      na.rm=T), 
                    include.lowest=T)
des<-svydesign(ids=~1,
               strata=~ststr,
               weights=~llcpwt,
               data=sub19c)

Variation in diabetes index across physical activity and sex: variation across physical activity

library(ggplot2)
ggplot(aes(x=physa, y=Comp.2),
       data=sub19c)+
  geom_boxplot()

#The mean for physical activity is lower for diabetes compared to no physical activity.

variation across sex

library(ggplot2)
ggplot(aes(x=sex, y=Comp.2),
       data=sub19c)+
  geom_boxplot()

#Regarding diabetes, males have a lower mean when compared to women.

Examine variation in diabetes index across insurance, education, and ethnicity

fit.1<-svyglm(Comp.2~physa+drvis+(sex),
              des,
              family=gaussian)
summary(fit.1)

## 
## Call:
## svyglm(formula = Comp.2 ~ physa + drvis + (sex), design = des, 
##     family = gaussian)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = sub19c)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.19373    0.07698   2.517   0.0120 *  
## physa1      -0.24913    0.10667  -2.336   0.0197 *  
## drvis        0.63221    0.10285   6.147 1.08e-09 ***
## sexMale     -0.10071    0.08724  -1.154   0.2486    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.6606112)
## 
## Number of Fisher Scoring iterations: 2

# (not sure if I am interpreting correclty...) results for fit.1. index decreases physical activity as well as for males.

HW7

Julie A. Gonzalez

3/23/2021