# Wave 4 data processing for Long -----------------------------------------
#Looking at all levels for Retirment Satisfaction
RF4<-table(HRS$R4RETSAT)
RF5<-table(HRS$R5RETSAT)
RF6<-table(HRS$R6RETSAT)
RF7<-table(HRS$R7RETSAT)
RF8<-table(HRS$R8RETSAT)
RF9<-table(HRS$R9RETSAT)
RF10<-table(HRS$R10RETSAT)
RF11<-table(HRS$R11RETSAT)
RF12<-table(HRS$R12RETSAT)
RF13<-table(HRS$R13RETSAT)
RF<-rbind(RF4,RF5,RF6, RF7, RF8, RF9, RF10, RF11, RF12, RF13)
RF<- as.data.frame(RF)
RF$year<- c(seq(1998,2016, by=2))
colnames(RF)<- c('Very','Moderately', 'NotAtAll', 'Year')
ggplot(data= RF, aes(x= Year))+
geom_line(aes(y=Very,color= 'Very'))+
geom_line(aes(y=Moderately,color='Moderately'))+
geom_line(aes(y=NotAtAll, color= 'not-at-all'))+
geom_point(aes(y=Very), color= '#619CFF')+
geom_point(aes(y=Moderately),color= '#F8766D')+
geom_point(aes(y=NotAtAll), color= '#00BA38')+
labs(title= 'Count of Retirment Satisfaction Over Time', x= 'Year', y= 'Number of People')+
scale_x_discrete(limits= c(1998,2000,2002,2004,2006,2008,2010,2012,2014,2016))+
theme_minimal()
This Graph shows the distribution of Very satisfied, moderately satisfied, and Not at all the wave times
This next section of code looks into creating flag variables as to whether a person has some difficulty in ADS, Minor physical health, and Major Health issues
#Based on the data it looks like the numbers remain constant over time with jumps in numbers
#between years 02-04 and 08-10
#I Immagin the ressesion has an affect as to the rise in not at all and moderatly compared to very
#---Rencoding difficulties----#
#--All these entries have 'Some Difficulties when attempting the following task'
#----Activites of Daily living----#
#summary(HRS$R4WALKRA) # walking
#summary(HRS$R4DRESSA) # dressing
#summary(HRS$R4BATHA) # Bath
#summary(HRS$R4EATA) # Eat
#summary(HRS$R4BEDA) # Bed
#summary(HRS$R4TOILTA) # Using the bath Room
R4ANY<- HRS %>% select(HHIDPN,R4WALKRA,R4DRESSA,R4BATHA,R4EATA,R4BEDA,R4TOILTA)%>% filter(!is.na(R4WALKRA))%>%filter(!is.na(R4DRESSA))%>%filter(!is.na(R4BATHA))%>%filter(!is.na(R4EATA))%>%filter(!is.na(R4BEDA))%>%filter(!is.na(R4TOILTA))
R4ANYDIFF<-c(rep(0,length(R4ANY$HHIDPN)))
for(i in 1:length(R4ANY$HHIDPN)){
if(R4ANY$R4WALKRA[i]== '1.Yes'||R4ANY$R4DRESSA[i]== '1.Yes'||R4ANY$R4BATHA[i]== '1.Yes'||R4ANY$R4EATA[i]== '1.Yes'||R4ANY$R4BEDA[i]=='1.Yes'||R4ANY$R4TOILTA[i]=='1.Yes') R4ANYDIFF[i]<- 1
}
summary(R4ANYDIFF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1765 0.0000 1.0000
#This vector now tells us wether or not some one has some difficulty in the above tasks.
#--Instrumental Daily Livings Some difficulty--#
#summary(HRS$R4MAPA) # Using a map
#summary(HRS$R4PHONEA) # Using a phone
#summary(HRS$R4MONEYA) # managing Money
#summary(HRS$R4MEDSA) # difficulty managing medication
#summary(HRS$R4SHOPA) # shopping for grociers
#summary(HRS$R4MEALSA) # preparing a hot meal
R4IANY<- HRS %>% select(HHIDPN,R4MAPA,R4PHONEA,R4MONEYA,R4MEDSA,R4SHOPA,R4MEALSA)%>% filter(!is.na(R4MAPA))%>% filter(!is.na(R4PHONEA))%>%filter(!is.na(R4MONEYA))%>%filter(!is.na(R4MEDSA))%>%filter(!is.na(R4SHOPA))%>%filter(!is.na(R4MEALSA))
R4IANYDIFF<-c(rep(0,length(R4IANY$HHIDPN)))
for(i in 1:length(R4IANY$HHIDPN)){
if(R4IANY$R4MAPA[i]== '1.Yes'||R4IANY$R4PHONEA[i]== '1.Yes'||R4IANY$R4MONEYA[i]== '1.Yes'||R4IANY$R4MEDSA[i]== '1.Yes'||R4IANY$R4SHOPA[i]=='1.Yes'||R4IANY$R4MEALSA[i]== '1.Yes') R4IANYDIFF[i]<- 1
}
summary(R4IANYDIFF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1862 0.0000 1.0000
#--New flag variable for wether some one has difficulties is their Instrumental Tasks. These are going to be highly correlated
#Note that their is other limitation variables that invloves more physical activites.
#-----------------------------#
#----Doctor Diagnosis--------#
#summary(HRS$R4HIBP) # High Blood Pressure
#summary(HRS$R4DIAB) # Reports Diabites
#summary(HRS$R4CANCR) # Cancer
#summary(HRS$R4LUNG) # Lung deseas
#summary(HRS$R4HEART) # Heart
#summary(HRS$R4STROK) # Stroke
#summary(HRS$R4PSYCH) # reports psych problem
#summary(HRS$R4ARTHR) # Reports Arthirites
#There are different levels to these in terms of their scare level...
#Med: HBP, Diabities, Arthirites, psych?!?!?!?!
#High: Cancer, Lung deseas, Heart, stroke
#MED
R4MDOCDIG<- HRS %>% select(HHIDPN,R4HIBP,R4DIAB,R4PSYCH,R4ARTHR)%>% filter(!is.na(R4HIBP))%>%filter(!is.na(R4DIAB))%>% filter(!is.na(R4PSYCH)) %>% filter(!is.na(R4ARTHR))
R4MANYDOCDIG<-c(rep(0,length(R4MDOCDIG$HHIDPN)))
for(i in 1:length(R4MDOCDIG$HHIDPN)){
if(R4MDOCDIG$R4HIBP[i]== '1.Yes'||R4MDOCDIG$R4DIAB[i]== '1.Yes'||R4MDOCDIG$R4PSYCH[i]== '1.Yes'||R4MDOCDIG$R4ARTHR[i]== '1.Yes') R4MANYDOCDIG[i]<- 1
}
summary(R4MANYDOCDIG) #about 73% of the sample has either Arth, Diabities, don't know how useful this information
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 1.000 0.753 1.000 1.000
#High
R4HDOCDIG<- HRS %>% select(HHIDPN,R4CANCR, R4LUNG, R4HEART,R4STROK)%>% filter(!is.na(R4CANCR))%>%filter(!is.na(R4LUNG))%>% filter(!is.na(R4HEART)) %>% filter(!is.na(R4STROK))
R4HANYDOCDIG<-c(rep(0,length(R4HDOCDIG$HHIDPN)))
for(i in 1:length(R4HDOCDIG$HHIDPN)){
if(R4HDOCDIG$R4CANCR[i]== '1.Yes'||R4HDOCDIG$R4LUNG[i]== '1.Yes'||R4HDOCDIG$R4HEART[i]== '1.Yes'||R4HDOCDIG$R4STROK[i]== '1.Yes') R4HANYDOCDIG[i]<- 1
}
summary(R4HANYDOCDIG)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3593 1.0000 1.0000
# I belive this variable is going to be a much tigher and useful compared to the medium.
#----------------------------------------------------------------------------------------------------#
smaller Medical issues will not be useful in the regression as most of the population has a minor health issue
#-----Income------#
#only lower levels of income has a really significance
grid.arrange(
HRS%>% filter(!is.na(H4ITOT)) %>% filter(!is.na(R4RETSAT))%>% ggplot(aes(x= H4ITOT, fill=R4RETSAT))+
geom_histogram(color= 'black',breaks= c(seq(0,100000, by=10000)))+
scale_x_continuous(breaks= c(seq(0,100000, by=10000)),labels=c(seq(0,90000, by=10000), "100000")),
HRS%>% filter(!is.na(H4ITOT)) %>% filter(!is.na(R4RETSAT))%>% ggplot(aes(x= H4ITOT, fill=R4RETSAT))+
geom_density(color= 'black',position = 'fill')+
scale_x_continuous(breaks= c(seq(0,100000, by=10000)),labels=c(seq(0,90000, by=10000), "100000"), limits = c(0,100000)))
## Warning: Removed 396 rows containing non-finite values (stat_density).
#Based on the density plot I think Income plays a role...
#For now I will use 25000 as a place holder
HRS$H4ITOTB<-HRS$H4ITOT
HRS$H4ITOTB<- HRS$H4ITOTB>=25000
HRS %>% filter(!is.na(H4ITOTB))%>% filter(!is.na(R4RETSAT)) %>% ggplot(aes(x=H4ITOTB, fill=R4RETSAT))+
geom_bar()
prop.table(table(HRS$R4RETSAT,HRS$H4ITOTB),2)
##
## FALSE TRUE
## 1.Very 0.5056911 0.7016061
## 2.Moderately 0.3869919 0.2493660
## 3.Not at all 0.1073171 0.0490279
When I ran the regression model with Income in standalone form the model did not take kindly probably because of the skew and outliers So I broke it down as a flag variable that is over 25K or under. Another possible fix for this might be a log transformation
This next code section is a mish-mash of other variables that I thought might be interesting to play around with as predictor variable. Here I am re-coding the factors so they fit nicer into the model.
#---Non- Housing Wealth---#
#I think the Asebedo case showed that non-housing net worth is not significant at higher levels which is covered in income.
#Also wealth is going to be correlated to income
#--Poverty--#
#THis level only applies to waves 6-13...
# could build this based off of income Might do that later.....
#---EDU-----#
summary(HRS$S4EDUC)
## 1.Lt High-school 2.GED 3.High-school graduate
## 3482 651 4575
## 4.Some college 5.College and above NA's
## 2910 2734 27701
HRS$S4EDUCB<-HRS$S4EDUC
levels(HRS$S4EDUCB)<- c('LHS','HS','HS','SC','CA')
#Moving GED to HS due to lack of sample size
summary(HRS$S4EDUCB)
## LHS HS SC CA NA's
## 3482 5226 2910 2734 27701
#--Gender--#
#no Recode minimal use
#--Marital Status---#
# Single, Coupled, Widowed
summary(HRS$R4MSTAT)
## 1.Married 2.Married,spouse absent 3.Partnered
## 13820 144 551
## 4.Separated 5.Divorced 6.Separated/divorced
## 331 1711 0
## 7.Widowed 8.Never married NA's
## 4199 604 20693
HRS$R4MSTATB<- HRS$R4MSTAT
levels(HRS$R4MSTATB)<- c('Couple','Alone','Couple', 'Alone', 'Alone', 'Alone', 'Widowed', 'Alone')
prop.table(table(HRS$R4RETSAT,HRS$H4ITOTB),2)
##
## FALSE TRUE
## 1.Very 0.5056911 0.7016061
## 2.Moderately 0.3869919 0.2493660
## 3.Not at all 0.1073171 0.0490279
#--Religion--#
#no recode
summary(HRS$S4RELIG)
## 1.Protestant 2.Catholic 3.Jewish 4.None/no pref 5.Other
## 9124 3898 360 762 161
## NA's
## 27748
#----race------#
summary(HRS$S4RACEM)
## 1.White/Caucasian 2.Black/African American 3.Other
## 12320 1539 498
## NA's
## 27696
HRS$S4RACES<- HRS$S4RACEM
levels(HRS$S4RACES)<- c('White', 'Non-White', 'Non-White')
summary(HRS$S4RACES)
## White Non-White NA's
## 12320 2037 27696
#---Location---#
summary(HRS$R4CENREG)
## 1.Northeast 2.Midwest 3.South 4.West 5.Other NA's
## 3582 5323 8856 3606 12 20674
prop.table(table(HRS$R4CENREG,HRS$R4RETSAT),1)
##
## 1.Very 2.Moderately 3.Not at all
## 1.Northeast 0.56766642 0.32772495 0.10460863
## 2.Midwest 0.63377926 0.30992196 0.05629877
## 3.South 0.57574654 0.33831027 0.08594319
## 4.West 0.64781297 0.28506787 0.06711916
## 5.Other 0.50000000 0.25000000 0.25000000
Here I am making a flag variable that shows whether a person has a mental health issue
#MENTAL HEALTH ISSUES
#summary(HRS$R4DEPRES) Depression
#summary(HRS$R4PSYCH) DOctor diagnosed Psych issue
#summary(HRS$R4EFFORT) Everything Requires Effort
#summary(HRS$R4GOING) could not to get going
#summary(HRS$R4FSAD) felt sad
#summary(HRS$R4FLONE) felt alone
R4MENHTH<- HRS %>% select(HHIDPN,R4DEPRES,R4FSAD,R4PSYCH)%>% filter(!is.na(R4DEPRES))%>%filter(!is.na(R4FSAD))%>% filter(!is.na(R4PSYCH))
R4MENHTHI<-c(rep(0,length(R4MENHTH$HHIDPN)))
for(i in 1:length(R4MENHTH$HHIDPN)){
if(R4MENHTH$R4DEPRES[i]== '1.Yes'||R4MENHTH$R4FSAD[i]== '1.Yes'||R4MENHTH$R4PSYCH[i]== '1.Yes') R4MENHTHI[i]<- 1
}
summary(R4MENHTHI)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3281 1.0000 1.0000
I ended up going with depression, sadness, and psych. Should look into adding the other related variables.
#-----Age-------#
ggplot(data = HRS, aes(x=R4AGEY_B))+
geom_histogram(binwidth = 10,color= 'black', fill= '#2cfaa8', size= 1)+
labs(x= 'Age', y = 'Count', title = '4th Wave Age Distribution')+
scale_y_continuous(limits=c(0,8000),breaks = seq(0,8000,1000))+
scale_x_continuous(limits=c(35,105),breaks = seq(0,105,10))+
theme_minimal()
## Warning: Removed 20697 rows containing non-finite values (stat_bin).
summary(HRS$R4AGEY_B)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 25.00 58.00 65.00 65.94 74.00 105.00 20669
HRS %>% filter(R4AGEY_B>=50)%>%count()
## # A tibble: 1 x 1
## n
## <int>
## 1 20567
age.labs <- c(paste(seq(50, 70, by = 10), seq(50 + 10 - 1, 80 - 1, by = 10),
sep = "-"), paste(80, "+", sep = ""))
HRS$R4AGEY_BB <- cut(HRS$R4AGEY_B, breaks = c(seq(50, 80, by = 10), Inf), labels = age.labs, right = FALSE)
ggplot(data = HRS, aes(x=R4AGEY_BB))+
geom_bar(fill= '#2cfaa8', color= 'black')+
scale_x_discrete(limits =c('50-59','60-69','70-79','80+'))+
labs(x= 'Age', y = 'Count', title = '4th Wave Age Distribution')+
theme_minimal()
## Warning: Removed 21486 rows containing non-finite values (stat_count).
#note the age bins of 90-99,100+ where extremly small and would lead to insignificant evidence to support a hypothesis
#-----------------------------------------#
#Collapsing Selt Retirmeent Sat
summary(HRS$R4SHLT)
## 1.Excellent 2.Very good 3.Good 4.Fair 5.Poor NA's
## 2633 5467 6541 4400 2337 20675
HRS$R4SHLTB<- HRS$R4SHLT
levels(HRS$R4SHLTB)<- c('Excel/VG','Excel/VG','Good','Fair/Poor','Fair/Poor')
Here am binning Age and collapsing self reported health. I found that little information or model accuracy is lost by doing this.
The Next code adds my flag variables back into the HRS data-set by a participants unique PIN
R4ANY<-cbind(R4ANY,R4ANYDIFF)
R4IANY<- cbind(R4IANY, R4IANYDIFF)
R4MDOCDIG<- cbind(R4MDOCDIG,R4MANYDOCDIG)
R4HDOCDIG<- cbind(R4HDOCDIG,R4HANYDOCDIG)
R4MENHTH<- cbind(R4MENHTH,R4MENHTHI)
R4ANY<- R4ANY %>% select(HHIDPN,R4ANYDIFF)
R4IANY <- R4IANY %>% select(HHIDPN, R4IANYDIFF)
R4MDOCDIG <- R4MDOCDIG %>% select(HHIDPN, R4MANYDOCDIG)
R4HDOCDIG <- R4HDOCDIG %>% select(HHIDPN, R4HANYDOCDIG)
R4MENHTH<- R4MENHTH %>% select(HHIDPN, R4MENHTHI)
HRS<- left_join(HRS,R4ANY, by= 'HHIDPN')
HRS<- left_join(HRS,R4IANY, by= 'HHIDPN')
HRS<- left_join(HRS,R4MDOCDIG, by= 'HHIDPN')
HRS<- left_join(HRS,R4HDOCDIG, by= 'HHIDPN')
HRS<- left_join(HRS, R4MENHTH, by = 'HHIDPN')
summary(HRS$R4ANYDIFF)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.000 0.176 0.000 1.000 20753
summary(HRS$R4IANYDIFF)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.000 0.186 0.000 1.000 26113
summary(HRS$R4MANYDOCDIG)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.000 1.000 0.753 1.000 1.000 20717
summary(HRS$R4HANYDOCDIG)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.000 0.359 1.000 1.000 20698
summary(HRS$R4MENHTHI)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.000 0.328 1.000 1.000 22750
Now that my variables of interest are prepped it is time to look at running the model.
#In wave 4 there are about 7239 observations that answer the Retirment satisfaction question
#I considered the following variables and
#summary(HRS$R4RETSAT) #Independent, what we be looking at
#summary(HRS$R4SHLT) # Significant
#summary(HRS$S4EDUCB) # sig, but lose about 3000 observations when applying it. Also is this the spouces education? Why did they not measure the Respondants Education level?
#summary(HRS$S4GENDER) # Not significant
#summary(HRS$S4RACES) # Comes out as significant but messes with the model
#summary(HRS$R4ANYDIFF) #sig
#summary(HRS$R4IANYDIFF) #sig, I excepct high multicolineararity with ANYDIFF so will be best to use just one, lose about 1000 samples vs using the ANY as well
#summary(HRS$R4MANYDOCDIG) #not significant not suprised as almost all the sample has a minor health diagnosis
#summary(HRS$R4HANYDOCDIG) # barley significant on the 5% level not significant in training model
#summary(HRS$H4ITOTB) #Could be interesting to try and examine the above $75K doesn't buy happines and see how it related to retirment satisfaction
#summary(HRS$S4RELIG) #Jewish People are less satisfied but not significant
#summary(HRS$R4CENREG) #Could be interesting to compare similar sub groups and measure their satisfaction based on census region.
#summary(HRS$R4AGEY_BB) #signficant, the older someone gets the higher satisfaction they become
#Model
training<- HRS %>% filter(!is.na(R4RETSAT))%>% filter(!is.na(R4SHLTB))%>% filter(!is.na(H4ITOTB))%>% filter(!is.na(R4ANYDIFF))%>% filter(!is.na(R4MENHTHI))%>% filter(!is.na(S4EDUCB))%>%
filter(!is.na(R4HANYDOCDIG))
samp<- sample(4037,405)
train<-training[-samp,]
test<-training[samp,]
odr<-clm(R4RETSAT~ R4SHLTB+H4ITOTB+R4ANYDIFF+S4EDUCB+R4AGEY_BB+R4MENHTHI+R4HANYDOCDIG, data = train)
summary(odr)
## formula:
## R4RETSAT ~ R4SHLTB + H4ITOTB + R4ANYDIFF + S4EDUCB + R4AGEY_BB + R4MENHTHI + R4HANYDOCDIG
## data: train
##
## link threshold nobs logLik AIC niter max.grad cond.H
## logit flexible 3613 -2553.01 5134.02 6(0) 1.60e-13 1.4e+02
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## R4SHLTBGood 0.57415 0.09822 5.845 5.05e-09 ***
## R4SHLTBFair/Poor 1.14626 0.10398 11.024 < 2e-16 ***
## H4ITOTBTRUE -0.51525 0.08131 -6.337 2.34e-10 ***
## R4ANYDIFF 0.65513 0.10151 6.454 1.09e-10 ***
## S4EDUCBHS -0.23530 0.09373 -2.510 0.01206 *
## S4EDUCBSC -0.30247 0.11551 -2.618 0.00883 **
## S4EDUCBCA -0.60723 0.12857 -4.723 2.33e-06 ***
## R4AGEY_BB60-69 -0.70813 0.12348 -5.735 9.77e-09 ***
## R4AGEY_BB70-79 -0.96315 0.12557 -7.670 1.72e-14 ***
## R4AGEY_BB80+ -1.25174 0.15802 -7.922 2.35e-15 ***
## R4MENHTHI 0.82934 0.08227 10.081 < 2e-16 ***
## R4HANYDOCDIG 0.18055 0.07856 2.298 0.02156 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Threshold coefficients:
## Estimate Std. Error z value
## 1.Very|2.Moderately 0.2653 0.1526 1.739
## 2.Moderately|3.Not at all 2.7771 0.1630 17.033
## (19 observations deleted due to missingness)
#Testing for Multicollinearity with VIF>10 defined as collinear
# Still confused on this part on how to check with categorical variables
# se
#train
run2<- predict(odr,train, type = 'class')
run2<- as.data.frame(run2)
table(train$R4RETSAT)
##
## 1.Very 2.Moderately 3.Not at all
## 2354 1059 219
table(run2$fit)
##
## 1.Very 2.Moderately 3.Not at all
## 2920 685 8
table(train$R4RETSAT,run2$fit)
##
## 1.Very 2.Moderately 3.Not at all
## 1.Very 2125 223 1
## 2.Moderately 725 324 3
## 3.Not at all 70 138 4
(2110+329+2)/3632 #67~68% accuracy *note that the numbers will change here due to the using different samples
## [1] 0.6720815
#test
run1<- predict(odr,test, type = 'class')
run1<- as.data.frame(run1)
table(test$R4RETSAT)
##
## 1.Very 2.Moderately 3.Not at all
## 271 114 20
table(run1$fit)
##
## 1.Very 2.Moderately 3.Not at all
## 316 86 2
table(test$R4RETSAT,run1$fit)
##
## 1.Very 2.Moderately 3.Not at all
## 1.Very 231 39 0
## 2.Moderately 77 35 2
## 3.Not at all 8 12 0
(250+28+1)/405 #68% accuracy
## [1] 0.6888889
#Note for limitations some variables that might have a significant affect but not used due to lack of sample size avaliable to chose from.
#Poor Performance on Moderatly and Not at All, It seems that the models is over chosing
Here are a few of my initial thoughts.
I also thought of a couple of hypothesis to look at.
One hypothesis that I thought about is that mental health plays a larger role in determining retirement satisfaction compared to physical health
I also saw an interesting paper that looks at how income stops affecting emotional well-being at 75K per year. It might be interesting to try and look at if this same principle also applies to retirement satisfaction. link: https://www.princeton.edu/~deaton/downloads/deaton_kahneman_high_income_improves_evaluation_August2010.pdf
Currently I plan on looking at other variables that might be beneficial to add to the model and how to implement a mixed model.