# Wave 4 data processing for Long -----------------------------------------

#Looking at all levels for Retirment Satisfaction 

RF4<-table(HRS$R4RETSAT)
RF5<-table(HRS$R5RETSAT)
RF6<-table(HRS$R6RETSAT)
RF7<-table(HRS$R7RETSAT)
RF8<-table(HRS$R8RETSAT)
RF9<-table(HRS$R9RETSAT)
RF10<-table(HRS$R10RETSAT)
RF11<-table(HRS$R11RETSAT)
RF12<-table(HRS$R12RETSAT)
RF13<-table(HRS$R13RETSAT)

RF<-rbind(RF4,RF5,RF6, RF7, RF8, RF9, RF10, RF11, RF12, RF13)
RF<- as.data.frame(RF) 
RF$year<- c(seq(1998,2016, by=2)) 
colnames(RF)<- c('Very','Moderately', 'NotAtAll', 'Year')

ggplot(data= RF, aes(x= Year))+
  geom_line(aes(y=Very,color= 'Very'))+
  geom_line(aes(y=Moderately,color='Moderately'))+
  geom_line(aes(y=NotAtAll, color= 'not-at-all'))+
  geom_point(aes(y=Very), color= '#619CFF')+
  geom_point(aes(y=Moderately),color= '#F8766D')+
  geom_point(aes(y=NotAtAll), color= '#00BA38')+
  labs(title= 'Count of Retirment Satisfaction Over Time', x= 'Year', y= 'Number of People')+
  scale_x_discrete(limits= c(1998,2000,2002,2004,2006,2008,2010,2012,2014,2016))+
  theme_minimal()

This Graph shows the distribution of Very satisfied, moderately satisfied, and Not at all the wave times

This next section of code looks into creating flag variables as to whether a person has some difficulty in ADS, Minor physical health, and Major Health issues

#Based on the data it looks like the numbers remain constant over time with jumps in numbers 
#between years 02-04 and 08-10 
#I Immagin the ressesion has an affect as to the rise in not at all and moderatly compared to very

#---Rencoding difficulties----#
#--All these entries have 'Some Difficulties when attempting the following task'

#----Activites of Daily living----#
#summary(HRS$R4WALKRA)  # walking
#summary(HRS$R4DRESSA)  # dressing
#summary(HRS$R4BATHA)   # Bath
#summary(HRS$R4EATA)    # Eat
#summary(HRS$R4BEDA)    # Bed
#summary(HRS$R4TOILTA)  # Using the bath Room 

R4ANY<- HRS %>% select(HHIDPN,R4WALKRA,R4DRESSA,R4BATHA,R4EATA,R4BEDA,R4TOILTA)%>% filter(!is.na(R4WALKRA))%>%filter(!is.na(R4DRESSA))%>%filter(!is.na(R4BATHA))%>%filter(!is.na(R4EATA))%>%filter(!is.na(R4BEDA))%>%filter(!is.na(R4TOILTA))

R4ANYDIFF<-c(rep(0,length(R4ANY$HHIDPN)))

for(i in 1:length(R4ANY$HHIDPN)){
  if(R4ANY$R4WALKRA[i]== '1.Yes'||R4ANY$R4DRESSA[i]== '1.Yes'||R4ANY$R4BATHA[i]== '1.Yes'||R4ANY$R4EATA[i]== '1.Yes'||R4ANY$R4BEDA[i]=='1.Yes'||R4ANY$R4TOILTA[i]=='1.Yes') R4ANYDIFF[i]<- 1
}

summary(R4ANYDIFF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1765  0.0000  1.0000
#This vector now tells us wether or not some one has some difficulty in the above tasks. 

#--Instrumental Daily Livings Some difficulty--#
#summary(HRS$R4MAPA)    # Using a map
#summary(HRS$R4PHONEA)  # Using a phone
#summary(HRS$R4MONEYA)  # managing Money
#summary(HRS$R4MEDSA)   # difficulty managing medication
#summary(HRS$R4SHOPA)  # shopping for grociers
#summary(HRS$R4MEALSA)  # preparing a hot meal 

R4IANY<- HRS %>% select(HHIDPN,R4MAPA,R4PHONEA,R4MONEYA,R4MEDSA,R4SHOPA,R4MEALSA)%>% filter(!is.na(R4MAPA))%>% filter(!is.na(R4PHONEA))%>%filter(!is.na(R4MONEYA))%>%filter(!is.na(R4MEDSA))%>%filter(!is.na(R4SHOPA))%>%filter(!is.na(R4MEALSA))

R4IANYDIFF<-c(rep(0,length(R4IANY$HHIDPN)))

for(i in 1:length(R4IANY$HHIDPN)){
  if(R4IANY$R4MAPA[i]== '1.Yes'||R4IANY$R4PHONEA[i]== '1.Yes'||R4IANY$R4MONEYA[i]== '1.Yes'||R4IANY$R4MEDSA[i]== '1.Yes'||R4IANY$R4SHOPA[i]=='1.Yes'||R4IANY$R4MEALSA[i]== '1.Yes') R4IANYDIFF[i]<- 1
}

summary(R4IANYDIFF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1862  0.0000  1.0000
#--New flag variable for wether some one has difficulties is their Instrumental Tasks. These are going to be highly correlated 
#Note that their is other limitation variables that invloves more physical activites.
#-----------------------------#

#----Doctor Diagnosis--------# 
#summary(HRS$R4HIBP)     # High Blood Pressure
#summary(HRS$R4DIAB)     # Reports Diabites
#summary(HRS$R4CANCR)    # Cancer
#summary(HRS$R4LUNG)     # Lung deseas
#summary(HRS$R4HEART)    # Heart
#summary(HRS$R4STROK)    # Stroke
#summary(HRS$R4PSYCH)    # reports psych problem
#summary(HRS$R4ARTHR)    # Reports Arthirites

#There are different levels to these in terms of their scare level...
#Med: HBP, Diabities, Arthirites, psych?!?!?!?!
#High: Cancer, Lung deseas, Heart, stroke

#MED
R4MDOCDIG<- HRS %>% select(HHIDPN,R4HIBP,R4DIAB,R4PSYCH,R4ARTHR)%>% filter(!is.na(R4HIBP))%>%filter(!is.na(R4DIAB))%>% filter(!is.na(R4PSYCH)) %>% filter(!is.na(R4ARTHR))

R4MANYDOCDIG<-c(rep(0,length(R4MDOCDIG$HHIDPN)))

for(i in 1:length(R4MDOCDIG$HHIDPN)){
  if(R4MDOCDIG$R4HIBP[i]== '1.Yes'||R4MDOCDIG$R4DIAB[i]== '1.Yes'||R4MDOCDIG$R4PSYCH[i]== '1.Yes'||R4MDOCDIG$R4ARTHR[i]== '1.Yes') R4MANYDOCDIG[i]<- 1
}
summary(R4MANYDOCDIG) #about 73% of the sample has either Arth, Diabities, don't know how useful this information
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   1.000   0.753   1.000   1.000
#High
R4HDOCDIG<- HRS %>% select(HHIDPN,R4CANCR, R4LUNG, R4HEART,R4STROK)%>% filter(!is.na(R4CANCR))%>%filter(!is.na(R4LUNG))%>% filter(!is.na(R4HEART)) %>% filter(!is.na(R4STROK))

R4HANYDOCDIG<-c(rep(0,length(R4HDOCDIG$HHIDPN)))

for(i in 1:length(R4HDOCDIG$HHIDPN)){
  if(R4HDOCDIG$R4CANCR[i]== '1.Yes'||R4HDOCDIG$R4LUNG[i]== '1.Yes'||R4HDOCDIG$R4HEART[i]== '1.Yes'||R4HDOCDIG$R4STROK[i]== '1.Yes') R4HANYDOCDIG[i]<- 1
}

summary(R4HANYDOCDIG) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3593  1.0000  1.0000
# I belive this variable is going to be a much tigher and useful compared to the medium. 

#----------------------------------------------------------------------------------------------------#

smaller Medical issues will not be useful in the regression as most of the population has a minor health issue

#-----Income------# 
#only lower levels of income has a really significance

grid.arrange(
  HRS%>% filter(!is.na(H4ITOT)) %>% filter(!is.na(R4RETSAT))%>% ggplot(aes(x= H4ITOT, fill=R4RETSAT))+
    geom_histogram(color= 'black',breaks= c(seq(0,100000, by=10000)))+
    scale_x_continuous(breaks= c(seq(0,100000, by=10000)),labels=c(seq(0,90000, by=10000), "100000")),
  
  HRS%>% filter(!is.na(H4ITOT)) %>% filter(!is.na(R4RETSAT))%>% ggplot(aes(x= H4ITOT, fill=R4RETSAT))+
    geom_density(color= 'black',position = 'fill')+
    scale_x_continuous(breaks= c(seq(0,100000, by=10000)),labels=c(seq(0,90000, by=10000), "100000"), limits = c(0,100000)))
## Warning: Removed 396 rows containing non-finite values (stat_density).

#Based on the density plot I think Income plays a role...
#For now I will use 25000 as a place holder 

HRS$H4ITOTB<-HRS$H4ITOT
HRS$H4ITOTB<- HRS$H4ITOTB>=25000

HRS %>% filter(!is.na(H4ITOTB))%>% filter(!is.na(R4RETSAT)) %>% ggplot(aes(x=H4ITOTB, fill=R4RETSAT))+
  geom_bar()

prop.table(table(HRS$R4RETSAT,HRS$H4ITOTB),2)
##               
##                    FALSE      TRUE
##   1.Very       0.5056911 0.7016061
##   2.Moderately 0.3869919 0.2493660
##   3.Not at all 0.1073171 0.0490279

When I ran the regression model with Income in standalone form the model did not take kindly probably because of the skew and outliers So I broke it down as a flag variable that is over 25K or under. Another possible fix for this might be a log transformation

This next code section is a mish-mash of other variables that I thought might be interesting to play around with as predictor variable. Here I am re-coding the factors so they fit nicer into the model.

#---Non- Housing Wealth---#

#I think the Asebedo case showed that non-housing net worth is not significant at higher levels which is covered in income.  
#Also wealth is going to be correlated to income

#--Poverty--#
#THis level only applies to waves 6-13...
# could build this based off of income Might do that later.....

#---EDU-----#
summary(HRS$S4EDUC)
##       1.Lt High-school                  2.GED 3.High-school graduate 
##                   3482                    651                   4575 
##         4.Some college    5.College and above                   NA's 
##                   2910                   2734                  27701
HRS$S4EDUCB<-HRS$S4EDUC 
levels(HRS$S4EDUCB)<- c('LHS','HS','HS','SC','CA')
#Moving GED to HS due to lack of sample size
summary(HRS$S4EDUCB)
##   LHS    HS    SC    CA  NA's 
##  3482  5226  2910  2734 27701
#--Gender--#
#no Recode minimal use

#--Marital Status---#
   # Single, Coupled, Widowed 
summary(HRS$R4MSTAT)
##               1.Married 2.Married,spouse absent             3.Partnered 
##                   13820                     144                     551 
##             4.Separated              5.Divorced    6.Separated/divorced 
##                     331                    1711                       0 
##               7.Widowed         8.Never married                    NA's 
##                    4199                     604                   20693
HRS$R4MSTATB<- HRS$R4MSTAT
levels(HRS$R4MSTATB)<- c('Couple','Alone','Couple', 'Alone', 'Alone', 'Alone', 'Widowed', 'Alone')
prop.table(table(HRS$R4RETSAT,HRS$H4ITOTB),2)
##               
##                    FALSE      TRUE
##   1.Very       0.5056911 0.7016061
##   2.Moderately 0.3869919 0.2493660
##   3.Not at all 0.1073171 0.0490279
#--Religion--#
#no recode 
summary(HRS$S4RELIG)
##   1.Protestant     2.Catholic       3.Jewish 4.None/no pref        5.Other 
##           9124           3898            360            762            161 
##           NA's 
##          27748
#----race------#
summary(HRS$S4RACEM)
##        1.White/Caucasian 2.Black/African American                  3.Other 
##                    12320                     1539                      498 
##                     NA's 
##                    27696
HRS$S4RACES<- HRS$S4RACEM
levels(HRS$S4RACES)<- c('White', 'Non-White', 'Non-White')
summary(HRS$S4RACES)
##     White Non-White      NA's 
##     12320      2037     27696
#---Location---#

summary(HRS$R4CENREG)
## 1.Northeast   2.Midwest     3.South      4.West     5.Other        NA's 
##        3582        5323        8856        3606          12       20674
prop.table(table(HRS$R4CENREG,HRS$R4RETSAT),1)
##              
##                   1.Very 2.Moderately 3.Not at all
##   1.Northeast 0.56766642   0.32772495   0.10460863
##   2.Midwest   0.63377926   0.30992196   0.05629877
##   3.South     0.57574654   0.33831027   0.08594319
##   4.West      0.64781297   0.28506787   0.06711916
##   5.Other     0.50000000   0.25000000   0.25000000

Here I am making a flag variable that shows whether a person has a mental health issue

#MENTAL HEALTH ISSUES 
#summary(HRS$R4DEPRES)  Depression
#summary(HRS$R4PSYCH)   DOctor diagnosed Psych issue
#summary(HRS$R4EFFORT)  Everything Requires Effort
#summary(HRS$R4GOING)   could not to get going 
#summary(HRS$R4FSAD)    felt sad
#summary(HRS$R4FLONE)   felt alone

R4MENHTH<- HRS %>% select(HHIDPN,R4DEPRES,R4FSAD,R4PSYCH)%>% filter(!is.na(R4DEPRES))%>%filter(!is.na(R4FSAD))%>% filter(!is.na(R4PSYCH))

R4MENHTHI<-c(rep(0,length(R4MENHTH$HHIDPN)))

for(i in 1:length(R4MENHTH$HHIDPN)){
  if(R4MENHTH$R4DEPRES[i]== '1.Yes'||R4MENHTH$R4FSAD[i]== '1.Yes'||R4MENHTH$R4PSYCH[i]== '1.Yes') R4MENHTHI[i]<- 1
}
summary(R4MENHTHI)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3281  1.0000  1.0000

I ended up going with depression, sadness, and psych. Should look into adding the other related variables.

#-----Age-------#
ggplot(data = HRS, aes(x=R4AGEY_B))+
  geom_histogram(binwidth = 10,color= 'black', fill= '#2cfaa8', size= 1)+
  labs(x= 'Age', y = 'Count', title = '4th Wave Age Distribution')+
  scale_y_continuous(limits=c(0,8000),breaks = seq(0,8000,1000))+
  scale_x_continuous(limits=c(35,105),breaks = seq(0,105,10))+
  theme_minimal()
## Warning: Removed 20697 rows containing non-finite values (stat_bin).

summary(HRS$R4AGEY_B)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   25.00   58.00   65.00   65.94   74.00  105.00   20669
HRS %>% filter(R4AGEY_B>=50)%>%count()
## # A tibble: 1 x 1
##       n
##   <int>
## 1 20567
age.labs <- c(paste(seq(50, 70, by = 10), seq(50 + 10 - 1, 80 - 1, by = 10),
                    sep = "-"), paste(80, "+", sep = ""))
HRS$R4AGEY_BB <- cut(HRS$R4AGEY_B, breaks = c(seq(50, 80, by = 10), Inf), labels = age.labs, right = FALSE)

ggplot(data = HRS, aes(x=R4AGEY_BB))+
  geom_bar(fill= '#2cfaa8', color= 'black')+
  scale_x_discrete(limits =c('50-59','60-69','70-79','80+'))+
  labs(x= 'Age', y = 'Count', title = '4th Wave Age Distribution')+
  theme_minimal()
## Warning: Removed 21486 rows containing non-finite values (stat_count).

#note the age bins of 90-99,100+ where extremly small and would lead to insignificant evidence to support a hypothesis
#-----------------------------------------#
#Collapsing Selt Retirmeent Sat
summary(HRS$R4SHLT)
## 1.Excellent 2.Very good      3.Good      4.Fair      5.Poor        NA's 
##        2633        5467        6541        4400        2337       20675
HRS$R4SHLTB<- HRS$R4SHLT
levels(HRS$R4SHLTB)<- c('Excel/VG','Excel/VG','Good','Fair/Poor','Fair/Poor')

Here am binning Age and collapsing self reported health. I found that little information or model accuracy is lost by doing this.

The Next code adds my flag variables back into the HRS data-set by a participants unique PIN

R4ANY<-cbind(R4ANY,R4ANYDIFF)
R4IANY<- cbind(R4IANY, R4IANYDIFF)
R4MDOCDIG<- cbind(R4MDOCDIG,R4MANYDOCDIG)
R4HDOCDIG<- cbind(R4HDOCDIG,R4HANYDOCDIG)
R4MENHTH<- cbind(R4MENHTH,R4MENHTHI)


R4ANY<- R4ANY %>% select(HHIDPN,R4ANYDIFF)
R4IANY <- R4IANY %>% select(HHIDPN, R4IANYDIFF)
R4MDOCDIG <- R4MDOCDIG %>% select(HHIDPN, R4MANYDOCDIG)
R4HDOCDIG <- R4HDOCDIG %>% select(HHIDPN, R4HANYDOCDIG)
R4MENHTH<- R4MENHTH %>% select(HHIDPN, R4MENHTHI)

HRS<- left_join(HRS,R4ANY, by= 'HHIDPN')
HRS<- left_join(HRS,R4IANY, by= 'HHIDPN')
HRS<- left_join(HRS,R4MDOCDIG, by= 'HHIDPN')
HRS<- left_join(HRS,R4HDOCDIG, by= 'HHIDPN')
HRS<- left_join(HRS, R4MENHTH, by = 'HHIDPN')

summary(HRS$R4ANYDIFF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.176   0.000   1.000   20753
summary(HRS$R4IANYDIFF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.186   0.000   1.000   26113
summary(HRS$R4MANYDOCDIG)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.000   1.000   0.753   1.000   1.000   20717
summary(HRS$R4HANYDOCDIG)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.359   1.000   1.000   20698
summary(HRS$R4MENHTHI)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.328   1.000   1.000   22750

Now that my variables of interest are prepped it is time to look at running the model.

#In wave 4 there are about 7239 observations that answer the Retirment satisfaction question
#I considered the following variables and 
#summary(HRS$R4RETSAT) #Independent, what we be looking at
#summary(HRS$R4SHLT)   # Significant
#summary(HRS$S4EDUCB)  # sig, but lose about 3000 observations when applying it. Also is this the spouces education? Why did they not measure the Respondants Education level?
#summary(HRS$S4GENDER) # Not significant
#summary(HRS$S4RACES)  # Comes out as significant but messes with the model  
#summary(HRS$R4ANYDIFF)   #sig
#summary(HRS$R4IANYDIFF)  #sig, I excepct high multicolineararity with ANYDIFF so will be best to use just one, lose about 1000 samples vs using the ANY as well 
#summary(HRS$R4MANYDOCDIG)  #not significant not suprised as almost all the sample has a minor health diagnosis 
#summary(HRS$R4HANYDOCDIG)  # barley significant on the 5% level not significant in training model
#summary(HRS$H4ITOTB)       #Could be interesting to try and examine the above $75K doesn't buy happines and see how it related to retirment satisfaction 
#summary(HRS$S4RELIG)       #Jewish People are less satisfied but not significant
#summary(HRS$R4CENREG)      #Could be interesting to compare similar sub groups and measure their satisfaction based on census region. 
#summary(HRS$R4AGEY_BB)      #signficant, the older someone gets the higher satisfaction they become

#Model
training<- HRS %>% filter(!is.na(R4RETSAT))%>% filter(!is.na(R4SHLTB))%>% filter(!is.na(H4ITOTB))%>% filter(!is.na(R4ANYDIFF))%>% filter(!is.na(R4MENHTHI))%>% filter(!is.na(S4EDUCB))%>% 
  filter(!is.na(R4HANYDOCDIG))

samp<- sample(4037,405)

train<-training[-samp,]
test<-training[samp,]

odr<-clm(R4RETSAT~ R4SHLTB+H4ITOTB+R4ANYDIFF+S4EDUCB+R4AGEY_BB+R4MENHTHI+R4HANYDOCDIG, data = train)

summary(odr)
## formula: 
## R4RETSAT ~ R4SHLTB + H4ITOTB + R4ANYDIFF + S4EDUCB + R4AGEY_BB + R4MENHTHI + R4HANYDOCDIG
## data:    train
## 
##  link  threshold nobs logLik   AIC     niter max.grad cond.H 
##  logit flexible  3613 -2553.01 5134.02 6(0)  1.60e-13 1.4e+02
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## R4SHLTBGood       0.57415    0.09822   5.845 5.05e-09 ***
## R4SHLTBFair/Poor  1.14626    0.10398  11.024  < 2e-16 ***
## H4ITOTBTRUE      -0.51525    0.08131  -6.337 2.34e-10 ***
## R4ANYDIFF         0.65513    0.10151   6.454 1.09e-10 ***
## S4EDUCBHS        -0.23530    0.09373  -2.510  0.01206 *  
## S4EDUCBSC        -0.30247    0.11551  -2.618  0.00883 ** 
## S4EDUCBCA        -0.60723    0.12857  -4.723 2.33e-06 ***
## R4AGEY_BB60-69   -0.70813    0.12348  -5.735 9.77e-09 ***
## R4AGEY_BB70-79   -0.96315    0.12557  -7.670 1.72e-14 ***
## R4AGEY_BB80+     -1.25174    0.15802  -7.922 2.35e-15 ***
## R4MENHTHI         0.82934    0.08227  10.081  < 2e-16 ***
## R4HANYDOCDIG      0.18055    0.07856   2.298  0.02156 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##                           Estimate Std. Error z value
## 1.Very|2.Moderately         0.2653     0.1526   1.739
## 2.Moderately|3.Not at all   2.7771     0.1630  17.033
## (19 observations deleted due to missingness)
#Testing for Multicollinearity with VIF>10 defined as collinear
# Still confused on this part on how to check with categorical variables
# se

#train
run2<- predict(odr,train, type = 'class')
run2<- as.data.frame(run2)
table(train$R4RETSAT)
## 
##       1.Very 2.Moderately 3.Not at all 
##         2354         1059          219
table(run2$fit)
## 
##       1.Very 2.Moderately 3.Not at all 
##         2920          685            8
table(train$R4RETSAT,run2$fit)
##               
##                1.Very 2.Moderately 3.Not at all
##   1.Very         2125          223            1
##   2.Moderately    725          324            3
##   3.Not at all     70          138            4
(2110+329+2)/3632 #67~68% accuracy *note that the numbers will change here due to the using different samples
## [1] 0.6720815
#test
run1<- predict(odr,test, type = 'class')
run1<- as.data.frame(run1)
table(test$R4RETSAT)
## 
##       1.Very 2.Moderately 3.Not at all 
##          271          114           20
table(run1$fit)
## 
##       1.Very 2.Moderately 3.Not at all 
##          316           86            2
table(test$R4RETSAT,run1$fit)
##               
##                1.Very 2.Moderately 3.Not at all
##   1.Very          231           39            0
##   2.Moderately     77           35            2
##   3.Not at all      8           12            0
(250+28+1)/405 #68% accuracy
## [1] 0.6888889
#Note for limitations some variables that might have a significant affect but not used due to lack of sample size avaliable to chose from. 
#Poor Performance on Moderatly and Not at All, It seems that the models is over chosing 

Here are a few of my initial thoughts.

I also thought of a couple of hypothesis to look at.

One hypothesis that I thought about is that mental health plays a larger role in determining retirement satisfaction compared to physical health

I also saw an interesting paper that looks at how income stops affecting emotional well-being at 75K per year. It might be interesting to try and look at if this same principle also applies to retirement satisfaction. link: https://www.princeton.edu/~deaton/downloads/deaton_kahneman_high_income_improves_evaluation_August2010.pdf

Currently I plan on looking at other variables that might be beneficial to add to the model and how to implement a mixed model.