HRSORDIN

# Wave 4 data processing for Long -----------------------------------------

#Looking at all levels for Retirment Satisfaction 

RF4<-table(HRS$R4RETSAT)
RF5<-table(HRS$R5RETSAT)
RF6<-table(HRS$R6RETSAT)
RF7<-table(HRS$R7RETSAT)
RF8<-table(HRS$R8RETSAT)
RF9<-table(HRS$R9RETSAT)
RF10<-table(HRS$R10RETSAT)
RF11<-table(HRS$R11RETSAT)
RF12<-table(HRS$R12RETSAT)
RF13<-table(HRS$R13RETSAT)

RF<-rbind(RF4,RF5,RF6, RF7, RF8, RF9, RF10, RF11, RF12, RF13)
RF<- as.data.frame(RF) 
RF$year<- c(seq(1998,2016, by=2)) 
colnames(RF)<- c('Very','Moderately', 'NotAtAll', 'Year')

ggplot(data= RF, aes(x= Year))+
  geom_line(aes(y=Very,color= 'Very'))+
  geom_line(aes(y=Moderately,color='Moderately'))+
  geom_line(aes(y=NotAtAll, color= 'not-at-all'))+
  geom_point(aes(y=Very), color= '#619CFF')+
  geom_point(aes(y=Moderately),color= '#F8766D')+
  geom_point(aes(y=NotAtAll), color= '#00BA38')+
  labs(title= 'Count of Retirment Satisfaction Over Time', x= 'Year', y= 'Number of People')+
  scale_x_discrete(limits= c(1998,2000,2002,2004,2006,2008,2010,2012,2014,2016))+
  theme_minimal()

This Graph shows the distribution of Very satisfied, moderately satisfied, and Not at all the wave times

This next section of code looks into creating flag variables as to whether a person has some difficulty in ADS, Minor physical health, and Major Health issues

#Based on the data it looks like the numbers remain constant over time with jumps in numbers 
#between years 02-04 and 08-10 
#I Immagin the ressesion has an affect as to the rise in not at all and moderatly compared to very

#---Rencoding difficulties----#
#--All these entries have 'Some Difficulties when attempting the following task'

#----Activites of Daily living----#
#summary(HRS$R4WALKRA)  # walking
#summary(HRS$R4DRESSA)  # dressing
#summary(HRS$R4BATHA)   # Bath
#summary(HRS$R4EATA)    # Eat
#summary(HRS$R4BEDA)    # Bed
#summary(HRS$R4TOILTA)  # Using the bath Room 

R4ANY<- HRS %>% select(HHIDPN,R4WALKRA,R4DRESSA,R4BATHA,R4EATA,R4BEDA,R4TOILTA)%>% filter(!is.na(R4WALKRA))%>%filter(!is.na(R4DRESSA))%>%filter(!is.na(R4BATHA))%>%filter(!is.na(R4EATA))%>%filter(!is.na(R4BEDA))%>%filter(!is.na(R4TOILTA))

R4ANYDIFF<-c(rep(0,length(R4ANY$HHIDPN)))

for(i in 1:length(R4ANY$HHIDPN)){
  if(R4ANY$R4WALKRA[i]== '1.Yes'||R4ANY$R4DRESSA[i]== '1.Yes'||R4ANY$R4BATHA[i]== '1.Yes'||R4ANY$R4EATA[i]== '1.Yes'||R4ANY$R4BEDA[i]=='1.Yes'||R4ANY$R4TOILTA[i]=='1.Yes') R4ANYDIFF[i]<- 1
}

summary(R4ANYDIFF)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1765  0.0000  1.0000

#This vector now tells us wether or not some one has some difficulty in the above tasks. 

#--Instrumental Daily Livings Some difficulty--#
#summary(HRS$R4MAPA)    # Using a map
#summary(HRS$R4PHONEA)  # Using a phone
#summary(HRS$R4MONEYA)  # managing Money
#summary(HRS$R4MEDSA)   # difficulty managing medication
#summary(HRS$R4SHOPA)  # shopping for grociers
#summary(HRS$R4MEALSA)  # preparing a hot meal 

R4IANY<- HRS %>% select(HHIDPN,R4MAPA,R4PHONEA,R4MONEYA,R4MEDSA,R4SHOPA,R4MEALSA)%>% filter(!is.na(R4MAPA))%>% filter(!is.na(R4PHONEA))%>%filter(!is.na(R4MONEYA))%>%filter(!is.na(R4MEDSA))%>%filter(!is.na(R4SHOPA))%>%filter(!is.na(R4MEALSA))

R4IANYDIFF<-c(rep(0,length(R4IANY$HHIDPN)))

for(i in 1:length(R4IANY$HHIDPN)){
  if(R4IANY$R4MAPA[i]== '1.Yes'||R4IANY$R4PHONEA[i]== '1.Yes'||R4IANY$R4MONEYA[i]== '1.Yes'||R4IANY$R4MEDSA[i]== '1.Yes'||R4IANY$R4SHOPA[i]=='1.Yes'||R4IANY$R4MEALSA[i]== '1.Yes') R4IANYDIFF[i]<- 1
}

summary(R4IANYDIFF)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1862  0.0000  1.0000

#--New flag variable for wether some one has difficulties is their Instrumental Tasks. These are going to be highly correlated 
#Note that their is other limitation variables that invloves more physical activites.
#-----------------------------#

#----Doctor Diagnosis--------# 
#summary(HRS$R4HIBP)     # High Blood Pressure
#summary(HRS$R4DIAB)     # Reports Diabites
#summary(HRS$R4CANCR)    # Cancer
#summary(HRS$R4LUNG)     # Lung deseas
#summary(HRS$R4HEART)    # Heart
#summary(HRS$R4STROK)    # Stroke
#summary(HRS$R4PSYCH)    # reports psych problem
#summary(HRS$R4ARTHR)    # Reports Arthirites

#There are different levels to these in terms of their scare level...
#Med: HBP, Diabities, Arthirites, psych?!?!?!?!
#High: Cancer, Lung deseas, Heart, stroke

#MED
R4MDOCDIG<- HRS %>% select(HHIDPN,R4HIBP,R4DIAB,R4PSYCH,R4ARTHR)%>% filter(!is.na(R4HIBP))%>%filter(!is.na(R4DIAB))%>% filter(!is.na(R4PSYCH)) %>% filter(!is.na(R4ARTHR))

R4MANYDOCDIG<-c(rep(0,length(R4MDOCDIG$HHIDPN)))

for(i in 1:length(R4MDOCDIG$HHIDPN)){
  if(R4MDOCDIG$R4HIBP[i]== '1.Yes'||R4MDOCDIG$R4DIAB[i]== '1.Yes'||R4MDOCDIG$R4PSYCH[i]== '1.Yes'||R4MDOCDIG$R4ARTHR[i]== '1.Yes') R4MANYDOCDIG[i]<- 1
}
summary(R4MANYDOCDIG) #about 73% of the sample has either Arth, Diabities, don't know how useful this information

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   1.000   0.753   1.000   1.000

#High
R4HDOCDIG<- HRS %>% select(HHIDPN,R4CANCR, R4LUNG, R4HEART,R4STROK)%>% filter(!is.na(R4CANCR))%>%filter(!is.na(R4LUNG))%>% filter(!is.na(R4HEART)) %>% filter(!is.na(R4STROK))

R4HANYDOCDIG<-c(rep(0,length(R4HDOCDIG$HHIDPN)))

for(i in 1:length(R4HDOCDIG$HHIDPN)){
  if(R4HDOCDIG$R4CANCR[i]== '1.Yes'||R4HDOCDIG$R4LUNG[i]== '1.Yes'||R4HDOCDIG$R4HEART[i]== '1.Yes'||R4HDOCDIG$R4STROK[i]== '1.Yes') R4HANYDOCDIG[i]<- 1
}

summary(R4HANYDOCDIG)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3593  1.0000  1.0000

# I belive this variable is going to be a much tigher and useful compared to the medium. 

#----------------------------------------------------------------------------------------------------#

smaller Medical issues will not be useful in the regression as most of the population has a minor health issue

#-----Income------# 
#only lower levels of income has a really significance

grid.arrange(
  HRS%>% filter(!is.na(H4ITOT)) %>% filter(!is.na(R4RETSAT))%>% ggplot(aes(x= H4ITOT, fill=R4RETSAT))+
    geom_histogram(color= 'black',breaks= c(seq(0,100000, by=10000)))+
    scale_x_continuous(breaks= c(seq(0,100000, by=10000)),labels=c(seq(0,90000, by=10000), "100000")),
  
  HRS%>% filter(!is.na(H4ITOT)) %>% filter(!is.na(R4RETSAT))%>% ggplot(aes(x= H4ITOT, fill=R4RETSAT))+
    geom_density(color= 'black',position = 'fill')+
    scale_x_continuous(breaks= c(seq(0,100000, by=10000)),labels=c(seq(0,90000, by=10000), "100000"), limits = c(0,100000)))

## Warning: Removed 396 rows containing non-finite values (stat_density).

#Based on the density plot I think Income plays a role...
#For now I will use 25000 as a place holder 

HRS$H4ITOTB<-HRS$H4ITOT
HRS$H4ITOTB<- HRS$H4ITOTB>=25000

HRS %>% filter(!is.na(H4ITOTB))%>% filter(!is.na(R4RETSAT)) %>% ggplot(aes(x=H4ITOTB, fill=R4RETSAT))+
  geom_bar()

prop.table(table(HRS$R4RETSAT,HRS$H4ITOTB),2)

##               
##                    FALSE      TRUE
##   1.Very       0.5056911 0.7016061
##   2.Moderately 0.3869919 0.2493660
##   3.Not at all 0.1073171 0.0490279

When I ran the regression model with Income in standalone form the model did not take kindly probably because of the skew and outliers So I broke it down as a flag variable that is over 25K or under. Another possible fix for this might be a log transformation

This next code section is a mish-mash of other variables that I thought might be interesting to play around with as predictor variable. Here I am re-coding the factors so they fit nicer into the model.

#---Non- Housing Wealth---#

#I think the Asebedo case showed that non-housing net worth is not significant at higher levels which is covered in income.  
#Also wealth is going to be correlated to income

#--Poverty--#
#THis level only applies to waves 6-13...
# could build this based off of income Might do that later.....

#---EDU-----#
summary(HRS$S4EDUC)

##       1.Lt High-school                  2.GED 3.High-school graduate 
##                   3482                    651                   4575 
##         4.Some college    5.College and above                   NA's 
##                   2910                   2734                  27701

HRS$S4EDUCB<-HRS$S4EDUC 
levels(HRS$S4EDUCB)<- c('LHS','HS','HS','SC','CA')
#Moving GED to HS due to lack of sample size
summary(HRS$S4EDUCB)

##   LHS    HS    SC    CA  NA's 
##  3482  5226  2910  2734 27701

#--Gender--#
#no Recode minimal use

#--Marital Status---#
   # Single, Coupled, Widowed 
summary(HRS$R4MSTAT)

##               1.Married 2.Married,spouse absent             3.Partnered 
##                   13820                     144                     551 
##             4.Separated              5.Divorced    6.Separated/divorced 
##                     331                    1711                       0 
##               7.Widowed         8.Never married                    NA's 
##                    4199                     604                   20693

HRS$R4MSTATB<- HRS$R4MSTAT
levels(HRS$R4MSTATB)<- c('Couple','Alone','Couple', 'Alone', 'Alone', 'Alone', 'Widowed', 'Alone')
prop.table(table(HRS$R4RETSAT,HRS$H4ITOTB),2)

##               
##                    FALSE      TRUE
##   1.Very       0.5056911 0.7016061
##   2.Moderately 0.3869919 0.2493660
##   3.Not at all 0.1073171 0.0490279

#--Religion--#
#no recode 
summary(HRS$S4RELIG)

##   1.Protestant     2.Catholic       3.Jewish 4.None/no pref        5.Other 
##           9124           3898            360            762            161 
##           NA's 
##          27748

#----race------#
summary(HRS$S4RACEM)

##        1.White/Caucasian 2.Black/African American                  3.Other 
##                    12320                     1539                      498 
##                     NA's 
##                    27696

HRS$S4RACES<- HRS$S4RACEM
levels(HRS$S4RACES)<- c('White', 'Non-White', 'Non-White')
summary(HRS$S4RACES)

##     White Non-White      NA's 
##     12320      2037     27696

#---Location---#

summary(HRS$R4CENREG)

## 1.Northeast   2.Midwest     3.South      4.West     5.Other        NA's 
##        3582        5323        8856        3606          12       20674

prop.table(table(HRS$R4CENREG,HRS$R4RETSAT),1)

##              
##                   1.Very 2.Moderately 3.Not at all
##   1.Northeast 0.56766642   0.32772495   0.10460863
##   2.Midwest   0.63377926   0.30992196   0.05629877
##   3.South     0.57574654   0.33831027   0.08594319
##   4.West      0.64781297   0.28506787   0.06711916
##   5.Other     0.50000000   0.25000000   0.25000000

Here I am making a flag variable that shows whether a person has a mental health issue

#MENTAL HEALTH ISSUES 
#summary(HRS$R4DEPRES)  Depression
#summary(HRS$R4PSYCH)   DOctor diagnosed Psych issue
#summary(HRS$R4EFFORT)  Everything Requires Effort
#summary(HRS$R4GOING)   could not to get going 
#summary(HRS$R4FSAD)    felt sad
#summary(HRS$R4FLONE)   felt alone

R4MENHTH<- HRS %>% select(HHIDPN,R4DEPRES,R4FSAD,R4PSYCH)%>% filter(!is.na(R4DEPRES))%>%filter(!is.na(R4FSAD))%>% filter(!is.na(R4PSYCH))

R4MENHTHI<-c(rep(0,length(R4MENHTH$HHIDPN)))

for(i in 1:length(R4MENHTH$HHIDPN)){
  if(R4MENHTH$R4DEPRES[i]== '1.Yes'||R4MENHTH$R4FSAD[i]== '1.Yes'||R4MENHTH$R4PSYCH[i]== '1.Yes') R4MENHTHI[i]<- 1
}
summary(R4MENHTHI)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3281  1.0000  1.0000

I ended up going with depression, sadness, and psych. Should look into adding the other related variables.

#-----Age-------#
ggplot(data = HRS, aes(x=R4AGEY_B))+
  geom_histogram(binwidth = 10,color= 'black', fill= '#2cfaa8', size= 1)+
  labs(x= 'Age', y = 'Count', title = '4th Wave Age Distribution')+
  scale_y_continuous(limits=c(0,8000),breaks = seq(0,8000,1000))+
  scale_x_continuous(limits=c(35,105),breaks = seq(0,105,10))+
  theme_minimal()

## Warning: Removed 20697 rows containing non-finite values (stat_bin).

summary(HRS$R4AGEY_B)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   25.00   58.00   65.00   65.94   74.00  105.00   20669

HRS %>% filter(R4AGEY_B>=50)%>%count()

## # A tibble: 1 x 1
##       n
##   <int>
## 1 20567

age.labs <- c(paste(seq(50, 70, by = 10), seq(50 + 10 - 1, 80 - 1, by = 10),
                    sep = "-"), paste(80, "+", sep = ""))
HRS$R4AGEY_BB <- cut(HRS$R4AGEY_B, breaks = c(seq(50, 80, by = 10), Inf), labels = age.labs, right = FALSE)

ggplot(data = HRS, aes(x=R4AGEY_BB))+
  geom_bar(fill= '#2cfaa8', color= 'black')+
  scale_x_discrete(limits =c('50-59','60-69','70-79','80+'))+
  labs(x= 'Age', y = 'Count', title = '4th Wave Age Distribution')+
  theme_minimal()

## Warning: Removed 21486 rows containing non-finite values (stat_count).

#note the age bins of 90-99,100+ where extremly small and would lead to insignificant evidence to support a hypothesis
#-----------------------------------------#
#Collapsing Selt Retirmeent Sat
summary(HRS$R4SHLT)

## 1.Excellent 2.Very good      3.Good      4.Fair      5.Poor        NA's 
##        2633        5467        6541        4400        2337       20675

HRS$R4SHLTB<- HRS$R4SHLT
levels(HRS$R4SHLTB)<- c('Excel/VG','Excel/VG','Good','Fair/Poor','Fair/Poor')

Here am binning Age and collapsing self reported health. I found that little information or model accuracy is lost by doing this.

The Next code adds my flag variables back into the HRS data-set by a participants unique PIN

R4ANY<-cbind(R4ANY,R4ANYDIFF)
R4IANY<- cbind(R4IANY, R4IANYDIFF)
R4MDOCDIG<- cbind(R4MDOCDIG,R4MANYDOCDIG)
R4HDOCDIG<- cbind(R4HDOCDIG,R4HANYDOCDIG)
R4MENHTH<- cbind(R4MENHTH,R4MENHTHI)


R4ANY<- R4ANY %>% select(HHIDPN,R4ANYDIFF)
R4IANY <- R4IANY %>% select(HHIDPN, R4IANYDIFF)
R4MDOCDIG <- R4MDOCDIG %>% select(HHIDPN, R4MANYDOCDIG)
R4HDOCDIG <- R4HDOCDIG %>% select(HHIDPN, R4HANYDOCDIG)
R4MENHTH<- R4MENHTH %>% select(HHIDPN, R4MENHTHI)

HRS<- left_join(HRS,R4ANY, by= 'HHIDPN')
HRS<- left_join(HRS,R4IANY, by= 'HHIDPN')
HRS<- left_join(HRS,R4MDOCDIG, by= 'HHIDPN')
HRS<- left_join(HRS,R4HDOCDIG, by= 'HHIDPN')
HRS<- left_join(HRS, R4MENHTH, by = 'HHIDPN')

summary(HRS$R4ANYDIFF)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.176   0.000   1.000   20753

summary(HRS$R4IANYDIFF)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.186   0.000   1.000   26113

summary(HRS$R4MANYDOCDIG)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.000   1.000   0.753   1.000   1.000   20717

summary(HRS$R4HANYDOCDIG)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.359   1.000   1.000   20698

summary(HRS$R4MENHTHI)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.328   1.000   1.000   22750

Now that my variables of interest are prepped it is time to look at running the model.

#In wave 4 there are about 7239 observations that answer the Retirment satisfaction question
#I considered the following variables and 
#summary(HRS$R4RETSAT) #Independent, what we be looking at
#summary(HRS$R4SHLT)   # Significant
#summary(HRS$S4EDUCB)  # sig, but lose about 3000 observations when applying it. Also is this the spouces education? Why did they not measure the Respondants Education level?
#summary(HRS$S4GENDER) # Not significant
#summary(HRS$S4RACES)  # Comes out as significant but messes with the model  
#summary(HRS$R4ANYDIFF)   #sig
#summary(HRS$R4IANYDIFF)  #sig, I excepct high multicolineararity with ANYDIFF so will be best to use just one, lose about 1000 samples vs using the ANY as well 
#summary(HRS$R4MANYDOCDIG)  #not significant not suprised as almost all the sample has a minor health diagnosis 
#summary(HRS$R4HANYDOCDIG)  # barley significant on the 5% level not significant in training model
#summary(HRS$H4ITOTB)       #Could be interesting to try and examine the above $75K doesn't buy happines and see how it related to retirment satisfaction 
#summary(HRS$S4RELIG)       #Jewish People are less satisfied but not significant
#summary(HRS$R4CENREG)      #Could be interesting to compare similar sub groups and measure their satisfaction based on census region. 
#summary(HRS$R4AGEY_BB)      #signficant, the older someone gets the higher satisfaction they become

#Model
training<- HRS %>% filter(!is.na(R4RETSAT))%>% filter(!is.na(R4SHLTB))%>% filter(!is.na(H4ITOTB))%>% filter(!is.na(R4ANYDIFF))%>% filter(!is.na(R4MENHTHI))%>% filter(!is.na(S4EDUCB))%>% 
  filter(!is.na(R4HANYDOCDIG))

samp<- sample(4037,405)

train<-training[-samp,]
test<-training[samp,]

odr<-clm(R4RETSAT~ R4SHLTB+H4ITOTB+R4ANYDIFF+S4EDUCB+R4AGEY_BB+R4MENHTHI+R4HANYDOCDIG, data = train)

summary(odr)

## formula: 
## R4RETSAT ~ R4SHLTB + H4ITOTB + R4ANYDIFF + S4EDUCB + R4AGEY_BB + R4MENHTHI + R4HANYDOCDIG
## data:    train
## 
##  link  threshold nobs logLik   AIC     niter max.grad cond.H 
##  logit flexible  3613 -2553.01 5134.02 6(0)  1.60e-13 1.4e+02
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## R4SHLTBGood       0.57415    0.09822   5.845 5.05e-09 ***
## R4SHLTBFair/Poor  1.14626    0.10398  11.024  < 2e-16 ***
## H4ITOTBTRUE      -0.51525    0.08131  -6.337 2.34e-10 ***
## R4ANYDIFF         0.65513    0.10151   6.454 1.09e-10 ***
## S4EDUCBHS        -0.23530    0.09373  -2.510  0.01206 *  
## S4EDUCBSC        -0.30247    0.11551  -2.618  0.00883 ** 
## S4EDUCBCA        -0.60723    0.12857  -4.723 2.33e-06 ***
## R4AGEY_BB60-69   -0.70813    0.12348  -5.735 9.77e-09 ***
## R4AGEY_BB70-79   -0.96315    0.12557  -7.670 1.72e-14 ***
## R4AGEY_BB80+     -1.25174    0.15802  -7.922 2.35e-15 ***
## R4MENHTHI         0.82934    0.08227  10.081  < 2e-16 ***
## R4HANYDOCDIG      0.18055    0.07856   2.298  0.02156 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##                           Estimate Std. Error z value
## 1.Very|2.Moderately         0.2653     0.1526   1.739
## 2.Moderately|3.Not at all   2.7771     0.1630  17.033
## (19 observations deleted due to missingness)

#Testing for Multicollinearity with VIF>10 defined as collinear
# Still confused on this part on how to check with categorical variables
# se

#train
run2<- predict(odr,train, type = 'class')
run2<- as.data.frame(run2)
table(train$R4RETSAT)

## 
##       1.Very 2.Moderately 3.Not at all 
##         2354         1059          219

table(run2$fit)

## 
##       1.Very 2.Moderately 3.Not at all 
##         2920          685            8

table(train$R4RETSAT,run2$fit)

##               
##                1.Very 2.Moderately 3.Not at all
##   1.Very         2125          223            1
##   2.Moderately    725          324            3
##   3.Not at all     70          138            4

(2110+329+2)/3632 #67~68% accuracy *note that the numbers will change here due to the using different samples

## [1] 0.6720815

#test
run1<- predict(odr,test, type = 'class')
run1<- as.data.frame(run1)
table(test$R4RETSAT)

## 
##       1.Very 2.Moderately 3.Not at all 
##          271          114           20

table(run1$fit)

## 
##       1.Very 2.Moderately 3.Not at all 
##          316           86            2

table(test$R4RETSAT,run1$fit)

##               
##                1.Very 2.Moderately 3.Not at all
##   1.Very          231           39            0
##   2.Moderately     77           35            2
##   3.Not at all      8           12            0

(250+28+1)/405 #68% accuracy

## [1] 0.6888889

#Note for limitations some variables that might have a significant affect but not used due to lack of sample size avaliable to chose from. 
#Poor Performance on Moderatly and Not at All, It seems that the models is over chosing

Here are a few of my initial thoughts.

The model is over predicting Very satisfied and preforming poorly when predicting moderately/ not at all. This is something to try and figure out
The preprocess code that I wrote to prepare this data should be able to replicate easily and applied to the other waves
The package I used for ordinal regression also has ability to create a mixed model based on the data but I need do some more looking into on the basic of mixed models
Multicolinarity and Proportional odds assumption have not looked at.
Based on the accuracy of the test and training results there does not seem to be over fitting.

I also thought of a couple of hypothesis to look at.

One hypothesis that I thought about is that mental health plays a larger role in determining retirement satisfaction compared to physical health

looking at the summary of the model it seems that predictor indicating whether people who have cancer, stroke, or lung disease is insignificant
Might be worth creating similar groups based on income,age,edu and compare satisfaction level among people with metal health vs serious physical health issues.

I also saw an interesting paper that looks at how income stops affecting emotional well-being at 75K per year. It might be interesting to try and look at if this same principle also applies to retirement satisfaction. link: https://www.princeton.edu/~deaton/downloads/deaton_kahneman_high_income_improves_evaluation_August2010.pdf

Currently I plan on looking at other variables that might be beneficial to add to the model and how to implement a mixed model.

HRSORDIN

Chris Beland

February 3, 2020