nhis=nhis[,c(-1:-12,-17:-19,-21:-41,-43:-66,-68:-114,-116:-117,-119:-127)]
## FSRUNOUT is dependent variable
## Recodding FSRUNOUT
nhis$FSRUNOUT = recode(nhis$FSRUNOUT, "'1'=1;'2'=1;'3'=0;'7'=NA;'8'=NA;'9'=NA")
FSRUNOUT=nhis$FSRUNOUT
## Making dummy variables
v1=rep(1,dim(nhis)[1])
v2=rep(0,dim(nhis)[1])
nhis$INCGRP5 = recode(nhis$INCGRP5, "'1'=1;'2'=2;'3'=3;'4'=4;'96'=NA;'99'=NA")
nhis$NIlessthan35k=ifelse(nhis$INCGRP5==1,v1,v2)
nhis$NI35kto74.9k=ifelse(nhis$INCGRP5==2,v1,v2)
nhis$NI75kto99k=ifelse(nhis$INCGRP5==3,v1,v2)
nhis$NIabove100k=ifelse(nhis$INCGRP5==4,v1,v2)
nhis$FLNGINTV = recode(nhis$FLNGINTV, "'1'=1;'2'=2;'3'=3;'4'=4;'8'=NA")
nhis$Englishspeaking=ifelse(nhis$FLNGINTV==1,v1,v2)
nhis$FM_EDUC1 = recode(nhis$FM_EDUC1, "'1'=1;'2'=1;'3'=2;'4'=2;'5'=2;'6'=3;'7'=3;'8'=4;'9'=4;'98'=NA;'99'=NA")
nhis$Nohighschool=ifelse(nhis$FM_EDUC1==1,v1,v2)
nhis$Highschooldegree=ifelse(nhis$FM_EDUC1==2,v1,v2)
nhis$Twoyrdegree=ifelse(nhis$FM_EDUC1==3,v1,v2)
nhis$Fourormoreyrcollegedegree=ifelse(nhis$FM_EDUC1==4,v1,v2)
nhis$HOUSEOWN = recode(nhis$HOUSEOWN, "'1'=1;'2'=2;'3'=3;'7'=NA;'8'=NA;'9'=NA")
nhis$Ownhome=ifelse(nhis$HOUSEOWN==1,v1,v2)
nhis$Renthome=ifelse(nhis$HOUSEOWN==2,v1,v2)
nhis$Otherlivingarrangement=ifelse(nhis$HOUSEOWN==3,v1,v2)
nhis=nhis[,c(-1,-5,-8,-9)]
## linear regression
m1=lm(FSRUNOUT~.,data = nhis)
summary(m1)
##
## Call:
## lm(formula = FSRUNOUT ~ ., data = nhis)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.65119 -0.22937 -0.08462 0.01944 1.08251
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.053837 0.053405 -1.008 0.3134
## FM_SIZE 0.053581 0.002970 18.043 < 2e-16 ***
## FM_KIDS -0.016623 0.003274 -5.078 3.83e-07 ***
## FM_ELDR -0.050590 0.003102 -16.310 < 2e-16 ***
## FHICOVCT -0.014547 0.002521 -5.769 8.01e-09 ***
## NIlessthan35k 0.238149 0.006338 37.574 < 2e-16 ***
## NI35kto74.9k 0.079847 0.005635 14.169 < 2e-16 ***
## NI75kto99k 0.026547 0.006749 3.933 8.39e-05 ***
## NIabove100k NA NA NA NA
## Englishspeaking 0.038487 0.007383 5.213 1.87e-07 ***
## Nohighschool 0.090607 0.051555 1.757 0.0788 .
## Highschooldegree 0.027022 0.051310 0.527 0.5984
## Twoyrdegree 0.011173 0.051477 0.217 0.8282
## Fourormoreyrcollegedegree -0.035673 0.051362 -0.695 0.4873
## Ownhome -0.046480 0.011242 -4.135 3.56e-05 ***
## Renthome 0.021120 0.011168 1.891 0.0586 .
## Otherlivingarrangement NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3474 on 40108 degrees of freedom
## (5474 observations deleted due to missingness)
## Multiple R-squared: 0.1401, Adjusted R-squared: 0.1398
## F-statistic: 466.6 on 14 and 40108 DF, p-value: < 2.2e-16
Since the dependent variable is categorical, linear regression is imappropliate.
So logistic regression is more appropriate.
## logistic regression
n=length(nhis$FSRUNOUT)
n1=floor(n*(0.6))
n2=n-n1
train=sample(1:n,n1)
xx = nhis[-4]
xtrain <-xx[train,]
xnew <- xx[-train,]
ytrain <- FSRUNOUT[train]
ynew <- FSRUNOUT[-train]
m2=glm(FSRUNOUT~.,family=binomial,data=data.frame(FSRUNOUT=ytrain,xtrain))
summary(m2)
##
## Call:
## glm(formula = FSRUNOUT ~ ., family = binomial, data = data.frame(FSRUNOUT = ytrain,
## xtrain))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6709 -0.6570 -0.3710 -0.1703 3.1399
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.95239 0.58859 -8.414 < 2e-16 ***
## FM_SIZE 0.37817 0.02834 13.342 < 2e-16 ***
## FM_KIDS -0.18737 0.03331 -5.624 1.86e-08 ***
## FM_ELDR -0.54239 0.04136 -13.114 < 2e-16 ***
## FHICOVCT -0.03732 0.02253 -1.656 0.09765 .
## NIlessthan35k 2.86949 0.11981 23.950 < 2e-16 ***
## NI35kto74.9k 1.73815 0.11824 14.700 < 2e-16 ***
## NI75kto99k 0.94204 0.14069 6.696 2.15e-11 ***
## NIabove100k NA NA NA NA
## Englishspeaking 0.31561 0.06705 4.707 2.52e-06 ***
## Nohighschool 0.88916 0.56451 1.575 0.11523
## Highschooldegree 0.58165 0.56299 1.033 0.30154
## Twoyrdegree 0.52439 0.56482 0.928 0.35319
## Fourormoreyrcollegedegree -0.09619 0.56466 -0.170 0.86473
## Ownhome -0.32851 0.10057 -3.266 0.00109 **
## Renthome 0.09926 0.09799 1.013 0.31103
## Otherlivingarrangement NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 21821 on 24066 degrees of freedom
## Residual deviance: 18097 on 24052 degrees of freedom
## (3291 observations deleted due to missingness)
## AIC: 18127
##
## Number of Fisher Scoring iterations: 6
## prediction
ptest <- predict(m2,newdata=data.frame(xnew),type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
data.frame(ynew,ptest)[1:10,]
## ynew ptest
## 1 0 0.40114426
## 2 0 0.20059945
## 3 0 0.08829054
## 4 0 0.11213339
## 5 0 0.15242574
## 6 0 0.12305523
## 7 1 0.39208123
## 8 0 0.02796397
## 9 0 0.06650559
## 10 0 0.07526081
hist(ptest, main = "Prediction of Running out food")

xbar=mean(ynew)
## Computing error
g=floor(ptest+0.4) ## prob .4 or higher
t=table(ynew,g)
t
## g
## ynew 0 1
## 0 13302 33
## 1 2689 32
error=(t[1,2]+t[2,1])/n2
error
## [1] 0.1492406
The percent of error of the prediction is about 15%.
bb=cbind(ptest,ynew)
bb1=bb[order(ptest,decreasing=TRUE),]
## making graph
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
axis[i]=i
ax[i]=xbar*i
ay[i]=ay[i-1]+bb1[i,2]
}
aaa=cbind(bb1[,1],bb1[,2],ay,ax)
aaa[1:100,]
## ay ax
## 2137 0.8595803 0 0 NA
## 2707 0.8183782 1 1 NA
## 3034 0.7918495 0 1 NA
## 1341 0.7469251 0 1 NA
## 14489 0.7463810 0 1 NA
## 5569 0.7175286 0 1 NA
## 7465 0.7039764 0 1 NA
## 17531 0.7021572 0 1 NA
## 296 0.6899322 1 2 NA
## 17296 0.6838089 0 2 NA
## 379 0.6773092 1 3 NA
## 8160 0.6773092 0 3 NA
## 795 0.6755189 0 3 NA
## 14639 0.6755189 0 3 NA
## 9456 0.6690987 1 4 NA
## 16395 0.6689767 1 5 NA
## 6595 0.6665218 1 6 NA
## 14366 0.6615537 1 7 NA
## 14615 0.6598435 1 8 NA
## 2699 0.6505114 1 9 NA
## 3361 0.6503396 0 9 NA
## 5773 0.6503396 1 10 NA
## 818 0.6497297 0 10 NA
## 16304 0.6486218 1 11 NA
## 6465 0.6484963 1 12 NA
## 1923 0.6435576 0 12 NA
## 1841 0.6419795 0 12 NA
## 14640 0.6419795 1 13 NA
## 10784 0.6391503 0 13 NA
## 6032 0.6372097 1 14 NA
## 14432 0.6350788 1 15 NA
## 3890 0.6342820 1 16 NA
## 12765 0.6342820 1 17 NA
## 13593 0.6341066 1 18 NA
## 11312 0.6323826 0 18 NA
## 12727 0.6322740 0 18 NA
## 13665 0.6322740 1 19 NA
## 15747 0.6312987 0 19 NA
## 2708 0.6293540 0 19 NA
## 5464 0.6263864 0 19 NA
## 6227 0.6263864 0 19 NA
## 7585 0.6263864 1 20 NA
## 8827 0.6263864 1 21 NA
## 2972 0.6262576 0 21 NA
## 11912 0.6262576 1 22 NA
## 812 0.6244703 1 23 NA
## 2013 0.6244703 0 23 NA
## 13664 0.6242956 1 24 NA
## 667 0.6225699 0 24 NA
## 6817 0.6184230 0 24 NA
## 9520 0.6145751 1 25 NA
## 2918 0.6119507 1 26 NA
## 3436 0.6118200 1 27 NA
## 9340 0.6101330 1 28 NA
## 12424 0.6080789 0 28 NA
## 2587 0.6077607 0 28 NA
## 10038 0.6076295 0 28 NA
## 13185 0.6076295 0 28 NA
## 11793 0.6068099 1 29 NA
## 12076 0.6068099 1 30 NA
## 12069 0.6066295 1 31 NA
## 6705 0.6059896 1 32 NA
## 6299 0.6056976 0 32 NA
## 9254 0.6056975 0 32 NA
## 13747 0.6047450 0 32 NA
## 9804 0.5988287 0 32 NA
## 17956 0.5988287 1 33 NA
## 6603 0.5986965 0 33 NA
## 8279 0.5978705 1 34 NA
## 3701 0.5959226 0 34 NA
## 4929 0.5949126 0 34 NA
## 9782 0.5947800 1 35 NA
## 7940 0.5938973 1 36 NA
## 12569 0.5912258 0 36 NA
## 8143 0.5906618 1 37 NA
## 164 0.5898306 1 38 NA
## 232 0.5898306 1 39 NA
## 3183 0.5898306 0 39 NA
## 5305 0.5898306 0 39 NA
## 7148 0.5898306 0 39 NA
## 13348 0.5898306 0 39 NA
## 13483 0.5898306 1 40 NA
## 13964 0.5898306 0 40 NA
## 3385 0.5896974 0 40 NA
## 3480 0.5878503 0 40 NA
## 12041 0.5878503 0 40 NA
## 16096 0.5877372 0 40 NA
## 8508 0.5870175 1 41 NA
## 10806 0.5870175 1 42 NA
## 1020 0.5858875 1 43 NA
## 1747 0.5858875 0 43 NA
## 7670 0.5848699 1 44 NA
## 658 0.5840306 0 44 NA
## 16172 0.5826469 0 44 NA
## 8488 0.5816084 0 44 NA
## 13499 0.5816084 1 45 NA
## 7919 0.5809225 0 45 NA
## 3589 0.5789134 1 46 NA
## 9836 0.5787792 0 46 NA
## 11056 0.5787792 0 46 NA
plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

The most influenciable factor that determine food insecurity in this model is the income. If the individual annually receive $35k, the probability of getting food insecurity would increase.