Chopped A

nhis=nhis[,c(-1:-12,-17:-19,-21:-41,-43:-66,-68:-114,-116:-117,-119:-127)]
## FSRUNOUT is dependent variable
## Recodding FSRUNOUT
nhis$FSRUNOUT = recode(nhis$FSRUNOUT, "'1'=1;'2'=1;'3'=0;'7'=NA;'8'=NA;'9'=NA")
FSRUNOUT=nhis$FSRUNOUT

## Making dummy variables
v1=rep(1,dim(nhis)[1])
v2=rep(0,dim(nhis)[1])
nhis$INCGRP5 = recode(nhis$INCGRP5, "'1'=1;'2'=2;'3'=3;'4'=4;'96'=NA;'99'=NA")
nhis$NIlessthan35k=ifelse(nhis$INCGRP5==1,v1,v2)
nhis$NI35kto74.9k=ifelse(nhis$INCGRP5==2,v1,v2)
nhis$NI75kto99k=ifelse(nhis$INCGRP5==3,v1,v2)
nhis$NIabove100k=ifelse(nhis$INCGRP5==4,v1,v2)
nhis$FLNGINTV = recode(nhis$FLNGINTV, "'1'=1;'2'=2;'3'=3;'4'=4;'8'=NA")
nhis$Englishspeaking=ifelse(nhis$FLNGINTV==1,v1,v2)
nhis$FM_EDUC1 = recode(nhis$FM_EDUC1, "'1'=1;'2'=1;'3'=2;'4'=2;'5'=2;'6'=3;'7'=3;'8'=4;'9'=4;'98'=NA;'99'=NA")
nhis$Nohighschool=ifelse(nhis$FM_EDUC1==1,v1,v2)
nhis$Highschooldegree=ifelse(nhis$FM_EDUC1==2,v1,v2)
nhis$Twoyrdegree=ifelse(nhis$FM_EDUC1==3,v1,v2)
nhis$Fourormoreyrcollegedegree=ifelse(nhis$FM_EDUC1==4,v1,v2)
nhis$HOUSEOWN = recode(nhis$HOUSEOWN, "'1'=1;'2'=2;'3'=3;'7'=NA;'8'=NA;'9'=NA")
nhis$Ownhome=ifelse(nhis$HOUSEOWN==1,v1,v2)
nhis$Renthome=ifelse(nhis$HOUSEOWN==2,v1,v2)
nhis$Otherlivingarrangement=ifelse(nhis$HOUSEOWN==3,v1,v2)
nhis=nhis[,c(-1,-5,-8,-9)]

## linear regression
m1=lm(FSRUNOUT~.,data = nhis)
summary(m1)

## 
## Call:
## lm(formula = FSRUNOUT ~ ., data = nhis)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65119 -0.22937 -0.08462  0.01944  1.08251 
## 
## Coefficients: (2 not defined because of singularities)
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -0.053837   0.053405  -1.008   0.3134    
## FM_SIZE                    0.053581   0.002970  18.043  < 2e-16 ***
## FM_KIDS                   -0.016623   0.003274  -5.078 3.83e-07 ***
## FM_ELDR                   -0.050590   0.003102 -16.310  < 2e-16 ***
## FHICOVCT                  -0.014547   0.002521  -5.769 8.01e-09 ***
## NIlessthan35k              0.238149   0.006338  37.574  < 2e-16 ***
## NI35kto74.9k               0.079847   0.005635  14.169  < 2e-16 ***
## NI75kto99k                 0.026547   0.006749   3.933 8.39e-05 ***
## NIabove100k                      NA         NA      NA       NA    
## Englishspeaking            0.038487   0.007383   5.213 1.87e-07 ***
## Nohighschool               0.090607   0.051555   1.757   0.0788 .  
## Highschooldegree           0.027022   0.051310   0.527   0.5984    
## Twoyrdegree                0.011173   0.051477   0.217   0.8282    
## Fourormoreyrcollegedegree -0.035673   0.051362  -0.695   0.4873    
## Ownhome                   -0.046480   0.011242  -4.135 3.56e-05 ***
## Renthome                   0.021120   0.011168   1.891   0.0586 .  
## Otherlivingarrangement           NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3474 on 40108 degrees of freedom
##   (5474 observations deleted due to missingness)
## Multiple R-squared:  0.1401, Adjusted R-squared:  0.1398 
## F-statistic: 466.6 on 14 and 40108 DF,  p-value: < 2.2e-16

Since the dependent variable is categorical, linear regression is imappropliate.

So logistic regression is more appropriate.

## logistic regression
n=length(nhis$FSRUNOUT)

n1=floor(n*(0.6))

n2=n-n1

train=sample(1:n,n1)


xx = nhis[-4]
xtrain <-xx[train,]
xnew <- xx[-train,]
ytrain <- FSRUNOUT[train]
ynew <- FSRUNOUT[-train]

m2=glm(FSRUNOUT~.,family=binomial,data=data.frame(FSRUNOUT=ytrain,xtrain))
summary(m2)

## 
## Call:
## glm(formula = FSRUNOUT ~ ., family = binomial, data = data.frame(FSRUNOUT = ytrain, 
##     xtrain))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6709  -0.6570  -0.3710  -0.1703   3.1399  
## 
## Coefficients: (2 not defined because of singularities)
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -4.95239    0.58859  -8.414  < 2e-16 ***
## FM_SIZE                    0.37817    0.02834  13.342  < 2e-16 ***
## FM_KIDS                   -0.18737    0.03331  -5.624 1.86e-08 ***
## FM_ELDR                   -0.54239    0.04136 -13.114  < 2e-16 ***
## FHICOVCT                  -0.03732    0.02253  -1.656  0.09765 .  
## NIlessthan35k              2.86949    0.11981  23.950  < 2e-16 ***
## NI35kto74.9k               1.73815    0.11824  14.700  < 2e-16 ***
## NI75kto99k                 0.94204    0.14069   6.696 2.15e-11 ***
## NIabove100k                     NA         NA      NA       NA    
## Englishspeaking            0.31561    0.06705   4.707 2.52e-06 ***
## Nohighschool               0.88916    0.56451   1.575  0.11523    
## Highschooldegree           0.58165    0.56299   1.033  0.30154    
## Twoyrdegree                0.52439    0.56482   0.928  0.35319    
## Fourormoreyrcollegedegree -0.09619    0.56466  -0.170  0.86473    
## Ownhome                   -0.32851    0.10057  -3.266  0.00109 ** 
## Renthome                   0.09926    0.09799   1.013  0.31103    
## Otherlivingarrangement          NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 21821  on 24066  degrees of freedom
## Residual deviance: 18097  on 24052  degrees of freedom
##   (3291 observations deleted due to missingness)
## AIC: 18127
## 
## Number of Fisher Scoring iterations: 6

## prediction
ptest <- predict(m2,newdata=data.frame(xnew),type="response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

data.frame(ynew,ptest)[1:10,]

##    ynew      ptest
## 1     0 0.40114426
## 2     0 0.20059945
## 3     0 0.08829054
## 4     0 0.11213339
## 5     0 0.15242574
## 6     0 0.12305523
## 7     1 0.39208123
## 8     0 0.02796397
## 9     0 0.06650559
## 10    0 0.07526081

hist(ptest, main = "Prediction of Running out food")

xbar=mean(ynew)


## Computing error
g=floor(ptest+0.4) ## prob .4 or higher
t=table(ynew,g)
t

##     g
## ynew     0     1
##    0 13302    33
##    1  2689    32

error=(t[1,2]+t[2,1])/n2
error

## [1] 0.1492406

The percent of error of the prediction is about 15%.

bb=cbind(ptest,ynew)

bb1=bb[order(ptest,decreasing=TRUE),]

## making graph
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}

aaa=cbind(bb1[,1],bb1[,2],ay,ax)
aaa[1:100,]

##                   ay ax
## 2137  0.8595803 0  0 NA
## 2707  0.8183782 1  1 NA
## 3034  0.7918495 0  1 NA
## 1341  0.7469251 0  1 NA
## 14489 0.7463810 0  1 NA
## 5569  0.7175286 0  1 NA
## 7465  0.7039764 0  1 NA
## 17531 0.7021572 0  1 NA
## 296   0.6899322 1  2 NA
## 17296 0.6838089 0  2 NA
## 379   0.6773092 1  3 NA
## 8160  0.6773092 0  3 NA
## 795   0.6755189 0  3 NA
## 14639 0.6755189 0  3 NA
## 9456  0.6690987 1  4 NA
## 16395 0.6689767 1  5 NA
## 6595  0.6665218 1  6 NA
## 14366 0.6615537 1  7 NA
## 14615 0.6598435 1  8 NA
## 2699  0.6505114 1  9 NA
## 3361  0.6503396 0  9 NA
## 5773  0.6503396 1 10 NA
## 818   0.6497297 0 10 NA
## 16304 0.6486218 1 11 NA
## 6465  0.6484963 1 12 NA
## 1923  0.6435576 0 12 NA
## 1841  0.6419795 0 12 NA
## 14640 0.6419795 1 13 NA
## 10784 0.6391503 0 13 NA
## 6032  0.6372097 1 14 NA
## 14432 0.6350788 1 15 NA
## 3890  0.6342820 1 16 NA
## 12765 0.6342820 1 17 NA
## 13593 0.6341066 1 18 NA
## 11312 0.6323826 0 18 NA
## 12727 0.6322740 0 18 NA
## 13665 0.6322740 1 19 NA
## 15747 0.6312987 0 19 NA
## 2708  0.6293540 0 19 NA
## 5464  0.6263864 0 19 NA
## 6227  0.6263864 0 19 NA
## 7585  0.6263864 1 20 NA
## 8827  0.6263864 1 21 NA
## 2972  0.6262576 0 21 NA
## 11912 0.6262576 1 22 NA
## 812   0.6244703 1 23 NA
## 2013  0.6244703 0 23 NA
## 13664 0.6242956 1 24 NA
## 667   0.6225699 0 24 NA
## 6817  0.6184230 0 24 NA
## 9520  0.6145751 1 25 NA
## 2918  0.6119507 1 26 NA
## 3436  0.6118200 1 27 NA
## 9340  0.6101330 1 28 NA
## 12424 0.6080789 0 28 NA
## 2587  0.6077607 0 28 NA
## 10038 0.6076295 0 28 NA
## 13185 0.6076295 0 28 NA
## 11793 0.6068099 1 29 NA
## 12076 0.6068099 1 30 NA
## 12069 0.6066295 1 31 NA
## 6705  0.6059896 1 32 NA
## 6299  0.6056976 0 32 NA
## 9254  0.6056975 0 32 NA
## 13747 0.6047450 0 32 NA
## 9804  0.5988287 0 32 NA
## 17956 0.5988287 1 33 NA
## 6603  0.5986965 0 33 NA
## 8279  0.5978705 1 34 NA
## 3701  0.5959226 0 34 NA
## 4929  0.5949126 0 34 NA
## 9782  0.5947800 1 35 NA
## 7940  0.5938973 1 36 NA
## 12569 0.5912258 0 36 NA
## 8143  0.5906618 1 37 NA
## 164   0.5898306 1 38 NA
## 232   0.5898306 1 39 NA
## 3183  0.5898306 0 39 NA
## 5305  0.5898306 0 39 NA
## 7148  0.5898306 0 39 NA
## 13348 0.5898306 0 39 NA
## 13483 0.5898306 1 40 NA
## 13964 0.5898306 0 40 NA
## 3385  0.5896974 0 40 NA
## 3480  0.5878503 0 40 NA
## 12041 0.5878503 0 40 NA
## 16096 0.5877372 0 40 NA
## 8508  0.5870175 1 41 NA
## 10806 0.5870175 1 42 NA
## 1020  0.5858875 1 43 NA
## 1747  0.5858875 0 43 NA
## 7670  0.5848699 1 44 NA
## 658   0.5840306 0 44 NA
## 16172 0.5826469 0 44 NA
## 8488  0.5816084 0 44 NA
## 13499 0.5816084 1 45 NA
## 7919  0.5809225 0 45 NA
## 3589  0.5789134 1 46 NA
## 9836  0.5787792 0 46 NA
## 11056 0.5787792 0 46 NA

plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

The most influenciable factor that determine food insecurity in this model is the income. If the individual annually receive $35k, the probability of getting food insecurity would increase.

Chopped A

Shunya

2017年10月25日

Since the dependent variable is categorical, linear regression is imappropliate.

So logistic regression is more appropriate.

The percent of error of the prediction is about 15%.

The most influenciable factor that determine food insecurity in this model is the income. If the individual annually receive $35k, the probability of getting food insecurity would increase.