HOMEPRICE ANALYSIS

knitr::kable(head(house))
HomeID Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
1 114300 1790 2 2 2 No East
2 114200 2030 4 2 3 No East
3 114800 1740 3 2 1 No East
4 94700 1980 3 2 3 No East
5 119800 2130 3 3 3 No East
6 114600 1780 3 2 2 No North
n=length(house$Neighborhood)
nt=100

a1 = rep(1,length(house$Neighborhood))
a2 = rep(0,length(house$Neighborhood))

house$BrickYes = ifelse(house$Brick == "Yes",a1,a2)

train <- sample(1:n,nt)

x <- scale(house[,c(2,3,4,5,6,9)])
x[1:3]
## [1] -0.6002263 -0.6039481 -0.5816174
mean(x)
## [1] 6.38084e-18
sd(x)
## [1] 0.9967352
x=house[,c(2,3,4,5,6,9)]

for (j in 1:6) {
  x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j])
}


knitr::kable(x[1:3,])
Price SqFt Bedrooms Bathrooms Offers BrickYes
-0.6002263 -0.9969990 -1.4097879 -0.8655378 -0.5406451 -0.6961011
-0.6039481 0.1373643 1.3452175 -0.8655378 0.3945248 -0.6961011
-0.5816174 -1.2333247 -0.0322852 -0.8655378 -1.4758150 -0.6961011
nearest1 <- knn(train=x[train,],test=x[-train,],cl=house$Neighborhood[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=house$Neighborhood[train],k=5)
knitr::kable(head(data.frame(house$Neighborhood[-train],nearest1,nearest5)))
house.Neighborhood..train. nearest1 nearest5
East North North
East East North
West West West
East North North
East East East
West West West
pcorr=dim(10)
neighbors=dim(10)  ## Creates a variable to count how many neighbors are being used
for (k in 1:10) {
  pred=knn.cv(x,cl=house$Neighborhood,k)
  pcorr[k]=100*sum(house$Neighborhood==pred)/1000
  neighbors[k]=k   ## Populates that count variable with k each time through
}
pcorr
##  [1] 7.8 7.4 6.7 7.0 7.3 7.4 7.7 7.5 7.5 7.8
neighbors
##  [1]  1  2  3  4  5  6  7  8  9 10
## New code to get it to give you the best number of neighbors to use
maxAcc<-max(pcorr) ## Maximum accuracy (percent correct) 
accTable<-rbind(pcorr,neighbors)
knitr::kable(accTable)
pcorr 7.8 7.4 6.7 7 7.3 7.4 7.7 7.5 7.5 7.8
neighbors 1.0 2.0 3.0 4 5.0 6.0 7.0 8.0 9.0 10.0
kBest=neighbors[pcorr==maxAcc]
kBest
## [1]  1 10

Best number of k neighbors is 1 in this model, thus use k=1.

near1<- data.frame(truetype=house$Neighborhood[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=house$Neighborhood[-train])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction East North West
##      East     5     2    0
##      North    7     4    0
##      West     2     0    8
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6071         
##                  95% CI : (0.4058, 0.785)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 0.1725         
##                                          
##                   Kappa : 0.4296         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: East Class: North Class: West
## Sensitivity               0.3571       0.6667      1.0000
## Specificity               0.8571       0.6818      0.9000
## Pos Pred Value            0.7143       0.3636      0.8000
## Neg Pred Value            0.5714       0.8824      1.0000
## Prevalence                0.5000       0.2143      0.2857
## Detection Rate            0.1786       0.1429      0.2857
## Detection Prevalence      0.2500       0.3929      0.3571
## Balanced Accuracy         0.6071       0.6742      0.9500

This model’s prediction accuracy is about 60%, which is not criticaly accurate. But it is beter than predictiong without any informations.

FOOD INSECURITY ANALYSIS

nhis = ReducedFoodInsec

nhis$FSRUNOUT = recode(nhis$FSRUNOUT, "'1'=1;'2'=1;'3'=0;'7'=NA;'8'=NA;'9'=NA")

nhis2 <-data.frame(nhis$INCGRP5,nhis$FLNGINTV,nhis$FM_EDUC1,nhis$HOUSEOWN,
             nhis$FM_SIZE,nhis$FM_KIDS,nhis$FM_ELDR,nhis$FSRUNOUT)

head(nhis2)
##   nhis.INCGRP5 nhis.FLNGINTV nhis.FM_EDUC1 nhis.HOUSEOWN nhis.FM_SIZE
## 1            2             1             8             2            3
## 2            2             1             5             1           12
## 3            3             1             6             1            3
## 4            2             1             8             1            6
## 5            4             1             8             1            4
## 6            4             1             6             1            4
##   nhis.FM_KIDS nhis.FM_ELDR nhis.FSRUNOUT
## 1            1            0             0
## 2            8            1             1
## 3            1            0             1
## 4            2            0             1
## 5            1            0             0
## 6            2            0             0
dim(nhis2)
## [1] 1168    8
n2=length(nhis2[,1])
nt2=900
train <- sample(1:n2,nt2)
x2 <- scale(nhis2[,c(1:7)])
x2[1:3]
## [1] -0.3922275 -0.3922275 -0.3590254
mean(x2)
## [1] 3.812677e-17
sd(x2)
## [1] 0.999633
for (j in 1:6) {
  x2[,j]=(x2[,j]-mean(x2[,j]))/sd(x2[,j])
}

x2 = nhis2[,c(1:7)]

near1 <- knn(train=x2[train,],test=x2[-train,],cl=nhis2$nhis.FSRUNOUT[train],k=1)
near5 <- knn(train=x2[train,],test=x2[-train,],cl=nhis2$nhis.FSRUNOUT[train],k=5)
near10 <- knn(train=x2[train,],test=x2[-train,],cl=nhis2$nhis.FSRUNOUT[train],k=10)
knitr::kable(head(data.frame(nhis2$nhis.FSRUNOUT[-train],near1,near5,near10)))
nhis2.nhis.FSRUNOUT..train. near1 near5 near10
0 1 0 0
1 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
pcorrn1=100*sum(nhis2$nhis.FSRUNOUT[-train]==near1)/(n-nt)
pcorrn5=100*sum(nhis2$nhis.FSRUNOUT[-train]==near5)/(n-nt)
pcorrn1
## [1] 839.2857
pcorrn5
## [1] 867.8571
pcorr=dim(10)
neighbors=dim(10)  ## Creates a variable to count how many neighbors are being used
for (k in 1:10) {
  pred=knn.cv(x2,cl=nhis2$nhis.FSRUNOUT,k)
  pcorr[k]=100*sum(nhis2$nhis.FSRUNOUT==pred)/n2
  neighbors[k]=k   ## Populates that count variable with k each time through
}
pcorr
##  [1] 86.81507 87.67123 88.69863 88.78425 89.46918 89.46918 89.38356
##  [8] 89.46918 89.72603 89.89726
neighbors
##  [1]  1  2  3  4  5  6  7  8  9 10
## New code to get it to give you the best number of neighbors to use
maxAcc<-max(pcorr) ## Maximum accuracy (percent correct) 
accTable<-rbind(pcorr,neighbors)
accTable
##               [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]
## pcorr     86.81507 87.67123 88.69863 88.78425 89.46918 89.46918 89.38356
## neighbors  1.00000  2.00000  3.00000  4.00000  5.00000  6.00000  7.00000
##               [,8]     [,9]    [,10]
## pcorr     89.46918 89.72603 89.89726
## neighbors  8.00000  9.00000 10.00000
kBest=neighbors[pcorr==maxAcc]
kBest
## [1] 10

Best number of neighbor is 10 in this model

fsnear10<- data.frame(truetype=nhis2$nhis.FSRUNOUT[-train],predtype=near10)
confusionMatrix(data=near10,reference=nhis2$nhis.FSRUNOUT[-train])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 240  25
##          1   2   1
##                                           
##                Accuracy : 0.8993          
##                  95% CI : (0.8568, 0.9326)
##     No Information Rate : 0.903           
##     P-Value [Acc > NIR] : 0.631           
##                                           
##                   Kappa : 0.0499          
##  Mcnemar's Test P-Value : 2.297e-05       
##                                           
##             Sensitivity : 0.99174         
##             Specificity : 0.03846         
##          Pos Pred Value : 0.90566         
##          Neg Pred Value : 0.33333         
##              Prevalence : 0.90299         
##          Detection Rate : 0.89552         
##    Detection Prevalence : 0.98881         
##       Balanced Accuracy : 0.51510         
##                                           
##        'Positive' Class : 0               
## 

Its accuracy is about 90%, which is well predicted model.