HOMEPRICE ANALYSIS
knitr::kable(head(house))
| 1 |
114300 |
1790 |
2 |
2 |
2 |
No |
East |
| 2 |
114200 |
2030 |
4 |
2 |
3 |
No |
East |
| 3 |
114800 |
1740 |
3 |
2 |
1 |
No |
East |
| 4 |
94700 |
1980 |
3 |
2 |
3 |
No |
East |
| 5 |
119800 |
2130 |
3 |
3 |
3 |
No |
East |
| 6 |
114600 |
1780 |
3 |
2 |
2 |
No |
North |
n=length(house$Neighborhood)
nt=100
a1 = rep(1,length(house$Neighborhood))
a2 = rep(0,length(house$Neighborhood))
house$BrickYes = ifelse(house$Brick == "Yes",a1,a2)
train <- sample(1:n,nt)
x <- scale(house[,c(2,3,4,5,6,9)])
x[1:3]
## [1] -0.6002263 -0.6039481 -0.5816174
mean(x)
## [1] 6.38084e-18
sd(x)
## [1] 0.9967352
x=house[,c(2,3,4,5,6,9)]
for (j in 1:6) {
x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j])
}
knitr::kable(x[1:3,])
| -0.6002263 |
-0.9969990 |
-1.4097879 |
-0.8655378 |
-0.5406451 |
-0.6961011 |
| -0.6039481 |
0.1373643 |
1.3452175 |
-0.8655378 |
0.3945248 |
-0.6961011 |
| -0.5816174 |
-1.2333247 |
-0.0322852 |
-0.8655378 |
-1.4758150 |
-0.6961011 |
nearest1 <- knn(train=x[train,],test=x[-train,],cl=house$Neighborhood[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=house$Neighborhood[train],k=5)
knitr::kable(head(data.frame(house$Neighborhood[-train],nearest1,nearest5)))
| East |
North |
North |
| East |
East |
North |
| West |
West |
West |
| East |
North |
North |
| East |
East |
East |
| West |
West |
West |
pcorr=dim(10)
neighbors=dim(10) ## Creates a variable to count how many neighbors are being used
for (k in 1:10) {
pred=knn.cv(x,cl=house$Neighborhood,k)
pcorr[k]=100*sum(house$Neighborhood==pred)/1000
neighbors[k]=k ## Populates that count variable with k each time through
}
pcorr
## [1] 7.8 7.4 6.7 7.0 7.3 7.4 7.7 7.5 7.5 7.8
neighbors
## [1] 1 2 3 4 5 6 7 8 9 10
## New code to get it to give you the best number of neighbors to use
maxAcc<-max(pcorr) ## Maximum accuracy (percent correct)
accTable<-rbind(pcorr,neighbors)
knitr::kable(accTable)
| pcorr |
7.8 |
7.4 |
6.7 |
7 |
7.3 |
7.4 |
7.7 |
7.5 |
7.5 |
7.8 |
| neighbors |
1.0 |
2.0 |
3.0 |
4 |
5.0 |
6.0 |
7.0 |
8.0 |
9.0 |
10.0 |
kBest=neighbors[pcorr==maxAcc]
kBest
## [1] 1 10
Best number of k neighbors is 1 in this model, thus use k=1.
near1<- data.frame(truetype=house$Neighborhood[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=house$Neighborhood[-train])
## Confusion Matrix and Statistics
##
## Reference
## Prediction East North West
## East 5 2 0
## North 7 4 0
## West 2 0 8
##
## Overall Statistics
##
## Accuracy : 0.6071
## 95% CI : (0.4058, 0.785)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.1725
##
## Kappa : 0.4296
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: East Class: North Class: West
## Sensitivity 0.3571 0.6667 1.0000
## Specificity 0.8571 0.6818 0.9000
## Pos Pred Value 0.7143 0.3636 0.8000
## Neg Pred Value 0.5714 0.8824 1.0000
## Prevalence 0.5000 0.2143 0.2857
## Detection Rate 0.1786 0.1429 0.2857
## Detection Prevalence 0.2500 0.3929 0.3571
## Balanced Accuracy 0.6071 0.6742 0.9500
This model’s prediction accuracy is about 60%, which is not criticaly accurate. But it is beter than predictiong without any informations.
FOOD INSECURITY ANALYSIS
nhis = ReducedFoodInsec
nhis$FSRUNOUT = recode(nhis$FSRUNOUT, "'1'=1;'2'=1;'3'=0;'7'=NA;'8'=NA;'9'=NA")
nhis2 <-data.frame(nhis$INCGRP5,nhis$FLNGINTV,nhis$FM_EDUC1,nhis$HOUSEOWN,
nhis$FM_SIZE,nhis$FM_KIDS,nhis$FM_ELDR,nhis$FSRUNOUT)
head(nhis2)
## nhis.INCGRP5 nhis.FLNGINTV nhis.FM_EDUC1 nhis.HOUSEOWN nhis.FM_SIZE
## 1 2 1 8 2 3
## 2 2 1 5 1 12
## 3 3 1 6 1 3
## 4 2 1 8 1 6
## 5 4 1 8 1 4
## 6 4 1 6 1 4
## nhis.FM_KIDS nhis.FM_ELDR nhis.FSRUNOUT
## 1 1 0 0
## 2 8 1 1
## 3 1 0 1
## 4 2 0 1
## 5 1 0 0
## 6 2 0 0
dim(nhis2)
## [1] 1168 8
n2=length(nhis2[,1])
nt2=900
train <- sample(1:n2,nt2)
x2 <- scale(nhis2[,c(1:7)])
x2[1:3]
## [1] -0.3922275 -0.3922275 -0.3590254
mean(x2)
## [1] 3.812677e-17
sd(x2)
## [1] 0.999633
for (j in 1:6) {
x2[,j]=(x2[,j]-mean(x2[,j]))/sd(x2[,j])
}
x2 = nhis2[,c(1:7)]
near1 <- knn(train=x2[train,],test=x2[-train,],cl=nhis2$nhis.FSRUNOUT[train],k=1)
near5 <- knn(train=x2[train,],test=x2[-train,],cl=nhis2$nhis.FSRUNOUT[train],k=5)
near10 <- knn(train=x2[train,],test=x2[-train,],cl=nhis2$nhis.FSRUNOUT[train],k=10)
knitr::kable(head(data.frame(nhis2$nhis.FSRUNOUT[-train],near1,near5,near10)))
| 0 |
1 |
0 |
0 |
| 1 |
0 |
0 |
0 |
| 0 |
0 |
0 |
0 |
| 0 |
0 |
0 |
0 |
| 0 |
0 |
0 |
0 |
| 0 |
0 |
0 |
0 |
pcorrn1=100*sum(nhis2$nhis.FSRUNOUT[-train]==near1)/(n-nt)
pcorrn5=100*sum(nhis2$nhis.FSRUNOUT[-train]==near5)/(n-nt)
pcorrn1
## [1] 839.2857
pcorrn5
## [1] 867.8571
pcorr=dim(10)
neighbors=dim(10) ## Creates a variable to count how many neighbors are being used
for (k in 1:10) {
pred=knn.cv(x2,cl=nhis2$nhis.FSRUNOUT,k)
pcorr[k]=100*sum(nhis2$nhis.FSRUNOUT==pred)/n2
neighbors[k]=k ## Populates that count variable with k each time through
}
pcorr
## [1] 86.81507 87.67123 88.69863 88.78425 89.46918 89.46918 89.38356
## [8] 89.46918 89.72603 89.89726
neighbors
## [1] 1 2 3 4 5 6 7 8 9 10
## New code to get it to give you the best number of neighbors to use
maxAcc<-max(pcorr) ## Maximum accuracy (percent correct)
accTable<-rbind(pcorr,neighbors)
accTable
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## pcorr 86.81507 87.67123 88.69863 88.78425 89.46918 89.46918 89.38356
## neighbors 1.00000 2.00000 3.00000 4.00000 5.00000 6.00000 7.00000
## [,8] [,9] [,10]
## pcorr 89.46918 89.72603 89.89726
## neighbors 8.00000 9.00000 10.00000
kBest=neighbors[pcorr==maxAcc]
kBest
## [1] 10
Best number of neighbor is 10 in this model
fsnear10<- data.frame(truetype=nhis2$nhis.FSRUNOUT[-train],predtype=near10)
confusionMatrix(data=near10,reference=nhis2$nhis.FSRUNOUT[-train])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 240 25
## 1 2 1
##
## Accuracy : 0.8993
## 95% CI : (0.8568, 0.9326)
## No Information Rate : 0.903
## P-Value [Acc > NIR] : 0.631
##
## Kappa : 0.0499
## Mcnemar's Test P-Value : 2.297e-05
##
## Sensitivity : 0.99174
## Specificity : 0.03846
## Pos Pred Value : 0.90566
## Neg Pred Value : 0.33333
## Prevalence : 0.90299
## Detection Rate : 0.89552
## Detection Prevalence : 0.98881
## Balanced Accuracy : 0.51510
##
## 'Positive' Class : 0
##
Its accuracy is about 90%, which is well predicted model.