download.file("https://www.biz.uiowa.edu/faculty/jledolter/datamining/HousePrices.csv", "HousePrices.csv",method="curl")
## Warning in download.file("https://www.biz.uiowa.edu/faculty/jledolter/
## datamining/HousePrices.csv", : download had nonzero exit status
hp<-read.csv("https://www.biz.uiowa.edu/faculty/jledolter/datamining/HousePrices.csv")
library(textir)
## Loading required package: distrom
## Loading required package: Matrix
## Loading required package: gamlr
## Loading required package: parallel
library(MASS)
summary(hp)
## HomeID Price SqFt Bedrooms
## Min. : 1.00 Min. : 69100 Min. :1450 Min. :2.000
## 1st Qu.: 32.75 1st Qu.:111325 1st Qu.:1880 1st Qu.:3.000
## Median : 64.50 Median :125950 Median :2000 Median :3.000
## Mean : 64.50 Mean :130427 Mean :2001 Mean :3.023
## 3rd Qu.: 96.25 3rd Qu.:148250 3rd Qu.:2140 3rd Qu.:3.000
## Max. :128.00 Max. :211200 Max. :2590 Max. :5.000
## Bathrooms Offers Brick Neighborhood
## Min. :2.000 Min. :1.000 No :86 East :45
## 1st Qu.:2.000 1st Qu.:2.000 Yes:42 North:44
## Median :2.000 Median :3.000 West :39
## Mean :2.445 Mean :2.578
## 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :4.000 Max. :6.000
hp=hp[-1]
v1=rep(1,length(hp$Brick))
v2=rep(0,length(hp$Brick))
hp$Brick=ifelse(hp$Brick=="Yes",v1,v2)
head(hp)
## Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1 114300 1790 2 2 2 0 East
## 2 114200 2030 4 2 3 0 East
## 3 114800 1740 3 2 1 0 East
## 4 94700 1980 3 2 3 0 East
## 5 119800 2130 3 3 3 0 East
## 6 114600 1780 3 2 2 0 North
dim(hp)
## [1] 128 7
table(hp$Neighborhood)
##
## East North West
## 45 44 39
The purpose of this analysis is to create a model with knn to predict neighborhood. First, we have to recode the brick category to binary variables.
Based on these plots, there are generally significant differences between neighborhoods in house prices. Otherwise, the three neighborhoods consistantly overlap.
n=length(hp$Neighborhood)
nt=115
set.seed(1)
train <- sample(1:n,nt)
x<-scale(hp[,c(1,2)])
x[1:3,]
## Price SqFt
## [1,] -0.6002263 -0.9969990
## [2,] -0.6039481 0.1373643
## [3,] -0.5816174 -1.2333247
##check to make sure mean=0 and sd=1
mean(x)
## [1] 6.675467e-18
sd(x)
## [1] 0.9980373
Because there are large differences between the minimums and maximums of the price and square feet variables, it is best to normalize for the model.
library(class)
nearest1 <- knn(train=x[train,],test=x[-train,],cl=hp$Neighborhood[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=hp$Neighborhood[train],k=5)
data.frame(hp$Neighborhood[-train],nearest1,nearest5)
## hp.Neighborhood..train. nearest1 nearest5
## 1 East North North
## 2 East North North
## 3 North North North
## 4 East East East
## 5 North North East
## 6 East North North
## 7 East East North
## 8 East West West
## 9 North North East
## 10 West West West
## 11 West West West
## 12 West West West
## 13 East North North
In this table, the predictions of the KNN of 1 and KNN of 5 are shown against the actual outcomes to determine each models’ success rate. Based on this model, it seems that each model is fairly accurate. However, we will plot them to see how they worked and calculate the portion correct.
par(mfrow=c(1,2))
## plot for k=1 (single) nearest neighbor
plot(x[train,],col=hp$Neighborhood[train],cex=.8,main="1-nearest neighbor")
points(x[-train,],bg=nearest1,pch=21,col=grey(.9),cex=1.25)
## plot for k=5 nearest neighbors
plot(x[train,],col=hp$Neighborhood[train],cex=.8,main="5-nearest neighbors")
points(x[-train,],bg=nearest5,pch=21,col=grey(.9),cex=1.25)
legend("topright",legend=levels(hp$Neighborhood),fill=1:6,bty="n",cex=.75)
These plots show how each model classified each testing point. There are small differences between the two. Both generally predicted the same points to be “west” but there were some differences in where they classified “north” and “east”.
pcorrn1=100*sum(hp$Neighborhood[-train]==nearest1)/(n-nt)##calculate base classification correctness
pcorrn5=100*sum(hp$Neighborhood[-train]==nearest5)/(n-nt)
pcorrn1##higher=better
## [1] 61.53846
pcorrn5##highest, therefore 5 nn is better
## [1] 38.46154
The first value is the accuracy percentage of the KNN 1 model and the second is the accuracy percentage of the KNN 5 model. Looking at these values, the KNN 1 model is better because it was correct 61% of the time compared to 38%.
qchisq(.95,2)
## [1] 5.991465
numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*3))^2)/(n*2)
PressQ1
## [1] 45.82249
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*3))^2)/(n*2)
PressQ5
## [1] 1.514793
The first value is the chi^2 value for the specific number of groups in this model (3). This is the value that the next two Press’ Q values need to be greater than. The next value is the Press’ Q value for KNN 1, which is much larger than 5.99, meaning that this is a good model. It is much better than the KNN 5 model which only has a Press’ Q value of 1.51. Because 1.51 is smaller than 5.99, it is not a good model.
pcorr=dim(10)
for (k in 1:10) {
pred=knn.cv(x,hp$Neighborhood,k)
pcorr[k]=100*sum(hp$Neighborhood==pred)/n
}
pcorr
## [1] 50.78125 48.43750 49.21875 57.81250 51.56250 57.03125 54.68750
## [8] 54.68750 56.25000 58.59375
These values are the percentages of correctness for each KNN models (values 1 through 10). The highest percentage is associated with KNN 10. Therefore, further models should be used with KNN of 10.
Now we normalize each variable.
x=hp[,c(1:6)]
for (j in 1:6) {
x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j])
}
nearest1 <- knn(train=x[train,],test=x[-train,],cl=hp$Neighborhood[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=hp$Neighborhood[train],k=5)
data.frame(hp$Neighborhood[-train],nearest1,nearest5)
## hp.Neighborhood..train. nearest1 nearest5
## 1 East North North
## 2 East East East
## 3 North East East
## 4 East East East
## 5 North East East
## 6 East North North
## 7 East North North
## 8 East East East
## 9 North North North
## 10 West West West
## 11 West West West
## 12 West West West
## 13 East North North
This new table is the same as the last one, but with normalized variables. Just looking at this table, each model is classifying the same thing each time even when the classification is wrong.
pcorrn1=100*sum(hp$Neighborhood[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(hp$Neighborhood[-train]==nearest5)/(n-nt)
pcorrn1
## [1] 53.84615
pcorrn5
## [1] 53.84615
According to these two values, both models have the same proportion of correctness. That may mean that something is wrong with the normalized variables.
qchisq(.95,2)
## [1] 5.991465
numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*3))^2)/(n*2)
PressQ1
## [1] 24.23669
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*3))^2)/(n*2)
PressQ5
## [1] 24.23669
These show the chi^2 value followed by the two Press’ Q values. Again, they are the same so normalizing the variables may have been unnecessary for this data set.
pcorr=dim(10)
for (k in 1:10) {
pred=knn.cv(x,hp$Neighborhood,k)
pcorr[k]=100*sum(hp$Neighborhood==pred)/n
}
pcorr
## [1] 60.93750 53.90625 52.34375 54.68750 58.59375 57.03125 60.93750
## [8] 60.15625 58.59375 55.46875
According to the values listed in this cross-validation table, the best model to predict neighborhood would be KNN 1 and KNN 7.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
near1<-data.frame(truetype=hp$Neighborhood[-train],predtype=nearest1)
confusionMatrix(data=nearest1, reference=hp$Neighborhood[-train])
## Confusion Matrix and Statistics
##
## Reference
## Prediction East North West
## East 3 2 0
## North 4 1 0
## West 0 0 3
##
## Overall Statistics
##
## Accuracy : 0.5385
## 95% CI : (0.2513, 0.8078)
## No Information Rate : 0.5385
## P-Value [Acc > NIR] : 0.6115
##
## Kappa : 0.2909
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: East Class: North Class: West
## Sensitivity 0.4286 0.33333 1.0000
## Specificity 0.6667 0.60000 1.0000
## Pos Pred Value 0.6000 0.20000 1.0000
## Neg Pred Value 0.5000 0.75000 1.0000
## Prevalence 0.5385 0.23077 0.2308
## Detection Rate 0.2308 0.07692 0.2308
## Detection Prevalence 0.3846 0.38462 0.2308
## Balanced Accuracy 0.5476 0.46667 1.0000
This confusion matrix shows that the KNN 1 correctly predicted west every time but incorrectly categorized north as east very often. The sensitivity and specificity values also show how the model incorrectly categorizes north and east. The Kappa value, however, is only .29 and it is better if that value is close to 1. Therefore, the model is good at predicting west, but otherwise it is not very accurate.
In this dataset, we will use specific variables to try and predict food insecurity. In this dataset, food security is marked by the value 0 and food insecurity is marked by the value 1. To do this, we need to recode all of the variables to fill in missing variables and recode other variables.
fam<-read.csv("ReducedFoodInsec(cleaned).csv")
head(fam)
## TELCELN FM_SIZE FM_KIDS FM_ELDR FM_EDUC1 FSRUNOUT FSBALANC FNMEDYN
## 1 1 3 1 0 8 3 3 2
## 2 1 12 8 1 5 2 3 1
## 3 1 3 1 0 6 2 2 2
## 4 1 6 2 0 8 2 3 2
## 5 1 4 1 0 8 3 3 2
## 6 1 4 2 0 6 3 3 2
## F10DVYN FSALYN FSSRRYN FTANFYN INCGRP5 FSNAP FWICYN
## 1 2 1 2 2 2 2 2
## 2 2 1 2 2 2 1 1
## 3 2 1 2 2 3 2 2
## 4 2 1 1 2 2 1 1
## 5 2 1 2 1 4 1 1
## 6 2 1 2 2 4 2 2
summary(fam)
## TELCELN FM_SIZE FM_KIDS FM_ELDR
## Min. :1 Min. : 2.000 Min. :1.000 Min. :0.00000
## 1st Qu.:1 1st Qu.: 4.000 1st Qu.:1.000 1st Qu.:0.00000
## Median :1 Median : 4.000 Median :2.000 Median :0.00000
## Mean :1 Mean : 4.537 Mean :2.166 Mean :0.09932
## 3rd Qu.:1 3rd Qu.: 5.000 3rd Qu.:3.000 3rd Qu.:0.00000
## Max. :1 Max. :12.000 Max. :9.000 Max. :2.00000
## FM_EDUC1 FSRUNOUT FSBALANC FNMEDYN
## Min. : 1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 5.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000
## Median : 8.000 Median :3.000 Median :3.000 Median :2.000
## Mean : 7.256 Mean :2.874 Mean :2.927 Mean :1.935
## 3rd Qu.: 9.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :99.000 Max. :3.000 Max. :3.000 Max. :2.000
## F10DVYN FSALYN FSSRRYN FTANFYN
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :1.000 Median :2.000 Median :2.000
## Mean :1.781 Mean :1.139 Mean :2.015 Mean :2.076
## 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :9.000 Max. :9.000 Max. :9.000 Max. :9.000
## INCGRP5 FSNAP FWICYN
## Min. : 1.00 Min. :1.000 Min. :1.000
## 1st Qu.: 2.00 1st Qu.:2.000 1st Qu.:2.000
## Median : 4.00 Median :2.000 Median :2.000
## Mean :13.81 Mean :1.988 Mean :1.942
## 3rd Qu.: 4.00 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :99.00 Max. :9.000 Max. :9.000
set.seed(1)
library(car)
library(norm)
library(lattice)
fam$FoodInsecurity<-fam$FSRUNOUT
fam$FoodInsecurity<-recode(fam$FoodInsecurity,"9=3")
fam$FoodInsecurity<-recode(fam$FoodInsecurity,"8=3")
fam$FoodInsecurity<-recode(fam$FoodInsecurity,"7=3")
fam$FoodInsecurity<-recode(fam$FoodInsecurity,"2=1")
fam$FoodInsecurity<-recode(fam$FoodInsecurity,"3=0")
fam$Unbalanced_Meals<-fam$FSBALANC
fam$Unbalanced_Meals<-recode(fam$Unbalanced_Meals,"9=2")
fam$Unbalanced_Meals<-recode(fam$Unbalanced_Meals,"8=2")
fam$Unbalanced_Meals<-recode(fam$Unbalanced_Meals,"7=2")
fam$Inc.Grp<-fam$INCGRP5
fam$Inc.Grp<-recode(fam$Inc.Grp,"99=2")
fam$Inc.Grp<-recode(fam$Inc.Grp,"96=2")
fam$No_Care<-fam$FNMEDYN
fam$No_Care<-recode(fam$No_Care,"9=2")
fam$No_Care<-recode(fam$No_Care,"8=2")
fam$No_Care<-recode(fam$No_Care,"7=2")
fam$ten_Med_Visits<-fam$F10DVYN
fam$ten_Med_Visits<-recode(fam$ten_Med_Visits,"9=2")
fam$ten_Med_Visits<-recode(fam$ten_Med_Visits,"8=2")
fam$ten_Med_Visits<-recode(fam$ten_Med_Visits,"7=2")
fam$MemIncome<-fam$FSALYN
fam$MemIncome<-recode(fam$MemIncome,"9=2")
fam$MemIncome<-recode(fam$MemIncome,"8=2")
fam$MemIncome<-recode(fam$MemIncome,"7=2")
fam$Cell<-fam$TELCELN
fam$Cell<-recode(fam$Cell,"9=2")
fam$Cell<-recode(fam$Cell,"8=2")
fam$Cell<-recode(fam$Cell,"7=2")
fam$FamSz<-fam$FM_SIZE
fam$SNAP<-fam$FSNAP
fam$SNAP<-recode(fam$SNAP,"9=2")
fam$SNAP<-recode(fam$SNAP,"8=2")
fam$SNAP<-recode(fam$SNAP,"7=2")
fam$EduLevl<-fam$FM_EDUC1
fam$EduLevl<-recode(fam$EduLevl,"99=4")
fam$EduLevl<-recode(fam$EduLevl,"98=4")
fam$EduLevl<-recode(fam$EduLevl,"97=4")
fam$SSIMem<-fam$FSSRRYN
fam$SSIMem<-recode(fam$SSIMem,"9=2")
fam$SSIMem<-recode(fam$SSIMem,"8=2")
fam$SSIMem<-recode(fam$SSIMem,"7=2")
fam$WelfareMem<-fam$FTANFYN
fam$WelfareMem<-recode(fam$WelfareMem,"9=2")
fam$WelfareMem<-recode(fam$WelfareMem,"8=2")
fam$WelfareMem<-recode(fam$WelfareMem,"7=2")
fam$Num_of_kids<-fam$FM_KIDS
fam$Num_of_Elders<-fam$FM_ELDR
fam$WIC<-fam$FWICYN
fam$WIC<-recode(fam$WIC,"9=2")
fam$WIC<-recode(fam$WIC,"8=2")
fam$WIC<-recode(fam$WIC,"7=2")
fam=fam[,16:30]
head(fam)
## FoodInsecurity Unbalanced_Meals Inc.Grp No_Care ten_Med_Visits MemIncome
## 1 0 3 2 2 2 1
## 2 1 3 2 1 2 1
## 3 1 2 3 2 2 1
## 4 1 3 2 2 2 1
## 5 0 3 4 2 2 1
## 6 0 3 4 2 2 1
## Cell FamSz SNAP EduLevl SSIMem WelfareMem Num_of_kids Num_of_Elders WIC
## 1 1 3 2 8 2 2 1 0 2
## 2 1 12 1 5 2 2 8 1 1
## 3 1 3 2 6 2 2 1 0 2
## 4 1 6 1 8 1 2 2 0 1
## 5 1 4 1 8 2 1 1 0 1
## 6 1 4 2 6 2 2 2 0 2
table(fam$FoodInsecurity,fam$Unbalanced_Meals)
##
## 1 2 3
## 0 3 13 1035
## 1 10 46 61
table(fam$FoodInsecurity,fam$FamSz)
##
## 2 3 4 5 6 7 8 9 10 11 12
## 0 9 221 397 239 105 46 15 11 4 2 2
## 1 3 18 33 29 17 9 4 1 0 1 2
table(fam$FoodInsecurity,fam$Num_of_Elders)
##
## 0 1 2
## 0 967 66 18
## 1 103 14 0
table(fam$FoodInsecurity,fam$Num_of_kids)
##
## 1 2 3 4 5 6 7 8 9
## 0 318 427 205 67 21 4 6 2 1
## 1 30 43 27 10 4 1 0 2 0
table(fam$FoodInsecurity,fam$EduLevl)
##
## 1 2 3 4 5 6 7 8 9
## 0 6 17 6 93 120 72 56 339 342
## 1 1 8 3 25 35 11 7 16 11
table(fam$FoodInsecurity,fam$SSIMem)
##
## 1 2
## 0 80 971
## 1 19 98
table(fam$FoodInsecurity,fam$WelfareMem)
##
## 1 2
## 0 19 1032
## 1 4 113
table(fam$FoodInsecurity,fam$SNAP)
##
## 1 2
## 0 89 962
## 1 45 72
table(fam$FoodInsecurity,fam$MemIncome)
##
## 1 2
## 0 1000 51
## 1 111 6
table(fam$FoodInsecurity,fam$ten_Med_Visits)
##
## 1 2
## 0 230 821
## 1 33 84
table(fam$FoodInsecurity,fam$No_Care)
##
## 1 2
## 0 46 1005
## 1 30 87
table(fam$FoodInsecurity,fam$Inc.Grp)
##
## 1 2 3 4
## 0 46 389 176 440
## 1 31 61 13 12
table(fam$FoodInsecurity,fam$WIC)
##
## 1 2
## 0 161 890
## 1 44 73
These tables show the distribution of food insecurity among the different variables. For example, in the family income group table, there is a general trend that, as income increases, food insecurity levels decrease. However, because they are all split up into tables, it can be difficult to discern trends without a model.
library(textir)
library(MASS)
n=length(fam$FoodInsecurity)
nt=1100
set.seed(1)
train <- sample(1:n,nt)
x<-scale(fam[,c(2,3,4,5,6,8,9,13)])
x[1:3,]
## Unbalanced_Meals Inc.Grp No_Care ten_Med_Visits MemIncome
## [1,] 0.2428295 -0.8613364 0.2636996 0.5388496 -0.2264094
## [2,] 0.2428295 -0.8613364 -3.7889470 0.5388496 -0.2264094
## [3,] -3.0939334 0.1288614 0.2636996 0.5388496 -0.2264094
## FamSz SNAP Num_of_kids
## [1,] -1.08243 0.3598373 -1.027254
## [2,] 5.25657 -2.7766547 5.139288
## [3,] -1.08243 0.3598373 -1.027254
mean(x)
## [1] 7.938838e-17
sd(x)
## [1] 0.9996253
Here, we normalize some of the variables that have larger differences between the minimum and the maximum to help the model.
library(class)
nearest1 <- knn(train=x[train,],test=x[-train,],cl=fam$FoodInsecurity[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=fam$FoodInsecurity[train],k=5)
data.frame(fam$FoodInsecurity[-train],nearest1,nearest5)
## fam.FoodInsecurity..train. nearest1 nearest5
## 1 0 0 0
## 2 0 0 1
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## 7 0 0 0
## 8 1 0 0
## 9 0 0 0
## 10 0 0 0
## 11 0 0 0
## 12 0 1 0
## 13 0 0 0
## 14 1 0 0
## 15 0 0 0
## 16 0 0 0
## 17 0 0 0
## 18 0 0 0
## 19 0 0 0
## 20 0 0 0
## 21 0 0 0
## 22 0 0 0
## 23 0 0 0
## 24 0 0 0
## 25 0 0 0
## 26 0 0 0
## 27 1 1 1
## 28 0 0 0
## 29 0 0 0
## 30 0 0 0
## 31 0 0 0
## 32 0 0 0
## 33 0 0 0
## 34 0 0 0
## 35 0 1 0
## 36 0 0 0
## 37 0 0 0
## 38 0 0 0
## 39 0 0 0
## 40 0 0 0
## 41 1 1 1
## 42 0 0 0
## 43 0 0 0
## 44 0 0 0
## 45 0 0 0
## 46 0 0 0
## 47 0 0 0
## 48 0 0 0
## 49 0 0 0
## 50 0 0 0
## 51 0 0 0
## 52 0 0 0
## 53 0 0 0
## 54 0 0 0
## 55 0 0 0
## 56 0 0 0
## 57 0 0 0
## 58 0 0 0
## 59 0 0 0
## 60 0 0 0
## 61 0 0 0
## 62 0 0 0
## 63 0 0 0
## 64 0 0 0
## 65 0 0 0
## 66 0 0 0
## 67 1 0 0
## 68 0 0 0
par(mfrow=c(1,2))
plot(x[train,],col=fam$FoodInsecurity[train],cex=.8,main="1-nearest neighbor")
points(x[-train,],bg=nearest1,pch=21,col=grey(.9),cex=1.25)
plot(x[train,],col=fam$FoodInsecurity[train],cex=.8,main="5-nearest neighbors")
points(x[-train,],bg=nearest5,pch=21,col=grey(.9),cex=1.25)
This table shows the actual values in the training set and the predicted values in both of the models. There are a total of 68 training cases so it is difficult to see exactly how each model is doing. So we will calculate the proportion of correct classifications. We will not plot each model because the models are on a binary grid.
pcorrn1=100*sum(fam$FoodInsecurity[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(fam$FoodInsecurity[-train]==nearest5)/(n-nt)
pcorrn1
## [1] 92.64706
pcorrn5
## [1] 94.11765
The values of are the proportions of classification for KNN 1 and KNN 5 respectively. As shown by the values, KNN 5 has the better proportion of correctness. However, both have very high values. This is due to the fact that there were a large amount of 0s correctly predicted.
qchisq(.95,1)
## [1] 3.841459
numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*2))^2)/(n*1)
PressQ1
## [1] 849.7301
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*2))^2)/(n*1)
PressQ5
## [1] 909.3426
First value is the chi^2 value for a data set with two groups. This value, 3.84 is far below the Press’ Q values of the models. This means that the models are both very good. Again, this is due to the large amount of 0s that both models correctly predicted.
pcorr=dim(10)
for (k in 1:10) {
pred=knn.cv(x,fam$FoodInsecurity,k)
pcorr[k]=100*sum(fam$FoodInsecurity==pred)/n
}
pcorr
## [1] 91.01027 92.20890 92.12329 92.20890 92.46575 93.15068 93.40753
## [8] 93.32192 93.40753 93.32192
These ten values are all extremely high, so any knn value would be good. However, the best one would be 7 or 9.
pcorrn1=100*sum(fam$FoodInsecurity[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(fam$FoodInsecurity[-train]==nearest5)/(n-nt)
pcorrn1
## [1] 92.64706
pcorrn5
## [1] 94.11765
Even though the last table showed that neither model is good at predicting food insecurity, both models still have large proportions of correctness.
qchisq(.95,1)
## [1] 3.841459
numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*2))^2)/(n*1)
PressQ1
## [1] 849.7301
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*2))^2)/(n*1)
PressQ5
## [1] 909.3426
Like before, the first value is the chi^2 value and the next two are the KNN 1 and KNN 5 respectively. Both are still way above the chi^2 value necessary.
library(caret)
near5<-data.frame(truetype=fam$FoodInsecurity[-train],predtype=nearest5)
confusionMatrix(data=nearest5, reference=fam$FoodInsecurity[-train])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 62 3
## 1 1 2
##
## Accuracy : 0.9412
## 95% CI : (0.8562, 0.9837)
## No Information Rate : 0.9265
## P-Value [Acc > NIR] : 0.4338
##
## Kappa : 0.4708
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.9841
## Specificity : 0.4000
## Pos Pred Value : 0.9538
## Neg Pred Value : 0.6667
## Prevalence : 0.9265
## Detection Rate : 0.9118
## Detection Prevalence : 0.9559
## Balanced Accuracy : 0.6921
##
## 'Positive' Class : 0
##
This classification matrix shows that the model has a high accuracy rate evidenced by the high accurary statistic and the table. However, the Kappa value is still not close to 1 and the p-value is also high, neither of which are good. Overall, the model is not good at predicting food insecurity, which is what is prioritized.