Our first data set is the house prices one again. We will look at the K-Nearest Neighbors for the neighborhood variable.
library(nutshell)
## Loading required package: nutshell.bbdb
## Loading required package: nutshell.audioscrobbler
library(MASS)
library(class)
library(caret)
## Warning: package 'caret' was built under R version 3.4.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.2
library(textir)
## Warning: package 'textir' was built under R version 3.4.2
## Loading required package: distrom
## Warning: package 'distrom' was built under R version 3.4.2
## Loading required package: Matrix
## Loading required package: gamlr
## Warning: package 'gamlr' was built under R version 3.4.2
## Loading required package: parallel
HP <- read.csv("~/Business Analytics/HousePrices.csv")
table(HP$Neighborhood)
##
## East North West
## 45 44 39
n = length(HP$Neighborhood)
nt = 114
set.seed(1)
train <- sample(1:n,nt)
x <- scale(HP[,c(4,1)])
mean(x)
## [1] 3.873482e-18
sd(x)
## [1] 0.9980373
nearest1 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=5)
data.frame(HP$Neighborhood[-train],nearest1,nearest5)
## HP.Neighborhood..train. nearest1 nearest5
## 1 East East North
## 2 East East East
## 3 North North North
## 4 East East North
## 5 North West West
## 6 East East North
## 7 East North North
## 8 East North North
## 9 North North North
## 10 West North West
## 11 West West West
## 12 West West West
## 13 West North West
## 14 East East East
This output shows us how our predictions went use the 1 nearest neighbor and the 5 nearest neighbors along with the actual category.
pcorrn1=100*sum(HP$Neighborhood[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(HP$Neighborhood[-train]==nearest5)/(n-nt)
pcorrn1
## [1] 64.28571
pcorrn5
## [1] 57.14286
numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*6))^2)/(n*5)
PressQ1
## [1] 208.9796
qchisq(.95,5) ##critical value for chi-square with alpha = .05, k-1 df where k=6
## [1] 11.0705
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*6))^2)/(n*5)
PressQ5
## [1] 150.9878
These are the proportion of correct classifications and the Press’s Q for both k=1 and 5.
pcorr=dim(10)
for (k in 1:10) {
pred=knn.cv(x,HP$Neighborhood,k)
pcorr[k]=100*sum(HP$Neighborhood==pred)/n
}
pcorr
## [1] 47.65625 39.84375 44.53125 38.28125 45.31250 40.62500 42.18750
## [8] 44.53125 42.18750 42.18750
This is our cross validation and we will chose 1 nearest neighbor because it is better at predicting.
closest <- data.frame(truetype=HP$Neighborhood[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=HP$Neighborhood[-train])
## Confusion Matrix and Statistics
##
## Reference
## Prediction East North West
## East 5 0 0
## North 2 2 2
## West 0 1 2
##
## Overall Statistics
##
## Accuracy : 0.6429
## 95% CI : (0.3514, 0.8724)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.212
##
## Kappa : 0.4656
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: East Class: North Class: West
## Sensitivity 0.7143 0.6667 0.5000
## Specificity 1.0000 0.6364 0.9000
## Pos Pred Value 1.0000 0.3333 0.6667
## Neg Pred Value 0.7778 0.8750 0.8182
## Prevalence 0.5000 0.2143 0.2857
## Detection Rate 0.3571 0.1429 0.1429
## Detection Prevalence 0.3571 0.4286 0.2143
## Balanced Accuracy 0.8571 0.6515 0.7000
Then there is the confusion matrix which shows the accuracy is decent but not fantastic.
This data set includes many variables trying to predict food insecurity. FSRUNOUT will be used as the dependent variable.
library(car)
## Warning: package 'car' was built under R version 3.4.2
RFI<- read.csv("~/Business Analytics/ReducedFoodInsec.csv")
table(RFI$FSRUNOUT)
##
## 1 2 3
## 30 87 1051
RFI$FSRUNOUT = recode(RFI$FSRUNOUT, "'1'=1;'2'=0;'3'=0;'7'=NA;'8'=NA;'9'=NA")
table(RFI$FSRUNOUT)
##
## 0 1
## 1138 30
To start we recoded our FSRUNOUT variable to be 1’s for only chronic food insecurity and the rest as 0’s.
RFI2 <- data.frame(RFI$FM_SIZE, RFI$FM_EDUC1, RFI$HOUSEOWN, RFI$FMEDBILL, RFI$FANYLCT, RFI$FSRUNOUT, RFI$RAT_CAT4)
head(RFI2)
## RFI.FM_SIZE RFI.FM_EDUC1 RFI.HOUSEOWN RFI.FMEDBILL RFI.FANYLCT
## 1 3 8 2 1 0
## 2 12 5 1 1 2
## 3 3 6 1 1 0
## 4 6 8 1 1 1
## 5 4 8 1 2 1
## 6 4 6 1 2 1
## RFI.FSRUNOUT RFI.RAT_CAT4
## 1 0 8
## 2 0 3
## 3 0 13
## 4 0 7
## 5 0 14
## 6 0 14
dim(RFI2)
## [1] 1168 7
n = length(RFI2[,1])
nt = 900
set.seed(1) ##to make calculations reproducible in repeated runs
train <- sample(1:n,nt)
x <- scale(RFI2[,c(4,1)])
x[1:3,]
## RFI.FMEDBILL RFI.FM_SIZE
## [1,] -1.805882 -1.08243
## [2,] -1.805882 5.25657
## [3,] -1.805882 -1.08243
mean(x)
## [1] 1.509512e-16
sd(x)
## [1] 0.9997858
nearest1 <- knn(train=x[train,],test=x[-train,],cl=RFI2$RFI.FSRUNOUT[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=RFI2$RFI.FSRUNOUT[train],k=5)
data.frame(RFI2$RFI.FSRUNOUT[-train],nearest1,nearest5)
## RFI2.RFI.FSRUNOUT..train. nearest1 nearest5
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## 7 0 0 0
## 8 0 0 0
## 9 0 0 0
## 10 0 0 0
## 11 0 0 0
## 12 0 0 0
## 13 0 0 0
## 14 0 0 0
## 15 0 0 0
## 16 0 0 0
## 17 0 0 0
## 18 0 0 0
## 19 1 0 0
## 20 0 0 0
## 21 0 0 0
## 22 0 0 0
## 23 0 0 0
## 24 0 0 0
## 25 0 0 0
## 26 0 0 0
## 27 0 0 0
## 28 0 0 0
## 29 0 0 0
## 30 0 0 0
## 31 0 0 0
## 32 0 0 0
## 33 0 0 0
## 34 0 0 0
## 35 0 0 0
## 36 0 0 0
## 37 0 0 0
## 38 0 0 0
## 39 0 0 0
## 40 1 0 0
## 41 0 0 0
## 42 0 0 0
## 43 0 0 0
## 44 0 0 0
## 45 0 0 0
## 46 0 0 0
## 47 0 0 0
## 48 0 0 0
## 49 1 0 0
## 50 0 0 0
## 51 0 0 0
## 52 0 0 0
## 53 0 0 0
## 54 0 0 0
## 55 0 0 0
## 56 0 0 0
## 57 0 0 0
## 58 0 0 0
## 59 0 0 0
## 60 0 0 0
## 61 0 0 0
## 62 0 0 0
## 63 0 0 0
## 64 0 0 0
## 65 0 0 0
## 66 0 0 0
## 67 0 0 0
## 68 0 0 0
## 69 0 0 0
## 70 0 0 0
## 71 0 0 0
## 72 0 0 0
## 73 0 0 0
## 74 0 0 0
## 75 0 0 0
## 76 0 0 0
## 77 0 0 0
## 78 0 0 0
## 79 0 0 0
## 80 0 0 0
## 81 0 0 0
## 82 0 0 0
## 83 0 0 0
## 84 0 0 0
## 85 0 0 0
## 86 0 0 0
## 87 0 0 0
## 88 0 0 0
## 89 0 0 0
## 90 0 0 0
## 91 0 0 0
## 92 0 0 0
## 93 0 0 0
## 94 0 0 0
## 95 0 0 0
## 96 0 0 0
## 97 0 0 0
## 98 0 0 0
## 99 0 0 0
## 100 0 0 0
## 101 0 0 0
## 102 0 0 0
## 103 1 0 0
## 104 0 0 0
## 105 0 0 0
## 106 0 0 0
## 107 0 0 0
## 108 0 0 0
## 109 0 0 0
## 110 0 0 0
## 111 0 0 0
## 112 1 0 0
## 113 0 0 0
## 114 0 0 0
## 115 0 0 0
## 116 0 0 0
## 117 0 0 0
## 118 0 0 0
## 119 0 0 0
## 120 0 0 0
## 121 0 0 0
## 122 0 0 0
## 123 0 0 0
## 124 0 0 0
## 125 0 0 0
## 126 0 0 0
## 127 0 0 0
## 128 0 0 0
## 129 0 0 0
## 130 0 0 0
## 131 0 0 0
## 132 0 0 0
## 133 0 0 0
## 134 0 0 0
## 135 0 0 0
## 136 0 0 0
## 137 0 0 0
## 138 0 0 0
## 139 0 0 0
## 140 0 0 0
## 141 0 0 0
## 142 0 0 0
## 143 0 0 0
## 144 0 0 0
## 145 0 0 0
## 146 0 0 0
## 147 0 0 0
## 148 0 0 0
## 149 0 0 0
## 150 0 0 0
## 151 0 0 0
## 152 0 0 0
## 153 0 0 0
## 154 0 0 0
## 155 0 0 0
## 156 0 0 0
## 157 0 0 0
## 158 0 0 0
## 159 0 0 0
## 160 0 0 0
## 161 0 0 0
## 162 0 0 0
## 163 0 0 0
## 164 0 0 0
## 165 0 0 0
## 166 0 0 0
## 167 0 0 0
## 168 0 0 0
## 169 0 0 0
## 170 0 0 0
## 171 0 0 0
## 172 0 0 0
## 173 0 0 0
## 174 0 0 0
## 175 0 0 0
## 176 0 0 0
## 177 0 0 0
## 178 0 0 0
## 179 0 0 0
## 180 0 0 0
## 181 0 0 0
## 182 0 0 0
## 183 0 0 0
## 184 0 0 0
## 185 0 0 0
## 186 0 0 0
## 187 0 0 0
## 188 0 0 0
## 189 0 0 0
## 190 0 0 0
## 191 0 0 0
## 192 0 0 0
## 193 0 0 0
## 194 0 0 0
## 195 0 0 0
## 196 0 0 0
## 197 0 0 0
## 198 0 0 0
## 199 0 0 0
## 200 0 0 0
## 201 0 0 0
## 202 0 0 0
## 203 0 0 0
## 204 0 0 0
## 205 0 0 0
## 206 0 0 0
## 207 0 0 0
## 208 0 0 0
## 209 0 0 0
## 210 0 0 0
## 211 0 0 0
## 212 0 0 0
## 213 0 0 0
## 214 0 0 0
## 215 0 0 0
## 216 0 0 0
## 217 0 0 0
## 218 0 0 0
## 219 0 0 0
## 220 0 0 0
## 221 0 0 0
## 222 0 0 0
## 223 0 0 0
## 224 0 0 0
## 225 0 0 0
## 226 0 0 0
## 227 0 0 0
## 228 0 0 0
## 229 0 0 0
## 230 0 0 0
## 231 0 0 0
## 232 0 0 0
## 233 0 0 0
## 234 0 0 0
## 235 1 0 0
## 236 0 0 0
## 237 0 0 0
## 238 0 0 0
## 239 0 0 0
## 240 1 0 0
## 241 0 0 0
## 242 0 0 0
## 243 0 0 0
## 244 0 0 0
## 245 0 0 0
## 246 0 0 0
## 247 0 0 0
## 248 0 0 0
## 249 0 0 0
## 250 0 0 0
## 251 0 0 0
## 252 0 0 0
## 253 0 0 0
## 254 0 0 0
## 255 0 0 0
## 256 0 0 0
## 257 0 0 0
## 258 0 0 0
## 259 0 0 0
## 260 0 0 0
## 261 0 0 0
## 262 0 0 0
## 263 0 1 0
## 264 0 0 0
## 265 0 0 0
## 266 0 0 0
## 267 0 0 0
## 268 0 0 0
Then I chose a subset of variables to use and made a new data set using those variables. Then Using that data set ran KNN using k=1 and 5, as you can see in the chart how our predictions looked. As you can see, mostly zero’s are predicted because a large majority of the data set contains zero’s.
pcorrn1=100*sum(RFI2$RFI.FSRUNOUT[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(RFI2$RFI.FSRUNOUT[-train]==nearest5)/(n-nt)
pcorrn1
## [1] 97.01493
pcorrn5
## [1] 97.38806
numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*6))^2)/(n*5)
PressQ1
## [1] 5429.105
qchisq(.95,5)
## [1] 11.0705
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*6))^2)/(n*5)
PressQ5
## [1] 5479.648
These are our proportion correct and our Press’s Q values.
pcorr=dim(10)
for (k in 1:10) {
pred=knn.cv(x,RFI2$RFI.FSRUNOUT,k)
pcorr[k]=100*sum(RFI2$RFI.FSRUNOUT==pred)/n
}
pcorr
## [1] 97.26027 97.26027 97.26027 97.43151 97.43151 97.43151 97.43151
## [8] 97.43151 97.43151 97.43151
closest <- data.frame(truetype=RFI2$RFI.FSRUNOUT[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=RFI2$RFI.FSRUNOUT[-train])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 260 7
## 1 1 0
##
## Accuracy : 0.9701
## 95% CI : (0.942, 0.987)
## No Information Rate : 0.9739
## P-Value [Acc > NIR] : 0.7308
##
## Kappa : -0.0066
## Mcnemar's Test P-Value : 0.0771
##
## Sensitivity : 0.9962
## Specificity : 0.0000
## Pos Pred Value : 0.9738
## Neg Pred Value : 0.0000
## Prevalence : 0.9739
## Detection Rate : 0.9701
## Detection Prevalence : 0.9963
## Balanced Accuracy : 0.4981
##
## 'Positive' Class : 0
##
Then finally after running our confusion matrix we got an accuracy of 97% which is great but if you look at the matrix, we failed to predict when someone had severe food insecurity all 7 instances. This shows that maybe we could choose better variables for our subset to have better predictive power.