House Prices

Our first data set is the house prices one again. We will look at the K-Nearest Neighbors for the neighborhood variable.

library(nutshell)
## Loading required package: nutshell.bbdb
## Loading required package: nutshell.audioscrobbler
library(MASS)
library(class)
library(caret)
## Warning: package 'caret' was built under R version 3.4.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.2
library(textir)
## Warning: package 'textir' was built under R version 3.4.2
## Loading required package: distrom
## Warning: package 'distrom' was built under R version 3.4.2
## Loading required package: Matrix
## Loading required package: gamlr
## Warning: package 'gamlr' was built under R version 3.4.2
## Loading required package: parallel
HP <- read.csv("~/Business Analytics/HousePrices.csv")
table(HP$Neighborhood)
## 
##  East North  West 
##    45    44    39
n = length(HP$Neighborhood)
nt = 114
set.seed(1)
train <- sample(1:n,nt)

x <- scale(HP[,c(4,1)])
mean(x)
## [1] 3.873482e-18
sd(x)
## [1] 0.9980373
nearest1 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=5)
data.frame(HP$Neighborhood[-train],nearest1,nearest5)
##    HP.Neighborhood..train. nearest1 nearest5
## 1                     East     East    North
## 2                     East     East     East
## 3                    North    North    North
## 4                     East     East    North
## 5                    North     West     West
## 6                     East     East    North
## 7                     East    North    North
## 8                     East    North    North
## 9                    North    North    North
## 10                    West    North     West
## 11                    West     West     West
## 12                    West     West     West
## 13                    West    North     West
## 14                    East     East     East

This output shows us how our predictions went use the 1 nearest neighbor and the 5 nearest neighbors along with the actual category.

pcorrn1=100*sum(HP$Neighborhood[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(HP$Neighborhood[-train]==nearest5)/(n-nt)
pcorrn1
## [1] 64.28571
pcorrn5
## [1] 57.14286
numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*6))^2)/(n*5)
PressQ1
## [1] 208.9796
qchisq(.95,5) ##critical value for chi-square with alpha = .05, k-1 df where k=6
## [1] 11.0705
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*6))^2)/(n*5)
PressQ5
## [1] 150.9878

These are the proportion of correct classifications and the Press’s Q for both k=1 and 5.

pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(x,HP$Neighborhood,k)
  pcorr[k]=100*sum(HP$Neighborhood==pred)/n
}
pcorr
##  [1] 47.65625 39.84375 44.53125 38.28125 45.31250 40.62500 42.18750
##  [8] 44.53125 42.18750 42.18750

This is our cross validation and we will chose 1 nearest neighbor because it is better at predicting.

closest <- data.frame(truetype=HP$Neighborhood[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=HP$Neighborhood[-train])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction East North West
##      East     5     0    0
##      North    2     2    2
##      West     0     1    2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6429          
##                  95% CI : (0.3514, 0.8724)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.212           
##                                           
##                   Kappa : 0.4656          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: East Class: North Class: West
## Sensitivity               0.7143       0.6667      0.5000
## Specificity               1.0000       0.6364      0.9000
## Pos Pred Value            1.0000       0.3333      0.6667
## Neg Pred Value            0.7778       0.8750      0.8182
## Prevalence                0.5000       0.2143      0.2857
## Detection Rate            0.3571       0.1429      0.1429
## Detection Prevalence      0.3571       0.4286      0.2143
## Balanced Accuracy         0.8571       0.6515      0.7000

Then there is the confusion matrix which shows the accuracy is decent but not fantastic.

Food Insecurity

This data set includes many variables trying to predict food insecurity. FSRUNOUT will be used as the dependent variable.

library(car)
## Warning: package 'car' was built under R version 3.4.2
RFI<- read.csv("~/Business Analytics/ReducedFoodInsec.csv")
table(RFI$FSRUNOUT)
## 
##    1    2    3 
##   30   87 1051
RFI$FSRUNOUT = recode(RFI$FSRUNOUT, "'1'=1;'2'=0;'3'=0;'7'=NA;'8'=NA;'9'=NA")
table(RFI$FSRUNOUT)
## 
##    0    1 
## 1138   30

To start we recoded our FSRUNOUT variable to be 1’s for only chronic food insecurity and the rest as 0’s.

RFI2 <- data.frame(RFI$FM_SIZE, RFI$FM_EDUC1, RFI$HOUSEOWN, RFI$FMEDBILL, RFI$FANYLCT, RFI$FSRUNOUT, RFI$RAT_CAT4)
head(RFI2)
##   RFI.FM_SIZE RFI.FM_EDUC1 RFI.HOUSEOWN RFI.FMEDBILL RFI.FANYLCT
## 1           3            8            2            1           0
## 2          12            5            1            1           2
## 3           3            6            1            1           0
## 4           6            8            1            1           1
## 5           4            8            1            2           1
## 6           4            6            1            2           1
##   RFI.FSRUNOUT RFI.RAT_CAT4
## 1            0            8
## 2            0            3
## 3            0           13
## 4            0            7
## 5            0           14
## 6            0           14
dim(RFI2)
## [1] 1168    7
n = length(RFI2[,1])
nt = 900
set.seed(1) ##to make calculations reproducible in repeated runs
train <- sample(1:n,nt)

x <- scale(RFI2[,c(4,1)])
x[1:3,]
##      RFI.FMEDBILL RFI.FM_SIZE
## [1,]    -1.805882    -1.08243
## [2,]    -1.805882     5.25657
## [3,]    -1.805882    -1.08243
mean(x)
## [1] 1.509512e-16
sd(x)
## [1] 0.9997858
nearest1 <- knn(train=x[train,],test=x[-train,],cl=RFI2$RFI.FSRUNOUT[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=RFI2$RFI.FSRUNOUT[train],k=5)
data.frame(RFI2$RFI.FSRUNOUT[-train],nearest1,nearest5)
##     RFI2.RFI.FSRUNOUT..train. nearest1 nearest5
## 1                           0        0        0
## 2                           0        0        0
## 3                           0        0        0
## 4                           0        0        0
## 5                           0        0        0
## 6                           0        0        0
## 7                           0        0        0
## 8                           0        0        0
## 9                           0        0        0
## 10                          0        0        0
## 11                          0        0        0
## 12                          0        0        0
## 13                          0        0        0
## 14                          0        0        0
## 15                          0        0        0
## 16                          0        0        0
## 17                          0        0        0
## 18                          0        0        0
## 19                          1        0        0
## 20                          0        0        0
## 21                          0        0        0
## 22                          0        0        0
## 23                          0        0        0
## 24                          0        0        0
## 25                          0        0        0
## 26                          0        0        0
## 27                          0        0        0
## 28                          0        0        0
## 29                          0        0        0
## 30                          0        0        0
## 31                          0        0        0
## 32                          0        0        0
## 33                          0        0        0
## 34                          0        0        0
## 35                          0        0        0
## 36                          0        0        0
## 37                          0        0        0
## 38                          0        0        0
## 39                          0        0        0
## 40                          1        0        0
## 41                          0        0        0
## 42                          0        0        0
## 43                          0        0        0
## 44                          0        0        0
## 45                          0        0        0
## 46                          0        0        0
## 47                          0        0        0
## 48                          0        0        0
## 49                          1        0        0
## 50                          0        0        0
## 51                          0        0        0
## 52                          0        0        0
## 53                          0        0        0
## 54                          0        0        0
## 55                          0        0        0
## 56                          0        0        0
## 57                          0        0        0
## 58                          0        0        0
## 59                          0        0        0
## 60                          0        0        0
## 61                          0        0        0
## 62                          0        0        0
## 63                          0        0        0
## 64                          0        0        0
## 65                          0        0        0
## 66                          0        0        0
## 67                          0        0        0
## 68                          0        0        0
## 69                          0        0        0
## 70                          0        0        0
## 71                          0        0        0
## 72                          0        0        0
## 73                          0        0        0
## 74                          0        0        0
## 75                          0        0        0
## 76                          0        0        0
## 77                          0        0        0
## 78                          0        0        0
## 79                          0        0        0
## 80                          0        0        0
## 81                          0        0        0
## 82                          0        0        0
## 83                          0        0        0
## 84                          0        0        0
## 85                          0        0        0
## 86                          0        0        0
## 87                          0        0        0
## 88                          0        0        0
## 89                          0        0        0
## 90                          0        0        0
## 91                          0        0        0
## 92                          0        0        0
## 93                          0        0        0
## 94                          0        0        0
## 95                          0        0        0
## 96                          0        0        0
## 97                          0        0        0
## 98                          0        0        0
## 99                          0        0        0
## 100                         0        0        0
## 101                         0        0        0
## 102                         0        0        0
## 103                         1        0        0
## 104                         0        0        0
## 105                         0        0        0
## 106                         0        0        0
## 107                         0        0        0
## 108                         0        0        0
## 109                         0        0        0
## 110                         0        0        0
## 111                         0        0        0
## 112                         1        0        0
## 113                         0        0        0
## 114                         0        0        0
## 115                         0        0        0
## 116                         0        0        0
## 117                         0        0        0
## 118                         0        0        0
## 119                         0        0        0
## 120                         0        0        0
## 121                         0        0        0
## 122                         0        0        0
## 123                         0        0        0
## 124                         0        0        0
## 125                         0        0        0
## 126                         0        0        0
## 127                         0        0        0
## 128                         0        0        0
## 129                         0        0        0
## 130                         0        0        0
## 131                         0        0        0
## 132                         0        0        0
## 133                         0        0        0
## 134                         0        0        0
## 135                         0        0        0
## 136                         0        0        0
## 137                         0        0        0
## 138                         0        0        0
## 139                         0        0        0
## 140                         0        0        0
## 141                         0        0        0
## 142                         0        0        0
## 143                         0        0        0
## 144                         0        0        0
## 145                         0        0        0
## 146                         0        0        0
## 147                         0        0        0
## 148                         0        0        0
## 149                         0        0        0
## 150                         0        0        0
## 151                         0        0        0
## 152                         0        0        0
## 153                         0        0        0
## 154                         0        0        0
## 155                         0        0        0
## 156                         0        0        0
## 157                         0        0        0
## 158                         0        0        0
## 159                         0        0        0
## 160                         0        0        0
## 161                         0        0        0
## 162                         0        0        0
## 163                         0        0        0
## 164                         0        0        0
## 165                         0        0        0
## 166                         0        0        0
## 167                         0        0        0
## 168                         0        0        0
## 169                         0        0        0
## 170                         0        0        0
## 171                         0        0        0
## 172                         0        0        0
## 173                         0        0        0
## 174                         0        0        0
## 175                         0        0        0
## 176                         0        0        0
## 177                         0        0        0
## 178                         0        0        0
## 179                         0        0        0
## 180                         0        0        0
## 181                         0        0        0
## 182                         0        0        0
## 183                         0        0        0
## 184                         0        0        0
## 185                         0        0        0
## 186                         0        0        0
## 187                         0        0        0
## 188                         0        0        0
## 189                         0        0        0
## 190                         0        0        0
## 191                         0        0        0
## 192                         0        0        0
## 193                         0        0        0
## 194                         0        0        0
## 195                         0        0        0
## 196                         0        0        0
## 197                         0        0        0
## 198                         0        0        0
## 199                         0        0        0
## 200                         0        0        0
## 201                         0        0        0
## 202                         0        0        0
## 203                         0        0        0
## 204                         0        0        0
## 205                         0        0        0
## 206                         0        0        0
## 207                         0        0        0
## 208                         0        0        0
## 209                         0        0        0
## 210                         0        0        0
## 211                         0        0        0
## 212                         0        0        0
## 213                         0        0        0
## 214                         0        0        0
## 215                         0        0        0
## 216                         0        0        0
## 217                         0        0        0
## 218                         0        0        0
## 219                         0        0        0
## 220                         0        0        0
## 221                         0        0        0
## 222                         0        0        0
## 223                         0        0        0
## 224                         0        0        0
## 225                         0        0        0
## 226                         0        0        0
## 227                         0        0        0
## 228                         0        0        0
## 229                         0        0        0
## 230                         0        0        0
## 231                         0        0        0
## 232                         0        0        0
## 233                         0        0        0
## 234                         0        0        0
## 235                         1        0        0
## 236                         0        0        0
## 237                         0        0        0
## 238                         0        0        0
## 239                         0        0        0
## 240                         1        0        0
## 241                         0        0        0
## 242                         0        0        0
## 243                         0        0        0
## 244                         0        0        0
## 245                         0        0        0
## 246                         0        0        0
## 247                         0        0        0
## 248                         0        0        0
## 249                         0        0        0
## 250                         0        0        0
## 251                         0        0        0
## 252                         0        0        0
## 253                         0        0        0
## 254                         0        0        0
## 255                         0        0        0
## 256                         0        0        0
## 257                         0        0        0
## 258                         0        0        0
## 259                         0        0        0
## 260                         0        0        0
## 261                         0        0        0
## 262                         0        0        0
## 263                         0        1        0
## 264                         0        0        0
## 265                         0        0        0
## 266                         0        0        0
## 267                         0        0        0
## 268                         0        0        0

Then I chose a subset of variables to use and made a new data set using those variables. Then Using that data set ran KNN using k=1 and 5, as you can see in the chart how our predictions looked. As you can see, mostly zero’s are predicted because a large majority of the data set contains zero’s.

pcorrn1=100*sum(RFI2$RFI.FSRUNOUT[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(RFI2$RFI.FSRUNOUT[-train]==nearest5)/(n-nt)
pcorrn1
## [1] 97.01493
pcorrn5
## [1] 97.38806
numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*6))^2)/(n*5)
PressQ1
## [1] 5429.105
qchisq(.95,5)
## [1] 11.0705
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*6))^2)/(n*5)
PressQ5
## [1] 5479.648

These are our proportion correct and our Press’s Q values.

pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(x,RFI2$RFI.FSRUNOUT,k)
  pcorr[k]=100*sum(RFI2$RFI.FSRUNOUT==pred)/n
}
pcorr
##  [1] 97.26027 97.26027 97.26027 97.43151 97.43151 97.43151 97.43151
##  [8] 97.43151 97.43151 97.43151
closest <- data.frame(truetype=RFI2$RFI.FSRUNOUT[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=RFI2$RFI.FSRUNOUT[-train])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 260   7
##          1   1   0
##                                         
##                Accuracy : 0.9701        
##                  95% CI : (0.942, 0.987)
##     No Information Rate : 0.9739        
##     P-Value [Acc > NIR] : 0.7308        
##                                         
##                   Kappa : -0.0066       
##  Mcnemar's Test P-Value : 0.0771        
##                                         
##             Sensitivity : 0.9962        
##             Specificity : 0.0000        
##          Pos Pred Value : 0.9738        
##          Neg Pred Value : 0.0000        
##              Prevalence : 0.9739        
##          Detection Rate : 0.9701        
##    Detection Prevalence : 0.9963        
##       Balanced Accuracy : 0.4981        
##                                         
##        'Positive' Class : 0             
## 

Then finally after running our confusion matrix we got an accuracy of 97% which is great but if you look at the matrix, we failed to predict when someone had severe food insecurity all 7 instances. This shows that maybe we could choose better variables for our subset to have better predictive power.