House Prices is a data set that has 128 observations (houses) and includes prices and characteristics of those houses in a major US metro area.
library (car)
library(lattice)
library(leaps)
library(nutshell)
## Loading required package: nutshell.bbdb
## Loading required package: nutshell.audioscrobbler
library(caret)
## Loading required package: ggplot2
library(textir)
## Loading required package: distrom
## Loading required package: Matrix
## Loading required package: gamlr
## Loading required package: parallel
library(MASS)
library(class)
HP <- read.csv("~/DataMining/Data/HousePrices.csv")
n = length(HP$Neighborhood)
nt = 114
set.seed(1) ##to make calculations reproducible in repeated runs
train <- sample(1:n,nt)
x <- scale(HP[,c(4,1)])
x[1:3,]
## Bedrooms HomeID
## [1,] -1.40978793 -1.711845
## [2,] 1.34521749 -1.684887
## [3,] -0.03228522 -1.657929
mean(x)
## [1] 3.873482e-18
sd(x)
## [1] 0.9980373
nearest1 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=5)
head(data.frame(HP$Neighborhood[-train],nearest1,nearest5))
## HP.Neighborhood..train. nearest1 nearest5
## 1 East East North
## 2 East East East
## 3 North North North
## 4 East East North
## 5 North West West
## 6 East East North
Now we are going to calculate the proportion of correct classifications on this one training set.
pcorrn1=100*sum(HP$Neighborhood[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(HP$Neighborhood[-train]==nearest5)/(n-nt)
pcorrn1
## [1] 64.28571
pcorrn5
## [1] 57.14286
Now for our Press Q
numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*6))^2)/(n*5)
PressQ1
## [1] 208.9796
qchisq(.95,5)
## [1] 11.0705
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*6))^2)/(n*5)
PressQ5
## [1] 150.9878
Now cross validation, we leave one out
pcorr=dim(10)
for (k in 1:10) {
pred=knn.cv(x,HP$Neighborhood,k)
pcorr[k]=100*sum(HP$Neighborhood==pred)/n
}
pcorr
## [1] 47.65625 39.84375 44.53125 38.28125 45.31250 40.62500 42.18750
## [8] 44.53125 42.18750 42.18750
Based upon the results of this, k=1 is our best number of neighbors so that is the one we will want to use. Now we will run a confusion matrix to test the accuracy of our prediction.
closest <- data.frame(truetype=HP$Neighborhood[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=HP$Neighborhood[-train])
## Confusion Matrix and Statistics
##
## Reference
## Prediction East North West
## East 5 0 0
## North 2 2 2
## West 0 1 2
##
## Overall Statistics
##
## Accuracy : 0.6429
## 95% CI : (0.3514, 0.8724)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.212
##
## Kappa : 0.4656
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: East Class: North Class: West
## Sensitivity 0.7143 0.6667 0.5000
## Specificity 1.0000 0.6364 0.9000
## Pos Pred Value 1.0000 0.3333 0.6667
## Neg Pred Value 0.7778 0.8750 0.8182
## Prevalence 0.5000 0.2143 0.2857
## Detection Rate 0.3571 0.1429 0.1429
## Detection Prevalence 0.3571 0.4286 0.2143
## Balanced Accuracy 0.8571 0.6515 0.7000
Based upon our confusion matrix, the accuracy of this model is 64.3%. This is neither great nor bad for an accuracy.
food<- read.csv("~/DataMining/Data/ReducedFoodInsec.csv")
food$FSRUNOUT = recode(food$FSRUNOUT, "'1'=1;'2'=1;'3'=0;'7'=NA;'8'=NA;'9'=NA")
Now we are going to create a subset of variables to run our knn analysis on.
food2 <- data.frame(food$FM_SIZE, food$FM_EDUC1, food$HOUSEOWN, food$FMEDBILL, food$FSLAST, food$FSRUNOUT)
head(food2)
## food.FM_SIZE food.FM_EDUC1 food.HOUSEOWN food.FMEDBILL food.FSLAST
## 1 3 8 2 1 3
## 2 12 5 1 1 3
## 3 3 6 1 1 2
## 4 6 8 1 1 3
## 5 4 8 1 2 3
## 6 4 6 1 2 3
## food.FSRUNOUT
## 1 0
## 2 1
## 3 1
## 4 1
## 5 0
## 6 0
n = length(food2[,1])
nt = 900
set.seed(1)
train <- sample(1:n,nt)
x <- scale(food2[,c(4,1)])
x[1:3,]
## food.FMEDBILL food.FM_SIZE
## [1,] -1.805882 -1.08243
## [2,] -1.805882 5.25657
## [3,] -1.805882 -1.08243
mean(x)
## [1] 1.509512e-16
sd(x)
## [1] 0.9997858
nearest1 <- knn(train=x[train,],test=x[-train,],cl=food2$food.FSRUNOUT[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=food2$food.FSRUNOUT[train],k=5)
head((data.frame(food2$food.FSRUNOUT[-train],nearest1,nearest5)))
## food2.food.FSRUNOUT..train. nearest1 nearest5
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
Once again we will calculate the proportion of correct classifications on this specific training set.
pcorrn1=100*sum(food2$food.FSRUNOUT[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(food2$food.FSRUNOUT[-train]==nearest5)/(n-nt)
pcorrn1
## [1] 89.55224
pcorrn5
## [1] 90.29851
Now for our Press Q
numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*6))^2)/(n*5)
PressQ1
## [1] 4467.437
qchisq(.95,5)
## [1] 11.0705
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*6))^2)/(n*5)
PressQ5
## [1] 4559.389
Cross validation now commences:
pcorr=dim(10)
for (k in 1:10) {
pred=knn.cv(x,food2$food.FSRUNOUT,k)
pcorr[k]=100*sum(food2$food.FSRUNOUT==pred)/n
}
pcorr
## [1] 89.64041 89.81164 89.81164 89.98288 89.98288 89.98288 89.98288
## [8] 89.98288 89.98288 89.98288
Lastly we will work to test the accuracy of our model.
closest <- data.frame(truetype=food2$food.FSRUNOUT[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=food2$food.FSRUNOUT[-train])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 240 26
## 1 2 0
##
## Accuracy : 0.8955
## 95% CI : (0.8525, 0.9294)
## No Information Rate : 0.903
## P-Value [Acc > NIR] : 0.7041
##
## Kappa : -0.0141
## Mcnemar's Test P-Value : 1.383e-05
##
## Sensitivity : 0.9917
## Specificity : 0.0000
## Pos Pred Value : 0.9023
## Neg Pred Value : 0.0000
## Prevalence : 0.9030
## Detection Rate : 0.8955
## Detection Prevalence : 0.9925
## Balanced Accuracy : 0.4959
##
## 'Positive' Class : 0
##
Based upon our confusion matrix, the accuracy of our model is 89.55%. This model is very accurate when making our knn prediction.