Data Set: House Prices This data set contains prices and characteristics of 128 houses in a major US metropolitan area. These plots can give us a predicted correlation of what characteristics affect the price of a home

hp<-read.csv("c:/users/abbey/Desktop/Data Mining/HousePrices.csv")
hp[1:3,]
##   HomeID  Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1      1 114300 1790        2         2      2    No         East
## 2      2 114200 2030        4         2      3    No         East
## 3      3 114800 1740        3         2      1    No         East
library(textir)
## Loading required package: distrom
## Loading required package: Matrix
## Loading required package: gamlr
## Loading required package: parallel
library(MASS)
dim(hp)
## [1] 128   8
table(hp$Neighborhood)
## 
##  East North  West 
##    45    44    39
a1 = rep(1,length(hp$Neighborhood))
a2 = rep(0,length(hp$Neighborhood))
hp$BrickYes = ifelse(hp$Brick == "Yes",a1,a2)

The data consists of 128 rows and 8 variables

Here are illustrative box plots of the features stratified by Neighboorhood

par(mfrow=c(3,3), mai=c(.3,.6,.1,.1))
plot(HomeID ~ Neighborhood, data=hp, col=c(grey(.2),2:6))
plot(Price ~ Neighborhood, data=hp, col=c(grey(.2),2:6))
plot(SqFt ~ Neighborhood, data=hp, col=c(grey(.2),2:6))
plot(Bedrooms ~ Neighborhood, data=hp, col=c(grey(.2),2:6))
plot(Bathrooms ~ Neighborhood, data=hp, col=c(grey(.2),2:6))
plot(Offers ~ Neighborhood, data=hp, col=c(grey(.2),2:6))
plot(Brick ~ Neighborhood, data=hp, col=c(grey(.2),2:6))

use nt=100 training cases to find the nearest neighbors for the remaining 28 cases. These 28 cases become the evaluation (test, hold-out) cases

n=length(hp$Neighborhood)
nt=100
set.seed(1) ## to make the calculations reproducible in repeated runs
train <- sample(1:n,nt)
x<-scale(hp[,c(2,3,4,5,6,9)])
x[1:3,]
##           Price       SqFt    Bedrooms  Bathrooms     Offers   BrickYes
## [1,] -0.6002263 -0.9969990 -1.40978793 -0.8655378 -0.5406451 -0.6961011
## [2,] -0.6039481  0.1373643  1.34521749 -0.8655378  0.3945248 -0.6961011
## [3,] -0.5816174 -1.2333247 -0.03228522 -0.8655378 -1.4758150 -0.6961011
for (j in 1:6) {
  x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j])
}

##mean and standard deviation
mean(x)
## [1] 5.002153e-18
sd(x)
## [1] 0.9967352
library(class)  
nearest1 <- knn(train=x[train,],test=x[-train,],cl=hp$Neighborhood[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=hp$Neighborhood[train],k=5)
knitr::kable(head(data.frame(hp$Neighborhood[-train],nearest1,nearest5)))
hp.Neighborhood..train. nearest1 nearest5
East North North
East East North
West West West
East North North
East East East
West West West
This predicted 3 out of 6 correctly. But, note that nearest 1 had more correct than nearest 5.
par(mfrow=c(1,2))
## plot for k=1 (single) nearest neighbor
plot(x[train,],col=hp$Neighborhood[train],cex=.8,main="1-nearest neighbor")
points(x[-train,],bg=nearest1,pch=21,col=grey(.9),cex=1.25)
## plot for k=5 nearest neighbors
plot(x[train,],col=hp$Neighborhood[train],cex=.8,main="5-nearest neighbors")
points(x[-train,],bg=nearest5,pch=21,col=grey(.9),cex=1.25)
legend("topright",legend=levels(hp$Neighborhood),fill=1:6,bty="n",cex=.75)

In these graphs we can see what each model classified each testing point(the filled in points). You can see similarities in each, but you can also see differences in classifying them as north or east. Next is the calculations of the proportion of correct classifications on this one on k=1 k=5 training set

pcorrn1=100*sum(hp$Neighborhood[-train]==nearest1)/(n-nt) 
pcorrn5=100*sum(hp$Neighborhood[-train]==nearest5)/(n-nt)
pcorrn1
## [1] 60.71429
pcorrn5
## [1] 46.42857

portion of correct using nearest 1 was better than portion of correct using nearest 5 Calculating Press’ Q: Q=[N-(n*k)]^2/N(k-1)

numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*3))^2)/(n*2)
PressQ1
## [1] 43.18367
qchisq(.95,2) ##critical value for chi-square with alpha= 0.5(use .95 in formula), k-1=d.f. where k=3
## [1] 5.991465
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*3))^2)/(n*2)
PressQ5
## [1] 9.877551

Press’ Q for nearest 1 is larger than chi-squre and higher than nearest 5 which means this nearest 1 is better.

cross-validation (leaving one out)

pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(x,hp$Neighborhood,k)
  pcorr[k]=100*sum(hp$Neighborhood==pred)/n
}
pcorr
##  [1] 60.93750 57.81250 52.34375 54.68750 57.03125 57.81250 60.15625
##  [8] 58.59375 58.59375 60.93750

you would use kn=1 because it has the biggest percentage in cross-validaiton you can also use kn=10 because it matches 1.

##x <- normalize(hp[,c(1:9)])
##x=fgl[,c(1:9)]
##for (j in 1:9) {
##  x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j])}
x<-scale(hp[,c(1,2,3,4,5,6,9)])
nearest1 <- knn(train=x[train,],test=x[-train,],cl=hp$Neighborhood[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=hp$Neighborhood[train],k=5)
knitr::kable(head(data.frame(hp$Neighborhood[-train],nearest1,nearest5)))
hp.Neighborhood..train. nearest1 nearest5
East North East
East North North
West West West
East North East
East East East
West West West
This turned out to be wors e because i t predicted 2 out of 6 correctly.
pcorrn1=100*sum(hp$Neighborhood[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(hp$Neighborhood[-train]==nearest5)/(n-nt)
pcorrn1
## [1] 57.14286
pcorrn5
## [1] 64.28571

portion of correct using nearest 1 was better than portion of correct using nearest 5

Press’ Q

numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*3))^2)/(n*2)
PressQ1
## [1] 32.65306
qchisq(.95,2)
## [1] 5.991465
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*3))^2)/(n*2)
PressQ5
## [1] 55.18367

Both Press’ Q was above the chi-squre which means these are good models for prediction. Another Cross-validation leaving one out

pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(x,hp$Neighborhood,k)
  pcorr[k]=100*sum(hp$Neighborhood==pred)/n
}
pcorr
##  [1] 50.78125 50.00000 57.03125 53.12500 53.90625 50.00000 60.93750
##  [8] 63.28125 58.59375 60.15625

number 8 has the highest percentage using a fstatstic and regression on nearest 1 also a confusion matrix

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)

near1<-data.frame(truetype=hp$Neighborhood[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=hp$Neighborhood[-train])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction East North West
##      East     4     2    0
##      North    8     4    0
##      West     2     0    8
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5714          
##                  95% CI : (0.3718, 0.7554)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.2858          
##                                           
##                   Kappa : 0.3869          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: East Class: North Class: West
## Sensitivity               0.2857       0.6667      1.0000
## Specificity               0.8571       0.6364      0.9000
## Pos Pred Value            0.6667       0.3333      0.8000
## Neg Pred Value            0.5455       0.8750      1.0000
## Prevalence                0.5000       0.2143      0.2857
## Detection Rate            0.1429       0.1429      0.2857
## Detection Prevalence      0.2143       0.4286      0.3571
## Balanced Accuracy         0.5714       0.6515      0.9500

using nearest 5

near5<-data.frame(truetype=hp$Neighborhood[-train],predtype=nearest5)
confusionMatrix(data=nearest5,reference=hp$Neighborhood[-train])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction East North West
##      East     5     1    0
##      North    8     5    0
##      West     1     0    8
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6429          
##                  95% CI : (0.4407, 0.8136)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.09247         
##                                           
##                   Kappa : 0.4909          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: East Class: North Class: West
## Sensitivity               0.3571       0.8333      1.0000
## Specificity               0.9286       0.6364      0.9500
## Pos Pred Value            0.8333       0.3846      0.8889
## Neg Pred Value            0.5909       0.9333      1.0000
## Prevalence                0.5000       0.2143      0.2857
## Detection Rate            0.1786       0.1786      0.2857
## Detection Prevalence      0.2143       0.4643      0.3214
## Balanced Accuracy         0.6429       0.7348      0.9750

The outcome of this model is good when using nearest 5 we get an accuracy of 64.3% which is above 50%. The p-values for is not significnt, but it is lower than nearest 1.