Homework #4a

Data Set: House Prices This data set contains prices and characteristics of 128 houses in a major US metropolitan area. These plots can give us a predicted correlation of what characteristics affect the price of a home

hp<-read.csv("c:/users/abbey/Desktop/Data Mining/HousePrices.csv")
hp[1:3,]

##   HomeID  Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1      1 114300 1790        2         2      2    No         East
## 2      2 114200 2030        4         2      3    No         East
## 3      3 114800 1740        3         2      1    No         East

library(textir)

## Loading required package: distrom

## Loading required package: Matrix

## Loading required package: gamlr

## Loading required package: parallel

library(MASS)

dim(hp)

## [1] 128   8

table(hp$Neighborhood)

## 
##  East North  West 
##    45    44    39

a1 = rep(1,length(hp$Neighborhood))
a2 = rep(0,length(hp$Neighborhood))
hp$BrickYes = ifelse(hp$Brick == "Yes",a1,a2)

The data consists of 128 rows and 8 variables

Here are illustrative box plots of the features stratified by Neighboorhood

par(mfrow=c(3,3), mai=c(.3,.6,.1,.1))
plot(HomeID ~ Neighborhood, data=hp, col=c(grey(.2),2:6))
plot(Price ~ Neighborhood, data=hp, col=c(grey(.2),2:6))
plot(SqFt ~ Neighborhood, data=hp, col=c(grey(.2),2:6))
plot(Bedrooms ~ Neighborhood, data=hp, col=c(grey(.2),2:6))
plot(Bathrooms ~ Neighborhood, data=hp, col=c(grey(.2),2:6))
plot(Offers ~ Neighborhood, data=hp, col=c(grey(.2),2:6))
plot(Brick ~ Neighborhood, data=hp, col=c(grey(.2),2:6))

use nt=100 training cases to find the nearest neighbors for the remaining 28 cases. These 28 cases become the evaluation (test, hold-out) cases

n=length(hp$Neighborhood)
nt=100
set.seed(1) ## to make the calculations reproducible in repeated runs
train <- sample(1:n,nt)

x<-scale(hp[,c(2,3,4,5,6,9)])
x[1:3,]

##           Price       SqFt    Bedrooms  Bathrooms     Offers   BrickYes
## [1,] -0.6002263 -0.9969990 -1.40978793 -0.8655378 -0.5406451 -0.6961011
## [2,] -0.6039481  0.1373643  1.34521749 -0.8655378  0.3945248 -0.6961011
## [3,] -0.5816174 -1.2333247 -0.03228522 -0.8655378 -1.4758150 -0.6961011

for (j in 1:6) {
  x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j])
}

##mean and standard deviation
mean(x)

## [1] 5.002153e-18

sd(x)

## [1] 0.9967352

library(class)  
nearest1 <- knn(train=x[train,],test=x[-train,],cl=hp$Neighborhood[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=hp$Neighborhood[train],k=5)
knitr::kable(head(data.frame(hp$Neighborhood[-train],nearest1,nearest5)))

hp.Neighborhood..train.	nearest1	nearest5
East	North	North
East	East	North
West	West	West
East	North	North
East	East	East
West	West	West
This predicted 3 out of 6	correctly.	But, note that nearest 1 had more correct than nearest 5.

par(mfrow=c(1,2))
## plot for k=1 (single) nearest neighbor
plot(x[train,],col=hp$Neighborhood[train],cex=.8,main="1-nearest neighbor")
points(x[-train,],bg=nearest1,pch=21,col=grey(.9),cex=1.25)
## plot for k=5 nearest neighbors
plot(x[train,],col=hp$Neighborhood[train],cex=.8,main="5-nearest neighbors")
points(x[-train,],bg=nearest5,pch=21,col=grey(.9),cex=1.25)
legend("topright",legend=levels(hp$Neighborhood),fill=1:6,bty="n",cex=.75)

In these graphs we can see what each model classified each testing point(the filled in points). You can see similarities in each, but you can also see differences in classifying them as north or east. Next is the calculations of the proportion of correct classifications on this one on k=1 k=5 training set

pcorrn1=100*sum(hp$Neighborhood[-train]==nearest1)/(n-nt) 
pcorrn5=100*sum(hp$Neighborhood[-train]==nearest5)/(n-nt)
pcorrn1

## [1] 60.71429

pcorrn5

## [1] 46.42857

portion of correct using nearest 1 was better than portion of correct using nearest 5 Calculating Press’ Q: Q=[N-(n*k)]^2/N(k-1)

numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*3))^2)/(n*2)
PressQ1

## [1] 43.18367

qchisq(.95,2) ##critical value for chi-square with alpha= 0.5(use .95 in formula), k-1=d.f. where k=3

## [1] 5.991465

numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*3))^2)/(n*2)
PressQ5

## [1] 9.877551

Press’ Q for nearest 1 is larger than chi-squre and higher than nearest 5 which means this nearest 1 is better.

cross-validation (leaving one out)

pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(x,hp$Neighborhood,k)
  pcorr[k]=100*sum(hp$Neighborhood==pred)/n
}
pcorr

##  [1] 60.93750 57.81250 52.34375 54.68750 57.03125 57.81250 60.15625
##  [8] 58.59375 58.59375 60.93750

you would use kn=1 because it has the biggest percentage in cross-validaiton you can also use kn=10 because it matches 1.

##x <- normalize(hp[,c(1:9)])
##x=fgl[,c(1:9)]
##for (j in 1:9) {
##  x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j])}
x<-scale(hp[,c(1,2,3,4,5,6,9)])
nearest1 <- knn(train=x[train,],test=x[-train,],cl=hp$Neighborhood[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=hp$Neighborhood[train],k=5)
knitr::kable(head(data.frame(hp$Neighborhood[-train],nearest1,nearest5)))

hp.Neighborhood..train.	nearest1	nearest5
East	North	East
East	North	North
West	West	West
East	North	East
East	East	East
West	West	West
This turned out to be wors	e because i	t predicted 2 out of 6 correctly.

pcorrn1=100*sum(hp$Neighborhood[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(hp$Neighborhood[-train]==nearest5)/(n-nt)
pcorrn1

## [1] 57.14286

pcorrn5

## [1] 64.28571

portion of correct using nearest 1 was better than portion of correct using nearest 5

Press’ Q

numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*3))^2)/(n*2)
PressQ1

## [1] 32.65306

qchisq(.95,2)

## [1] 5.991465

numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*3))^2)/(n*2)
PressQ5

## [1] 55.18367

Both Press’ Q was above the chi-squre which means these are good models for prediction. Another Cross-validation leaving one out

pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(x,hp$Neighborhood,k)
  pcorr[k]=100*sum(hp$Neighborhood==pred)/n
}
pcorr

##  [1] 50.78125 50.00000 57.03125 53.12500 53.90625 50.00000 60.93750
##  [8] 63.28125 58.59375 60.15625

number 8 has the highest percentage using a fstatstic and regression on nearest 1 also a confusion matrix

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(e1071)

near1<-data.frame(truetype=hp$Neighborhood[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=hp$Neighborhood[-train])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction East North West
##      East     4     2    0
##      North    8     4    0
##      West     2     0    8
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5714          
##                  95% CI : (0.3718, 0.7554)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.2858          
##                                           
##                   Kappa : 0.3869          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: East Class: North Class: West
## Sensitivity               0.2857       0.6667      1.0000
## Specificity               0.8571       0.6364      0.9000
## Pos Pred Value            0.6667       0.3333      0.8000
## Neg Pred Value            0.5455       0.8750      1.0000
## Prevalence                0.5000       0.2143      0.2857
## Detection Rate            0.1429       0.1429      0.2857
## Detection Prevalence      0.2143       0.4286      0.3571
## Balanced Accuracy         0.5714       0.6515      0.9500

using nearest 5

near5<-data.frame(truetype=hp$Neighborhood[-train],predtype=nearest5)
confusionMatrix(data=nearest5,reference=hp$Neighborhood[-train])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction East North West
##      East     5     1    0
##      North    8     5    0
##      West     1     0    8
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6429          
##                  95% CI : (0.4407, 0.8136)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.09247         
##                                           
##                   Kappa : 0.4909          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: East Class: North Class: West
## Sensitivity               0.3571       0.8333      1.0000
## Specificity               0.9286       0.6364      0.9500
## Pos Pred Value            0.8333       0.3846      0.8889
## Neg Pred Value            0.5909       0.9333      1.0000
## Prevalence                0.5000       0.2143      0.2857
## Detection Rate            0.1786       0.1786      0.2857
## Detection Prevalence      0.2143       0.4643      0.3214
## Balanced Accuracy         0.6429       0.7348      0.9750

The outcome of this model is good when using nearest 5 we get an accuracy of 64.3% which is above 50%. The p-values for is not significnt, but it is lower than nearest 1.