Homework 4

House Prices Analysis

House Prices is a data set that has 128 observations (houses) and includes prices and characteristics of those houses in a major US metro area.

library (car)
library(lattice)
library(leaps)
library(nutshell)

## Loading required package: nutshell.bbdb

## Loading required package: nutshell.audioscrobbler

library(caret)

## Loading required package: ggplot2

library(textir)

## Loading required package: distrom

## Loading required package: Matrix

## Loading required package: gamlr

## Loading required package: parallel

library(MASS)
library(class)
HP <- read.csv("~/DataMining/Data/HousePrices.csv")
n = length(HP$Neighborhood)
nt = 114
set.seed(1) ##to make calculations reproducible in repeated runs
train <- sample(1:n,nt)

x <- scale(HP[,c(4,1)])
x[1:3,]

##         Bedrooms    HomeID
## [1,] -1.40978793 -1.711845
## [2,]  1.34521749 -1.684887
## [3,] -0.03228522 -1.657929

mean(x)

## [1] 3.873482e-18

sd(x)

## [1] 0.9980373

nearest1 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=5)
head(data.frame(HP$Neighborhood[-train],nearest1,nearest5))

##   HP.Neighborhood..train. nearest1 nearest5
## 1                    East     East    North
## 2                    East     East     East
## 3                   North    North    North
## 4                    East     East    North
## 5                   North     West     West
## 6                    East     East    North

Now we are going to calculate the proportion of correct classifications on this one training set.

pcorrn1=100*sum(HP$Neighborhood[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(HP$Neighborhood[-train]==nearest5)/(n-nt)
pcorrn1

## [1] 64.28571

pcorrn5

## [1] 57.14286

Now for our Press Q

numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*6))^2)/(n*5)
PressQ1

## [1] 208.9796

qchisq(.95,5)

## [1] 11.0705

numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*6))^2)/(n*5)
PressQ5

## [1] 150.9878

Now cross validation, we leave one out

pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(x,HP$Neighborhood,k)
  pcorr[k]=100*sum(HP$Neighborhood==pred)/n
}
pcorr

##  [1] 47.65625 39.84375 44.53125 38.28125 45.31250 40.62500 42.18750
##  [8] 44.53125 42.18750 42.18750

Based upon the results of this, k=1 is our best number of neighbors so that is the one we will want to use. Now we will run a confusion matrix to test the accuracy of our prediction.

closest <- data.frame(truetype=HP$Neighborhood[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=HP$Neighborhood[-train])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction East North West
##      East     5     0    0
##      North    2     2    2
##      West     0     1    2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6429          
##                  95% CI : (0.3514, 0.8724)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.212           
##                                           
##                   Kappa : 0.4656          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: East Class: North Class: West
## Sensitivity               0.7143       0.6667      0.5000
## Specificity               1.0000       0.6364      0.9000
## Pos Pred Value            1.0000       0.3333      0.6667
## Neg Pred Value            0.7778       0.8750      0.8182
## Prevalence                0.5000       0.2143      0.2857
## Detection Rate            0.3571       0.1429      0.1429
## Detection Prevalence      0.3571       0.4286      0.2143
## Balanced Accuracy         0.8571       0.6515      0.7000

Based upon our confusion matrix, the accuracy of this model is 64.3%. This is neither great nor bad for an accuracy.

Reduced Food Insecurities

food<- read.csv("~/DataMining/Data/ReducedFoodInsec.csv")
food$FSRUNOUT = recode(food$FSRUNOUT, "'1'=1;'2'=1;'3'=0;'7'=NA;'8'=NA;'9'=NA")

Now we are going to create a subset of variables to run our knn analysis on.

food2 <- data.frame(food$FM_SIZE, food$FM_EDUC1, food$HOUSEOWN, food$FMEDBILL, food$FSLAST, food$FSRUNOUT)
head(food2)

##   food.FM_SIZE food.FM_EDUC1 food.HOUSEOWN food.FMEDBILL food.FSLAST
## 1            3             8             2             1           3
## 2           12             5             1             1           3
## 3            3             6             1             1           2
## 4            6             8             1             1           3
## 5            4             8             1             2           3
## 6            4             6             1             2           3
##   food.FSRUNOUT
## 1             0
## 2             1
## 3             1
## 4             1
## 5             0
## 6             0

n = length(food2[,1])
nt = 900
set.seed(1)
train <- sample(1:n,nt)

x <- scale(food2[,c(4,1)])
x[1:3,]

##      food.FMEDBILL food.FM_SIZE
## [1,]     -1.805882     -1.08243
## [2,]     -1.805882      5.25657
## [3,]     -1.805882     -1.08243

mean(x)

## [1] 1.509512e-16

sd(x)

## [1] 0.9997858

nearest1 <- knn(train=x[train,],test=x[-train,],cl=food2$food.FSRUNOUT[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=food2$food.FSRUNOUT[train],k=5)
head((data.frame(food2$food.FSRUNOUT[-train],nearest1,nearest5)))

##   food2.food.FSRUNOUT..train. nearest1 nearest5
## 1                           0        0        0
## 2                           0        0        0
## 3                           0        0        0
## 4                           0        0        0
## 5                           0        0        0
## 6                           0        0        0

Once again we will calculate the proportion of correct classifications on this specific training set.

pcorrn1=100*sum(food2$food.FSRUNOUT[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(food2$food.FSRUNOUT[-train]==nearest5)/(n-nt)
pcorrn1

## [1] 89.55224

pcorrn5

## [1] 90.29851

Now for our Press Q

numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*6))^2)/(n*5)
PressQ1

## [1] 4467.437

qchisq(.95,5)

## [1] 11.0705

numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*6))^2)/(n*5)
PressQ5

## [1] 4559.389

Cross validation now commences:

pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(x,food2$food.FSRUNOUT,k)
  pcorr[k]=100*sum(food2$food.FSRUNOUT==pred)/n
}
pcorr

##  [1] 89.64041 89.81164 89.81164 89.98288 89.98288 89.98288 89.98288
##  [8] 89.98288 89.98288 89.98288

Lastly we will work to test the accuracy of our model.

closest <- data.frame(truetype=food2$food.FSRUNOUT[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=food2$food.FSRUNOUT[-train])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 240  26
##          1   2   0
##                                           
##                Accuracy : 0.8955          
##                  95% CI : (0.8525, 0.9294)
##     No Information Rate : 0.903           
##     P-Value [Acc > NIR] : 0.7041          
##                                           
##                   Kappa : -0.0141         
##  Mcnemar's Test P-Value : 1.383e-05       
##                                           
##             Sensitivity : 0.9917          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9023          
##          Neg Pred Value : 0.0000          
##              Prevalence : 0.9030          
##          Detection Rate : 0.8955          
##    Detection Prevalence : 0.9925          
##       Balanced Accuracy : 0.4959          
##                                           
##        'Positive' Class : 0               
##

Based upon our confusion matrix, the accuracy of our model is 89.55%. This model is very accurate when making our knn prediction.

Homework 4

Jones

November 8, 2017

House Prices Analysis

Reduced Food Insecurities