knitr::opts_chunk$set(echo = TRUE)
library(class)
library(textir)
## Loading required package: distrom
## Loading required package: Matrix
## Loading required package: gamlr
## Loading required package: parallel
library(MASS)
HP <- read.csv("/Users/hannahpeterson/Documents/R stuff/HousePrices.csv")
head(HP)
## HomeID Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1 1 114300 1790 2 2 2 No East
## 2 2 114200 2030 4 2 3 No East
## 3 3 114800 1740 3 2 1 No East
## 4 4 94700 1980 3 2 3 No East
## 5 5 119800 2130 3 3 3 No East
## 6 6 114600 1780 3 2 2 No North
table(HP$Neighborhood)
##
## East North West
## 45 44 39
par(mfrow=c(3,3), mai=c(.3,.6,.1,.1)) ##mfrow 3 columns
plot(Price ~ Neighborhood, data=HP, col=c(grey(.2),2:6))
plot(SqFt ~ Neighborhood, data=HP, col=c(grey(.2),2:6))
plot(Bedrooms ~ Neighborhood, data=HP, col=c(grey(.2),2:6))
plot(Bathrooms ~ Neighborhood, data=HP, col=c(grey(.2),2:6))
plot(Offers ~ Neighborhood, data=HP, col=c(grey(.2),2:6))
plot(Brick ~ Neighborhood, data=HP, col=c(grey(.2),2:6))
n=length(HP$Neighborhood)
nt=50
set.seed(1)
train <- sample(1:n,nt)
x <-scale(HP[,c(2,3,4,5,6)])
x[1:3,]
## Price SqFt Bedrooms Bathrooms Offers
## [1,] -0.6002263 -0.9969990 -1.40978793 -0.8655378 -0.5406451
## [2,] -0.6039481 0.1373643 1.34521749 -0.8655378 0.3945248
## [3,] -0.5816174 -1.2333247 -0.03228522 -0.8655378 -1.4758150
mean(x)
## [1] 3.858066e-18
sd(x)
## [1] 0.9968652
nearest3 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=3)
nearest7 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=7)
data.frame(HP$Neighborhood[-train],nearest3,nearest7)
## HP.Neighborhood..train. nearest3 nearest7
## 1 East North North
## 2 East North North
## 3 East North North
## 4 East North East
## 5 North East North
## 6 West East West
## 7 East North East
## 8 East East East
## 9 East North North
## 10 North North North
## 11 West West West
## 12 West West West
## 13 East North East
## 14 East North North
## 15 West West West
## 16 North North North
## 17 North North North
## 18 West East East
## 19 North North North
## 20 West West West
## 21 West West West
## 22 North North North
## 23 East East East
## 24 North North North
## 25 North North North
## 26 West West East
## 27 East North North
## 28 East North North
## 29 East West West
## 30 East North North
## 31 North North West
## 32 East North North
## 33 North North North
## 34 East West East
## 35 North North North
## 36 North North North
## 37 North East East
## 38 East North North
## 39 West West East
## 40 West West West
## 41 West West West
## 42 North North North
## 43 East North East
## 44 West East East
## 45 North East North
## 46 North North North
## 47 West West West
## 48 West West West
## 49 West East East
## 50 West North East
## 51 North East East
## 52 West West West
## 53 West East East
## 54 East East West
## 55 West West West
## 56 North West West
## 57 West West West
## 58 East North East
## 59 West East East
## 60 East West West
## 61 West East West
## 62 West East West
## 63 West West West
## 64 East East East
## 65 West West East
## 66 East North North
## 67 West East East
## 68 North North North
## 69 East East North
## 70 East East East
## 71 North North North
## 72 West West East
## 73 North North East
## 74 North East East
## 75 North North North
## 76 East North North
## 77 East East North
## 78 East North West
## calculate the proportion of correct classifications on this one
## training set
pcorrn3=100*sum(HP$Neighborhood[-train]==nearest3)/(n-nt)
pcorrn7=100*sum(HP$Neighborhood[-train]==nearest7)/(n-nt)
pcorrn3
## [1] 52.5641
pcorrn7
## [1] 55.12821
numCorrn3=(pcorrn3/100)*n
PressQ3=((n-(numCorrn3*6))^2)/(n*5)
PressQ3
## [1] 118.7598
##Critical value for chi-square with alpha=.0
qchisq(.95,5)
## [1] 11.0705
numCorrn7=(pcorrn7/100)*n
PressQ7=((n-(numCorrn7*6))^2)/(n*5)
PressQ7
## [1] 136.3314
## cross-validation (leave one out)
pcorr=dim(10)
for (k in 1:10) {
pred=knn.cv(x,HP$Neighborhood,k)
pcorr[k]=100*sum(HP$Neighborhood==pred)/n
}
pcorr
## [1] 50.78125 55.46875 47.65625 53.90625 53.90625 52.34375 50.00000
## [8] 53.90625 51.56250 53.12500
x <-scale(HP[,c(2:6)])
nearest3 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=3)
nearest7 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=7)
data.frame(HP$Neighborhood[-train],nearest3,nearest7)
## HP.Neighborhood..train. nearest3 nearest7
## 1 East North North
## 2 East North North
## 3 East North North
## 4 East North East
## 5 North East North
## 6 West East East
## 7 East North East
## 8 East East East
## 9 East North North
## 10 North North North
## 11 West East West
## 12 West West West
## 13 East North East
## 14 East North North
## 15 West West West
## 16 North North North
## 17 North North North
## 18 West East East
## 19 North North North
## 20 West West West
## 21 West West West
## 22 North North North
## 23 East East East
## 24 North North North
## 25 North North North
## 26 West West East
## 27 East North North
## 28 East North North
## 29 East West West
## 30 East North North
## 31 North West West
## 32 East North North
## 33 North North North
## 34 East West East
## 35 North North North
## 36 North North North
## 37 North East East
## 38 East North North
## 39 West West West
## 40 West West West
## 41 West West West
## 42 North North North
## 43 East North North
## 44 West East East
## 45 North East North
## 46 North North North
## 47 West West West
## 48 West West West
## 49 West East East
## 50 West North North
## 51 North East East
## 52 West West West
## 53 West East East
## 54 East East East
## 55 West West West
## 56 North West West
## 57 West West West
## 58 East North East
## 59 West East East
## 60 East West West
## 61 West North East
## 62 West East West
## 63 West West West
## 64 East East East
## 65 West West East
## 66 East North North
## 67 West East East
## 68 North North North
## 69 East East North
## 70 East East East
## 71 North North North
## 72 West West West
## 73 North North East
## 74 North East East
## 75 North North North
## 76 East North North
## 77 East East East
## 78 East North West
pcorrn3=100*sum(HP$Neighborhood[-train]==nearest3)/(n-nt)
pcorrn7=100*sum(HP$Neighborhood[-train]==nearest7)/(n-nt)
pcorrn3
## [1] 50
pcorrn7
## [1] 56.41026
numCorrn3=(pcorrn3/100)*n
PressQ3=((n-(numCorrn3*6))^2)/(n*5)
PressQ3
## [1] 102.4
##Critical value for chi-square with alpha=.05
qchisq(.95,5)
## [1] 11.0705
numCorrn7=(pcorrn7/100)*n
PressQ7=((n-(numCorrn7*6))^2)/(n*5)
PressQ7
## [1] 145.5716
Cross Validation
pcorr=dim(10)
for (k in 1:10) {
pred=knn.cv(x,HP$Neighborhood,k)
pcorr[k]=100*sum(HP$Neighborhood==pred)/n
}
pcorr
## [1] 50.78125 54.68750 46.87500 53.90625 52.34375 56.25000 50.00000
## [8] 57.03125 55.46875 57.03125
Confusion Matrix
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
near7<-data.frame(truetype=HP$Neighborhood[-train],predtype=nearest7)
confusionMatrix(data=nearest7,reference =HP$Neighborhood[-train])
## Confusion Matrix and Statistics
##
## Reference
## Prediction East North West
## East 11 4 10
## North 14 17 1
## West 3 2 16
##
## Overall Statistics
##
## Accuracy : 0.5641
## 95% CI : (0.447, 0.6761)
## No Information Rate : 0.359
## P-Value [Acc > NIR] : 0.0001756
##
## Kappa : 0.3502
## Mcnemar's Test P-Value : 0.0217074
##
## Statistics by Class:
##
## Class: East Class: North Class: West
## Sensitivity 0.3929 0.7391 0.5926
## Specificity 0.7200 0.7273 0.9020
## Pos Pred Value 0.4400 0.5312 0.7619
## Neg Pred Value 0.6792 0.8696 0.8070
## Prevalence 0.3590 0.2949 0.3462
## Detection Rate 0.1410 0.2179 0.2051
## Detection Prevalence 0.3205 0.4103 0.2692
## Balanced Accuracy 0.5564 0.7332 0.7473
This model’s overall goal is to predict whether neighborhood predicts house prices. In this matrix, we end up with a about a 56% accuracy. This is better than half but still not the best. With the accuracy percent alone, it might be able to be considered an okay predictor. However, the Kappa is significantly low. Which makes it less of a reliable model.
library(class)
library(textir)
library(MASS)
foodinsec<- read.csv("/Users/hannahpeterson/Documents/R stuff/ReducedFoodInsec.csv")
food2=foodinsec[,c(-1:-7,-9:-11,-13,-14,-16:-23,-25:-31,-33:-36,-41:-98,-100:-107,-109,-110)]
head(food2)
## FLNGINTV FM_TYPE FM_EDUC1 FWKLIMYN FHSTATEX FSRUNOUT FSLAST FSBALANC
## 1 1 4 8 2 3 3 3 3
## 2 1 4 5 1 0 2 3 3
## 3 1 4 6 2 0 2 2 2
## 4 1 4 8 1 6 2 3 3
## 5 1 4 8 1 3 3 3 3
## 6 1 4 6 2 4 3 3 3
## FDMEDYN INCGRP4 FSNAP
## 1 2 2 2
## 2 2 2 1
## 3 1 4 2
## 4 2 3 1
## 5 2 5 1
## 6 2 5 2
table(food2$FSRUNOUT)
##
## 1 2 3
## 30 87 1051
n=length(food2$FSRUNOUT)
nt=1100
set.seed(1)
train <- sample(1:n,nt)
x<-scale(food2[,c(1,2,3,4,5,7,8,9,10,11)])
x[1:3,]
## FLNGINTV FM_TYPE FM_EDUC1 FWKLIMYN FHSTATEX FSLAST
## [1,] -0.1960748 0.1833017 0.1752954 0.2818056 0.2958104 0.2693467
## [2,] -0.1960748 0.1833017 -0.5315345 -3.5455075 -1.1418036 0.2693467
## [3,] -0.1960748 0.1833017 -0.2959245 0.2818056 -1.1418036 -2.9739212
## FSBALANC FDMEDYN INCGRP4 FSNAP
## [1,] 0.2428295 0.3303306 -0.3942581 0.0139804
## [2,] 0.2428295 0.3303306 -0.3942581 -1.1523840
## [3,] -3.0939334 -3.0246789 -0.3219274 0.0139804
for (j in 1:6) {
x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j])
}
mean(x)
## [1] 8.82116e-17
sd(x)
## [1] 0.9996146
nearest1 <- knn(train=x[train,],test=x[-train,],cl=food2$FSRUNOUT[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=food2$FSRUNOUT[train],k=5)
nearest7 <- knn(train=x[train,],test=x[-train,],cl=food2$FSRUNOUT[train],k=7)
data.frame(food2$FSRUNOUT[-train],nearest1,nearest5,nearest7)[1:10,]
## food2.FSRUNOUT..train. nearest1 nearest5 nearest7
## 1 3 3 3 3
## 2 3 3 3 3
## 3 3 3 3 3
## 4 3 3 3 3
## 5 3 3 3 3
## 6 3 3 3 3
## 7 3 3 3 3
## 8 2 3 3 3
## 9 3 3 3 3
## 10 3 3 3 3
pcorrn1=100*sum(food2$FSRUNOUT[-train]==nearest1)/(n-nt)
pcorrn5=100*sum(food2$FSRUNOUT[-train]==nearest5)/(n-nt)
pcorrn7=100*sum(food2$FSRUNOUT[-train]==nearest7)/(n-nt)
pcorrn1
## [1] 95.58824
pcorrn5
## [1] 97.05882
pcorrn7
## [1] 97.05882
numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*3))^2)/(n*2)
PressQ1
## [1] 2037.054
qchisq(.95,2)
## [1] 5.991465
numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*3))^2)/(n*2)
PressQ5
## [1] 2134.429
numCorrn7=(pcorrn7/100)*n
PressQ7=((n-(numCorrn7*3))^2)/(n*2)
PressQ7pcorr=dim(10)
for (k in 1:10) {
pred=knn.cv(x,food2$FSRUNOUT,k)
pcorr[k]=100*sum(food2$FSRUNOUT==pred)/n
}
pcorr
## [1] 91.60959 91.52397 93.23630 93.49315 93.40753 93.66438 94.09247
## [8] 93.92123 93.75000 93.75000
near5<-data.frame(truetype=food2$FSRUNOUT[-train],predtype=nearest5)
confusionMatrix(data=nearest5,reference=food2$FSRUNOUT[-train])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 1 0 0
## 2 0 2 0
## 3 0 2 63
##
## Overall Statistics
##
## Accuracy : 0.9706
## 95% CI : (0.8978, 0.9964)
## No Information Rate : 0.9265
## P-Value [Acc > NIR] : 0.1152
##
## Kappa : 0.7385
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 1.00000 0.50000 1.0000
## Specificity 1.00000 1.00000 0.6000
## Pos Pred Value 1.00000 1.00000 0.9692
## Neg Pred Value 1.00000 0.96970 1.0000
## Prevalence 0.01471 0.05882 0.9265
## Detection Rate 0.01471 0.02941 0.9265
## Detection Prevalence 0.01471 0.02941 0.9559
## Balanced Accuracy 1.00000 0.75000 0.8000
In this model, we have a 97% accuracy. Including a Kappa at about 0.74. This shows that it is a good model to use.