Homework 4

Problem 1: HousePrices

knitr::opts_chunk$set(echo = TRUE)

library(class)
library(textir)

## Loading required package: distrom

## Loading required package: Matrix

## Loading required package: gamlr

## Loading required package: parallel

library(MASS)

HP <- read.csv("/Users/hannahpeterson/Documents/R stuff/HousePrices.csv")
head(HP)

##   HomeID  Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1      1 114300 1790        2         2      2    No         East
## 2      2 114200 2030        4         2      3    No         East
## 3      3 114800 1740        3         2      1    No         East
## 4      4  94700 1980        3         2      3    No         East
## 5      5 119800 2130        3         3      3    No         East
## 6      6 114600 1780        3         2      2    No        North

table(HP$Neighborhood)

## 
##  East North  West 
##    45    44    39

par(mfrow=c(3,3), mai=c(.3,.6,.1,.1)) ##mfrow 3 columns 
plot(Price ~ Neighborhood, data=HP, col=c(grey(.2),2:6))
plot(SqFt ~ Neighborhood, data=HP, col=c(grey(.2),2:6))
plot(Bedrooms ~ Neighborhood, data=HP, col=c(grey(.2),2:6))
plot(Bathrooms ~ Neighborhood, data=HP, col=c(grey(.2),2:6))
plot(Offers ~ Neighborhood, data=HP, col=c(grey(.2),2:6))
plot(Brick ~ Neighborhood, data=HP, col=c(grey(.2),2:6))

n=length(HP$Neighborhood)
nt=50
set.seed(1) 
train <- sample(1:n,nt)

x <-scale(HP[,c(2,3,4,5,6)])
x[1:3,]

##           Price       SqFt    Bedrooms  Bathrooms     Offers
## [1,] -0.6002263 -0.9969990 -1.40978793 -0.8655378 -0.5406451
## [2,] -0.6039481  0.1373643  1.34521749 -0.8655378  0.3945248
## [3,] -0.5816174 -1.2333247 -0.03228522 -0.8655378 -1.4758150

mean(x)

## [1] 3.858066e-18

sd(x)

## [1] 0.9968652

nearest3 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=3)
nearest7 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=7)
data.frame(HP$Neighborhood[-train],nearest3,nearest7)

##    HP.Neighborhood..train. nearest3 nearest7
## 1                     East    North    North
## 2                     East    North    North
## 3                     East    North    North
## 4                     East    North     East
## 5                    North     East    North
## 6                     West     East     West
## 7                     East    North     East
## 8                     East     East     East
## 9                     East    North    North
## 10                   North    North    North
## 11                    West     West     West
## 12                    West     West     West
## 13                    East    North     East
## 14                    East    North    North
## 15                    West     West     West
## 16                   North    North    North
## 17                   North    North    North
## 18                    West     East     East
## 19                   North    North    North
## 20                    West     West     West
## 21                    West     West     West
## 22                   North    North    North
## 23                    East     East     East
## 24                   North    North    North
## 25                   North    North    North
## 26                    West     West     East
## 27                    East    North    North
## 28                    East    North    North
## 29                    East     West     West
## 30                    East    North    North
## 31                   North    North     West
## 32                    East    North    North
## 33                   North    North    North
## 34                    East     West     East
## 35                   North    North    North
## 36                   North    North    North
## 37                   North     East     East
## 38                    East    North    North
## 39                    West     West     East
## 40                    West     West     West
## 41                    West     West     West
## 42                   North    North    North
## 43                    East    North     East
## 44                    West     East     East
## 45                   North     East    North
## 46                   North    North    North
## 47                    West     West     West
## 48                    West     West     West
## 49                    West     East     East
## 50                    West    North     East
## 51                   North     East     East
## 52                    West     West     West
## 53                    West     East     East
## 54                    East     East     West
## 55                    West     West     West
## 56                   North     West     West
## 57                    West     West     West
## 58                    East    North     East
## 59                    West     East     East
## 60                    East     West     West
## 61                    West     East     West
## 62                    West     East     West
## 63                    West     West     West
## 64                    East     East     East
## 65                    West     West     East
## 66                    East    North    North
## 67                    West     East     East
## 68                   North    North    North
## 69                    East     East    North
## 70                    East     East     East
## 71                   North    North    North
## 72                    West     West     East
## 73                   North    North     East
## 74                   North     East     East
## 75                   North    North    North
## 76                    East    North    North
## 77                    East     East    North
## 78                    East    North     West

## calculate the proportion of correct classifications on this one 
## training set
pcorrn3=100*sum(HP$Neighborhood[-train]==nearest3)/(n-nt)
pcorrn7=100*sum(HP$Neighborhood[-train]==nearest7)/(n-nt)
pcorrn3

## [1] 52.5641

pcorrn7

## [1] 55.12821

numCorrn3=(pcorrn3/100)*n
PressQ3=((n-(numCorrn3*6))^2)/(n*5)
PressQ3

## [1] 118.7598

##Critical value for chi-square with alpha=.0
qchisq(.95,5)

## [1] 11.0705

numCorrn7=(pcorrn7/100)*n
PressQ7=((n-(numCorrn7*6))^2)/(n*5)
PressQ7

## [1] 136.3314

## cross-validation (leave one out)
pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(x,HP$Neighborhood,k)
  pcorr[k]=100*sum(HP$Neighborhood==pred)/n
}
pcorr

##  [1] 50.78125 55.46875 47.65625 53.90625 53.90625 52.34375 50.00000
##  [8] 53.90625 51.56250 53.12500

x <-scale(HP[,c(2:6)])
nearest3 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=3)
nearest7 <- knn(train=x[train,],test=x[-train,],cl=HP$Neighborhood[train],k=7)
data.frame(HP$Neighborhood[-train],nearest3,nearest7)

##    HP.Neighborhood..train. nearest3 nearest7
## 1                     East    North    North
## 2                     East    North    North
## 3                     East    North    North
## 4                     East    North     East
## 5                    North     East    North
## 6                     West     East     East
## 7                     East    North     East
## 8                     East     East     East
## 9                     East    North    North
## 10                   North    North    North
## 11                    West     East     West
## 12                    West     West     West
## 13                    East    North     East
## 14                    East    North    North
## 15                    West     West     West
## 16                   North    North    North
## 17                   North    North    North
## 18                    West     East     East
## 19                   North    North    North
## 20                    West     West     West
## 21                    West     West     West
## 22                   North    North    North
## 23                    East     East     East
## 24                   North    North    North
## 25                   North    North    North
## 26                    West     West     East
## 27                    East    North    North
## 28                    East    North    North
## 29                    East     West     West
## 30                    East    North    North
## 31                   North     West     West
## 32                    East    North    North
## 33                   North    North    North
## 34                    East     West     East
## 35                   North    North    North
## 36                   North    North    North
## 37                   North     East     East
## 38                    East    North    North
## 39                    West     West     West
## 40                    West     West     West
## 41                    West     West     West
## 42                   North    North    North
## 43                    East    North    North
## 44                    West     East     East
## 45                   North     East    North
## 46                   North    North    North
## 47                    West     West     West
## 48                    West     West     West
## 49                    West     East     East
## 50                    West    North    North
## 51                   North     East     East
## 52                    West     West     West
## 53                    West     East     East
## 54                    East     East     East
## 55                    West     West     West
## 56                   North     West     West
## 57                    West     West     West
## 58                    East    North     East
## 59                    West     East     East
## 60                    East     West     West
## 61                    West    North     East
## 62                    West     East     West
## 63                    West     West     West
## 64                    East     East     East
## 65                    West     West     East
## 66                    East    North    North
## 67                    West     East     East
## 68                   North    North    North
## 69                    East     East    North
## 70                    East     East     East
## 71                   North    North    North
## 72                    West     West     West
## 73                   North    North     East
## 74                   North     East     East
## 75                   North    North    North
## 76                    East    North    North
## 77                    East     East     East
## 78                    East    North     West

pcorrn3=100*sum(HP$Neighborhood[-train]==nearest3)/(n-nt)
pcorrn7=100*sum(HP$Neighborhood[-train]==nearest7)/(n-nt)
pcorrn3

## [1] 50

pcorrn7

## [1] 56.41026

numCorrn3=(pcorrn3/100)*n
PressQ3=((n-(numCorrn3*6))^2)/(n*5)
PressQ3

## [1] 102.4

##Critical value for chi-square with alpha=.05
qchisq(.95,5)

## [1] 11.0705

numCorrn7=(pcorrn7/100)*n
PressQ7=((n-(numCorrn7*6))^2)/(n*5)
PressQ7

## [1] 145.5716

Cross Validation

pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(x,HP$Neighborhood,k)
  pcorr[k]=100*sum(HP$Neighborhood==pred)/n
}
pcorr

##  [1] 50.78125 54.68750 46.87500 53.90625 52.34375 56.25000 50.00000
##  [8] 57.03125 55.46875 57.03125

Confusion Matrix

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

near7<-data.frame(truetype=HP$Neighborhood[-train],predtype=nearest7)
confusionMatrix(data=nearest7,reference =HP$Neighborhood[-train])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction East North West
##      East    11     4   10
##      North   14    17    1
##      West     3     2   16
## 
## Overall Statistics
##                                          
##                Accuracy : 0.5641         
##                  95% CI : (0.447, 0.6761)
##     No Information Rate : 0.359          
##     P-Value [Acc > NIR] : 0.0001756      
##                                          
##                   Kappa : 0.3502         
##  Mcnemar's Test P-Value : 0.0217074      
## 
## Statistics by Class:
## 
##                      Class: East Class: North Class: West
## Sensitivity               0.3929       0.7391      0.5926
## Specificity               0.7200       0.7273      0.9020
## Pos Pred Value            0.4400       0.5312      0.7619
## Neg Pred Value            0.6792       0.8696      0.8070
## Prevalence                0.3590       0.2949      0.3462
## Detection Rate            0.1410       0.2179      0.2051
## Detection Prevalence      0.3205       0.4103      0.2692
## Balanced Accuracy         0.5564       0.7332      0.7473

This model’s overall goal is to predict whether neighborhood predicts house prices. In this matrix, we end up with a about a 56% accuracy. This is better than half but still not the best. With the accuracy percent alone, it might be able to be considered an okay predictor. However, the Kappa is significantly low. Which makes it less of a reliable model.

Problem 2: Food Insecurity

library(class)
library(textir)
library(MASS)
foodinsec<- read.csv("/Users/hannahpeterson/Documents/R stuff/ReducedFoodInsec.csv")

food2=foodinsec[,c(-1:-7,-9:-11,-13,-14,-16:-23,-25:-31,-33:-36,-41:-98,-100:-107,-109,-110)]
head(food2)

##   FLNGINTV FM_TYPE FM_EDUC1 FWKLIMYN FHSTATEX FSRUNOUT FSLAST FSBALANC
## 1        1       4        8        2        3        3      3        3
## 2        1       4        5        1        0        2      3        3
## 3        1       4        6        2        0        2      2        2
## 4        1       4        8        1        6        2      3        3
## 5        1       4        8        1        3        3      3        3
## 6        1       4        6        2        4        3      3        3
##   FDMEDYN INCGRP4 FSNAP
## 1       2       2     2
## 2       2       2     1
## 3       1       4     2
## 4       2       3     1
## 5       2       5     1
## 6       2       5     2

table(food2$FSRUNOUT)

## 
##    1    2    3 
##   30   87 1051

n=length(food2$FSRUNOUT)
nt=1100
set.seed(1)
train <- sample(1:n,nt)
x<-scale(food2[,c(1,2,3,4,5,7,8,9,10,11)])
x[1:3,]

##        FLNGINTV   FM_TYPE   FM_EDUC1   FWKLIMYN   FHSTATEX     FSLAST
## [1,] -0.1960748 0.1833017  0.1752954  0.2818056  0.2958104  0.2693467
## [2,] -0.1960748 0.1833017 -0.5315345 -3.5455075 -1.1418036  0.2693467
## [3,] -0.1960748 0.1833017 -0.2959245  0.2818056 -1.1418036 -2.9739212
##        FSBALANC    FDMEDYN    INCGRP4      FSNAP
## [1,]  0.2428295  0.3303306 -0.3942581  0.0139804
## [2,]  0.2428295  0.3303306 -0.3942581 -1.1523840
## [3,] -3.0939334 -3.0246789 -0.3219274  0.0139804

for (j in 1:6) {
  x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j])
}

mean(x)

## [1] 8.82116e-17

sd(x)

## [1] 0.9996146

nearest1 <- knn(train=x[train,],test=x[-train,],cl=food2$FSRUNOUT[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=food2$FSRUNOUT[train],k=5)
nearest7 <- knn(train=x[train,],test=x[-train,],cl=food2$FSRUNOUT[train],k=7)
data.frame(food2$FSRUNOUT[-train],nearest1,nearest5,nearest7)[1:10,]

##    food2.FSRUNOUT..train. nearest1 nearest5 nearest7
## 1                       3        3        3        3
## 2                       3        3        3        3
## 3                       3        3        3        3
## 4                       3        3        3        3
## 5                       3        3        3        3
## 6                       3        3        3        3
## 7                       3        3        3        3
## 8                       2        3        3        3
## 9                       3        3        3        3
## 10                      3        3        3        3

pcorrn1=100*sum(food2$FSRUNOUT[-train]==nearest1)/(n-nt) 
pcorrn5=100*sum(food2$FSRUNOUT[-train]==nearest5)/(n-nt)
pcorrn7=100*sum(food2$FSRUNOUT[-train]==nearest7)/(n-nt)
pcorrn1

## [1] 95.58824

pcorrn5

## [1] 97.05882

pcorrn7

## [1] 97.05882

numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*3))^2)/(n*2)
PressQ1

## [1] 2037.054

qchisq(.95,2)

## [1] 5.991465

numCorrn5=(pcorrn5/100)*n
PressQ5=((n-(numCorrn5*3))^2)/(n*2)
PressQ5

## [1] 2134.429

numCorrn7=(pcorrn7/100)*n
PressQ7=((n-(numCorrn7*3))^2)/(n*2)
PressQ7pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(x,food2$FSRUNOUT,k)
  pcorr[k]=100*sum(food2$FSRUNOUT==pred)/n
}
pcorr

##  [1] 91.60959 91.52397 93.23630 93.49315 93.40753 93.66438 94.09247
##  [8] 93.92123 93.75000 93.75000

near5<-data.frame(truetype=food2$FSRUNOUT[-train],predtype=nearest5)
confusionMatrix(data=nearest5,reference=food2$FSRUNOUT[-train])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1  1  0  0
##          2  0  2  0
##          3  0  2 63
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9706          
##                  95% CI : (0.8978, 0.9964)
##     No Information Rate : 0.9265          
##     P-Value [Acc > NIR] : 0.1152          
##                                           
##                   Kappa : 0.7385          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity           1.00000  0.50000   1.0000
## Specificity           1.00000  1.00000   0.6000
## Pos Pred Value        1.00000  1.00000   0.9692
## Neg Pred Value        1.00000  0.96970   1.0000
## Prevalence            0.01471  0.05882   0.9265
## Detection Rate        0.01471  0.02941   0.9265
## Detection Prevalence  0.01471  0.02941   0.9559
## Balanced Accuracy     1.00000  0.75000   0.8000

In this model, we have a 97% accuracy. Including a Kappa at about 0.74. This shows that it is a good model to use.

Homework 4

Hannah Peterson

8 November 2017

Problem 1: HousePrices

Problem 2: Food Insecurity