Homework#4

HOMEPRICE ANALYSIS

knitr::kable(head(house))

HomeID	Price	SqFt	Bedrooms	Bathrooms	Offers	Brick	Neighborhood
1	114300	1790	2	2	2	No	East
2	114200	2030	4	2	3	No	East
3	114800	1740	3	2	1	No	East
4	94700	1980	3	2	3	No	East
5	119800	2130	3	3	3	No	East
6	114600	1780	3	2	2	No	North

n=length(house$Neighborhood)
nt=100

a1 = rep(1,length(house$Neighborhood))
a2 = rep(0,length(house$Neighborhood))

house$BrickYes = ifelse(house$Brick == "Yes",a1,a2)

train <- sample(1:n,nt)

x <- scale(house[,c(2,3,4,5,6,9)])
x[1:3]

## [1] -0.6002263 -0.6039481 -0.5816174

mean(x)

## [1] 6.38084e-18

sd(x)

## [1] 0.9967352

x=house[,c(2,3,4,5,6,9)]

for (j in 1:6) {
  x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j])
}


knitr::kable(x[1:3,])

Price	SqFt	Bedrooms	Bathrooms	Offers	BrickYes
-0.6002263	-0.9969990	-1.4097879	-0.8655378	-0.5406451	-0.6961011
-0.6039481	0.1373643	1.3452175	-0.8655378	0.3945248	-0.6961011
-0.5816174	-1.2333247	-0.0322852	-0.8655378	-1.4758150	-0.6961011

nearest1 <- knn(train=x[train,],test=x[-train,],cl=house$Neighborhood[train],k=1)
nearest5 <- knn(train=x[train,],test=x[-train,],cl=house$Neighborhood[train],k=5)
knitr::kable(head(data.frame(house$Neighborhood[-train],nearest1,nearest5)))

house.Neighborhood..train.	nearest1	nearest5
East	North	North
East	East	North
West	West	West
East	North	North
East	East	East
West	West	West

pcorr=dim(10)
neighbors=dim(10)  ## Creates a variable to count how many neighbors are being used
for (k in 1:10) {
  pred=knn.cv(x,cl=house$Neighborhood,k)
  pcorr[k]=100*sum(house$Neighborhood==pred)/1000
  neighbors[k]=k   ## Populates that count variable with k each time through
}
pcorr

##  [1] 7.8 7.4 6.7 7.0 7.3 7.4 7.7 7.5 7.5 7.8

neighbors

##  [1]  1  2  3  4  5  6  7  8  9 10

## New code to get it to give you the best number of neighbors to use
maxAcc<-max(pcorr) ## Maximum accuracy (percent correct) 
accTable<-rbind(pcorr,neighbors)
knitr::kable(accTable)

pcorr	7.8	7.4	6.7	7	7.3	7.4	7.7	7.5	7.5	7.8
neighbors	1.0	2.0	3.0	4	5.0	6.0	7.0	8.0	9.0	10.0

kBest=neighbors[pcorr==maxAcc]
kBest

## [1]  1 10

Best number of k neighbors is 1 in this model, thus use k=1.

near1<- data.frame(truetype=house$Neighborhood[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=house$Neighborhood[-train])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction East North West
##      East     5     2    0
##      North    7     4    0
##      West     2     0    8
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6071         
##                  95% CI : (0.4058, 0.785)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 0.1725         
##                                          
##                   Kappa : 0.4296         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: East Class: North Class: West
## Sensitivity               0.3571       0.6667      1.0000
## Specificity               0.8571       0.6818      0.9000
## Pos Pred Value            0.7143       0.3636      0.8000
## Neg Pred Value            0.5714       0.8824      1.0000
## Prevalence                0.5000       0.2143      0.2857
## Detection Rate            0.1786       0.1429      0.2857
## Detection Prevalence      0.2500       0.3929      0.3571
## Balanced Accuracy         0.6071       0.6742      0.9500

This model’s prediction accuracy is about 60%, which is not criticaly accurate. But it is beter than predictiong without any informations.

FOOD INSECURITY ANALYSIS

nhis = ReducedFoodInsec

nhis$FSRUNOUT = recode(nhis$FSRUNOUT, "'1'=1;'2'=1;'3'=0;'7'=NA;'8'=NA;'9'=NA")

nhis2 <-data.frame(nhis$INCGRP5,nhis$FLNGINTV,nhis$FM_EDUC1,nhis$HOUSEOWN,
             nhis$FM_SIZE,nhis$FM_KIDS,nhis$FM_ELDR,nhis$FSRUNOUT)

head(nhis2)

##   nhis.INCGRP5 nhis.FLNGINTV nhis.FM_EDUC1 nhis.HOUSEOWN nhis.FM_SIZE
## 1            2             1             8             2            3
## 2            2             1             5             1           12
## 3            3             1             6             1            3
## 4            2             1             8             1            6
## 5            4             1             8             1            4
## 6            4             1             6             1            4
##   nhis.FM_KIDS nhis.FM_ELDR nhis.FSRUNOUT
## 1            1            0             0
## 2            8            1             1
## 3            1            0             1
## 4            2            0             1
## 5            1            0             0
## 6            2            0             0

dim(nhis2)

## [1] 1168    8

n2=length(nhis2[,1])
nt2=900
train <- sample(1:n2,nt2)
x2 <- scale(nhis2[,c(1:7)])
x2[1:3]

## [1] -0.3922275 -0.3922275 -0.3590254

mean(x2)

## [1] 3.812677e-17

sd(x2)

## [1] 0.999633

for (j in 1:6) {
  x2[,j]=(x2[,j]-mean(x2[,j]))/sd(x2[,j])
}

x2 = nhis2[,c(1:7)]

near1 <- knn(train=x2[train,],test=x2[-train,],cl=nhis2$nhis.FSRUNOUT[train],k=1)
near5 <- knn(train=x2[train,],test=x2[-train,],cl=nhis2$nhis.FSRUNOUT[train],k=5)
near10 <- knn(train=x2[train,],test=x2[-train,],cl=nhis2$nhis.FSRUNOUT[train],k=10)
knitr::kable(head(data.frame(nhis2$nhis.FSRUNOUT[-train],near1,near5,near10)))

nhis2.nhis.FSRUNOUT..train.	near1	near5	near10
0	1	0	0
1	0	0	0
0	0	0	0
0	0	0	0
0	0	0	0
0	0	0	0

pcorrn1=100*sum(nhis2$nhis.FSRUNOUT[-train]==near1)/(n-nt)
pcorrn5=100*sum(nhis2$nhis.FSRUNOUT[-train]==near5)/(n-nt)
pcorrn1

## [1] 839.2857

pcorrn5

## [1] 867.8571

pcorr=dim(10)
neighbors=dim(10)  ## Creates a variable to count how many neighbors are being used
for (k in 1:10) {
  pred=knn.cv(x2,cl=nhis2$nhis.FSRUNOUT,k)
  pcorr[k]=100*sum(nhis2$nhis.FSRUNOUT==pred)/n2
  neighbors[k]=k   ## Populates that count variable with k each time through
}
pcorr

##  [1] 86.81507 87.67123 88.69863 88.78425 89.46918 89.46918 89.38356
##  [8] 89.46918 89.72603 89.89726

neighbors

##  [1]  1  2  3  4  5  6  7  8  9 10

## New code to get it to give you the best number of neighbors to use
maxAcc<-max(pcorr) ## Maximum accuracy (percent correct) 
accTable<-rbind(pcorr,neighbors)
accTable

##               [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]
## pcorr     86.81507 87.67123 88.69863 88.78425 89.46918 89.46918 89.38356
## neighbors  1.00000  2.00000  3.00000  4.00000  5.00000  6.00000  7.00000
##               [,8]     [,9]    [,10]
## pcorr     89.46918 89.72603 89.89726
## neighbors  8.00000  9.00000 10.00000

kBest=neighbors[pcorr==maxAcc]
kBest

## [1] 10

Best number of neighbor is 10 in this model

fsnear10<- data.frame(truetype=nhis2$nhis.FSRUNOUT[-train],predtype=near10)
confusionMatrix(data=near10,reference=nhis2$nhis.FSRUNOUT[-train])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 240  25
##          1   2   1
##                                           
##                Accuracy : 0.8993          
##                  95% CI : (0.8568, 0.9326)
##     No Information Rate : 0.903           
##     P-Value [Acc > NIR] : 0.631           
##                                           
##                   Kappa : 0.0499          
##  Mcnemar's Test P-Value : 2.297e-05       
##                                           
##             Sensitivity : 0.99174         
##             Specificity : 0.03846         
##          Pos Pred Value : 0.90566         
##          Neg Pred Value : 0.33333         
##              Prevalence : 0.90299         
##          Detection Rate : 0.89552         
##    Detection Prevalence : 0.98881         
##       Balanced Accuracy : 0.51510         
##                                           
##        'Positive' Class : 0               
##

Homework#4

Shunya

2017年11月5日

HOMEPRICE ANALYSIS

Best number of k neighbors is 1 in this model, thus use k=1.

This model’s prediction accuracy is about 60%, which is not criticaly accurate. But it is beter than predictiong without any informations.

FOOD INSECURITY ANALYSIS

Best number of neighbor is 10 in this model

Its accuracy is about 90%, which is well predicted model.