Homework #4b

Insec<-read.csv("c:/users/abbey/Desktop/Data Mining/ReducedFoodInsec.csv")
Insec[1:3,]

##   WTFA_FAM FINT_M_P TELN_FLG CURWRKN TELCELN WRKCELN PHONEUSE FLNGINTV
## 1     2016        4        1       1       1       2        2        2
## 2     2460        4        1       1       1       6        2        2
## 3      882        6        1       1       1       1        3        1
##   FM_SIZE FM_KIDS FM_ELDR FM_TYPE FM_STRCP FM_STRP FM_EDUC1 FCHLMYN
## 1       5       3       0       4       42      43        2       2
## 2       7       4       1       4       44      44        2       2
## 3       6       3       1       4       44      44        2       2
##   FCHLMCT FSPEDYN FSPEDCT FLAADLYN FLAADLCT FLIADLYN FLIADLCT FWKLIMYN
## 1       0       1       1        2        0        2        0        2
## 2       0       1       1        1        1        2        0        2
## 3       0       2       0        2        0        2        0        2
##   FWKLIMCT FWALKYN FWALKCT FREMEMYN FREMEMCT FANYLYN FANYLCT FHSTATEX
## 1        0       2       0        2        0       1       1        0
## 2        0       1       1        2        0       1       2        0
## 3        0       2       0        2        0       2       0        5
##   FHSTATVG FHSTATG FHSTATFR FHSTATPR FSRUNOUT FSLAST FSBALANC FDMEDYN
## 1        1       3        1        0        2      2        2       1
## 2        0       6        0        1        2      2        2       1
## 3        0       1        0        0        2      2        2       1
##   FDMEDCT FNMEDYN FNMEDCT FHOSP2YN FHOSP2CT FHCHMYN FHCHMCT FHCPHRYN
## 1       1       2       0        2        0       2       0        2
## 2       1       1       1        1        1       1       1        2
## 3       1       1       1        1        2       2       0        2
##   FHCPHRCT FHCDVYN FHCDVCT F10DVYN F10DVCT FHICOVYN FHICOVCT FHIPRVCT
## 1        0       2       0       2       0        1        3        2
## 2        0       2       0       2       0        1        7        2
## 3        0       1       1       2       0        1        6        1
##   FHIEXCT FHISINCT FHICARCT FHICADCT FHICHPCT FHIMILCT FHIIHSCT FHIPUBCT
## 1       0        0        0        1        0        0        0        0
## 2       0        0        0        5        0        0        0        0
## 3       0        0        1        4        0        0        0        0
##   FHIOGVCT FPRCOOH FHIEBCCT FHICOST FMEDBILL FMEDBPAY FSAF FHDSTCT
## 1        0       2        2       1        1        1    2       2
## 2        0       2        1       2        1        1    2       3
## 3        0       2        1       2        1        1    2       2
##   FDGLWCT1 FDGLWCT2 FSALYN FSALCT FSEINCYN FSEINCCT FSSRRYN FSSRRCT
## 1        1        0      1      1        2        0       2       0
## 2        2        0      2      0        2        0       2       0
## 3        2        0      1      3        2        0       1       1
##   FPENSYN FPENSCT FOPENSYN FOPENSCT FSSIYN FSSICT FTANFYN FTANFCT FOWBENYN
## 1       2       0        2        0      2      0       2       0        2
## 2       1       1        2        0      2      0       2       0        2
## 3       2       0        2        0      2      0       2       0        2
##   FOWBENCT FINTR1YN FINTR1CT FDIVDYN FDIVDCT FCHSPYN FCHSPCT FINCOTYN
## 1        0        2        0       2       0       2       0        2
## 2        0        2        0       2       0       1       3        2
## 3        0        2        0       2       0       2       0        2
##   FINCOTCT INCGRP4 INCGRP5 RAT_CAT4 RAT_CAT5 HOUSEOWN FSSAPLYN FSSAPLCT
## 1        0       2       2        6        6        1        2        0
## 2        0       2       2        4        4        2        1        1
## 3        0       3       2        6        6        2        2        0
##   FSDAPLYN FSDAPLCT FSNAP FWICYN FWICCT
## 1        2        0     2      1      1
## 2        2        0     1      1      1
## 3        2        0     1      2      0

library(textir)

## Loading required package: distrom

## Loading required package: Matrix

## Loading required package: gamlr

## Loading required package: parallel

library(MASS)
library(class)

Next I recreated the data set with only the variables I had chosen previously.

foodins=Insec[,c(-1:-7,-9:-11,-13,-14,-16:-23,-25:-31,-33:-36,-41:-98,-100:-107,-109,-110)]
head(foodins)

##   FLNGINTV FM_TYPE FM_EDUC1 FWKLIMYN FHSTATEX FSRUNOUT FSLAST FSBALANC
## 1        2       4        2        2        0        2      2        2
## 2        2       4        2        2        0        2      2        2
## 3        1       4        2        2        5        2      2        2
## 4        1       4        2        2        0        3      3        3
## 5        1       4        3        2        0        1      2        3
## 6        1       4        4        1        2        2      2        2
##   FDMEDYN INCGRP4 FSNAP
## 1       1       2     2
## 2       1       2     1
## 3       1       3     1
## 4       1       1     1
## 5       1       6     1
## 6       1       1     2

dim(foodins)

## [1] 1168   11

There are 1168 rows in this data set and 11 variables

table(foodins$FSRUNOUT)

## 
##    1    2    3 
##   30   87 1051

In this data set 30 household were often afraid of running out of food before they could buy more and 87 were sometimes worried. 1051 were never worried about running out of food before they were able to buy more.

foodins$FSRUNOUT=factor(foodins$FSRUNOUT, levels=c("1","2","3"))
levels(foodins$FSRUNOUT)=c("Often","Sometimes","Never")

par(mfrow=c(3,3), mai=c(.3,.6,.1,.1))
plot(FDMEDYN ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FSBALANC ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FSLAST ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(INCGRP4 ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FHSTATEX ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FWKLIMYN ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FM_EDUC1 ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FM_TYPE ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FLNGINTV ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))

plot(FSNAP ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))

n=length(foodins$FSRUNOUT)
nt=1100
set.seed(1) ## to make the calculations reproducible in repeated runs
train <- sample(1:n,nt)

x<-scale(foodins[,c(1,2,3,4,5,7,8,9,10,11)])
x[1:3,]

##        FLNGINTV   FM_TYPE  FM_EDUC1  FWKLIMYN  FHSTATEX    FSLAST
## [1,]  2.7781504 0.1833017 -1.238364 0.2818056 -1.141804 -2.973921
## [2,]  2.7781504 0.1833017 -1.238364 0.2818056 -1.141804 -2.973921
## [3,] -0.1960748 0.1833017 -1.238364 0.2818056  1.254220 -2.973921
##       FSBALANC   FDMEDYN    INCGRP4      FSNAP
## [1,] -3.093933 -3.024679 -0.3942581  0.0139804
## [2,] -3.093933 -3.024679 -0.3942581 -1.1523840
## [3,] -3.093933 -3.024679 -0.3580927 -1.1523840

for (j in 1:6) {
  x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j])
}

##mean and standard deviation
mean(x)

## [1] 8.921712e-17

sd(x)

## [1] 0.9996146

library(class)  
nearest1 <- knn(train=x[train,],test=x[-train,],cl=foodins$FSRUNOUT[train],k=1)
nearest7 <- knn(train=x[train,],test=x[-train,],cl=foodins$FSRUNOUT[train],k=7)
nearest9 <- knn(train=x[train,],test=x[-train,],cl=foodins$FSRUNOUT[train],k=9)
data.frame(foodins$FSRUNOUT[-train],nearest1,nearest7,nearest9)[1:10,]

##    foodins.FSRUNOUT..train.  nearest1  nearest7  nearest9
## 1                     Never Sometimes Sometimes Sometimes
## 2                 Sometimes Sometimes Sometimes Sometimes
## 3                     Never     Never     Never     Never
## 4                     Never     Never     Never     Never
## 5                     Never     Never     Never     Never
## 6                     Never     Never     Never     Never
## 7                     Never     Never     Never     Never
## 8                     Never     Never     Never     Never
## 9                     Never     Never     Never     Never
## 10                    Often Sometimes Sometimes Sometimes

This model predicted 8 out of 10 correct using nearest 1 nearest 7 and nearest 9 neighbors.

pcorrn1=100*sum(foodins$FSRUNOUT[-train]==nearest1)/(n-nt) 
pcorrn7=100*sum(foodins$FSRUNOUT[-train]==nearest7)/(n-nt)
pcorrn9=100*sum(foodins$FSRUNOUT[-train]==nearest9)/(n-nt)
pcorrn1

## [1] 97.05882

pcorrn7

## [1] 97.05882

pcorrn9

## [1] 97.05882

Portion of correct for all three is the same 97.06%

numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*3))^2)/(n*2)
PressQ1

## [1] 2134.429

qchisq(.95,2) ##critical value for chi-square with alpha= 0.5(use .95 in formula), k-1=d.f. where k=3

## [1] 5.991465

numCorrn7=(pcorrn7/100)*n
PressQ7=((n-(numCorrn7*3))^2)/(n*2)
PressQ7

## [1] 2134.429

numCorrn9=(pcorrn9/100)*n
PressQ9=((n-(numCorrn9*3))^2)/(n*2)
PressQ9

## [1] 2134.429

Press’ Q for nearest 1, 7, and 9 are all above the chi-squre meaning a better chance.

cross-validation (leavingn one out)

pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(x,foodins$FSRUNOUT,k)
  pcorr[k]=100*sum(foodins$FSRUNOUT==pred)/n
}
pcorr

##  [1] 91.52397 92.03767 93.40753 93.23630 93.32192 93.57877 94.17808
##  [8] 93.83562 93.83562 93.83562

we sould use kn=7 becaue it has the biggest precentage

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(e1071)

near7<-data.frame(truetype=foodins$FSRUNOUT[-train],predtype=nearest7)
confusionMatrix(data=nearest7,reference=foodins$FSRUNOUT[-train])

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Often Sometimes Never
##   Often         0         0     0
##   Sometimes     1         3     1
##   Never         0         0    63
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9706          
##                  95% CI : (0.8978, 0.9964)
##     No Information Rate : 0.9412          
##     P-Value [Acc > NIR] : 0.2293          
##                                           
##                   Kappa : 0.7643          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Often Class: Sometimes Class: Never
## Sensitivity               0.00000          1.00000       0.9844
## Specificity               1.00000          0.96923       1.0000
## Pos Pred Value                NaN          0.60000       1.0000
## Neg Pred Value            0.98529          1.00000       0.8000
## Prevalence                0.01471          0.04412       0.9412
## Detection Rate            0.00000          0.04412       0.9265
## Detection Prevalence      0.00000          0.07353       0.9265
## Balanced Accuracy         0.50000          0.98462       0.9922

The confusion matrix shows it predicted 69 casses correctly. This is a good model becaue it predicted with 97% accuracy, but the p-value is not significant.