Insec<-read.csv("c:/users/abbey/Desktop/Data Mining/ReducedFoodInsec.csv")
Insec[1:3,]
## WTFA_FAM FINT_M_P TELN_FLG CURWRKN TELCELN WRKCELN PHONEUSE FLNGINTV
## 1 2016 4 1 1 1 2 2 2
## 2 2460 4 1 1 1 6 2 2
## 3 882 6 1 1 1 1 3 1
## FM_SIZE FM_KIDS FM_ELDR FM_TYPE FM_STRCP FM_STRP FM_EDUC1 FCHLMYN
## 1 5 3 0 4 42 43 2 2
## 2 7 4 1 4 44 44 2 2
## 3 6 3 1 4 44 44 2 2
## FCHLMCT FSPEDYN FSPEDCT FLAADLYN FLAADLCT FLIADLYN FLIADLCT FWKLIMYN
## 1 0 1 1 2 0 2 0 2
## 2 0 1 1 1 1 2 0 2
## 3 0 2 0 2 0 2 0 2
## FWKLIMCT FWALKYN FWALKCT FREMEMYN FREMEMCT FANYLYN FANYLCT FHSTATEX
## 1 0 2 0 2 0 1 1 0
## 2 0 1 1 2 0 1 2 0
## 3 0 2 0 2 0 2 0 5
## FHSTATVG FHSTATG FHSTATFR FHSTATPR FSRUNOUT FSLAST FSBALANC FDMEDYN
## 1 1 3 1 0 2 2 2 1
## 2 0 6 0 1 2 2 2 1
## 3 0 1 0 0 2 2 2 1
## FDMEDCT FNMEDYN FNMEDCT FHOSP2YN FHOSP2CT FHCHMYN FHCHMCT FHCPHRYN
## 1 1 2 0 2 0 2 0 2
## 2 1 1 1 1 1 1 1 2
## 3 1 1 1 1 2 2 0 2
## FHCPHRCT FHCDVYN FHCDVCT F10DVYN F10DVCT FHICOVYN FHICOVCT FHIPRVCT
## 1 0 2 0 2 0 1 3 2
## 2 0 2 0 2 0 1 7 2
## 3 0 1 1 2 0 1 6 1
## FHIEXCT FHISINCT FHICARCT FHICADCT FHICHPCT FHIMILCT FHIIHSCT FHIPUBCT
## 1 0 0 0 1 0 0 0 0
## 2 0 0 0 5 0 0 0 0
## 3 0 0 1 4 0 0 0 0
## FHIOGVCT FPRCOOH FHIEBCCT FHICOST FMEDBILL FMEDBPAY FSAF FHDSTCT
## 1 0 2 2 1 1 1 2 2
## 2 0 2 1 2 1 1 2 3
## 3 0 2 1 2 1 1 2 2
## FDGLWCT1 FDGLWCT2 FSALYN FSALCT FSEINCYN FSEINCCT FSSRRYN FSSRRCT
## 1 1 0 1 1 2 0 2 0
## 2 2 0 2 0 2 0 2 0
## 3 2 0 1 3 2 0 1 1
## FPENSYN FPENSCT FOPENSYN FOPENSCT FSSIYN FSSICT FTANFYN FTANFCT FOWBENYN
## 1 2 0 2 0 2 0 2 0 2
## 2 1 1 2 0 2 0 2 0 2
## 3 2 0 2 0 2 0 2 0 2
## FOWBENCT FINTR1YN FINTR1CT FDIVDYN FDIVDCT FCHSPYN FCHSPCT FINCOTYN
## 1 0 2 0 2 0 2 0 2
## 2 0 2 0 2 0 1 3 2
## 3 0 2 0 2 0 2 0 2
## FINCOTCT INCGRP4 INCGRP5 RAT_CAT4 RAT_CAT5 HOUSEOWN FSSAPLYN FSSAPLCT
## 1 0 2 2 6 6 1 2 0
## 2 0 2 2 4 4 2 1 1
## 3 0 3 2 6 6 2 2 0
## FSDAPLYN FSDAPLCT FSNAP FWICYN FWICCT
## 1 2 0 2 1 1
## 2 2 0 1 1 1
## 3 2 0 1 2 0
library(textir)
## Loading required package: distrom
## Loading required package: Matrix
## Loading required package: gamlr
## Loading required package: parallel
library(MASS)
library(class)
Next I recreated the data set with only the variables I had chosen previously.
foodins=Insec[,c(-1:-7,-9:-11,-13,-14,-16:-23,-25:-31,-33:-36,-41:-98,-100:-107,-109,-110)]
head(foodins)
## FLNGINTV FM_TYPE FM_EDUC1 FWKLIMYN FHSTATEX FSRUNOUT FSLAST FSBALANC
## 1 2 4 2 2 0 2 2 2
## 2 2 4 2 2 0 2 2 2
## 3 1 4 2 2 5 2 2 2
## 4 1 4 2 2 0 3 3 3
## 5 1 4 3 2 0 1 2 3
## 6 1 4 4 1 2 2 2 2
## FDMEDYN INCGRP4 FSNAP
## 1 1 2 2
## 2 1 2 1
## 3 1 3 1
## 4 1 1 1
## 5 1 6 1
## 6 1 1 2
dim(foodins)
## [1] 1168 11
There are 1168 rows in this data set and 11 variables
table(foodins$FSRUNOUT)
##
## 1 2 3
## 30 87 1051
In this data set 30 household were often afraid of running out of food before they could buy more and 87 were sometimes worried. 1051 were never worried about running out of food before they were able to buy more.
foodins$FSRUNOUT=factor(foodins$FSRUNOUT, levels=c("1","2","3"))
levels(foodins$FSRUNOUT)=c("Often","Sometimes","Never")
par(mfrow=c(3,3), mai=c(.3,.6,.1,.1))
plot(FDMEDYN ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FSBALANC ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FSLAST ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(INCGRP4 ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FHSTATEX ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FWKLIMYN ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FM_EDUC1 ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FM_TYPE ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FLNGINTV ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
plot(FSNAP ~ FSRUNOUT, data=foodins, col=c(grey(.2),2:6))
n=length(foodins$FSRUNOUT)
nt=1100
set.seed(1) ## to make the calculations reproducible in repeated runs
train <- sample(1:n,nt)
x<-scale(foodins[,c(1,2,3,4,5,7,8,9,10,11)])
x[1:3,]
## FLNGINTV FM_TYPE FM_EDUC1 FWKLIMYN FHSTATEX FSLAST
## [1,] 2.7781504 0.1833017 -1.238364 0.2818056 -1.141804 -2.973921
## [2,] 2.7781504 0.1833017 -1.238364 0.2818056 -1.141804 -2.973921
## [3,] -0.1960748 0.1833017 -1.238364 0.2818056 1.254220 -2.973921
## FSBALANC FDMEDYN INCGRP4 FSNAP
## [1,] -3.093933 -3.024679 -0.3942581 0.0139804
## [2,] -3.093933 -3.024679 -0.3942581 -1.1523840
## [3,] -3.093933 -3.024679 -0.3580927 -1.1523840
for (j in 1:6) {
x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j])
}
##mean and standard deviation
mean(x)
## [1] 8.921712e-17
sd(x)
## [1] 0.9996146
library(class)
nearest1 <- knn(train=x[train,],test=x[-train,],cl=foodins$FSRUNOUT[train],k=1)
nearest7 <- knn(train=x[train,],test=x[-train,],cl=foodins$FSRUNOUT[train],k=7)
nearest9 <- knn(train=x[train,],test=x[-train,],cl=foodins$FSRUNOUT[train],k=9)
data.frame(foodins$FSRUNOUT[-train],nearest1,nearest7,nearest9)[1:10,]
## foodins.FSRUNOUT..train. nearest1 nearest7 nearest9
## 1 Never Sometimes Sometimes Sometimes
## 2 Sometimes Sometimes Sometimes Sometimes
## 3 Never Never Never Never
## 4 Never Never Never Never
## 5 Never Never Never Never
## 6 Never Never Never Never
## 7 Never Never Never Never
## 8 Never Never Never Never
## 9 Never Never Never Never
## 10 Often Sometimes Sometimes Sometimes
This model predicted 8 out of 10 correct using nearest 1 nearest 7 and nearest 9 neighbors.
pcorrn1=100*sum(foodins$FSRUNOUT[-train]==nearest1)/(n-nt)
pcorrn7=100*sum(foodins$FSRUNOUT[-train]==nearest7)/(n-nt)
pcorrn9=100*sum(foodins$FSRUNOUT[-train]==nearest9)/(n-nt)
pcorrn1
## [1] 97.05882
pcorrn7
## [1] 97.05882
pcorrn9
## [1] 97.05882
Portion of correct for all three is the same 97.06%
numCorrn1=(pcorrn1/100)*n
PressQ1=((n-(numCorrn1*3))^2)/(n*2)
PressQ1
## [1] 2134.429
qchisq(.95,2) ##critical value for chi-square with alpha= 0.5(use .95 in formula), k-1=d.f. where k=3
## [1] 5.991465
numCorrn7=(pcorrn7/100)*n
PressQ7=((n-(numCorrn7*3))^2)/(n*2)
PressQ7
## [1] 2134.429
numCorrn9=(pcorrn9/100)*n
PressQ9=((n-(numCorrn9*3))^2)/(n*2)
PressQ9
## [1] 2134.429
Press’ Q for nearest 1, 7, and 9 are all above the chi-squre meaning a better chance.
cross-validation (leavingn one out)
pcorr=dim(10)
for (k in 1:10) {
pred=knn.cv(x,foodins$FSRUNOUT,k)
pcorr[k]=100*sum(foodins$FSRUNOUT==pred)/n
}
pcorr
## [1] 91.52397 92.03767 93.40753 93.23630 93.32192 93.57877 94.17808
## [8] 93.83562 93.83562 93.83562
we sould use kn=7 becaue it has the biggest precentage
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
near7<-data.frame(truetype=foodins$FSRUNOUT[-train],predtype=nearest7)
confusionMatrix(data=nearest7,reference=foodins$FSRUNOUT[-train])
## Confusion Matrix and Statistics
##
## Reference
## Prediction Often Sometimes Never
## Often 0 0 0
## Sometimes 1 3 1
## Never 0 0 63
##
## Overall Statistics
##
## Accuracy : 0.9706
## 95% CI : (0.8978, 0.9964)
## No Information Rate : 0.9412
## P-Value [Acc > NIR] : 0.2293
##
## Kappa : 0.7643
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Often Class: Sometimes Class: Never
## Sensitivity 0.00000 1.00000 0.9844
## Specificity 1.00000 0.96923 1.0000
## Pos Pred Value NaN 0.60000 1.0000
## Neg Pred Value 0.98529 1.00000 0.8000
## Prevalence 0.01471 0.04412 0.9412
## Detection Rate 0.00000 0.04412 0.9265
## Detection Prevalence 0.00000 0.07353 0.9265
## Balanced Accuracy 0.50000 0.98462 0.9922
The confusion matrix shows it predicted 69 casses correctly. This is a good model becaue it predicted with 97% accuracy, but the p-value is not significant.