fish<-read.csv("~/Desktop/碩一下/多變量/Fishdata.csv")
library(rpart)
fish.control<-rpart.control(minisplit=10,minbucket=3,xval=0)
fish.treeorig<-rpart(Species~
Weight+L1+L2+L3+Height+Width,data=fish,method="class",control=fish.control)
Let’s now plot the tree:
# 第一種畫法
plot(fish.treeorig)
text(fish.treeorig)
# 第二種畫法
require(rpart.plot)
## Loading required package: rpart.plot
prp(fish.treeorig, # 模型
faclen=0, # 呈現的變數不要縮寫
fallen.leaves=TRUE, # 讓樹枝以垂直方式呈現
shadow.col="gray", # 最下面的節點塗上陰影
# number of correct classifications / number of observations in that node
extra=2)
Also check out the complexity parameter (CP):
printcp(fish.treeorig)
##
## Classification tree:
## rpart(formula = Species ~ Weight + L1 + L2 + L3 + Height + Width,
## data = fish, method = "class", control = fish.control)
##
## Variables actually used in tree construction:
## [1] Height L1 L3 Weight Width
##
## Root node error: 94/148 = 0.63514
##
## n= 148
##
## CP nsplit rel error
## 1 0.351064 0 1.00000
## 2 0.170213 1 0.64894
## 3 0.127660 2 0.47872
## 4 0.106383 3 0.35106
## 5 0.053191 4 0.24468
## 6 0.031915 5 0.19149
## 7 0.010638 6 0.15957
## 8 0.010000 10 0.11702
summary(fish.treeorig)
## Call:
## rpart(formula = Species ~ Weight + L1 + L2 + L3 + Height + Width,
## data = fish, method = "class", control = fish.control)
## n= 148
##
## CP nsplit rel error
## 1 0.35106383 0 1.0000000
## 2 0.17021277 1 0.6489362
## 3 0.12765957 2 0.4787234
## 4 0.10638298 3 0.3510638
## 5 0.05319149 4 0.2446809
## 6 0.03191489 5 0.1914894
## 7 0.01063830 6 0.1595745
## 8 0.01000000 10 0.1170213
##
## Variable importance
## Height L3 L2 L1 Weight Width
## 26 16 15 15 15 13
##
## Node number 1: 148 observations, complexity param=0.3510638
## predicted class=perch expected loss=0.6351351 P(node) =1
## class counts: 33 10 54 16 18 12 5
## probabilities: 0.223 0.068 0.365 0.108 0.122 0.081 0.034
## left son=2 (43 obs) right son=3 (105 obs)
## Primary splits:
## Height < 33.9 to the right, improve=29.75863, (0 missing)
## Width < 11.85 to the right, improve=17.98385, (0 missing)
## L3 < 29.7 to the right, improve=13.80398, (0 missing)
## L2 < 28.85 to the right, improve=12.96300, (0 missing)
## L1 < 26.1 to the right, improve=12.56245, (0 missing)
##
## Node number 2: 43 observations, complexity param=0.106383
## predicted class=bream expected loss=0.2325581 P(node) =0.2905405
## class counts: 33 10 0 0 0 0 0
## probabilities: 0.767 0.233 0.000 0.000 0.000 0.000 0.000
## left son=4 (33 obs) right son=5 (10 obs)
## Primary splits:
## L3 < 29.5 to the right, improve=15.348840, (0 missing)
## L2 < 26.15 to the right, improve=13.530660, (0 missing)
## L1 < 23.1 to the right, improve=13.407660, (0 missing)
## Weight < 331.5 to the right, improve=12.015500, (0 missing)
## Width < 14.85 to the right, improve= 1.063123, (0 missing)
## Surrogate splits:
## L1 < 23.1 to the right, agree=0.977, adj=0.9, (0 split)
## L2 < 25.2 to the right, agree=0.977, adj=0.9, (0 split)
## Weight < 221 to the right, agree=0.953, adj=0.8, (0 split)
##
## Node number 3: 105 observations, complexity param=0.1702128
## predicted class=perch expected loss=0.4857143 P(node) =0.7094595
## class counts: 0 0 54 16 18 12 5
## probabilities: 0.000 0.000 0.514 0.152 0.171 0.114 0.048
## left son=6 (77 obs) right son=7 (28 obs)
## Primary splits:
## Height < 20.1 to the right, improve=21.78355, (0 missing)
## Width < 12.45 to the right, improve=20.93000, (0 missing)
## Weight < 25.95 to the right, improve=13.35778, (0 missing)
## L3 < 15.6 to the right, improve=10.68888, (0 missing)
## L1 < 12.3 to the right, improve=10.63876, (0 missing)
## Surrogate splits:
## Width < 12.45 to the right, agree=0.990, adj=0.964, (0 split)
## Weight < 25.95 to the right, agree=0.838, adj=0.393, (0 split)
## L1 < 12.3 to the right, agree=0.819, adj=0.321, (0 split)
## L2 < 13.35 to the right, agree=0.819, adj=0.321, (0 split)
## L3 < 14.25 to the right, agree=0.819, adj=0.321, (0 split)
##
## Node number 4: 33 observations
## predicted class=bream expected loss=0 P(node) =0.222973
## class counts: 33 0 0 0 0 0 0
## probabilities: 1.000 0.000 0.000 0.000 0.000 0.000 0.000
##
## Node number 5: 10 observations
## predicted class=parki expected loss=0 P(node) =0.06756757
## class counts: 0 10 0 0 0 0 0
## probabilities: 0.000 1.000 0.000 0.000 0.000 0.000 0.000
##
## Node number 6: 77 observations, complexity param=0.05319149
## predicted class=perch expected loss=0.2987013 P(node) =0.5202703
## class counts: 0 0 54 0 18 0 5
## probabilities: 0.000 0.000 0.701 0.000 0.234 0.000 0.065
## left son=12 (64 obs) right son=13 (13 obs)
## Primary splits:
## Width < 14.4 to the right, improve=5.777691, (0 missing)
## Height < 25.25 to the left, improve=4.275974, (0 missing)
## L1 < 25.1 to the right, improve=2.872913, (0 missing)
## L2 < 27.15 to the right, improve=2.872913, (0 missing)
## Weight < 548 to the right, improve=2.448383, (0 missing)
## Surrogate splits:
## L1 < 13.35 to the right, agree=0.844, adj=0.077, (0 split)
## L2 < 14.55 to the right, agree=0.844, adj=0.077, (0 split)
##
## Node number 7: 28 observations, complexity param=0.1276596
## predicted class=pike expected loss=0.4285714 P(node) =0.1891892
## class counts: 0 0 0 16 0 12 0
## probabilities: 0.000 0.000 0.000 0.571 0.000 0.429 0.000
## left son=14 (16 obs) right son=15 (12 obs)
## Primary splits:
## Weight < 109.95 to the right, improve=13.714290, (0 missing)
## L1 < 21.9 to the right, improve=13.714290, (0 missing)
## L2 < 23.65 to the right, improve=13.714290, (0 missing)
## L3 < 25.5 to the right, improve=13.714290, (0 missing)
## Height < 16.05 to the left, improve= 4.571429, (0 missing)
## Surrogate splits:
## L1 < 21.9 to the right, agree=1.000, adj=1.000, (0 split)
## L2 < 23.65 to the right, agree=1.000, adj=1.000, (0 split)
## L3 < 25.5 to the right, agree=1.000, adj=1.000, (0 split)
## Height < 16.05 to the left, agree=0.786, adj=0.500, (0 split)
## Width < 9.45 to the right, agree=0.714, adj=0.333, (0 split)
##
## Node number 12: 64 observations, complexity param=0.0106383
## predicted class=perch expected loss=0.21875 P(node) =0.4324324
## class counts: 0 0 50 0 9 0 5
## probabilities: 0.000 0.000 0.781 0.000 0.141 0.000 0.078
## left son=24 (40 obs) right son=25 (24 obs)
## Primary splits:
## Height < 27.55 to the left, improve=3.314583, (0 missing)
## Width < 15.65 to the right, improve=1.557526, (0 missing)
## L1 < 30 to the right, improve=1.174970, (0 missing)
## L2 < 32.25 to the right, improve=1.174970, (0 missing)
## Weight < 548 to the right, improve=1.058472, (0 missing)
## Surrogate splits:
## Width < 17.4 to the left, agree=0.719, adj=0.250, (0 split)
## Weight < 267.5 to the left, agree=0.688, adj=0.167, (0 split)
## L3 < 29.05 to the left, agree=0.656, adj=0.083, (0 split)
##
## Node number 13: 13 observations, complexity param=0.03191489
## predicted class=roach expected loss=0.3076923 P(node) =0.08783784
## class counts: 0 0 4 0 9 0 0
## probabilities: 0.000 0.000 0.308 0.000 0.692 0.000 0.000
## left son=26 (3 obs) right son=27 (10 obs)
## Primary splits:
## Height < 24.8 to the left, improve=3.7384620, (0 missing)
## Weight < 174.5 to the right, improve=1.0051280, (0 missing)
## L1 < 22.5 to the right, improve=1.0051280, (0 missing)
## L2 < 24.5 to the right, improve=1.0051280, (0 missing)
## L3 < 21.1 to the left, improve=0.4273504, (0 missing)
##
## Node number 14: 16 observations
## predicted class=pike expected loss=0 P(node) =0.1081081
## class counts: 0 0 0 16 0 0 0
## probabilities: 0.000 0.000 0.000 1.000 0.000 0.000 0.000
##
## Node number 15: 12 observations
## predicted class=smelt expected loss=0 P(node) =0.08108108
## class counts: 0 0 0 0 0 12 0
## probabilities: 0.000 0.000 0.000 0.000 0.000 1.000 0.000
##
## Node number 24: 40 observations
## predicted class=perch expected loss=0.075 P(node) =0.2702703
## class counts: 0 0 37 0 3 0 0
## probabilities: 0.000 0.000 0.925 0.000 0.075 0.000 0.000
##
## Node number 25: 24 observations, complexity param=0.0106383
## predicted class=perch expected loss=0.4583333 P(node) =0.1621622
## class counts: 0 0 13 0 6 0 5
## probabilities: 0.000 0.000 0.542 0.000 0.250 0.000 0.208
## left son=50 (9 obs) right son=51 (15 obs)
## Primary splits:
## L1 < 29.5 to the right, improve=2.772222, (0 missing)
## L2 < 31.9 to the right, improve=2.772222, (0 missing)
## Width < 16.45 to the right, improve=2.772222, (0 missing)
## Weight < 295 to the right, improve=2.583333, (0 missing)
## L3 < 32.4 to the right, improve=2.216667, (0 missing)
## Surrogate splits:
## L2 < 31.9 to the right, agree=1.000, adj=1.000, (0 split)
## Weight < 410 to the right, agree=0.958, adj=0.889, (0 split)
## L3 < 32.4 to the right, agree=0.958, adj=0.889, (0 split)
## Width < 16.45 to the right, agree=0.833, adj=0.556, (0 split)
## Height < 29.35 to the right, agree=0.667, adj=0.111, (0 split)
##
## Node number 26: 3 observations
## predicted class=perch expected loss=0 P(node) =0.02027027
## class counts: 0 0 3 0 0 0 0
## probabilities: 0.000 0.000 1.000 0.000 0.000 0.000 0.000
##
## Node number 27: 10 observations
## predicted class=roach expected loss=0.1 P(node) =0.06756757
## class counts: 0 0 1 0 9 0 0
## probabilities: 0.000 0.000 0.100 0.000 0.900 0.000 0.000
##
## Node number 50: 9 observations
## predicted class=perch expected loss=0.1111111 P(node) =0.06081081
## class counts: 0 0 8 0 0 0 1
## probabilities: 0.000 0.000 0.889 0.000 0.000 0.000 0.111
##
## Node number 51: 15 observations, complexity param=0.0106383
## predicted class=roach expected loss=0.6 P(node) =0.1013514
## class counts: 0 0 5 0 6 0 4
## probabilities: 0.000 0.000 0.333 0.000 0.400 0.000 0.267
## left son=102 (11 obs) right son=103 (4 obs)
## Primary splits:
## L3 < 29.25 to the left, improve=2.003030, (0 missing)
## Weight < 247.5 to the left, improve=1.866667, (0 missing)
## L1 < 22.85 to the left, improve=1.866667, (0 missing)
## L2 < 25 to the left, improve=1.866667, (0 missing)
## Height < 28.45 to the left, improve=1.088889, (0 missing)
## Surrogate splits:
## L1 < 24.05 to the left, agree=0.933, adj=0.75, (0 split)
## L2 < 26.25 to the left, agree=0.933, adj=0.75, (0 split)
## Weight < 303 to the left, agree=0.867, adj=0.50, (0 split)
##
## Node number 102: 11 observations, complexity param=0.0106383
## predicted class=perch expected loss=0.5454545 P(node) =0.07432432
## class counts: 0 0 5 0 5 0 1
## probabilities: 0.000 0.000 0.455 0.000 0.455 0.000 0.091
## left son=204 (7 obs) right son=205 (4 obs)
## Primary splits:
## Weight < 212.5 to the left, improve=0.4350649, (0 missing)
## L1 < 22.05 to the left, improve=0.4350649, (0 missing)
## L2 < 23.75 to the left, improve=0.4350649, (0 missing)
## L3 < 26.15 to the left, improve=0.4350649, (0 missing)
## Height < 28.5 to the left, improve=0.4350649, (0 missing)
## Surrogate splits:
## L2 < 23.75 to the left, agree=1.000, adj=1.00, (0 split)
## Height < 28.5 to the left, agree=1.000, adj=1.00, (0 split)
## L1 < 21.25 to the left, agree=0.909, adj=0.75, (0 split)
## L3 < 25.4 to the left, agree=0.909, adj=0.75, (0 split)
## Width < 14.95 to the right, agree=0.727, adj=0.25, (0 split)
##
## Node number 103: 4 observations
## predicted class=white expected loss=0.25 P(node) =0.02702703
## class counts: 0 0 0 0 1 0 3
## probabilities: 0.000 0.000 0.000 0.000 0.250 0.000 0.750
##
## Node number 204: 7 observations
## predicted class=roach expected loss=0.4285714 P(node) =0.0472973
## class counts: 0 0 3 0 4 0 0
## probabilities: 0.000 0.000 0.429 0.000 0.571 0.000 0.000
##
## Node number 205: 4 observations
## predicted class=perch expected loss=0.5 P(node) =0.02702703
## class counts: 0 0 2 0 1 0 1
## probabilities: 0.000 0.000 0.500 0.000 0.250 0.000 0.250
fish.prunetree<-prune.rpart(fish.treeorig,cp=0.02)
# 第一種畫法
plot(fish.prunetree)
text(fish.prunetree)
# 第二種畫法
require(rpart.plot)
prp(fish.prunetree, # 模型
faclen=0, # 呈現的變數不要縮寫
fallen.leaves=TRUE, # 讓樹枝以垂直方式呈現
shadow.col="gray", # 最下面的節點塗上陰影
# number of correct classifications / number of observations in that node
extra=2)
L21<-fish$L2-fish$L1
L32<-fish$L3-fish$L2
L31<-fish$L3-fish$L1
newfish<-cbind(fish,L21,L32,L31)
newfish.treenew<-rpart(Species~., data=newfish,method="class",parms=list(split="information"),control=fish.control)
printcp(newfish.treenew)
##
## Classification tree:
## rpart(formula = Species ~ ., data = newfish, method = "class",
## parms = list(split = "information"), control = fish.control)
##
## Variables actually used in tree construction:
## [1] Height L21 L3 L32 Weight
##
## Root node error: 94/148 = 0.63514
##
## n= 148
##
## CP nsplit rel error
## 1 0.351064 0 1.000000
## 2 0.170213 1 0.648936
## 3 0.127660 2 0.478723
## 4 0.106383 3 0.351064
## 5 0.095745 4 0.244681
## 6 0.053191 5 0.148936
## 7 0.047872 6 0.095745
## 8 0.010000 8 0.000000
# 第一種畫法
plot(newfish.treenew)
text(newfish.treenew)
# 第二種畫法
require(rpart.plot)
prp(newfish.treenew, # 模型
faclen=0, # 呈現的變數不要縮寫
fallen.leaves=TRUE, # 讓樹枝以垂直方式呈現
shadow.col="gray", # 最下面的節點塗上陰影
# number of correct classifications / number of observations in that node
extra=2)
fish.control<-rpart.control(minbucket=3,minsplit=10,xval=148)
newfish.treenewcv<- rpart(Species~., data=newfish,method="class",
parms=list(split='information'),control=fish.control)
printcp(newfish.treenewcv)
##
## Classification tree:
## rpart(formula = Species ~ ., data = newfish, method = "class",
## parms = list(split = "information"), control = fish.control)
##
## Variables actually used in tree construction:
## [1] Height L21 L3 L32 Weight
##
## Root node error: 94/148 = 0.63514
##
## n= 148
##
## CP nsplit rel error xerror xstd
## 1 0.351064 0 1.000000 1.000000 0.062302
## 2 0.170213 1 0.648936 0.648936 0.063704
## 3 0.127660 2 0.478723 0.478723 0.059534
## 4 0.106383 3 0.351064 0.351064 0.053870
## 5 0.095745 4 0.244681 0.361702 0.054442
## 6 0.053191 5 0.148936 0.170213 0.040187
## 7 0.047872 6 0.095745 0.180851 0.041267
## 8 0.010000 8 0.000000 0.031915 0.018238
newfish.test<-read.csv("~/Desktop/碩一下/多變量/Fish_test_data.csv")
L31<-newfish.test$L3- newfish.test$L1
L32<-newfish.test$L3- newfish.test$L2
L21<-newfish.test$L2- newfish.test$L1
newfish.test<-cbind(newfish.test,L21,L32,L31)
newfish.tpred<-predict(newfish.treenewcv,newfish.test)
newfish.tpred
## bream parki perch pike roach smelt white
## 1 1 0 0 0 0 0 0
## 2 1 0 0 0 0 0 0
## 3 0 0 1 0 0 0 0
## 4 0 0 1 0 0 0 0
## 5 0 0 0 1 0 0 0
## 6 0 0 0 0 0 1 0
## 7 0 0 0 0 0 1 0
## 8 0 1 0 0 0 0 0
## 9 0 0 0 0 1 0 0
## 10 0 0 0 0 1 0 0
## 11 0 0 0 0 0 0 1
library(MASS)
newfish.lda<-lda(Species~.,data=newfish)
# 有共線性存在
newfish.lda<-lda(Species~Weight+L1+Height+Width+L21+L32,data=newfish)
newfish.lda
## Call:
## lda(Species ~ Weight + L1 + Height + Width + L21 + L32, data = newfish)
##
## Prior probabilities of groups:
## bream parki perch pike roach smelt
## 0.22297297 0.06756757 0.36486486 0.10810811 0.12162162 0.08108108
## white
## 0.03378378
##
## Group means:
## Weight L1 Height Width L21 L32
## bream 636.1818 30.60606 39.52727 14.10000 2.8060606 5.272727
## parki 155.8000 18.62000 39.20000 14.18000 1.6100000 2.430000
## perch 360.9333 25.31852 26.17778 15.78519 2.1259259 1.650000
## pike 742.0625 42.88125 15.85625 10.48125 3.0375000 3.281250
## roach 159.1111 20.66667 26.88333 14.57222 1.6388889 2.716667
## smelt 11.5000 11.34167 16.99167 10.21667 0.6916667 1.091667
## white 477.2000 27.82000 29.10000 15.76000 2.4800000 2.960000
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3 LD4 LD5
## Weight 0.000911022 -0.002710071 0.007553399 0.001688806 0.006182751
## L1 0.132200166 0.036926540 -0.259794107 -0.235599786 -0.330471903
## Height -0.618519868 -0.332732865 -0.053863042 -0.330737436 -0.029226039
## Width 0.464670922 -0.341184928 -0.353062958 0.842951264 -0.201141743
## L21 -0.114071841 0.712452136 -2.278059990 0.277900320 2.700516892
## L32 -2.311243186 2.141452146 0.539501848 1.803654269 -0.461925634
## LD6
## Weight -0.003600115
## L1 -0.119589009
## Height -0.019796935
## Width -0.159484049
## L21 2.813216431
## L32 -0.080912628
##
## Proportion of trace:
## LD1 LD2 LD3 LD4 LD5 LD6
## 0.7998 0.1327 0.0473 0.0167 0.0035 0.0000
newfish.ldapred<-predict(newfish.lda,newfish[,-1])
table(newfish$Species,newfish.ldapred$class)
##
## bream parki perch pike roach smelt white
## bream 33 0 0 0 0 0 0
## parki 0 10 0 0 0 0 0
## perch 0 0 54 0 0 0 0
## pike 0 0 0 16 0 0 0
## roach 0 0 0 0 18 0 0
## smelt 0 0 0 0 0 12 0
## white 0 0 0 0 1 0 4
newfish.ldacv<-lda(Species~Weight+L1+Height+Width+L21+L32,data=newfish,CV=T)
table(newfish$Species,newfish.ldacv$class)
##
## bream parki perch pike roach smelt white
## bream 33 0 0 0 0 0 0
## parki 0 10 0 0 0 0 0
## perch 0 0 54 0 0 0 0
## pike 0 0 0 16 0 0 0
## roach 0 0 0 0 18 0 0
## smelt 0 0 0 0 0 12 0
## white 0 0 0 0 1 0 4
eqscplot(newfish.ldapred$x,type='n',xlab="1st LD",ylab="2nd LD")
fish.species<-c(rep("B",33),rep("W",5),rep("R",18),rep("Pa",10),rep("S",12),rep("Pi",16),rep("Pe",54))
fish.colors<-c(rep(1,33),rep(2,5),rep(3,18),rep(4,10),rep(5,12),rep(6,16),rep(7,54))
text(newfish.ldapred$x[,1:2],fish.species,col=fish.colors)
newfish.ldatest<-predict(newfish.lda,newfish.test)
newfish.ldatest$class
## [1] bream bream perch perch pike smelt smelt parki roach roach white
## Levels: bream parki perch pike roach smelt white
Let us examine how to apply QDA to this dataset. Let us pretend that the multivariate normal assumption is valid (you can check for this by using the R package “MVN”, as shown in the lab of CCA)
Check Normailty
library(MVN)
## sROC 0.1-2 loaded
fish1 <- newfish[,2:6]
mvn(fish1,mvnTest = "hz")
## $multivariateNormality
## Test HZ p value MVN
## 1 Henze-Zirkler 5.571187 0 NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk Weight 0.8814 <0.001 NO
## 2 Shapiro-Wilk L1 0.9696 0.0023 NO
## 3 Shapiro-Wilk L2 0.9720 0.004 NO
## 4 Shapiro-Wilk L3 0.9728 0.0048 NO
## 5 Shapiro-Wilk Height 0.9171 <0.001 NO
##
## $Descriptives
## n Mean Std.Dev Median Min Max 25th 75th
## Weight 148 400.69865 358.364429 272.50 5.9 1650.0 120.000 650.00
## L1 148 26.32905 10.037659 25.30 7.5 59.0 19.000 32.55
## L2 148 28.50676 10.749091 27.40 8.4 63.4 21.000 35.00
## L3 148 31.32230 11.671320 29.35 8.8 68.0 23.025 39.55
## Height 148 28.35811 8.251992 26.90 14.5 44.5 24.300 37.50
## Skew Kurtosis
## Weight 1.1088548 0.87680061
## L1 0.6170594 0.41022666
## L2 0.5731405 0.37843479
## L3 0.4232795 0.03046029
## Height 0.1267317 -1.01732434
newfish.qda<-qda(Species~.,data=newfish)
newfish.q<-read.csv("~/Desktop/碩一下/多變量/QDA.csv")
newfish.qda<-qda(Species~.,data=newfish.q)
newfish.qda<-qda(Species~Weight+L1+Height+Width+L21+L32,data=newfish.q)
newfish.qdapred<-predict(newfish.qda,newfish.q)
table(newfish.q$Species,newfish.qdapred$class)
##
## bream parki perch pike roach smelt
## bream 33 0 0 0 0 0
## parki 0 10 0 0 0 0
## perch 0 0 54 0 0 0
## pike 0 0 0 16 0 0
## roach 0 0 0 0 18 0
## smelt 0 0 0 0 0 12
predict(newfish.qda,newfish.test)$class
## [1] bream bream perch perch pike smelt smelt parki roach roach perch
## Levels: bream parki perch pike roach smelt
newfish.qda<-qda(Species~Weight+L1+Height+Width+L21+L32,data=newfish.q,CV=T)
table(newfish.q$Species,newfish.qda$class)
##
## bream parki perch pike roach smelt
## bream 33 0 0 0 0 0
## parki 0 10 0 0 0 0
## perch 0 0 54 0 0 0
## pike 0 0 0 16 0 0
## roach 0 0 1 0 17 0
## smelt 0 0 1 0 0 11
k = 3
library(class)
newfish.knn<-knn(newfish[,2:10],newfish[,2:10],newfish[,"Species"],k=3,prob=T)
table(newfish$Species,newfish.knn)
## newfish.knn
## bream parki perch pike roach smelt white
## bream 30 1 2 0 0 0 0
## parki 0 6 2 0 2 0 0
## perch 4 0 47 0 2 1 0
## pike 1 0 2 11 1 0 1
## roach 1 0 9 0 7 0 1
## smelt 0 0 0 0 0 12 0
## white 0 0 2 1 0 0 2
k = 2
newfish.knn<-knn(newfish[,2:10],newfish[,2:10],newfish[,"Species"],k=2,prob=T)
table(newfish$Species,newfish.knn)
## newfish.knn
## bream parki perch pike roach smelt white
## bream 31 0 1 0 0 0 1
## parki 0 7 0 0 3 0 0
## perch 1 0 51 0 2 0 0
## pike 0 0 5 11 0 0 0
## roach 1 0 4 0 13 0 0
## smelt 0 0 0 0 0 12 0
## white 0 0 3 0 0 0 2
k = 1
newfish.knn<-knn(newfish[,2:10],newfish[,2:10],newfish[,"Species"],k=1,prob=T)
table(newfish$Species,newfish.knn)
## newfish.knn
## bream parki perch pike roach smelt white
## bream 33 0 0 0 0 0 0
## parki 0 10 0 0 0 0 0
## perch 0 0 54 0 0 0 0
## pike 0 0 0 16 0 0 0
## roach 0 0 0 0 18 0 0
## smelt 0 0 0 0 0 12 0
## white 0 0 0 0 0 0 5
newfish1<-newfish[,c(1,2,3,6,8,9)]
newfish.knncv<-knn.cv(newfish1[,2:6],newfish1[,"Species"],k=1,prob=T)
table(newfish1$Species,newfish.knncv)
## newfish.knncv
## bream parki perch pike roach smelt white
## bream 26 0 4 0 2 0 1
## parki 1 4 0 0 4 0 1
## perch 3 0 37 0 11 1 2
## pike 2 0 4 9 0 0 1
## roach 2 0 10 0 5 0 1
## smelt 0 0 0 0 0 12 0
## white 0 0 3 0 0 0 2
newfish1.test<-newfish.test[,c(1,2,5,7,8)]
newfish.knntest<-knn(newfish1[,2:6],newfish1.test,newfish1[,"Species"],k=1,prob=T)
newfish.knntest
## [1] bream bream perch white perch smelt smelt parki perch perch perch
## attr(,"prob")
## [1] 1 1 1 1 1 1 1 1 1 1 1
## Levels: bream parki perch pike roach smelt white
Again, let us pretend that all assumptions (including “multivariate normal” and “common covariance matrix” over all classes) are valid (need to check that in practice). In order to use this method in R, we need another package.
library(nnet)
newfish.logd<-multinom(Species~.,data=newfish,maxit=250)
## # weights: 77 (60 variable)
## initial value 287.994702
## iter 10 value 189.100680
## iter 20 value 82.739762
## iter 30 value 15.668415
## iter 40 value 0.165377
## iter 50 value 0.003851
## final value 0.000000
## converged
newfish.logd
## Call:
## multinom(formula = Species ~ ., data = newfish, maxit = 250)
##
## Coefficients:
## (Intercept) Weight L1 L2 L3 Height
## parki -29.45533 0.02917110 6.349592 17.8259067 -23.500970 9.645257
## perch -80.11405 0.16021628 3.267803 56.6489219 -53.765483 6.684178
## pike 15.22567 -0.05874368 8.093673 0.9753102 -3.095179 -13.084687
## roach -277.16410 -0.51539078 54.195310 -43.6844449 4.362472 -2.952463
## smelt 455.64639 0.18459382 29.363751 -20.5072505 -10.290211 -13.228223
## white -57.01255 0.19991067 -17.467222 31.7667561 -20.454096 -4.118171
## Width L21 L32 L31
## parki 3.247584 11.476314 -41.326877 -29.850563
## perch 21.052273 53.381119 -110.414404 -57.033286
## pike 21.652958 -7.118363 -4.070489 -11.188852
## roach 40.080837 -97.879755 48.046917 -49.832838
## smelt 18.368009 -49.871001 10.217040 -39.653961
## white 26.549556 49.233978 -52.220852 -2.986874
##
## Residual Deviance: 2.009681e-11
## AIC: 84
table(newfish$Species,predict(newfish.logd,newfish))
##
## bream parki perch pike roach smelt white
## bream 33 0 0 0 0 0 0
## parki 0 10 0 0 0 0 0
## perch 0 0 54 0 0 0 0
## pike 0 0 0 16 0 0 0
## roach 0 0 0 0 18 0 0
## smelt 0 0 0 0 0 12 0
## white 0 0 0 0 0 0 5
Now we examine the true error rate by cross-validation. To do this, we use another package called “glmnet”:
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
x <- as.matrix(newfish[,-1])
y <- newfish$Species
cvfit <- cv.glmnet(x, y, family='multinomial', type.measure='class', nfolds=148)
predict.value <- predict(cvfit, x, s = "lambda.min", type = "class")
table(predict.value,newfish$Species)
##
## predict.value bream parki perch pike roach smelt white
## bream 33 0 0 0 0 0 0
## parki 0 10 0 0 0 0 0
## perch 0 0 54 0 0 0 0
## pike 0 0 0 16 0 0 0
## roach 0 0 0 0 18 0 0
## smelt 0 0 0 0 0 12 0
## white 0 0 0 0 0 0 5
predict(newfish.logd,newfish.test)
## [1] bream bream perch perch pike smelt smelt parki roach roach white
## Levels: bream parki perch pike roach smelt white