Algorithm Evaluation : Accuracy and Kappa

library(readr)
kappa <- read_csv("~/Proposed papers/kappa.csv")

## Parsed with column specification:
## cols(
##   seed = col_double(),
##   algorithm = col_character(),
##   statistic = col_character(),
##   value = col_double()
## )

View(kappa)
attach(kappa)

library(ggplot2)
library(extrafontdb)
library(MASS)
library(pscl)

## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(gridExtra)
library(repr) ### adjusting the length and width of your plot
library(beanplot)
library("devtools")

## Warning: package 'devtools' was built under R version 3.6.1

## Loading required package: usethis

## Warning: package 'usethis' was built under R version 3.6.1

library("yarrr")

## Loading required package: jpeg

## Loading required package: BayesFactor

## Loading required package: coda

## Warning: package 'coda' was built under R version 3.6.1

## Loading required package: Matrix

## ************
## Welcome to BayesFactor 0.9.12-4.2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
## 
## Type BFManual() to open the manual.
## ************

## Loading required package: circlize

## ========================================
## circlize version 0.4.6
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: http://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization 
##   in R. Bioinformatics 2014.
## ========================================

## yarrr v0.1.5. Citation info at citation('yarrr'). Package guide at yarrr.guide()

## Email me at Nathaniel.D.Phillips.is@gmail.com

## 
## Attaching package: 'yarrr'

## The following object is masked from 'package:ggplot2':
## 
##     diamonds

library(agricolae)
library(easynls)
library(MVN)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

## sROC 0.1-2 loaded

library(lme4)

## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car

library(ggsignif)
library(ggpubr)

## Warning: package 'ggpubr' was built under R version 3.6.1

## Loading required package: magrittr

library(multcomp)

## Loading required package: mvtnorm

## Loading required package: survival

## Loading required package: TH.data

## 
## Attaching package: 'TH.data'

## The following object is masked from 'package:MASS':
## 
##     geyser

library(multcompView)
library(PerformanceAnalytics)

## Warning: package 'PerformanceAnalytics' was built under R version 3.6.1

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo

## 
## Attaching package: 'PerformanceAnalytics'

## The following objects are masked from 'package:agricolae':
## 
##     kurtosis, skewness

## The following object is masked from 'package:graphics':
## 
##     legend

algorithm=as.factor(algorithm)
statistic=as.factor(statistic)

md<-aov(value~algorithm+statistic,data=kappa)
summary(md)

##              Df Sum Sq Mean Sq  F value Pr(>F)    
## algorithm     5  0.132   0.026    34.43 <2e-16 ***
## statistic     1 12.205  12.205 15950.67 <2e-16 ***
## Residuals   197  0.151   0.001                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

###################################################

x<- LSD.test(md,"algorithm",alpha=0.05,group=FALSE)
diffograph(x,cex.axis=0.8,xlab="Accuracy",ylab="Accuracy]")

### HSD grouping of means
h<-HSD.test(md,"algorithm",group =TRUE)
h

## $statistics
##        MSerror  Df      Mean       CV        MSD
##   0.0007651991 197 0.3119951 8.866239 0.01930859
## 
## $parameters
##    test    name.t ntr StudentizedRange alpha
##   Tukey algorithm   6         4.070079  0.05
## 
## $means
##          value       std  r   Min   Max    Q25    Q50     Q75
## CART 0.3058529 0.2435119 34 0.044 0.596 0.0660 0.3000 0.53950
## KNN  0.3255588 0.2553190 34 0.040 0.604 0.0730 0.3220 0.57725
## LDA  0.3275294 0.2634035 34 0.038 0.611 0.0660 0.3380 0.58325
## LR   0.3266176 0.2594426 34 0.035 0.598 0.0675 0.3320 0.58200
## NB   0.3285000 0.2663417 34 0.046 0.613 0.0630 0.3355 0.58825
## SVM  0.2579118 0.2057043 34 0.034 0.511 0.0560 0.2500 0.46000
## 
## $comparison
## NULL
## 
## $groups
##          value groups
## NB   0.3285000      a
## LDA  0.3275294      a
## LR   0.3266176      a
## KNN  0.3255588      a
## CART 0.3058529      b
## SVM  0.2579118      c
## 
## attr(,"class")
## [1] "group"

acc<-subset(kappa,statistic=='accuracy')

md<-aov(value~algorithm,data=acc)
summary(md)

##             Df  Sum Sq Mean Sq F value Pr(>F)    
## algorithm    5 0.21396 0.04279   115.4 <2e-16 ***
## Residuals   96 0.03559 0.00037                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

###################################################

x<- LSD.test(md,"algorithm",alpha=0.05,group=FALSE)
diffograph(x,cex.axis=0.8,xlab="Accuracy",ylab="Accuracy]")

### HSD grouping of means
h<-HSD.test(md,"algorithm",group =TRUE)
h

## $statistics
##        MSerror Df     Mean       CV        MSD
##   0.0003707243 96 0.556598 3.459265 0.01920542
## 
## $parameters
##    test    name.t ntr StudentizedRange alpha
##   Tukey algorithm   6         4.112659  0.05
## 
## $means
##          value        std  r   Min   Max   Q25   Q50   Q75
## CART 0.5450588 0.02268664 17 0.504 0.596 0.533 0.540 0.557
## KNN  0.5762941 0.02302923 17 0.538 0.604 0.562 0.579 0.596
## LDA  0.5864706 0.01398713 17 0.567 0.611 0.574 0.584 0.599
## LR   0.5817647 0.01001580 17 0.555 0.598 0.579 0.582 0.586
## NB   0.5903529 0.01625554 17 0.560 0.613 0.579 0.589 0.606
## SVM  0.4596471 0.02488207 17 0.418 0.511 0.448 0.460 0.477
## 
## $comparison
## NULL
## 
## $groups
##          value groups
## NB   0.5903529      a
## LDA  0.5864706      a
## LR   0.5817647      a
## KNN  0.5762941      a
## CART 0.5450588      b
## SVM  0.4596471      c
## 
## attr(,"class")
## [1] "group"

####
stat_box_data <- function(value, upper_limit = max(value)*1.5) {
  return( 
    data.frame(
      y =0.65,
      label = paste('N =', length(value), '\n',
                    'Mean =', round(mean(value), 3), '\n')
    )
  )
}

theme_set(theme_gray(base_size = 10))
ggplot(acc, aes(x =algorithm, y = value))+ geom_boxplot(size=0.7,varwidth = FALSE,outlier.colour = "red",outlier.shape = 2, shape=2,fill=c("grey","grey","grey","grey","grey","grey")) + geom_smooth(method=lm)+ ylab("Accuracy") + xlab("Algorithm")+stat_summary(fun.y=mean, geom="point", shape=10, size=2, color="blue", fill="red")+stat_summary(
    fun.data = stat_box_data, 
    geom = "text", 
    hjust = 0.5,
    vjust = 0.9,
    size=2,
    color="blue"
  )+ theme_classic(base_size = 14)+ theme(axis.text.x = element_text(face="bold", color="black", 
                           size=10),
          axis.text.y = element_text(face="bold", color="black", 
                           size=10))

ggplot(acc, aes(x =seed , y = value,fill=algorithm)) +  
  geom_point(size=3,aes(shape=algorithm)) + 
  geom_smooth(method=lm, position = "jitter", level = 0.95,aes(color=algorithm), formula = y ~ x + I(x^2)+I(x^3)+I(x^4))+ 
  ylim(0.4,0.65) + xlim(0,20)+ylab("Accuracy") + 
  xlab("seed")+ theme_classic(base_size = 14)+ theme(axis.text.x = element_text(face="bold", color="black", 
                           size=10),
          axis.text.y = element_text(face="bold", color="black", 
                           size=10))

## Warning: Removed 5 rows containing missing values (geom_smooth).

p <- ggplot(data = kappa, 
    aes(x=seed, y=value, color=algorithm,shape=statistic))+
    geom_point(size=3)+ylab("Accuracy") + geom_line(shape=algorithm)+
  xlab("Pseudorandom number (seed)")+ theme_classic(base_size = 14)+ theme(axis.text.x = element_text(face="bold", color="black", 
                           size=10),
          axis.text.y = element_text(face="bold", color="black", 
                           size=10))

## Warning: Ignoring unknown parameters: shape

qplot(data = kappa, x =seed, y = value, colour = statistic) + 
  stat_ellipse(geom = "polygon", alpha = 1/2, aes(fill = algorithm))

ggplot(kappa, aes(seed, value, color = statistic > 3)) +
  geom_point() +
  stat_ellipse(type = "norm", linetype = 2) +
  stat_ellipse(type = "t")

ggplot(kappa, aes(seed, value, color = algorithm )) +
  geom_point(size=3,shape=algorithm) +
  stat_ellipse(type = "norm", linetype = 2) +
  stat_ellipse(type = "t")+
  xlab("Pseudorandom number (seed)")+ylab("Accuracy")+ theme_classic(base_size = 14)+ theme(axis.text.x = element_text(face="bold", color="black", 
                           size=10),
          axis.text.y = element_text(face="bold", color="black", 
                           size=10))

evaluation based on cross validation

library(readr)
vali <- read_csv("~/Proposed papers/vali.csv")

## Parsed with column specification:
## cols(
##   validation = col_double(),
##   algorithm = col_character(),
##   accuracy = col_double(),
##   kappa = col_double()
## )

View(vali)
attach(vali)

## The following objects are masked _by_ .GlobalEnv:
## 
##     algorithm, kappa

## The following object is masked from kappa:
## 
##     algorithm

algorithm=as.factor(algorithm)

p <- ggplot(data = vali, 
    aes(x=validation, y=accuracy, color=algorithm,shape=algorithm))+
    geom_point(size=3)+ylab("Accuracy") + geom_line(shape=algorithm)+
  xlab("Cross Validation")+ theme_classic(base_size = 14)+ theme(axis.text.x = element_text(face="bold", color="black", 
                           size=10),
          axis.text.y = element_text(face="bold", color="black", 
                           size=10))

## Warning: Ignoring unknown parameters: shape

################################ kappa

p <- ggplot(data = vali, 
    aes(x=validation, y=kappa, color=algorithm,shape=algorithm))+
    geom_point(size=3)+ylab("Kappa") + geom_line(shape=algorithm)+
  xlab("Cross Validation")+ theme_classic(base_size = 14)+ theme(axis.text.x = element_text(face="bold", color="black", 
                           size=10),
          axis.text.y = element_text(face="bold", color="black", 
                           size=10))

## Warning: Ignoring unknown parameters: shape

library(readr)
machine_py <- read_csv("~/Proposed papers/machine_py.csv")

## Parsed with column specification:
## cols(
##   Conv = col_double(),
##   CA_rotation = col_double(),
##   CA_sole = col_double(),
##   CA_intercrop = col_double(),
##   agroecoregion = col_character()
## )

#View(machine_py)
attach(machine_py)
agroecoregion=as.factor(agroecoregion)

co<-data.frame(Conv,CA_rotation,CA_sole,CA_intercrop)
chart.Correlation(co)

describeBy(co,agroecoregion)

## 
##  Descriptive statistics by group 
## group: Highlands
##              vars   n    mean      sd  median trimmed     mad    min
## Conv            1 257 2508.78 1819.49 2220.00 2332.06 1749.47 106.67
## CA_rotation     2 257 3545.08 1975.66 3315.00 3452.58 2289.91 199.00
## CA_sole         3 257 3654.53 1847.33 3603.00 3568.23 2077.12 148.00
## CA_intercrop    4 257 2754.88 2091.53 2133.32 2484.24 1976.79 110.00
##                  max   range skew kurtosis     se
## Conv         9000.00 8893.33 0.90     0.67 113.50
## CA_rotation  8773.59 8574.59 0.38    -0.61 123.24
## CA_sole      8931.00 8783.00 0.41    -0.26 115.23
## CA_intercrop 8900.00 8790.00 1.04     0.48 130.47
## -------------------------------------------------------- 
## group: Lowlands
##              vars   n    mean      sd  median trimmed     mad    min
## Conv            1 257 2421.01 1649.87 2166.92 2270.86 1730.07 110.00
## CA_rotation     2 257 3209.30 1826.53 2895.76 3065.44 1790.78 280.00
## CA_sole         3 257 3072.35 1694.56 2735.80 2917.76 1484.11 231.82
## CA_intercrop    4 257 3061.76 1668.72 2866.99 2943.00 1728.50 120.00
##                  max   range skew kurtosis     se
## Conv         8814.60 8704.60 0.88     0.70 102.92
## CA_rotation  8585.56 8305.56 0.67     0.04 113.94
## CA_sole      8819.20 8587.38 0.88     0.45 105.70
## CA_intercrop 7910.00 7790.00 0.62    -0.13 104.09

t.test(Conv~agroecoregion,data = machine_py)

## 
##  Welch Two Sample t-test
## 
## data:  Conv by agroecoregion
## t = 0.57285, df = 507.17, p-value = 0.567
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -213.2375  388.7699
## sample estimates:
## mean in group Highlands  mean in group Lowlands 
##                2508.776                2421.010

t.test(CA_rotation~agroecoregion,data = machine_py)

## 
##  Welch Two Sample t-test
## 
## data:  CA_rotation by agroecoregion
## t = 2.0006, df = 508.88, p-value = 0.04596
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##    6.040142 665.514744
## sample estimates:
## mean in group Highlands  mean in group Lowlands 
##                3545.080                3209.302

t.test(CA_sole~agroecoregion,data = machine_py)

## 
##  Welch Two Sample t-test
## 
## data:  CA_sole by agroecoregion
## t = 3.723, df = 508.23, p-value = 0.0002189
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  274.9612 889.3895
## sample estimates:
## mean in group Highlands  mean in group Lowlands 
##                3654.530                3072.355

t.test(CA_intercrop~agroecoregion,data = machine_py)

## 
##  Welch Two Sample t-test
## 
## data:  CA_intercrop by agroecoregion
## t = -1.8386, df = 487.94, p-value = 0.06658
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -634.81104   21.06287
## sample estimates:
## mean in group Highlands  mean in group Lowlands 
##                2754.882                3061.756

library(rpart)


library(party)

## Warning: package 'party' was built under R version 3.6.1

## Loading required package: grid

## Loading required package: modeltools

## Loading required package: stats4

## 
## Attaching package: 'modeltools'

## The following object is masked from 'package:lme4':
## 
##     refit

## The following object is masked from 'package:BayesFactor':
## 
##     posterior

## Loading required package: strucchange

## Warning: package 'strucchange' was built under R version 3.6.1

## Loading required package: sandwich

library(tree)

## Warning: package 'tree' was built under R version 3.6.1

## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli

library(partykit)

## Warning: package 'partykit' was built under R version 3.6.1

## Loading required package: libcoin

## Warning: package 'libcoin' was built under R version 3.6.1

## 
## Attaching package: 'partykit'

## The following objects are masked from 'package:party':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner,
##     node_surv, node_terminal, varimp

### grow a tree

fit <- rpart(agroecoregion~Conv+CA_rotation+CA_sole+CA_intercrop,
   method="class", data=machine_py)
printcp(fit)

## 
## Classification tree:
## rpart(formula = agroecoregion ~ Conv + CA_rotation + CA_sole + 
##     CA_intercrop, data = machine_py, method = "class")
## 
## Variables actually used in tree construction:
## [1] CA_intercrop CA_rotation  CA_sole      Conv        
## 
## Root node error: 257/514 = 0.5
## 
## n= 514 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.182879      0   1.00000 1.12451 0.043765
## 2 0.101167      1   0.81712 0.85992 0.043673
## 3 0.027237      2   0.71595 0.74708 0.042674
## 4 0.013619      5   0.63424 0.72763 0.042440
## 5 0.011673      7   0.60700 0.76265 0.042848
## 6 0.010000      8   0.59533 0.73541 0.042536

plotcp(fit)

# visualize cross-validation results 
summary(fit) # detailed summary of splits

## Call:
## rpart(formula = agroecoregion ~ Conv + CA_rotation + CA_sole + 
##     CA_intercrop, data = machine_py, method = "class")
##   n= 514 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.18287938      0 1.0000000 1.1245136 0.04376486
## 2 0.10116732      1 0.8171206 0.8599222 0.04367323
## 3 0.02723735      2 0.7159533 0.7470817 0.04267405
## 4 0.01361868      5 0.6342412 0.7276265 0.04244045
## 5 0.01167315      7 0.6070039 0.7626459 0.04284764
## 6 0.01000000      8 0.5953307 0.7354086 0.04253612
## 
## Variable importance
## CA_intercrop      CA_sole         Conv  CA_rotation 
##           45           29           14           12 
## 
## Node number 1: 514 observations,    complexity param=0.1828794
##   predicted class=Highlands  expected loss=0.5  P(node) =1
##     class counts:   257   257
##    probabilities: 0.500 0.500 
##   left son=2 (117 obs) right son=3 (397 obs)
##   Primary splits:
##       CA_intercrop < 1261.024 to the left,  improve=12.222290, (0 missing)
##       CA_sole      < 3498.759 to the right, improve=11.933030, (0 missing)
##       CA_rotation  < 5022.483 to the right, improve= 4.425696, (0 missing)
##       Conv         < 309.1429 to the left,  improve= 3.470042, (0 missing)
##   Surrogate splits:
##       CA_rotation < 276.5    to the left,  agree=0.776, adj=0.017, (0 split)
##       CA_sole     < 8872.1   to the right, agree=0.776, adj=0.017, (0 split)
## 
## Node number 2: 117 observations
##   predicted class=Highlands  expected loss=0.2991453  P(node) =0.2276265
##     class counts:    82    35
##    probabilities: 0.701 0.299 
## 
## Node number 3: 397 observations,    complexity param=0.1011673
##   predicted class=Lowlands   expected loss=0.440806  P(node) =0.7723735
##     class counts:   175   222
##    probabilities: 0.441 0.559 
##   left son=6 (162 obs) right son=7 (235 obs)
##   Primary splits:
##       CA_sole      < 3498.759 to the right, improve=10.642600, (0 missing)
##       CA_intercrop < 7955     to the right, improve= 5.759121, (0 missing)
##       CA_rotation  < 4912.688 to the right, improve= 3.957297, (0 missing)
##       Conv         < 4410.106 to the right, improve= 3.625561, (0 missing)
##   Surrogate splits:
##       CA_rotation  < 4912.688 to the right, agree=0.650, adj=0.142, (0 split)
##       Conv         < 2770.3   to the right, agree=0.610, adj=0.043, (0 split)
##       CA_intercrop < 6317.155 to the right, agree=0.602, adj=0.025, (0 split)
## 
## Node number 6: 162 observations,    complexity param=0.02723735
##   predicted class=Highlands  expected loss=0.4197531  P(node) =0.3151751
##     class counts:    94    68
##    probabilities: 0.580 0.420 
##   left son=12 (62 obs) right son=13 (100 obs)
##   Primary splits:
##       CA_intercrop < 3751.362 to the right, improve=2.578742, (0 missing)
##       CA_rotation  < 797.0455 to the right, improve=1.674107, (0 missing)
##       CA_sole      < 4553.112 to the left,  improve=1.629697, (0 missing)
##       Conv         < 4384.598 to the right, improve=1.623725, (0 missing)
##   Surrogate splits:
##       Conv        < 4531.065 to the right, agree=0.667, adj=0.129, (0 split)
##       CA_sole     < 3545.556 to the left,  agree=0.630, adj=0.032, (0 split)
##       CA_rotation < 6675.363 to the right, agree=0.623, adj=0.016, (0 split)
## 
## Node number 7: 235 observations,    complexity param=0.02723735
##   predicted class=Lowlands   expected loss=0.3446809  P(node) =0.4571984
##     class counts:    81   154
##    probabilities: 0.345 0.655 
##   left son=14 (7 obs) right son=15 (228 obs)
##   Primary splits:
##       CA_intercrop < 7260.057 to the right, improve=6.196790, (0 missing)
##       CA_rotation  < 4526.093 to the right, improve=2.910365, (0 missing)
##       CA_sole      < 3331.514 to the left,  improve=2.742059, (0 missing)
##       Conv         < 4269.529 to the right, improve=2.503498, (0 missing)
##   Surrogate splits:
##       Conv < 8130.172 to the right, agree=0.983, adj=0.429, (0 split)
## 
## Node number 12: 62 observations
##   predicted class=Highlands  expected loss=0.3064516  P(node) =0.1206226
##     class counts:    43    19
##    probabilities: 0.694 0.306 
## 
## Node number 13: 100 observations,    complexity param=0.02723735
##   predicted class=Highlands  expected loss=0.49  P(node) =0.1945525
##     class counts:    51    49
##    probabilities: 0.510 0.490 
##   left son=26 (44 obs) right son=27 (56 obs)
##   Primary splits:
##       CA_sole      < 4532.199 to the left,  improve=4.639091, (0 missing)
##       CA_intercrop < 1940     to the left,  improve=3.245873, (0 missing)
##       Conv         < 804.4232 to the left,  improve=2.475017, (0 missing)
##       CA_rotation  < 6347.129 to the left,  improve=1.638120, (0 missing)
##   Surrogate splits:
##       CA_intercrop < 1989.548 to the left,  agree=0.64, adj=0.182, (0 split)
##       Conv         < 1026.515 to the left,  agree=0.63, adj=0.159, (0 split)
##       CA_rotation  < 2099.317 to the left,  agree=0.59, adj=0.068, (0 split)
## 
## Node number 14: 7 observations
##   predicted class=Highlands  expected loss=0  P(node) =0.01361868
##     class counts:     7     0
##    probabilities: 1.000 0.000 
## 
## Node number 15: 228 observations,    complexity param=0.01361868
##   predicted class=Lowlands   expected loss=0.3245614  P(node) =0.4435798
##     class counts:    74   154
##    probabilities: 0.325 0.675 
##   left son=30 (47 obs) right son=31 (181 obs)
##   Primary splits:
##       CA_rotation  < 4526.093 to the right, improve=3.215882, (0 missing)
##       CA_sole      < 3331.514 to the left,  improve=2.434958, (0 missing)
##       CA_intercrop < 4016.424 to the left,  improve=2.247850, (0 missing)
##       Conv         < 4058.735 to the right, improve=1.375439, (0 missing)
##   Surrogate splits:
##       CA_sole      < 3489.13  to the right, agree=0.803, adj=0.043, (0 split)
##       CA_intercrop < 6116.648 to the right, agree=0.798, adj=0.021, (0 split)
## 
## Node number 26: 44 observations
##   predicted class=Highlands  expected loss=0.3181818  P(node) =0.08560311
##     class counts:    30    14
##    probabilities: 0.682 0.318 
## 
## Node number 27: 56 observations
##   predicted class=Lowlands   expected loss=0.375  P(node) =0.1089494
##     class counts:    21    35
##    probabilities: 0.375 0.625 
## 
## Node number 30: 47 observations,    complexity param=0.01361868
##   predicted class=Lowlands   expected loss=0.4893617  P(node) =0.09143969
##     class counts:    23    24
##    probabilities: 0.489 0.511 
##   left son=60 (33 obs) right son=61 (14 obs)
##   Primary splits:
##       Conv         < 3065     to the left,  improve=3.0175000, (0 missing)
##       CA_intercrop < 5065.28  to the left,  improve=2.7166340, (0 missing)
##       CA_rotation  < 6995.561 to the left,  improve=1.9750760, (0 missing)
##       CA_sole      < 1817.72  to the left,  improve=0.9395782, (0 missing)
##   Surrogate splits:
##       CA_rotation  < 7854.061 to the left,  agree=0.766, adj=0.214, (0 split)
##       CA_intercrop < 6182.448 to the left,  agree=0.723, adj=0.071, (0 split)
## 
## Node number 31: 181 observations
##   predicted class=Lowlands   expected loss=0.281768  P(node) =0.3521401
##     class counts:    51   130
##    probabilities: 0.282 0.718 
## 
## Node number 60: 33 observations,    complexity param=0.01167315
##   predicted class=Highlands  expected loss=0.3939394  P(node) =0.06420233
##     class counts:    20    13
##    probabilities: 0.606 0.394 
##   left son=120 (24 obs) right son=121 (9 obs)
##   Primary splits:
##       CA_intercrop < 4212.765 to the left,  improve=1.8409090, (0 missing)
##       CA_rotation  < 5422.03  to the right, improve=0.6987522, (0 missing)
##       CA_sole      < 1817.72  to the left,  improve=0.4848485, (0 missing)
##       Conv         < 605.9165 to the left,  improve=0.4375758, (0 missing)
##   Surrogate splits:
##       CA_rotation < 4640.931 to the right, agree=0.788, adj=0.222, (0 split)
##       Conv        < 717.6149 to the right, agree=0.758, adj=0.111, (0 split)
## 
## Node number 61: 14 observations
##   predicted class=Lowlands   expected loss=0.2142857  P(node) =0.02723735
##     class counts:     3    11
##    probabilities: 0.214 0.786 
## 
## Node number 120: 24 observations
##   predicted class=Highlands  expected loss=0.2916667  P(node) =0.04669261
##     class counts:    17     7
##    probabilities: 0.708 0.292 
## 
## Node number 121: 9 observations
##   predicted class=Lowlands   expected loss=0.3333333  P(node) =0.01750973
##     class counts:     3     6
##    probabilities: 0.333 0.667

plot(fit)

# plot tree 
plot(fit, uniform=TRUE, 
   main="Regression Tree for yield based on Agro-ecology ")
text(fit, use.n=TRUE, all=TRUE, cex=.6)

###################################################################

rpart.tree <- rpart(agroecoregion~Conv+CA_rotation+CA_sole+CA_intercrop, data=machine_py)

rparty.tree <- as.party(rpart.tree)
rparty.tree

## 
## Model formula:
## agroecoregion ~ Conv + CA_rotation + CA_sole + CA_intercrop
## 
## Fitted party:
## [1] root
## |   [2] CA_intercrop < 1261.02413: Highlands (n = 117, err = 29.9%)
## |   [3] CA_intercrop >= 1261.02413
## |   |   [4] CA_sole >= 3498.75942
## |   |   |   [5] CA_intercrop >= 3751.36174: Highlands (n = 62, err = 30.6%)
## |   |   |   [6] CA_intercrop < 3751.36174
## |   |   |   |   [7] CA_sole < 4532.19876: Highlands (n = 44, err = 31.8%)
## |   |   |   |   [8] CA_sole >= 4532.19876: Lowlands (n = 56, err = 37.5%)
## |   |   [9] CA_sole < 3498.75942
## |   |   |   [10] CA_intercrop >= 7260.05747: Highlands (n = 7, err = 0.0%)
## |   |   |   [11] CA_intercrop < 7260.05747
## |   |   |   |   [12] CA_rotation >= 4526.09261
## |   |   |   |   |   [13] Conv < 3065
## |   |   |   |   |   |   [14] CA_intercrop < 4212.76525: Highlands (n = 24, err = 29.2%)
## |   |   |   |   |   |   [15] CA_intercrop >= 4212.76525: Lowlands (n = 9, err = 33.3%)
## |   |   |   |   |   [16] Conv >= 3065: Lowlands (n = 14, err = 21.4%)
## |   |   |   |   [17] CA_rotation < 4526.09261: Lowlands (n = 181, err = 28.2%)
## 
## Number of inner nodes:    8
## Number of terminal nodes: 9

plot(rparty.tree,cex=.01,width=16, height=11, pointsize=5)

# create additional plots
par(mfrow=c(1,2)) # two plots on one page
rsq.rpart(rpart.tree) # visualize cross-validation results

## 
## Classification tree:
## rpart(formula = agroecoregion ~ Conv + CA_rotation + CA_sole + 
##     CA_intercrop, data = machine_py)
## 
## Variables actually used in tree construction:
## [1] CA_intercrop CA_rotation  CA_sole      Conv        
## 
## Root node error: 257/514 = 0.5
## 
## n= 514 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.182879      0   1.00000 1.04280 0.044068
## 2 0.101167      1   0.81712 0.91829 0.043961
## 3 0.027237      2   0.71595 0.79377 0.043160
## 4 0.013619      5   0.63424 0.80934 0.043299
## 5 0.011673      7   0.60700 0.81712 0.043364
## 6 0.010000      8   0.59533 0.80934 0.043299

## Warning in rsq.rpart(rpart.tree): may not be applicable for this method

rpart.tree <- rpart(Conv~agroecoregion, data=machine_py)

rparty.tree <- as.party(rpart.tree)
rparty.tree

## 
## Model formula:
## Conv ~ agroecoregion
## 
## Fitted party:
## [1] root: 2464.893 (n = 514, err = 1545338258.1) 
## 
## Number of inner nodes:    0
## Number of terminal nodes: 1

plot(rparty.tree,cex=.01,width=16, height=11, pointsize=5)

# create additional plots
par(mfrow=c(1,2)) # two plots on one page
rsq.rpart(rpart.tree) #

## 
## Regression tree:
## rpart(formula = Conv ~ agroecoregion, data = machine_py)
## 
## Variables actually used in tree construction:
## character(0)
## 
## Root node error: 1545338258/514 = 3006495
## 
## n= 514 
## 
##           CP nsplit rel error xerror xstd
## 1 0.00064052      0         1      0    0

Algorithm Evaluation : Accuracy and Kappa

Lovemore Chipindu [lovemore.datascience@gmail.com], [0778796212]

6/8/2019