Helper functions
rules<-function(data,formula_string){
data<-analysis(data)
tree<-rpart::rpart(formula(formula_string),data=data, control = rpart.control(minsplit = 10))
rpart.rules(tree)
}
nrules<-function(data,formula_string){
data<-analysis(data)
tree<-rpart::rpart(formula(formula_string),data=data, control = rpart.control(minsplit = 10))
#tree<-prune(tree,cp=0.010)
rpart.rules(tree) %>% nrow()
#preds<-predict(tree,retropostion_data,type = 'class')
#caret::confusionMatrix(preds,as.factor(retropostion_data$N_POSITION))
}
plot_ranges <- function(resamples,formula_s) {
resamples$trees <- map(.x=resamples$splits,formula_string=formula_s, rules)
resamples$trees <-
map(resamples$trees, function(x)
if (x %>% ncol() == 8)
x)
resamples$trees <-
map(resamples$trees, function(x)
if (!is.null(x)) {
names(x) <- 1:8
x
})
fullset_trees <- do.call(rbind, resamples$trees)
fullset_trees <- fullset_trees %>% as.data.frame()
ggplot(fullset_trees) +
facet_wrap( ~ `1` + `5`,ncol = 2) +
geom_boxplot(aes(x = 1, y = fullset_trees[['8']] %>% as.double()), color =
'red') +
geom_boxplot(aes(x = 2, y = fullset_trees[['6']] %>% as.double()), color =
'blue') +
#geom_line(aes(x = fullset_trees[['8']] %>% as.double()), stat = "density", adjust = 1.25,color='red') +
#geom_line(aes(x = fullset_trees[['6']] %>% as.double()), stat = "density", adjust = 1.25,color='blue') +
xlab("CART: POSITION ranges: Upper limit in red, Lower limit in blue.") +
ylab("values") +
theme_bw()
}
plot_nrules <- function(resamples,formula_s) {
nrules <- map(.x=resamples$splits,formula_string=formula_s, nrules)
fullset_nrules <- do.call(rbind, nrules)
ggplot(fullset_nrules %>% as.data.frame(), aes(x = V1)) +
geom_line(stat = "density", adjust = 1.25) +
#geom_boxplot(x = as.integer(1), color = 'red') +
xlab("CART: number of rules for POSITION") +
#ylab("rules")+
theme_bw()
}
formula_s="T_POSITION~NORM_T_RETROPOSITION"
plot_rules_overlaping<-function(resamples,formula_s){
resamples$trees <- map(.x=resamples$splits,formula_string=formula_s , rules)
resamples$trees <-
map(resamples$trees, function(x)
if (x %>% ncol() == 8)
x)
resamples$trees <-
map(resamples$trees, function(x)
if (!is.null(x)) {
names(x) <- 1:8
x
})
fullset_trees <- do.call(rbind, resamples$trees)
fullset_trees <- fullset_trees %>% as.data.frame()
fullset_trees_ranges<-fullset_trees %>% as.data.frame() %>% select(c(1,5,6,8))
names(fullset_trees_ranges)<-c("POSITION","condition","lower_limit","upper_limit")
fullset_trees_ranges[['lower_limit']]<- fullset_trees_ranges[['lower_limit']] %>% as.double()
fullset_trees_ranges[['upper_limit']]<- fullset_trees_ranges[['upper_limit']] %>% as.double()
fullset_trees_ranges %>% mutate(ll=ifelse(condition==">=",upper_limit,lower_limit))%>% mutate(ul=ifelse(is.na(lower_limit),1,upper_limit)) %>%
mutate(ul=ifelse(condition=="< ",lower_limit,ul)) %>%
mutate(ll=ifelse(is.na(upper_limit),0,ll)) %>% select(POSITION,ll,ul) %>%
ggplot()+
#geom_line(aes(x=0:0.34,y=POSITION,color=POSITION),size=5)
geom_segment(aes(y=POSITION %>% factor,
yend=POSITION %>% factor,
x=ul,
xend=ll,
color=POSITION %>% factor),
size=5,alpha=0.01
)+
xlim(0,1)+
ylab("POSITION")+xlab("Normalized Range")+
theme_bw()+theme(legend.position = "none")
}
NASAL POSITION
CART model
tree <-
rpart::rpart(
N_POSITION ~ NORM_N_RETROPOSITION,
data = retropostion_data,
control = rpart.control(minsplit = 10, xval = 10)
)
rpart.plot(
tree,
type = 1,
extra = 101,
box.palette = "GnBu",
branch.lty = 3,
shadow.col = "gray",
nn = TRUE
)

printcp(tree)
Classification tree:
rpart::rpart(formula = N_POSITION ~ NORM_N_RETROPOSITION, data = retropostion_data,
control = rpart.control(minsplit = 10, xval = 10))
Variables actually used in tree construction:
[1] NORM_N_RETROPOSITION
Root node error: 16/63 = 0.25397
n= 63
CP nsplit rel error xerror xstd
1 0.375 0 1.000 1.000 0.21593
2 0.125 1 0.625 0.625 0.18128
3 0.010 2 0.500 0.625 0.18128
rpart.rules(tree)
preds <- predict(tree, retropostion_data, type = 'class')
cm <-
caret::confusionMatrix(preds, as.factor(retropostion_data$N_POSITION))
cm
Confusion Matrix and Statistics
Reference
Prediction CA CP S
CA 4 0 2
CP 6 47 0
S 0 0 4
Overall Statistics
Accuracy : 0.873
95% CI : (0.765, 0.9435)
No Information Rate : 0.746
P-Value [Acc > NIR] : 0.01097
Kappa : 0.6385
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: CA Class: CP Class: S
Sensitivity 0.40000 1.0000 0.66667
Specificity 0.96226 0.6250 1.00000
Pos Pred Value 0.66667 0.8868 1.00000
Neg Pred Value 0.89474 1.0000 0.96610
Prevalence 0.15873 0.7460 0.09524
Detection Rate 0.06349 0.7460 0.06349
Detection Prevalence 0.09524 0.8413 0.06349
Balanced Accuracy 0.68113 0.8125 0.83333
Calculating confidence intervals using one-tailed binomial test
The binomial test is an exact test of the statistical significance of deviations from a theoretically expected distribution of observations into two categories. for this case we consider each category/class against the rest. The clasical one-vs-all approach for dealing with multiclasses situations
Sulcus
binom.test(cm$table[3,3] ,cm$table[1:3,3] %>% sum())
Exact binomial test
data: cm$table[3, 3] and cm$table[1:3, 3] %>% sum()
number of successes = 4, number of trials = 6, p-value = 0.6875
alternative hypothesis: true probability of success is not equal to 0.5
95 percent confidence interval:
0.2227781 0.9567281
sample estimates:
probability of success
0.6666667
Ciliary Body Posterior
binom.test(cm$table[2,2] ,cm$table[1:3,2] %>% sum())
Exact binomial test
data: cm$table[2, 2] and cm$table[1:3, 2] %>% sum()
number of successes = 47, number of trials = 47, p-value = 1.421e-14
alternative hypothesis: true probability of success is not equal to 0.5
95 percent confidence interval:
0.9245143 1.0000000
sample estimates:
probability of success
1
Ciliary Body Anterior
binom.test(cm$table[1,1] ,cm$table[1:3,1] %>% sum())
Exact binomial test
data: cm$table[1, 1] and cm$table[1:3, 1] %>% sum()
number of successes = 4, number of trials = 10, p-value = 0.7539
alternative hypothesis: true probability of success is not equal to 0.5
95 percent confidence interval:
0.1215523 0.7376219
sample estimates:
probability of success
0.4
Analysis of rules found by CART
LOOCV
Similar to CV with K = N, where N is the size of the dataset. Ideally, this approach has a low variance in the results.
Number of rules per model distribution
resamples<-loo_cv(retropostion_data)
plot_nrules(resamples,"N_POSITION~NORM_N_RETROPOSITION")

Range variation for models with rulset size = 3
Rules can have an upper limit (>=), lower limit(<) or both (is)
plot_ranges(resamples,"N_POSITION~NORM_N_RETROPOSITION")

Range Overlaping for ruleset size = 3
plot_rules_overlaping(resamples,"N_POSITION~NORM_N_RETROPOSITION")

BOOTSTRAP
We generate 2000 new samples with replacement and we build 2000 new CARTs.
Number of rules per model distribution
resamples<-bootstraps(retropostion_data,strata = N_POSITION,times = 2000 )
plot_nrules(resamples,"N_POSITION~NORM_N_RETROPOSITION")

Range variation for models with rulset size = 3
Rules can have an upper limit (>), lower limit(<) or both (is)
plot_ranges(resamples,"N_POSITION~NORM_N_RETROPOSITION")

Range Overlaping for ruleset size = 3
plot_rules_overlaping(resamples,"N_POSITION~NORM_N_RETROPOSITION")

TEMPORAL POSITION
CART model
tree <-
rpart::rpart(
T_POSITION ~ NORM_T_RETROPOSITION,
data = retropostion_data,
control = rpart.control(minsplit = 10, xval = 10)
)
rpart.plot(
tree,
type = 1,
extra = 101,
box.palette = "GnBu",
branch.lty = 3,
shadow.col = "gray",
nn = TRUE
)

printcp(tree)
Classification tree:
rpart::rpart(formula = T_POSITION ~ NORM_T_RETROPOSITION, data = retropostion_data,
control = rpart.control(minsplit = 10, xval = 10))
Variables actually used in tree construction:
[1] NORM_T_RETROPOSITION
Root node error: 14/63 = 0.22222
n= 63
CP nsplit rel error xerror xstd
1 0.500000 0 1.00000 1.00000 0.23570
2 0.035714 1 0.50000 0.57143 0.18877
3 0.010000 3 0.42857 0.57143 0.18877
rpart.rules(tree)
preds <- predict(tree, retropostion_data, type = 'class')
cm <-
caret::confusionMatrix(preds, as.factor(retropostion_data$T_POSITION))
cm
Confusion Matrix and Statistics
Reference
Prediction CA CP S
CA 2 1 0
CP 2 48 1
S 2 0 7
Overall Statistics
Accuracy : 0.9048
95% CI : (0.8041, 0.9642)
No Information Rate : 0.7778
P-Value [Acc > NIR] : 0.007371
Kappa : 0.7261
Mcnemar's Test P-Value : 0.343030
Statistics by Class:
Class: CA Class: CP Class: S
Sensitivity 0.33333 0.9796 0.8750
Specificity 0.98246 0.7857 0.9636
Pos Pred Value 0.66667 0.9412 0.7778
Neg Pred Value 0.93333 0.9167 0.9815
Prevalence 0.09524 0.7778 0.1270
Detection Rate 0.03175 0.7619 0.1111
Detection Prevalence 0.04762 0.8095 0.1429
Balanced Accuracy 0.65789 0.8827 0.9193
Calculating confidence intervals using one-tailed binomial test
Sulcus
binom.test(cm$table[3,3] ,cm$table[1:3,3] %>% sum())
Exact binomial test
data: cm$table[3, 3] and cm$table[1:3, 3] %>% sum()
number of successes = 7, number of trials = 8, p-value = 0.07031
alternative hypothesis: true probability of success is not equal to 0.5
95 percent confidence interval:
0.4734903 0.9968403
sample estimates:
probability of success
0.875
Ciliary Body Posterior
binom.test(cm$table[2,2] ,cm$table[1:3,2] %>% sum())
Exact binomial test
data: cm$table[2, 2] and cm$table[1:3, 2] %>% sum()
number of successes = 48, number of trials = 49, p-value = 1.776e-13
alternative hypothesis: true probability of success is not equal to 0.5
95 percent confidence interval:
0.8914582 0.9994834
sample estimates:
probability of success
0.9795918
Ciliary Body Anterior
binom.test(cm$table[1,1] ,cm$table[1:3,1] %>% sum())
Exact binomial test
data: cm$table[1, 1] and cm$table[1:3, 1] %>% sum()
number of successes = 2, number of trials = 6, p-value = 0.6875
alternative hypothesis: true probability of success is not equal to 0.5
95 percent confidence interval:
0.04327187 0.77722190
sample estimates:
probability of success
0.3333333
Analysis of rules found by CART
LOOCV
Similar to CV with K = N, where N is the size of the dataset. Ideally, this approach has a low variance in the results.
Number of rules per model distribution
resamples<-loo_cv(retropostion_data)
plot_nrules(resamples,"T_POSITION~NORM_T_RETROPOSITION")

Range variation for models with rulset
Rules can have an upper limit (>=), lower limit(<) or both (is) (*) Ruleset should include at least one rule with upper and lower limit
plot_ranges(resamples,"T_POSITION~NORM_T_RETROPOSITION")

Range Overlaping for ruleset
plot_rules_overlaping(resamples,"T_POSITION~NORM_T_RETROPOSITION")

BOOTSTRAP
We generate 2000 new samples with replacement and we build 2000 new CARTs.
Number of rules per model distribution
resamples<-bootstraps(retropostion_data,strata = N_POSITION,times = 2000 )
plot_nrules(resamples,"T_POSITION~NORM_T_RETROPOSITION")

Range variation for models with ruleset
Rules can have an upper limit (>), lower limit(<) or both (is) (*) Ruleset should include at least one rule with upper and lower limit
plot_ranges(resamples,"T_POSITION~NORM_T_RETROPOSITION")

Range Overlaping for ruleset
plot_rules_overlaping(resamples,"T_POSITION~NORM_T_RETROPOSITION")

---
title: "IZ ICL POSITION ANALYSIS (RESAMPLING)"
output: 
  html_notebook: 
    code_folding: hide
    toc: true
    toc_float: true
    toc_collapsed: true
---


```{r message=FALSE, warning=FALSE, include=FALSE}
library(skimr)
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(VIM)
library(readr)
library(rpart.plot)
library(caret)
library(rsample)
library(purrr)
library(ggstance)
```

## Reading and preprocesing datataset

```{r message=FALSE, warning=FALSE, include=FALSE}
postop_data <-
  readr::read_csv(file = "postop_data_cleaned_normalized-version5-retest.csv")
retropostion_data <- postop_data %>% select(
  NAME,
  NORM_N_RETROPOSITION,
  NORM_T_RETROPOSITION,
  POSITION,
  N_RETROPOSITION_MM,
  T_RETROPOSITION_MM,
  RETEST_POSITION,
  NORM_T_RETRO_HALF_IOL,
  NORM_N_RETRO_HALF_IOL,
  NORM_T_RETRO_IOL,
  NORM_N_RETRO_IOL
  
  
) %>% filter(!is.na(RETEST_POSITION)) %>% separate(RETEST_POSITION,
                                                   into = c("N_POSITION", "T_POSITION"),
                                                   sep = '-') %>%
  drop_na(T_POSITION) %>% drop_na(NORM_T_RETROPOSITION)

retropostion_data <-
  retropostion_data %>% mutate(
    T_POSITION = ifelse(T_POSITION == "CM", "CA", T_POSITION),
    N_POSITION = ifelse(N_POSITION == "CM", "CA", N_POSITION)
  )
retropostion_data %>%
  group_by(N_POSITION) %>%
  summarise(total = n())

retropostion_data
```

## Helper functions
```{r}
rules<-function(data,formula_string){
  data<-analysis(data)
  tree<-rpart::rpart(formula(formula_string),data=data, control =  rpart.control(minsplit = 10))
  rpart.rules(tree) 
}
nrules<-function(data,formula_string){
  data<-analysis(data)
  tree<-rpart::rpart(formula(formula_string),data=data, control =  rpart.control(minsplit = 10))
  #tree<-prune(tree,cp=0.010)
  rpart.rules(tree) %>% nrow()
  #preds<-predict(tree,retropostion_data,type = 'class')
  #caret::confusionMatrix(preds,as.factor(retropostion_data$N_POSITION))
}

plot_ranges <- function(resamples,formula_s) {
  resamples$trees <- map(.x=resamples$splits,formula_string=formula_s, rules)
  resamples$trees <-
    map(resamples$trees, function(x)
      if (x %>% ncol() == 8)
        x)
  resamples$trees <-
    map(resamples$trees, function(x)
      if (!is.null(x)) {
        names(x) <- 1:8
        x
      })
  fullset_trees <- do.call(rbind, resamples$trees)
  fullset_trees <- fullset_trees %>% as.data.frame()
  ggplot(fullset_trees) +
    facet_wrap( ~ `1` + `5`,ncol = 2) +
    geom_boxplot(aes(x = 1, y = fullset_trees[['8']]  %>% as.double()), color =
                   'red') +
    geom_boxplot(aes(x = 2, y = fullset_trees[['6']]  %>% as.double()), color =
                   'blue') +
    #geom_line(aes(x = fullset_trees[['8']]  %>% as.double()), stat = "density", adjust = 1.25,color='red') +
    #geom_line(aes(x = fullset_trees[['6']]  %>% as.double()), stat = "density", adjust = 1.25,color='blue') +
    xlab("CART:  POSITION ranges: Upper limit in red, Lower limit in blue.") +
    ylab("values") +
    theme_bw()
}


plot_nrules <- function(resamples,formula_s) {
  nrules <- map(.x=resamples$splits,formula_string=formula_s, nrules)
  fullset_nrules <- do.call(rbind, nrules)
  ggplot(fullset_nrules %>% as.data.frame(), aes(x = V1)) +
    geom_line(stat = "density", adjust = 1.25) +
    #geom_boxplot(x = as.integer(1), color = 'red') +
    xlab("CART: number of rules for POSITION") +
    #ylab("rules")+
    theme_bw()
}

formula_s="T_POSITION~NORM_T_RETROPOSITION"

plot_rules_overlaping<-function(resamples,formula_s){
 resamples$trees <- map(.x=resamples$splits,formula_string=formula_s , rules)
  resamples$trees <-
    map(resamples$trees, function(x)
      if (x %>% ncol() == 8)
        x)
  resamples$trees <-
    map(resamples$trees, function(x)
      if (!is.null(x)) {
        names(x) <- 1:8
        x
      })
  fullset_trees <- do.call(rbind, resamples$trees)
  fullset_trees <- fullset_trees %>% as.data.frame()
   
  
  
fullset_trees_ranges<-fullset_trees %>% as.data.frame() %>% select(c(1,5,6,8))
names(fullset_trees_ranges)<-c("POSITION","condition","lower_limit","upper_limit")
fullset_trees_ranges[['lower_limit']]<- fullset_trees_ranges[['lower_limit']] %>% as.double()
fullset_trees_ranges[['upper_limit']]<- fullset_trees_ranges[['upper_limit']] %>% as.double()


fullset_trees_ranges %>%  mutate(ll=ifelse(condition==">=",upper_limit,lower_limit))%>% mutate(ul=ifelse(is.na(lower_limit),1,upper_limit)) %>%
mutate(ul=ifelse(condition=="< ",lower_limit,ul)) %>%
mutate(ll=ifelse(is.na(upper_limit),0,ll))  %>% select(POSITION,ll,ul) %>%
  ggplot()+
  #geom_line(aes(x=0:0.34,y=POSITION,color=POSITION),size=5)
  geom_segment(aes(y=POSITION %>% factor,
                   yend=POSITION %>% factor,
                   x=ul,
                   xend=ll,
                   color=POSITION %>% factor),
                 size=5,alpha=0.01
                 )+
  xlim(0,1)+
  ylab("POSITION")+xlab("Normalized Range")+
  theme_bw()+theme(legend.position = "none") 
}
```


# NASAL POSITION

### CART model

```{r}
tree <-
  rpart::rpart(
    N_POSITION ~ NORM_N_RETROPOSITION,
    data = retropostion_data,
    control = rpart.control(minsplit = 10, xval = 10)
  )
rpart.plot(
  tree,
  type = 1,
  extra = 101,
  box.palette = "GnBu",
  branch.lty = 3,
  shadow.col = "gray",
  nn = TRUE
)
printcp(tree)
rpart.rules(tree)


preds <- predict(tree, retropostion_data, type = 'class')
cm <-
  caret::confusionMatrix(preds, as.factor(retropostion_data$N_POSITION))
cm
```

### Calculating confidence intervals using one-tailed binomial test

The binomial test is an exact test of the statistical significance of deviations from a theoretically expected distribution of observations into two categories. for this case we consider each category/class against the rest. The clasical one-vs-all approach for dealing with multiclasses situations

#### Sulcus
```{r}
binom.test(cm$table[3,3] ,cm$table[1:3,3] %>% sum())
```

#### Ciliary Body Posterior
```{r}
binom.test(cm$table[2,2] ,cm$table[1:3,2] %>% sum())
```

#### Ciliary Body Anterior
```{r}
binom.test(cm$table[1,1] ,cm$table[1:3,1] %>% sum())

```




### Analysis of rules found by CART

#### LOOCV

Similar to CV with K = N, where N is the size of the dataset. Ideally, this approach has a low variance in the results.

#####  Number of rules per model distribution
```{r}
resamples<-loo_cv(retropostion_data)
plot_nrules(resamples,"N_POSITION~NORM_N_RETROPOSITION")
```

##### Range variation for models with rulset size = 3

Rules can have an upper limit (>=), lower limit(<) or both (is)
```{r}

plot_ranges(resamples,"N_POSITION~NORM_N_RETROPOSITION")
```

##### Range Overlaping for ruleset size = 3

```{r}
plot_rules_overlaping(resamples,"N_POSITION~NORM_N_RETROPOSITION")
```

#### BOOTSTRAP 

We generate 2000 new samples with replacement and  we build 2000 new CARTs.

#####  Number of rules per model distribution
```{r}
resamples<-bootstraps(retropostion_data,strata = N_POSITION,times = 2000  )
plot_nrules(resamples,"N_POSITION~NORM_N_RETROPOSITION")
```

##### Range variation for models with rulset size = 3
Rules can have an upper limit (>), lower limit(<) or both (is)
```{r}

plot_ranges(resamples,"N_POSITION~NORM_N_RETROPOSITION")
```

##### Range Overlaping for ruleset size = 3

```{r}
plot_rules_overlaping(resamples,"N_POSITION~NORM_N_RETROPOSITION")
```


# TEMPORAL POSITION

### CART model

```{r}
tree <-
  rpart::rpart(
    T_POSITION ~ NORM_T_RETROPOSITION,
    data = retropostion_data,
    control = rpart.control(minsplit = 10, xval = 10)
  )
rpart.plot(
  tree,
  type = 1,
  extra = 101,
  box.palette = "GnBu",
  branch.lty = 3,
  shadow.col = "gray",
  nn = TRUE
)
printcp(tree)
rpart.rules(tree)


preds <- predict(tree, retropostion_data, type = 'class')
cm <-
  caret::confusionMatrix(preds, as.factor(retropostion_data$T_POSITION))
cm
```

### Calculating confidence intervals using one-tailed binomial test

#### Sulcus
```{r}
binom.test(cm$table[3,3] ,cm$table[1:3,3] %>% sum())
```

#### Ciliary Body Posterior
```{r}
binom.test(cm$table[2,2] ,cm$table[1:3,2] %>% sum())
```

#### Ciliary Body Anterior
```{r}
binom.test(cm$table[1,1] ,cm$table[1:3,1] %>% sum())

```


### Analysis of rules found by CART

#### LOOCV

Similar to CV with K = N, where N is the size of the dataset. Ideally, this approach has a low variance in the results.

#####  Number of rules per model distribution
```{r}
resamples<-loo_cv(retropostion_data)
plot_nrules(resamples,"T_POSITION~NORM_T_RETROPOSITION")
```

##### Range variation for models with rulset

Rules can have an upper limit (>=), lower limit(<) or both (is)
(*)  Ruleset should include at least one rule with upper and lower limit
```{r}

plot_ranges(resamples,"T_POSITION~NORM_T_RETROPOSITION")
```

##### Range Overlaping for ruleset 

```{r}
plot_rules_overlaping(resamples,"T_POSITION~NORM_T_RETROPOSITION")
```

#### BOOTSTRAP 

We generate 2000 new samples with replacement and  we build 2000 new CARTs.

#####  Number of rules per model distribution
```{r}
resamples<-bootstraps(retropostion_data,strata = N_POSITION,times = 2000  )
plot_nrules(resamples,"T_POSITION~NORM_T_RETROPOSITION")
```

##### Range variation for models with ruleset 

Rules can have an upper limit (>), lower limit(<) or both (is)
(*)  Ruleset should include at least one rule with upper and lower limit
```{r}

plot_ranges(resamples,"T_POSITION~NORM_T_RETROPOSITION")
```

##### Range Overlaping for ruleset 


```{r}
plot_rules_overlaping(resamples,"T_POSITION~NORM_T_RETROPOSITION")
```




