library(skimr)
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(VIM)
library(readr)
library(rpart.plot)

LOAD DATASET

Set some variable as factor

postop_data$SEX<-as.factor(postop_data$SEX)
postop_data$EYE<-as.factor(postop_data$EYE)
postop_data$IOL_MODEL_NEW<-as.factor(postop_data$IOL_MODEL_NEW)
#postop_data$IOL_SIZE<-as.factor(postop_data$IOL_SIZE)
postop_data$IOL_POSITION<-as.factor(postop_data$IOL_POSITION)
postop_data$POSITION<-as.factor(postop_data$POSITION)

CLEANING DATA

Remove registers with innacurated information

postop_data<-postop_data %>% filter(!(grepl("Bastias",NAME) & EYE == "OD")) 
postop_data<-postop_data %>% filter(!(grepl("Bonadeo",NAME))) 

Set PO_IOL_ANT_RAD_MM outlier from register 32 equals to register 31 (OD = OS)

postop_data[32,]$PO_IOL_ANT_RAD_MM=postop_data[31,]$PO_IOL_ANT_RAD_MM
postop_data %>% filter(grepl("Tomas",NAME)) %>% select(-NAME)

Remove IOL with very few instances

#postop_data$IOL_MODEL_NEW %>% unique()
postop_data %>% group_by(IOL_MODEL_NEW) %>% summarise(n=n()) %>% arrange(desc(n))
`summarise()` ungrouping output (override with `.groups` argument)
postop_data<- postop_data %>% filter( IOL_MODEL_NEW %in% c("VICM5","VTICM5","VTICMO","VICMO"))

EXPLORATORY DATA ANALYSIS (EDA)

Basic distribution information

Categorical variables

skim(postop_data %>% select(SEX,AGE, IOL_MODEL_NEW, IOL_SIZE,IOL_POSITION,POSITION)) %>% yank("factor") %>% knitr::kable()
skim_variable n_missing complete_rate ordered n_unique top_counts
SEX 0 1.0000 FALSE 2 F: 62, M: 18, NS: 0
IOL_MODEL_NEW 0 1.0000 FALSE 4 VIC: 49, VTI: 14, VTI: 10, VIC: 7
IOL_POSITION 0 1.0000 FALSE 3 H: 58, V: 18, O: 4
POSITION 25 0.6875 FALSE 12 CP-: 27, CA-: 5, CP-: 5, CA-: 3

Numerical variables

skim(postop_data %>% select(SEX,AGE, IOL_MODEL_NEW,
                            IOL_SIZE,
                            IOL_POSITION,
                            IOL_EQ,
                            IOL_POWER_SPH,
                            T_RETROPOSITION_MM,
                            N_RETROPOSITION_MM
                            )) %>% yank("numeric") %>% knitr::kable()
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
AGE 21 0.7375 31.2203390 5.1396625 22.000 28.0000 31.000 34.0000 44.000 ▃▅▇▂▁
IOL_SIZE 0 1.0000 13.2537500 0.2024807 12.600 13.2000 13.200 13.2000 13.700 ▁▁▇▁▁
IOL_EQ 1 0.9875 -9.5253165 3.5587548 -17.500 -11.5000 -9.000 -7.5000 0.000 ▃▃▇▃▁
IOL_POWER_SPH 0 1.0000 -9.9500000 3.4664760 -17.500 -11.7500 -9.000 -7.5000 -3.000 ▃▂▇▆▂
T_RETROPOSITION_MM 25 0.6875 0.4342909 0.2133933 0.000 0.3115 0.445 0.5295 0.982 ▃▇▇▅▁
N_RETROPOSITION_MM 25 0.6875 0.4217455 0.1797101 0.053 0.3075 0.432 0.5310 0.824 ▂▅▇▃▂

PO MEASURES:

Discriminated by gender:

NOTE: PO_IOL_ANT_RAD_MM column was removed for better visualization since it still shows some outliers (>1000)

postop_data%>% select_if(is.numeric) %>% 
  select(starts_with("PO"),-PO_IOL_ANT_RAD_MM) %>% 
  tibble::add_column(SEX=postop_data$SEX) %>% 
  reshape2::melt() %>%
  ggplot()+
    facet_wrap(~SEX)+
    geom_boxplot(aes(x=variable,y=value,fill=variable))+
    #ggdark::dark_theme_gray()+
    theme_bw()+
    theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
  guides(fill=FALSE)
Using SEX as id variables

NA

Discriminated by gender and eyes:


postop_data%>% select_if(is.numeric) %>% 
  select(starts_with("PO"),-PO_IOL_ANT_RAD_MM) %>% 
  tibble::add_column(EYE=postop_data$EYE) %>%
  tibble::add_column(SEX=postop_data$SEX) %>%
  reshape2::melt() %>%
  ggplot()+
    facet_grid(~EYE+SEX)+
    geom_boxplot(aes(x=variable,y=value,fill=variable))+
    theme_bw()+
    #ggdark::dark_theme_gray()+
    theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
  guides(fill=FALSE)
Using EYE, SEX as id variables

postop_data%>% select_if(is.numeric) %>% 
  select(starts_with("PO"),-PO_IOL_ANT_RAD_MM) %>% 
  tibble::add_column(MODEL=postop_data$IOL_MODEL_NEW) %>%
  tibble::add_column(SIZE=postop_data$IOL_SIZE) %>%
  tidyr::pivot_longer(cols = c(-MODEL,-SIZE)) %>%
  ggplot()+
    facet_wrap(~SIZE,ncol = 4)+
    geom_boxplot(aes(x=name,y=value,fill=name))+
    theme_bw()+
    #ggdark::dark_theme_gray()+
    theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
  guides(fill=FALSE)

REMOVE MISSING DATA

THe plot below shows in the Y axis the combination of variables with missing values (red) and X axis, the name of the variables. The histogram in the Y-axis show the frequency of each combination and the histogram on top of the plot shows the frecuency of missing values for each variable.



res<-aggr(postop_data, combined = TRUE, 
          numbers = TRUE, 
          sortCombs = TRUE, 
          sortVars = TRUE, 
          labels=names(postop_data),
          cex.axis=.4, 
          varheight = FALSE,
          cex.numbers=0.4,
          cex.lab=0.4,
          prop = FALSE)

 Variables sorted by number of missings: 

#res$combinations

#res$combinations
#plot(res, numbers = TRUE, prop = FALSE)
#res

Remove columns not providing enough information.

Some of the column were removed based on the number of missing values and some other just because they not provide valuable information (this must be confirmed)

library(lubridate)
postop_data <- postop_data %>% select(-CR,
                                      -AGE,
                                      -QX,
                                      -QX_DATE,
                                      -CONTROL_DATE,
                                      -IOL_MODEL_OLD,
                                      -IOL_EQ_CHECK,
                                      -IOL_EQ_CALC,
                                      -IOL_CYL,-IOL_AXIS,
                                      -PO_C_VAULT_1D_MM)


postop_data <- postop_data %>% mutate(AGE=floor(interval(dmy(DOB),today())/years(1))) %>% select(-DOB)

Remove missing values in POSITION

postop_data<-postop_data %>% filter(!is.na(POSITION))

Impute missing values

A basic KNN imputation is done for performing some other analisys such as PCA.


postop_data_imputed <- kNN(postop_data)
postop_data<-postop_data_imputed %>% select(-ends_with("_imp"))
postop_data %>% select(-NAME)
NA

CORRELATION ANALYSIS

Considered variables for PCA

library(ggplot2)
library(ggfortify)
library(cluster)
#postop_data_pca %>% ncol
#postop_data %>% nrow

postop_data_pca<-postop_data %>% select(starts_with("PO_")) %>% select_if(is.numeric) %>% 
  add_column(T_RETROPOSITION_MM=postop_data$T_RETROPOSITION_MM) %>% 
  add_column(N_RETROPOSITION_MM=postop_data$N_RETROPOSITION_MM) %>% 
  add_column(C_VAULT_MM=postop_data$C_VAULT_MM) %>%
  add_column(N_VAULT_MM=postop_data$N_VAULT_MM) %>%
  add_column(T_VAULT_MM=postop_data$T_VAULT_MM) %>% 
  add_column(AGE=postop_data$AGE) %>%
  add_column(IOL_POWER=postop_data$IOL_POWER) #%>%
  #add_column(EYE=postop_data$EYE) 
  
  
names(postop_data_pca) %>% as_tibble()

PCA

pca<-prcomp(postop_data_pca,center=TRUE,scale.=TRUE)
postop_data_pca_res<-data.frame(pca$x,position=postop_data$POSITION,
                            gender=postop_data$SEX,
                            istoric=as.factor(postop_data$IS_TORIC_LENS))

plot<-autoplot(pca,data=postop_data,colour='POSITION',loadings = TRUE,
         loadings.colour = 'blue',
         loadings.label = TRUE, loadings.label.size = 2,shape=1,size=3)

#plot+
#  theme_bw()


#ggplot(postop_data_pca_res) +
#  geom_point(aes(x=PC1,y=PC2,color=position),shape =1, size=3 )+
#  theme_bw()




#plotly::plot_ly(postop_data_pca_res , type="scatter3d", 
#                x = ~PC2, y = ~PC1, z = ~PC3, color = ~position,
#                colors = c('blue', 'orange',"red","green"), 
#                opacity=0.5, marker = list(size = 3),text = ~age~gender) 
library(FactoMineR)
#postop_data_pca<-postop_data %>% select(starts_with("PO_")) %>% select_if(is.numeric) %>% add_column(postop_data$SEX)
res_pca = PCA(postop_data_pca, scale.unit=TRUE, ncp=6, 
              graph=T,
              quali.sup=12
              )

PCA Feature Selection

res_pca$var$contrib %>% as.data.frame() %>% add_rownames()%>% reshape2::melt() %>% group_by(variable) %>%
  slice(which.max(value))
Using rowname as id variables
# %>% summarise(max=max(value))

Correlation matrix

Using Spearman correlation

library(d3heatmap)
postop_data_cor_matrix<-cor(postop_data_pca,method="spearman")
#heatmap(postop_data_cor_matrix)
d3heatmap(postop_data_cor_matrix,colors = "Blues",cexRow = 0.8, cexCol = 0.8)

Pairs matrix

pairs(postop_data_pca)

readr::write_csv(postop_data,path="postop_data_cleaned.csv")
---
title: "IZ ICL EDA"
output: 
  html_notebook: 
    code_folding: hide
    toc: true
    toc_float: true
    toc_collapsed: true
---



```{r}
library(skimr)
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(VIM)
library(readr)
library(rpart.plot)
```

# LOAD DATASET

```{r include=FALSE}
postop_data <- readr::read_delim("datasets/ARCSCAN POSTOP ICL - CLEANED.tsv",delim='\t')
```

### Set some variable as factor
```{r}
postop_data$SEX<-as.factor(postop_data$SEX)
postop_data$EYE<-as.factor(postop_data$EYE)
postop_data$IOL_MODEL_NEW<-as.factor(postop_data$IOL_MODEL_NEW)
#postop_data$IOL_SIZE<-as.factor(postop_data$IOL_SIZE)
postop_data$IOL_POSITION<-as.factor(postop_data$IOL_POSITION)
postop_data$POSITION<-as.factor(postop_data$POSITION)
```

# CLEANING DATA

### Remove registers with innacurated information

```{r}
postop_data<-postop_data %>% filter(!(grepl("Bastias",NAME) & EYE == "OD")) 
postop_data<-postop_data %>% filter(!(grepl("Bonadeo",NAME))) 

```

### Set  PO_IOL_ANT_RAD_MM outlier from register 32 equals to register 31 (OD = OS)
```{r}
postop_data[32,]$PO_IOL_ANT_RAD_MM=postop_data[31,]$PO_IOL_ANT_RAD_MM
postop_data %>% filter(grepl("Tomas",NAME)) %>% select(-NAME)
```

### Remove IOL with very few instances

```{r}
#postop_data$IOL_MODEL_NEW %>% unique()
postop_data %>% group_by(IOL_MODEL_NEW) %>% summarise(n=n()) %>% arrange(desc(n))
postop_data<- postop_data %>% filter( IOL_MODEL_NEW %in% c("VICM5","VTICM5","VTICMO","VICMO"))
```


# EXPLORATORY DATA ANALYSIS (EDA)



### Basic distribution information
### Categorical variables

```{r}
skim(postop_data %>% select(SEX,AGE, IOL_MODEL_NEW, IOL_SIZE,IOL_POSITION,POSITION)) %>% yank("factor") %>% knitr::kable()
```


### Numerical variables
```{r}
skim(postop_data %>% select(SEX,AGE, IOL_MODEL_NEW,
                            IOL_SIZE,
                            IOL_POSITION,
                            IOL_EQ,
                            IOL_POWER_SPH,
                            T_RETROPOSITION_MM,
                            N_RETROPOSITION_MM
                            )) %>% yank("numeric") %>% knitr::kable()
```

## PO MEASURES: 

#### Discriminated by gender:

NOTE: PO_IOL_ANT_RAD_MM column was removed for better visualization since it still shows some outliers (>1000)

```{r fig.height=4, fig.width=12}
postop_data%>% select_if(is.numeric) %>% 
  select(starts_with("PO"),-PO_IOL_ANT_RAD_MM) %>% 
  tibble::add_column(SEX=postop_data$SEX) %>% 
  reshape2::melt() %>%
  ggplot()+
    facet_wrap(~SEX)+
    geom_boxplot(aes(x=variable,y=value,fill=variable))+
    #ggdark::dark_theme_gray()+
    theme_bw()+
    theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
  guides(fill=FALSE)
   
```

#### Discriminated by gender and eyes:

```{r fig.height=4, fig.width=12}

postop_data%>% select_if(is.numeric) %>% 
  select(starts_with("PO"),-PO_IOL_ANT_RAD_MM) %>% 
  tibble::add_column(EYE=postop_data$EYE) %>%
  tibble::add_column(SEX=postop_data$SEX) %>%
  reshape2::melt() %>%
  ggplot()+
    facet_grid(~EYE+SEX)+
    geom_boxplot(aes(x=variable,y=value,fill=variable))+
    theme_bw()+
    #ggdark::dark_theme_gray()+
    theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
  guides(fill=FALSE)
```

```{r fig.height=4, fig.width=12}
postop_data%>% select_if(is.numeric) %>% 
  select(starts_with("PO"),-PO_IOL_ANT_RAD_MM) %>% 
  tibble::add_column(MODEL=postop_data$IOL_MODEL_NEW) %>%
  tibble::add_column(SIZE=postop_data$IOL_SIZE) %>%
  tidyr::pivot_longer(cols = c(-MODEL,-SIZE)) %>%
  ggplot()+
    facet_wrap(~SIZE,ncol = 4)+
    geom_boxplot(aes(x=name,y=value,fill=name))+
    theme_bw()+
    #ggdark::dark_theme_gray()+
    theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
  guides(fill=FALSE)
```

# REMOVE MISSING DATA

THe plot below shows in the Y axis the combination of variables with missing values (red) and X axis, the name of the variables. The histogram in the Y-axis show the frequency of each combination and the histogram on top of the plot shows the frecuency of missing values for each variable.

```{r}


res<-aggr(postop_data, combined = TRUE, 
          numbers = TRUE, 
          sortCombs = TRUE, 
          sortVars = TRUE, 
          labels=names(postop_data),
          cex.axis=.4, 
          varheight = FALSE,
          cex.numbers=0.4,
          cex.lab=0.4,
          prop = FALSE)
#res$combinations

#res$combinations
#plot(res, numbers = TRUE, prop = FALSE)
#res
```

## Remove columns not providing enough information.

Some of the column were removed based on the number of missing values and some other just because they not provide valuable information (this must be confirmed)

```{r}
library(lubridate)
postop_data <- postop_data %>% select(-CR,
                                      -AGE,
                                      -QX,
                                      -QX_DATE,
                                      -CONTROL_DATE,
                                      -IOL_MODEL_OLD,
                                      -IOL_EQ_CHECK,
                                      -IOL_EQ_CALC,
                                      -IOL_CYL,-IOL_AXIS,
                                      -PO_C_VAULT_1D_MM)


postop_data <- postop_data %>% mutate(AGE=floor(interval(dmy(DOB),today())/years(1))) %>% select(-DOB)
```

## Remove missing values in POSITION

```{r}
postop_data<-postop_data %>% filter(!is.na(POSITION))
```



## Impute missing values
A basic KNN imputation is done for performing some other analisys such as PCA.

```{r}

postop_data_imputed <- kNN(postop_data)
postop_data<-postop_data_imputed %>% select(-ends_with("_imp"))
postop_data %>% select(-NAME)

```


# CORRELATION ANALYSIS
## Considered variables for PCA
```{r}
library(ggplot2)
library(ggfortify)
library(cluster)
#postop_data_pca %>% ncol
#postop_data %>% nrow

postop_data_pca<-postop_data %>% select(starts_with("PO_")) %>% select_if(is.numeric) %>% 
  add_column(T_RETROPOSITION_MM=postop_data$T_RETROPOSITION_MM) %>% 
  add_column(N_RETROPOSITION_MM=postop_data$N_RETROPOSITION_MM) %>% 
  add_column(C_VAULT_MM=postop_data$C_VAULT_MM) %>%
  add_column(N_VAULT_MM=postop_data$N_VAULT_MM) %>%
  add_column(T_VAULT_MM=postop_data$T_VAULT_MM) %>% 
  add_column(AGE=postop_data$AGE) %>%
  add_column(IOL_POWER=postop_data$IOL_POWER) #%>%
  #add_column(EYE=postop_data$EYE) 
  
  
names(postop_data_pca) %>% as_tibble()
```


## PCA

```{r}
pca<-prcomp(postop_data_pca,center=TRUE,scale.=TRUE)
postop_data_pca_res<-data.frame(pca$x,position=postop_data$POSITION,
                            gender=postop_data$SEX,
                            istoric=as.factor(postop_data$IS_TORIC_LENS))

plot<-autoplot(pca,data=postop_data,colour='POSITION',loadings = TRUE,
         loadings.colour = 'blue',
         loadings.label = TRUE, loadings.label.size = 2,shape=1,size=3)

#plot+
#  theme_bw()


#ggplot(postop_data_pca_res) +
#  geom_point(aes(x=PC1,y=PC2,color=position),shape =1, size=3 )+
#  theme_bw()




#plotly::plot_ly(postop_data_pca_res , type="scatter3d", 
#                x = ~PC2, y = ~PC1, z = ~PC3, color = ~position,
#                colors = c('blue', 'orange',"red","green"), 
#                opacity=0.5, marker = list(size = 3),text = ~age~gender) 
```


```{r fig.height=8}
library(FactoMineR)
#postop_data_pca<-postop_data %>% select(starts_with("PO_")) %>% select_if(is.numeric) %>% add_column(postop_data$SEX)
res_pca = PCA(postop_data_pca, scale.unit=TRUE, ncp=6, 
              graph=T,
              quali.sup=12
              )
```


## PCA Feature Selection
```{r}
res_pca$var$contrib %>% as.data.frame() %>% add_rownames()%>% reshape2::melt() %>% group_by(variable) %>%
  slice(which.max(value))
# %>% summarise(max=max(value))
```


## Correlation matrix

Using Spearman correlation
```{r fig.height=6, fig.width=6}
library(d3heatmap)
postop_data_cor_matrix<-cor(postop_data_pca,method="spearman")
#heatmap(postop_data_cor_matrix)
d3heatmap(postop_data_cor_matrix,colors = "Blues",cexRow = 0.8, cexCol = 0.8)
```


## Pairs matrix
```{r fig.width=12}
pairs(postop_data_pca)
```
```{r}
readr::write_csv(postop_data,path="postop_data_cleaned.csv")
```

