library(readr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(plotly)
#library(ggmap)
#library(catboost)

rad2deg <- function(rad) {(rad * 180) / (pi)}
deg2rad <- function(deg) {(deg * pi) / (180)}


# Datasets originales
#caprino_hours<-read.csv("data/caprino_hours.csv.gz",header = F)
#caprino_imu_gyros<-read.csv("data/caprino_imu_gyro.csv.gz", header=F)
#caprino_imu_acc<-read.csv("data/caprino_imu_acc.csv.gz", header=F)
#caprino_gps<-read.csv("data/caprino_gps.csv",header=F)

# Datasets nuevos
caprino_hours<-read.csv("data/caprino_hours.csv.gz",header = F)
caprino_imu_gyros<-read.csv("data/caprino_imu_gyro.csv.gz", header=F)
caprino_imu_acc<-read.csv("data/caprino_imu_acc.csv.gz", header=F)
caprino_gps<-read.csv("data/caprino_gps_distancia.csv",header=F) #INCLUYE UNA SEXTA COLUMNA CON la distancia euclidea
caprino_gps<-caprino_gps %>% select(-V7)

caprino_imu_acc %>% nrow()
[1] 463703
caprino_imu_gyros %>% nrow()
[1] 463703
caprino_gps %>% nrow()
[1] 231767
caprino_hours %>% nrow()
[1] 231767

Datos de los archivos

Construccion del Dataset

caprino_data<-cbind(caprino_imu_acc,caprino_imu_gyros %>% select(-V1))

caprino_data<-cbind(caprino_data[seq(2,nrow(caprino_data),2),] %>% head(nrow(caprino_gps)-1),  caprino_gps[2:nrow(caprino_gps),] ) 
names(caprino_data)<-c("imu_ts",
                       "accX",
                       "accY",
                       "accZ",
                       "gyroX",
                       "gyroY",
                       "gyroZ",
                       "gps_hour",
                       "gps_ts",
                       "lat",
                       "long",
                       "alt",
                       "distance" 
                        )
#caprino_data %>% mutate(diff=imu_ts-gps_ts) %>% select(diff)
caprino_data<-caprino_data %>% mutate(activity=ifelse(hms(gps_hour)>hms("8:56:00") & hms(gps_hour)<hms("10:38:00"),"RP",
                                       
                                        ifelse(hms(gps_hour)>=hms("10:38:00") & hms(gps_hour)<hms("11:38:00"),"W",
                                        ifelse(hms(gps_hour)>=hms("11:38:00") & hms(gps_hour)<hms("12:05:00"),"G",
                                        ifelse(hms(gps_hour)>=hms("12:05:00") & hms(gps_hour)<hms("15:48:00"),"RF",
                                        ifelse(hms(gps_hour)>=hms("17:30:00") & hms(gps_hour)<hms("20:33:00"),"G",
                                        ifelse(hms(gps_hour)>=hms("20:50:00") & hms(gps_hour)<hms("21:31:00"),"W",
                                        ifelse(hms(gps_hour)>=hms("21:33:00") & hms(gps_hour)<hms("21:50:00"),"RP",NA)
                                        )))))
                                        )
                        
                        )

caprino_data <-caprino_data %>% mutate(id=cut(as.POSIXct(hms(caprino_data$gps_hour),origin="1960-01-01",tz="GMT"),breaks="1 min")) 
caprino_data %>% head(10)

Tomamos muestras aleatorias de 1 minuto de duracion para sobre las cuales despues queremos predecir el comportamiento

#readr::write_csv(caprino_data,"data/caprino_data_sliced_distance.csv")
#set.seed(21092019)
#set.seed(21092022)
#set.seed(07092020)
#set.seed(01092020)
minute_slice<-as.vector(t(caprino_data %>% group_by(id) %>% summarise(n=n()) %>% select(id)))
trainidx<-sample(1:length(minute_slice),(length(minute_slice)*70)/100.0)
data_train<-caprino_data %>% filter( id %in% minute_slice[trainidx])
data_test<-caprino_data %>% filter( id %in% (minute_slice[-trainidx]))
data_train%>% group_by(id) %>% summarise(n=n())
data_test%>% group_by(id) %>% summarise(n=n())
data_train %>% filter(!is.na(activity)) %>% group_by(activity) %>% summarise(n=n()) 
NA

Aplicamos Bag-of-features.

presupuesto

Consideramos cada muestra como un vector de 5 dimensiones. La idea es aplicar un algoritmo de clustering sobre estos vectores (accX,accY,accZ,GyroX,gyroY). Una vez encontrado los centroides, para cada muestra de 1 se construye un histograma de los vectores que han caido en cada uno de estos clusters. Este histograma es el que se utiliza para entregar el algoritmo.

La hipotesis de aplicar este metodo es que aquellas muestras que pertenecen a una misma actividad tendran valores similares. Los cuales han sido representado mediantes estos histogramas.

Aplicamos k-means con 90 centroides sobre el conjunto de entrenamiento.

Se utilizan todos los sensores y la distancia.

centroids<-90
cluster_features<-c('accX','accY','accZ','gyroX','gyroY','gyroZ','distance')

## Sampling considering time slices 
  minute_slice <-
    as.vector(t(
      caprino_data %>% group_by(id) %>% summarise(n = n()) %>% select(id)
    ))
  trainidx <-
    sample(1:length(minute_slice), (length(minute_slice) * 70) / 100.0)
  data_train <- caprino_data %>% filter(id %in% minute_slice[trainidx])
  data_test <-
    caprino_data %>% filter(id %in% (minute_slice[-trainidx]))
  
  
  # Kmeans on trainset
  cluster_results <-
    kmeans(
      x = data_train %>% select(cluster_features),
      centers = centroids,
      nstart = 10
    )
  data_train_cluster <-
    cbind(data_train, cluster = as.integer(cluster_results$cluster)) %>% group_by(activity, cluster)
  
  cluster_results<-cluster_results$centers %>% 
    as.data.frame() %>% 
    tibble::rownames_to_column(var = "cluster")

Aplicamos KNN con K=3 en el conjunto de entrenamiento.

  # KNN using centroids from kmeans for testset
  
  cluster_results_test <-class::knn(train=cluster_results %>% select(-cluster),
                                    test=data_test %>% select(cluster_features)
                                    ,cl = as.integer(cluster_results$cluster),
                                    k=3 )
  data_test_cluster <-
    cbind(data_test, cluster = as.integer(cluster_results_test)) %>% group_by(activity, cluster)
  
  # Create histograms for bag-of-features and label each new datapoint 
  ## For trainset
  data_train_cluster_labeled <- data_train_cluster   %>%
    group_by(id, cluster) %>% summarise(n = n()) %>%
    reshape2::dcast(id ~ cluster, fill = 0) %>%
    inner_join(data_train_cluster %>% ungroup() %>%
                 select(id, activity)  %>%
                 unique()  ,
               by = "id") %>% filter(!is.na(activity))
  
  data_cluster_histogram<-data.frame(matrix(0L,nrow=nrow(data_train_cluster_labeled),ncol=90))
  names(data_cluster_histogram)[1:90]<-seq(1,90)
  data_cluster_histogram[names(data_train_cluster_labeled)]<-data_train_cluster_labeled
  data_train_cluster_labeled<-data_cluster_histogram
  
  ## For testset
  data_test_cluster_labeled <- data_test_cluster   %>% 
    group_by(id, cluster) %>% summarise(n = n()) %>%
    reshape2::dcast(id ~ cluster, fill = 0) %>%
    inner_join(data_test_cluster %>%  ungroup() %>%
                 select(id, activity)  %>%
                 unique() ,
               by = "id")  %>% filter(!is.na(activity))
  
  data_cluster_histogram<-data.frame(matrix(0L,nrow=nrow(data_test_cluster_labeled),ncol=90))
  names(data_cluster_histogram)[1:90]<-seq(1,90)
  data_cluster_histogram[names(data_test_cluster_labeled)]<-data_test_cluster_labeled
  data_test_cluster_labeled<-data_cluster_histogram

Total de actividades en Train

data_train_cluster_labeled %>% group_by(activity) %>% summarise(total=n())

Total de actividades en Test

data_test_cluster_labeled %>% group_by(activity) %>% summarise(total=n())

Aplicamos 3x5 CV para elegir la mejor version del modelo

Probamos Random Forest

rf_grid <-  expand.grid(.mtry=c(5)) 
rfFit <- caret::train(
               x= data_train_cluster_labeled %>% select(-id,-activity),
               y= data_train_cluster_labeled$activity,
              
               method = "rf",
  
               #tuneLength=5, 
               tuneGrid=rf_grid,
               
               verbose=2,
       
               trControl = ctrl_fast,
               ntree=200)
rfFit$results

Seleccionamos el modelo

selected_model<-rfFit

Evaluacion de los resultados de CV (CV Report)

predictions_cv<-selected_model$pred %>% select(Resample,pred,obs)

predictions_cv_cm <- predictions_cv %>% group_by(Resample) %>%
  group_modify(
    ~ as.data.frame(
      caret::confusionMatrix(
        data = .x$pred,
        reference = .x$obs,
        mode = 'everything'
      )$byClass
    ) %>%
      select(Precision, Recall, F1,) %>% 
      tibble::rownames_to_column(var = "activity")
  )

predictions_cv_results<- predictions_cv_cm %>% 
  group_by(activity) %>% 
  summarise(
    precision_mean=mean(Precision),
    precision_se=sd(Precision)/sqrt(selected_model$control$number*selected_model$control$repeats),
    precision_conf_inf=mean(Precision)-(precision_se*1.96),
    precision_conf_sup=mean(Precision)+(precision_se*1.96),
    recall_mean=mean(Recall),
    recall_se=sd(Recall)/sqrt(selected_model$control$number*selected_model$control$repeats),
    recall_conf_inf=mean(Recall)-(recall_se*1.96),
    recall_conf_sup=mean(Recall)+(recall_se*1.96),
    F1_mean=mean(F1),
    F1_se=sd(F1)/sqrt(selected_model$control$number*selected_model$control$repeats),
    F1_conf_inf=mean(F1)-(F1_se*1.96),
    F1_conf_sup=mean(F1)+(F1_se*1.96)

)
predictions_cv_cm_plot<-predictions_cv_cm %>% reshape2::melt() %>%
  ggplot()+
  facet_wrap(~variable)+
  ylim(0.6,1)+
  geom_boxplot(aes(x=activity,y=value,fill=activity))+
  labs(title=paste0(selected_model$method," for activity prediction [CV] Report"),
                    subtitle="[3x5 CV]")+
  ggdark::dark_theme_gray()+
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
      theme(legend.position = "none") 
   
predictions_cv_cm

predictions_cv_cm_plot

predictions_cv_results
prediction_cv_cor<-predictions_cv_cm %>% select(activity,Recall,-Resample) %>% tidyr::pivot_wider(names_from = activity, values_from = Recall) %>% ungroup() %>% select(-Resample) %>% cor(method = "spearman")
Adding missing grouping variables: `Resample`
library(d3heatmap)
d3heatmap(prediction_cv_cor,colors = "Blues",cexRow = 0.8, cexCol = 0.8)

#predictions_cv_results_error  %>% select(activity,Sensitivity_sd,Specificity_sd,F1_sd,BalancedAccuracy_sd)
#predictions_cv_results  %>% select(activity,Sensitivity_mean,Specificity_mean,F1_mean,BalancedAccuracy_mean) %>% reshape2::melt() %>%
  
  
#  ggplot()+
#  facet_wrap(~variable)+
#  geom_col(aes(x=activity,y=value,fill=activity))+
#  ggdark::dark_theme_grey()+
#  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
#  theme(legend.position = "none") 

Evaluacion sobre el conjunto de test (Test Report)

Matriz de Confusion

data_test_prediction<-predict(selected_model,data_test_cluster_labeled %>% select(-id))#,type="prob")
cm<-caret::confusionMatrix(data_test_prediction,as.factor(data_test_cluster_labeled$activity),mode='everything')
cm
Confusion Matrix and Statistics

          Reference
Prediction  G RF RP  W
        G  45  9  0  2
        RF  9 58  6  0
        RP  1  2 33  1
        W   1  0  0 32

Overall Statistics
                                          
               Accuracy : 0.8442          
                 95% CI : (0.7862, 0.8916)
    No Information Rate : 0.3467          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.786           
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: G Class: RF Class: RP Class: W
Sensitivity            0.8036    0.8406    0.8462   0.9143
Specificity            0.9231    0.8846    0.9750   0.9939
Pos Pred Value         0.8036    0.7945    0.8919   0.9697
Neg Pred Value         0.9231    0.9127    0.9630   0.9819
Precision              0.8036    0.7945    0.8919   0.9697
Recall                 0.8036    0.8406    0.8462   0.9143
F1                     0.8036    0.8169    0.8684   0.9412
Prevalence             0.2814    0.3467    0.1960   0.1759
Detection Rate         0.2261    0.2915    0.1658   0.1608
Detection Prevalence   0.2814    0.3668    0.1859   0.1658
Balanced Accuracy      0.8633    0.8626    0.9106   0.9541
test_data<-cm$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var='activity') %>%
  select(activity,Precision, Recall, F1) %>% reshape2::melt()

predictions_cv_cm_plot+
  labs(title=paste0(selected_model$method," for activity prediction [CV+Test] Report"),
                    subtitle="[3x5 CV]")+
    ylim(0.6,1)+
  geom_point(data=test_data,aes(x=activity,y=value),fill="darkgreen",color="white",shape=23,size=2)+
    theme(legend.position = "none") 

Metricas Macro y Micro



macro_metrics<-(apply(cm$byClass,2,sum)/4) [c(5,6,7)]

s=table(data_test_prediction,as.factor(data_test_cluster_labeled$activity))

Precision_micro <-  (sum(diag(s)) / sum(apply(s,1, sum)))
Recall_micro <- (sum(diag(s)) / sum(apply(s,2, sum)))

micro_metrics <- c( 
  (sum(diag(s)) / sum(apply(s,1, sum))),
 (sum(diag(s)) / sum(apply(s,2, sum))),
 2 * ((Precision_micro*Recall_micro)/(Precision_micro+Recall_micro)))

mm<-cbind(c(macro_metrics),c(micro_metrics))


library(gridExtra)
data.frame(macro=mm[,1],micro=mm[,2])
apply(cm$table,2,sum)
 G RF RP  W 
56 69 39 35 

Matriz de confusion (plot)

len_data_test_prediction = length(data_test_prediction)
reshape2::melt(table(data_test_prediction,as.factor(data_test_cluster_labeled$activity))) %>% mutate(value=(value)) %>%
  ggplot(aes(x=data_test_prediction,y=Var2))+
  geom_tile(aes(fill=value), colour = "white") + 
   geom_text(aes(label = sprintf("%1.2f", value)), color='black',vjust = 1)+
  scale_fill_gradient(low = "white", high = "red")+
  xlab(" Predicted Activity ")+ylab(" Actual Activity")+
  theme_bw()+ theme(legend.position = "none")


#ggsave(filename = "/home/harpo/Dropbox/ongoing-work/publicaciones/CAI2019-JAIIO/confusion-matrix.png",width = 8, height = 4)

G vs RF



GvsRFmatrix <-
  cbind(data_test_cluster_labeled, predictions = data_test_prediction) %>% select(activity, predictions) %>% filter(activity %in% c("G", "RF")) %>% filter(predictions %in% c("G", "RF"))

GvsRF_recall <-
  caret::confusionMatrix(
    factor(GvsRFmatrix$predictions),
    factor(GvsRFmatrix$activity),
    mode = 'everything',
    positive = "G"
  )$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var = "metric") %>% filter(metric == "Recall" | metric == "Specificity") %>% mutate(activities="GvsRF")

GvsRF_recall

RF vs RP


RFvsRPmatrix <-
  cbind(data_test_cluster_labeled, predictions = data_test_prediction) %>% select(activity, predictions) %>% filter(activity %in% c("RF", "RP")) %>% filter(predictions %in% c("RF", "RP"))

RFvsRP_recall <-
  caret::confusionMatrix(
    factor(RFvsRPmatrix$predictions),
    factor(RFvsRPmatrix$activity),
    mode = 'everything',
    positive = "RF"
  )$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var = "metric") %>% filter(metric == "Recall" | metric == "Specificity") %>% mutate(activities="RFvsRP")

RFvsRP_recall

G vs RP


GvsRPmatrix <-
  cbind(data_test_cluster_labeled, predictions = data_test_prediction) %>% select(activity, predictions) %>% filter(activity %in% c("G", "RP")) %>% filter(predictions %in% c("G", "RP"))

GvsRP_recall <-
  caret::confusionMatrix(
    factor(GvsRPmatrix$predictions),
    factor(GvsRPmatrix$activity),
    mode = 'everything',
    positive = "G"
  )$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var = "metric") %>% filter(metric == "Recall" | metric == "Specificity") %>% mutate(activities="GvsRP")

GvsRP_recall

Session Info

sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1

locale:
 [1] LC_CTYPE=es_AR.UTF-8       LC_NUMERIC=C               LC_TIME=es_AR.UTF-8        LC_COLLATE=es_AR.UTF-8    
 [5] LC_MONETARY=es_AR.UTF-8    LC_MESSAGES=es_AR.UTF-8    LC_PAPER=es_AR.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=es_AR.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] d3heatmap_0.6.1.3 gridExtra_2.3     doMC_1.3.6        iterators_1.0.12  foreach_1.4.7     caret_6.0-86      lattice_0.20-41  
 [8] plotly_4.9.2      ggplot2_3.2.1     lubridate_1.7.4   dplyr_1.0.0       readr_1.3.1      

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.5           tidyr_1.0.0          png_0.1-7            class_7.3-17         packrat_0.5.0        assertthat_0.2.1    
 [7] digest_0.6.25        ipred_0.9-9          R6_2.4.0             plyr_1.8.4           ggdark_0.2.1         stats4_3.6.3        
[13] e1071_1.7-2          evaluate_0.14        httr_1.4.2           pillar_1.4.3         rlang_0.4.7          lazyeval_0.2.2      
[19] rstudioapi_0.11      data.table_1.12.4    rpart_4.1-15         Matrix_1.2-18        rmarkdown_2.3        labeling_0.3        
[25] splines_3.6.3        gower_0.2.1          stringr_1.4.0        htmlwidgets_1.5.1    munsell_0.5.0        tinytex_0.16        
[31] compiler_3.6.3       xfun_0.10            base64enc_0.1-3      pkgconfig_2.0.3      htmltools_0.5.0      nnet_7.3-14         
[37] tidyselect_1.1.0     tibble_3.0.1         prodlim_2018.04.18   codetools_0.2-16     randomForest_4.6-14  fansi_0.4.0         
[43] viridisLite_0.3.0    crayon_1.3.4         withr_2.2.0          MASS_7.3-52          recipes_0.1.10       ModelMetrics_1.2.2.2
[49] grid_3.6.3           nlme_3.1-149         jsonlite_1.7.1       gtable_0.3.0         lifecycle_0.2.0      magrittr_1.5        
[55] pROC_1.16.2          scales_1.0.0         cli_2.0.2            stringi_1.4.3        reshape2_1.4.3       timeDate_3043.102   
[61] ellipsis_0.3.1       generics_0.0.2       vctrs_0.3.2          lava_1.6.6           RColorBrewer_1.1-2   tools_3.6.3         
[67] glue_1.4.1           purrr_0.3.3          hms_0.5.3            yaml_2.2.0           survival_3.1-12      colorspace_1.4-1    
[73] knitr_1.25          
---
title: "Dataset Ganado Caprino 28/9/20"
output: 
  html_notebook: 
    code_folding: show
---
```{r message=FALSE, warning=FALSE}
library(readr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(plotly)
#library(ggmap)
#library(catboost)

rad2deg <- function(rad) {(rad * 180) / (pi)}
deg2rad <- function(deg) {(deg * pi) / (180)}


# Datasets originales
#caprino_hours<-read.csv("data/caprino_hours.csv.gz",header = F)
#caprino_imu_gyros<-read.csv("data/caprino_imu_gyro.csv.gz", header=F)
#caprino_imu_acc<-read.csv("data/caprino_imu_acc.csv.gz", header=F)
#caprino_gps<-read.csv("data/caprino_gps.csv",header=F)

# Datasets nuevos
caprino_hours<-read.csv("data/caprino_hours.csv.gz",header = F)
caprino_imu_gyros<-read.csv("data/caprino_imu_gyro.csv.gz", header=F)
caprino_imu_acc<-read.csv("data/caprino_imu_acc.csv.gz", header=F)
caprino_gps<-read.csv("data/caprino_gps_distancia.csv",header=F) #INCLUYE UNA SEXTA COLUMNA CON la distancia euclidea
caprino_gps<-caprino_gps %>% select(-V7)

caprino_imu_acc %>% nrow()
caprino_imu_gyros %>% nrow()
caprino_gps %>% nrow()
caprino_hours %>% nrow()


```


### Datos de los archivos 
* caprino_imu_acc.csv, 4 columnas: tiempo UTC en microsegundos,
aceleración X en m/s², aceleración Y en m/s², aceleración, Z en m/s².
Muestreado a 200 muestras por segundo. Filtrados

* caprino_imu_gyro.csv, 4 columnas: tiempo UTC en microsegundos,
velocidad angular X en rad/s, velocidad angular Y en rad/s, velocidad
angular Z en rad/s. Muestreado a 200 muestras por segundo. Filtrados

* caprino_gps_distancia.csv, 4 columnas: tiempo UTC en microsegundos, latitud en
radianes, longitud en radianes, altitud en metros. Muestreado a 5
muestras por segundo Se agrega ademas una columna con la distancia euclidean.

* caprino_hours.csv: tiempo en horas, minutos y segundos. Muestreado a
5 muestras por segundo. La primera hora de este archivo coincide con
el primer valor de tiempo UTC del archivo caprino_gps.csv.



## Construccion del Dataset

```{r}
caprino_data<-cbind(caprino_imu_acc,caprino_imu_gyros %>% select(-V1))

caprino_data<-cbind(caprino_data[seq(2,nrow(caprino_data),2),] %>% head(nrow(caprino_gps)-1),  caprino_gps[2:nrow(caprino_gps),] ) 
names(caprino_data)<-c("imu_ts",
                       "accX",
                       "accY",
                       "accZ",
                       "gyroX",
                       "gyroY",
                       "gyroZ",
                       "gps_hour",
                       "gps_ts",
                       "lat",
                       "long",
                       "alt",
                       "distance" 
                        )
#caprino_data %>% mutate(diff=imu_ts-gps_ts) %>% select(diff)
caprino_data<-caprino_data %>% mutate(activity=ifelse(hms(gps_hour)>hms("8:56:00") & hms(gps_hour)<hms("10:38:00"),"RP",
                                       
                                        ifelse(hms(gps_hour)>=hms("10:38:00") & hms(gps_hour)<hms("11:38:00"),"W",
                                        ifelse(hms(gps_hour)>=hms("11:38:00") & hms(gps_hour)<hms("12:05:00"),"G",
                                        ifelse(hms(gps_hour)>=hms("12:05:00") & hms(gps_hour)<hms("15:48:00"),"RF",
                                        ifelse(hms(gps_hour)>=hms("17:30:00") & hms(gps_hour)<hms("20:33:00"),"G",
                                        ifelse(hms(gps_hour)>=hms("20:50:00") & hms(gps_hour)<hms("21:31:00"),"W",
                                        ifelse(hms(gps_hour)>=hms("21:33:00") & hms(gps_hour)<hms("21:50:00"),"RP",NA)
                                        )))))
                                        )
                        
                        )

caprino_data <-caprino_data %>% mutate(id=cut(as.POSIXct(hms(caprino_data$gps_hour),origin="1960-01-01",tz="GMT"),breaks="1 min")) 
caprino_data %>% head(10)
```
  
```{r eval=FALSE, include=FALSE}
sample_results_test<-data.frame()
for (i in seq(1,30)){
  minute_slice<-as.vector(t(caprino_data %>% group_by(id) %>% summarise(n=n()) %>% select(id)))
  trainidx<-sample(1:length(minute_slice),(length(minute_slice)*70)/100.0)
  data_train<-caprino_data %>% filter( id %in% minute_slice[trainidx])
  data_test<-caprino_data %>% filter( id %in% (minute_slice[-trainidx]))
  data_train<-data_train %>% select (-imu_ts,-gps_hour,-id,-long,-lat,-alt, -gps_ts,-activity)
  data_test<-data_test %>% select (-imu_ts,-gps_hour,-id,-long,-lat,-alt, -gps_ts,-activity)
 
  results_train<-data_train %>% skimr::skim() %>% skimr::yank("numeric")  %>% select(mean,sd) %>% as.data.frame()
  results_train$repetition<-i
  results_train$variable_n<-names(data_train)
  sample_results_train<-rbind(sample_results_train,results_train)
  
  
  results_test<-data_test %>% skimr::skim() %>% skimr::yank("numeric")  %>% select(mean,sd) %>% as.data.frame()
  results_test$repetition<-i
  results_test$variable_n<-names(data_train)
  sample_results_test<-rbind(sample_results_test,results_test)
  
}
setrain<-sample_results_train %>% group_by(variable_n) %>% summarise(grand_mean=mean(mean),se=sd(mean))

settest<-sample_results_test %>% group_by(variable_n) %>% summarise(grand_mean_t=mean(mean),se_t=sd(mean))


cbind(setrain,settest)

sample_results_train %>% ggplot()+
  geom_histogram(aes(x=sd))+
  facet_wrap(~variable_n,scales = 'free')
```

# Tomamos muestras aleatorias de 1 minuto de duracion para sobre las cuales despues queremos predecir el comportamiento

```{r message=FALSE, warning=FALSE}
#readr::write_csv(caprino_data,"data/caprino_data_sliced_distance.csv")
#set.seed(21092019)
#set.seed(21092022)
#set.seed(07092020)
#set.seed(01092020)
minute_slice<-as.vector(t(caprino_data %>% group_by(id) %>% summarise(n=n()) %>% select(id)))
trainidx<-sample(1:length(minute_slice),(length(minute_slice)*70)/100.0)
data_train<-caprino_data %>% filter( id %in% minute_slice[trainidx])
data_test<-caprino_data %>% filter( id %in% (minute_slice[-trainidx]))
data_train%>% group_by(id) %>% summarise(n=n())
data_test%>% group_by(id) %>% summarise(n=n())
data_train %>% filter(!is.na(activity)) %>% group_by(activity) %>% summarise(n=n()) 

```

# Aplicamos Bag-of-features.
## presupuesto

 Consideramos cada muestra como un vector de 5 dimensiones. La idea es aplicar un algoritmo de clustering sobre estos vectores (accX,accY,accZ,GyroX,gyroY). Una vez encontrado los centroides, para cada muestra de 1 se construye un histograma de los vectores que han caido en cada uno de estos clusters. Este histograma es el que se utiliza para entregar el algoritmo.

La hipotesis de aplicar este metodo es que aquellas muestras que pertenecen a una misma actividad tendran valores similares. Los cuales han sido representado mediantes estos histogramas. 

## Aplicamos k-means con 90 centroides sobre el conjunto de entrenamiento. 
Se utilizan todos los sensores y la distancia.

```{r message=FALSE, warning=FALSE}
centroids<-90
cluster_features<-c('accX','accY','accZ','gyroX','gyroY','gyroZ','distance')

## Sampling considering time slices 
  minute_slice <-
    as.vector(t(
      caprino_data %>% group_by(id) %>% summarise(n = n()) %>% select(id)
    ))
  trainidx <-
    sample(1:length(minute_slice), (length(minute_slice) * 70) / 100.0)
  data_train <- caprino_data %>% filter(id %in% minute_slice[trainidx])
  data_test <-
    caprino_data %>% filter(id %in% (minute_slice[-trainidx]))
  
  
  # Kmeans on trainset
  cluster_results <-
    kmeans(
      x = data_train %>% select(cluster_features),
      centers = centroids,
      nstart = 10
    )
  data_train_cluster <-
    cbind(data_train, cluster = as.integer(cluster_results$cluster)) %>% group_by(activity, cluster)
  
  cluster_results<-cluster_results$centers %>% 
    as.data.frame() %>% 
    tibble::rownames_to_column(var = "cluster")
```
### Aplicamos KNN con K=3 en el conjunto de entrenamiento.
```{r message=FALSE, warning=FALSE}
  # KNN using centroids from kmeans for testset
  
  cluster_results_test <-class::knn(train=cluster_results %>% select(-cluster),
                                    test=data_test %>% select(cluster_features)
                                    ,cl = as.integer(cluster_results$cluster),
                                    k=3 )
  data_test_cluster <-
    cbind(data_test, cluster = as.integer(cluster_results_test)) %>% group_by(activity, cluster)
  
  # Create histograms for bag-of-features and label each new datapoint 
  ## For trainset
  data_train_cluster_labeled <- data_train_cluster   %>%
    group_by(id, cluster) %>% summarise(n = n()) %>%
    reshape2::dcast(id ~ cluster, fill = 0) %>%
    inner_join(data_train_cluster %>% ungroup() %>%
                 select(id, activity)  %>%
                 unique()  ,
               by = "id") %>% filter(!is.na(activity))
  
  data_cluster_histogram<-data.frame(matrix(0L,nrow=nrow(data_train_cluster_labeled),ncol=90))
  names(data_cluster_histogram)[1:90]<-seq(1,90)
  data_cluster_histogram[names(data_train_cluster_labeled)]<-data_train_cluster_labeled
  data_train_cluster_labeled<-data_cluster_histogram
  
  ## For testset
  data_test_cluster_labeled <- data_test_cluster   %>% 
    group_by(id, cluster) %>% summarise(n = n()) %>%
    reshape2::dcast(id ~ cluster, fill = 0) %>%
    inner_join(data_test_cluster %>%  ungroup() %>%
                 select(id, activity)  %>%
                 unique() ,
               by = "id")  %>% filter(!is.na(activity))
  
  data_cluster_histogram<-data.frame(matrix(0L,nrow=nrow(data_test_cluster_labeled),ncol=90))
  names(data_cluster_histogram)[1:90]<-seq(1,90)
  data_cluster_histogram[names(data_test_cluster_labeled)]<-data_test_cluster_labeled
  data_test_cluster_labeled<-data_cluster_histogram
```
# Total de actividades en Train
```{r message=FALSE, warning=FALSE}
data_train_cluster_labeled %>% group_by(activity) %>% summarise(total=n())
```
# Total de actividades en Test
```{r message=FALSE, warning=FALSE}
data_test_cluster_labeled %>% group_by(activity) %>% summarise(total=n())
```

## Aplicamos 3x5 CV para elegir la mejor version del modelo
```{r include=FALSE}
library(caret)
library(doMC)
registerDoMC(cores=6)
ctrl_fast <- trainControl(method="repeatedcv", 
                     repeats=3,
                     number=10, 
                     returnResamp = 'final',
                     savePredictions = 'final',
                     
                     verboseIter=F,
                     #summaryFunction = prSummary,
                     classProbs=TRUE,
                     allowParallel = T
                    
                     )  
```
## Probamos Random Forest
```{r}
rf_grid <-  expand.grid(.mtry=c(5)) 
rfFit <- caret::train(
               x= data_train_cluster_labeled %>% select(-id,-activity),
               y= data_train_cluster_labeled$activity,
              
               method = "rf",
  
               #tuneLength=5, 
               tuneGrid=rf_grid,
               
               verbose=2,
       
               trControl = ctrl_fast,
               ntree=200)
rfFit$results
```
<!-- ## Probamos catboost -->
```{r eval=FALSE, include=FALSE}
catboostFit <- caret::train(
               x= data_train_cluster_labeled %>% select(-id,-activity),
               y= data_train_cluster_labeled$activity,
              
               method = catboost.caret,
  
               tuneLength=5,
               verbose=2,
       
               trControl = ctrl_fast)

```

<!-- ## Probamos glmnet -->
```{r eval=FALSE, include=FALSE}
glmnetFit <- caret::train(
               x= data_train_cluster_labeled %>% select(-id,-activity),
               y= data_train_cluster_labeled$activity,
              
               method = "glmnet",
  
               tuneLength=5,
               verbose=2,
       
               trControl = ctrl_fast)

```
<!-- ## modelos obtenidos -->
```{r eval=FALSE, include=FALSE}
rf_cv_results<-cbind(resample=rfFit$resample,model="rf")
catboost_cv_results<-cbind(resample=catboostFit$resample,model="catboost")
glmnet_cv_results<-cbind(resample=glmnetFit$resample,model="glmnet")

cv_results<-rbind(rf_cv_results,catboost_cv_results,glmnet_cv_results)

cv_results %>% ggplot()+
  geom_boxplot(aes(x=model,y=resample.Accuracy,fill=model))+
    geom_point(aes(x=model,y=resample.Accuracy),color='red',alpha=0.5)+
  ggdark::dark_theme_gray() +
  ylim(0.7, 1)+
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
  theme(legend.position = "none") 

oneway.test(resample.Accuracy~model,data=cv_results %>% select(resample.Accuracy,model)) %>% broom::tidy()

cv_results %>% group_by(model) %>% summarise(mean=mean(resample.Accuracy),sd=sd(resample.Accuracy))
```


## Seleccionamos el modelo
```{r}
selected_model<-rfFit
#save(rfFit,file = "journal-polaco/rf_model_journal_polaco2020")
#write_csv(caprino_data,path="journal-polaco/caprino_data_sliced.csv")
#write_csv(data_train_cluster_labeled,path="journal-polaco/trainset.csv")
#write_csv(data_test_cluster_labeled,path="journal-polaco/testset.csv")

```

# Evaluacion de los resultados de CV (CV Report)
```{r fig.height=3, fig.width=8, message=FALSE, warning=FALSE}
predictions_cv<-selected_model$pred %>% select(Resample,pred,obs)

predictions_cv_cm <- predictions_cv %>% group_by(Resample) %>%
  group_modify(
    ~ as.data.frame(
      caret::confusionMatrix(
        data = .x$pred,
        reference = .x$obs,
        mode = 'everything'
      )$byClass
    ) %>%
      select(Precision, Recall, F1,) %>% 
      tibble::rownames_to_column(var = "activity")
  )

predictions_cv_results<- predictions_cv_cm %>% 
  group_by(activity) %>% 
  summarise(
    precision_mean=mean(Precision),
    precision_se=sd(Precision)/sqrt(selected_model$control$number*selected_model$control$repeats),
    precision_conf_inf=mean(Precision)-(precision_se*1.96),
    precision_conf_sup=mean(Precision)+(precision_se*1.96),
    recall_mean=mean(Recall),
    recall_se=sd(Recall)/sqrt(selected_model$control$number*selected_model$control$repeats),
    recall_conf_inf=mean(Recall)-(recall_se*1.96),
    recall_conf_sup=mean(Recall)+(recall_se*1.96),
    F1_mean=mean(F1),
    F1_se=sd(F1)/sqrt(selected_model$control$number*selected_model$control$repeats),
    F1_conf_inf=mean(F1)-(F1_se*1.96),
    F1_conf_sup=mean(F1)+(F1_se*1.96)

)
predictions_cv_cm_plot<-predictions_cv_cm %>% reshape2::melt() %>%
  ggplot()+
  facet_wrap(~variable)+
  ylim(0.6,1)+
  geom_boxplot(aes(x=activity,y=value,fill=activity))+
  labs(title=paste0(selected_model$method," for activity prediction [CV] Report"),
                    subtitle="[3x5 CV]")+
  ggdark::dark_theme_gray()+
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
      theme(legend.position = "none") 
   
predictions_cv_cm

predictions_cv_cm_plot
predictions_cv_results
```

```{r}
prediction_cv_cor<-predictions_cv_cm %>% select(activity,Recall,-Resample) %>% tidyr::pivot_wider(names_from = activity, values_from = Recall) %>% ungroup() %>% select(-Resample) %>% cor(method = "spearman")
library(d3heatmap)
d3heatmap(prediction_cv_cor,colors = "Blues",cexRow = 0.8, cexCol = 0.8)
```

```{r}

#predictions_cv_results_error  %>% select(activity,Sensitivity_sd,Specificity_sd,F1_sd,BalancedAccuracy_sd)
#predictions_cv_results  %>% select(activity,Sensitivity_mean,Specificity_mean,F1_mean,BalancedAccuracy_mean) %>% reshape2::melt() %>%
  
  
#  ggplot()+
#  facet_wrap(~variable)+
#  geom_col(aes(x=activity,y=value,fill=activity))+
#  ggdark::dark_theme_grey()+
#  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
#  theme(legend.position = "none") 
```


```{r eval=FALSE, include=FALSE}
varImp(selected_model)
```

## Evaluacion sobre el conjunto de test (Test Report)
Matriz de Confusion
```{r}
data_test_prediction<-predict(selected_model,data_test_cluster_labeled %>% select(-id))#,type="prob")
cm<-caret::confusionMatrix(data_test_prediction,as.factor(data_test_cluster_labeled$activity),mode='everything')
cm
```
```{r fig.height=3, fig.width=8, message=FALSE, warning=FALSE}
test_data<-cm$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var='activity') %>%
  select(activity,Precision, Recall, F1) %>% reshape2::melt()

predictions_cv_cm_plot+
  labs(title=paste0(selected_model$method," for activity prediction [CV+Test] Report"),
                    subtitle="[3x5 CV]")+
    ylim(0.6,1)+
  geom_point(data=test_data,aes(x=activity,y=value),fill="darkgreen",color="white",shape=23,size=2)+
    theme(legend.position = "none") 

```

## Metricas Macro y Micro
```{r}


macro_metrics<-(apply(cm$byClass,2,sum)/4) [c(5,6,7)]

s=table(data_test_prediction,as.factor(data_test_cluster_labeled$activity))

Precision_micro <-  (sum(diag(s)) / sum(apply(s,1, sum)))
Recall_micro <- (sum(diag(s)) / sum(apply(s,2, sum)))

micro_metrics <- c( 
  (sum(diag(s)) / sum(apply(s,1, sum))),
 (sum(diag(s)) / sum(apply(s,2, sum))),
 2 * ((Precision_micro*Recall_micro)/(Precision_micro+Recall_micro)))

mm<-cbind(c(macro_metrics),c(micro_metrics))


library(gridExtra)
data.frame(macro=mm[,1],micro=mm[,2])
```
```{r}
apply(cm$table,2,sum)
```

## Matriz de confusion (plot)
```{r}
len_data_test_prediction = length(data_test_prediction)
reshape2::melt(table(data_test_prediction,as.factor(data_test_cluster_labeled$activity))) %>% mutate(value=(value)) %>%
  ggplot(aes(x=data_test_prediction,y=Var2))+
  geom_tile(aes(fill=value), colour = "white") + 
   geom_text(aes(label = sprintf("%1.2f", value)), color='black',vjust = 1)+
  scale_fill_gradient(low = "white", high = "red")+
  xlab(" Predicted Activity ")+ylab(" Actual Activity")+
  theme_bw()+ theme(legend.position = "none")

#ggsave(filename = "/home/harpo/Dropbox/ongoing-work/publicaciones/CAI2019-JAIIO/confusion-matrix.png",width = 8, height = 4)


```


#### G vs RF
```{r}


GvsRFmatrix <-
  cbind(data_test_cluster_labeled, predictions = data_test_prediction) %>% select(activity, predictions) %>% filter(activity %in% c("G", "RF")) %>% filter(predictions %in% c("G", "RF"))

GvsRF_recall <-
  caret::confusionMatrix(
    factor(GvsRFmatrix$predictions),
    factor(GvsRFmatrix$activity),
    mode = 'everything',
    positive = "G"
  )$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var = "metric") %>% filter(metric == "Recall" | metric == "Specificity") %>% mutate(activities="GvsRF")

GvsRF_recall
```
#### RF vs RP
```{r}

RFvsRPmatrix <-
  cbind(data_test_cluster_labeled, predictions = data_test_prediction) %>% select(activity, predictions) %>% filter(activity %in% c("RF", "RP")) %>% filter(predictions %in% c("RF", "RP"))

RFvsRP_recall <-
  caret::confusionMatrix(
    factor(RFvsRPmatrix$predictions),
    factor(RFvsRPmatrix$activity),
    mode = 'everything',
    positive = "RF"
  )$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var = "metric") %>% filter(metric == "Recall" | metric == "Specificity") %>% mutate(activities="RFvsRP")

RFvsRP_recall
```

#### G vs RP
```{r}

GvsRPmatrix <-
  cbind(data_test_cluster_labeled, predictions = data_test_prediction) %>% select(activity, predictions) %>% filter(activity %in% c("G", "RP")) %>% filter(predictions %in% c("G", "RP"))

GvsRP_recall <-
  caret::confusionMatrix(
    factor(GvsRPmatrix$predictions),
    factor(GvsRPmatrix$activity),
    mode = 'everything',
    positive = "G"
  )$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var = "metric") %>% filter(metric == "Recall" | metric == "Specificity") %>% mutate(activities="GvsRP")

GvsRP_recall
```
# Session Info
```{r}
sessionInfo()
```

