library(readr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(plotly)
#library(ggmap)
#library(catboost)
rad2deg <- function(rad) {(rad * 180) / (pi)}
deg2rad <- function(deg) {(deg * pi) / (180)}
# Datasets originales
#caprino_hours<-read.csv("data/caprino_hours.csv.gz",header = F)
#caprino_imu_gyros<-read.csv("data/caprino_imu_gyro.csv.gz", header=F)
#caprino_imu_acc<-read.csv("data/caprino_imu_acc.csv.gz", header=F)
#caprino_gps<-read.csv("data/caprino_gps.csv",header=F)
# Datasets nuevos
caprino_hours<-read.csv("data/caprino_hours.csv.gz",header = F)
caprino_imu_gyros<-read.csv("data/caprino_imu_gyro.csv.gz", header=F)
caprino_imu_acc<-read.csv("data/caprino_imu_acc.csv.gz", header=F)
caprino_gps<-read.csv("data/caprino_gps_distancia.csv",header=F) #INCLUYE UNA SEXTA COLUMNA CON la distancia euclidea
caprino_gps<-caprino_gps %>% select(-V7)
caprino_imu_acc %>% nrow()
[1] 463703
caprino_imu_gyros %>% nrow()
[1] 463703
caprino_gps %>% nrow()
[1] 231767
caprino_hours %>% nrow()
[1] 231767
Datos de los archivos
caprino_imu_acc.csv, 4 columnas: tiempo UTC en microsegundos, aceleración X en m/s², aceleración Y en m/s², aceleración, Z en m/s². Muestreado a 200 muestras por segundo. Filtrados
caprino_imu_gyro.csv, 4 columnas: tiempo UTC en microsegundos, velocidad angular X en rad/s, velocidad angular Y en rad/s, velocidad angular Z en rad/s. Muestreado a 200 muestras por segundo. Filtrados
caprino_gps_distancia.csv, 4 columnas: tiempo UTC en microsegundos, latitud en radianes, longitud en radianes, altitud en metros. Muestreado a 5 muestras por segundo Se agrega ademas una columna con la distancia euclidean.
caprino_hours.csv: tiempo en horas, minutos y segundos. Muestreado a 5 muestras por segundo. La primera hora de este archivo coincide con el primer valor de tiempo UTC del archivo caprino_gps.csv.
Construccion del Dataset
caprino_data<-cbind(caprino_imu_acc,caprino_imu_gyros %>% select(-V1))
caprino_data<-cbind(caprino_data[seq(2,nrow(caprino_data),2),] %>% head(nrow(caprino_gps)-1), caprino_gps[2:nrow(caprino_gps),] )
names(caprino_data)<-c("imu_ts",
"accX",
"accY",
"accZ",
"gyroX",
"gyroY",
"gyroZ",
"gps_hour",
"gps_ts",
"lat",
"long",
"alt",
"distance"
)
#caprino_data %>% mutate(diff=imu_ts-gps_ts) %>% select(diff)
caprino_data<-caprino_data %>% mutate(activity=ifelse(hms(gps_hour)>hms("8:56:00") & hms(gps_hour)<hms("10:38:00"),"RP",
ifelse(hms(gps_hour)>=hms("10:38:00") & hms(gps_hour)<hms("11:38:00"),"W",
ifelse(hms(gps_hour)>=hms("11:38:00") & hms(gps_hour)<hms("12:05:00"),"G",
ifelse(hms(gps_hour)>=hms("12:05:00") & hms(gps_hour)<hms("15:48:00"),"RF",
ifelse(hms(gps_hour)>=hms("17:30:00") & hms(gps_hour)<hms("20:33:00"),"G",
ifelse(hms(gps_hour)>=hms("20:50:00") & hms(gps_hour)<hms("21:31:00"),"W",
ifelse(hms(gps_hour)>=hms("21:33:00") & hms(gps_hour)<hms("21:50:00"),"RP",NA)
)))))
)
)
caprino_data <-caprino_data %>% mutate(id=cut(as.POSIXct(hms(caprino_data$gps_hour),origin="1960-01-01",tz="GMT"),breaks="1 min"))
caprino_data %>% head(10)
Tomamos muestras aleatorias de 1 minuto de duracion para sobre las cuales despues queremos predecir el comportamiento
#readr::write_csv(caprino_data,"data/caprino_data_sliced_distance.csv")
#set.seed(21092019)
#set.seed(21092022)
#set.seed(07092020)
#set.seed(01092020)
minute_slice<-as.vector(t(caprino_data %>% group_by(id) %>% summarise(n=n()) %>% select(id)))
trainidx<-sample(1:length(minute_slice),(length(minute_slice)*70)/100.0)
data_train<-caprino_data %>% filter( id %in% minute_slice[trainidx])
data_test<-caprino_data %>% filter( id %in% (minute_slice[-trainidx]))
data_train%>% group_by(id) %>% summarise(n=n())
data_test%>% group_by(id) %>% summarise(n=n())
data_train %>% filter(!is.na(activity)) %>% group_by(activity) %>% summarise(n=n())
NA
Aplicamos Bag-of-features.
presupuesto
Consideramos cada muestra como un vector de 5 dimensiones. La idea es aplicar un algoritmo de clustering sobre estos vectores (accX,accY,accZ,GyroX,gyroY). Una vez encontrado los centroides, para cada muestra de 1 se construye un histograma de los vectores que han caido en cada uno de estos clusters. Este histograma es el que se utiliza para entregar el algoritmo.
La hipotesis de aplicar este metodo es que aquellas muestras que pertenecen a una misma actividad tendran valores similares. Los cuales han sido representado mediantes estos histogramas.
Aplicamos k-means con 90 centroides sobre el conjunto de entrenamiento.
Se utilizan todos los sensores y la distancia.
centroids<-90
cluster_features<-c('accX','accY','accZ','gyroX','gyroY','gyroZ','distance')
## Sampling considering time slices
minute_slice <-
as.vector(t(
caprino_data %>% group_by(id) %>% summarise(n = n()) %>% select(id)
))
trainidx <-
sample(1:length(minute_slice), (length(minute_slice) * 70) / 100.0)
data_train <- caprino_data %>% filter(id %in% minute_slice[trainidx])
data_test <-
caprino_data %>% filter(id %in% (minute_slice[-trainidx]))
# Kmeans on trainset
cluster_results <-
kmeans(
x = data_train %>% select(cluster_features),
centers = centroids,
nstart = 10
)
data_train_cluster <-
cbind(data_train, cluster = as.integer(cluster_results$cluster)) %>% group_by(activity, cluster)
cluster_results<-cluster_results$centers %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "cluster")
Aplicamos KNN con K=3 en el conjunto de entrenamiento.
# KNN using centroids from kmeans for testset
cluster_results_test <-class::knn(train=cluster_results %>% select(-cluster),
test=data_test %>% select(cluster_features)
,cl = as.integer(cluster_results$cluster),
k=3 )
data_test_cluster <-
cbind(data_test, cluster = as.integer(cluster_results_test)) %>% group_by(activity, cluster)
# Create histograms for bag-of-features and label each new datapoint
## For trainset
data_train_cluster_labeled <- data_train_cluster %>%
group_by(id, cluster) %>% summarise(n = n()) %>%
reshape2::dcast(id ~ cluster, fill = 0) %>%
inner_join(data_train_cluster %>% ungroup() %>%
select(id, activity) %>%
unique() ,
by = "id") %>% filter(!is.na(activity))
data_cluster_histogram<-data.frame(matrix(0L,nrow=nrow(data_train_cluster_labeled),ncol=90))
names(data_cluster_histogram)[1:90]<-seq(1,90)
data_cluster_histogram[names(data_train_cluster_labeled)]<-data_train_cluster_labeled
data_train_cluster_labeled<-data_cluster_histogram
## For testset
data_test_cluster_labeled <- data_test_cluster %>%
group_by(id, cluster) %>% summarise(n = n()) %>%
reshape2::dcast(id ~ cluster, fill = 0) %>%
inner_join(data_test_cluster %>% ungroup() %>%
select(id, activity) %>%
unique() ,
by = "id") %>% filter(!is.na(activity))
data_cluster_histogram<-data.frame(matrix(0L,nrow=nrow(data_test_cluster_labeled),ncol=90))
names(data_cluster_histogram)[1:90]<-seq(1,90)
data_cluster_histogram[names(data_test_cluster_labeled)]<-data_test_cluster_labeled
data_test_cluster_labeled<-data_cluster_histogram
Total de actividades en Train
data_train_cluster_labeled %>% group_by(activity) %>% summarise(total=n())
Total de actividades en Test
data_test_cluster_labeled %>% group_by(activity) %>% summarise(total=n())
Aplicamos 3x5 CV para elegir la mejor version del modelo
Probamos Random Forest
rf_grid <- expand.grid(.mtry=c(5))
rfFit <- caret::train(
x= data_train_cluster_labeled %>% select(-id,-activity),
y= data_train_cluster_labeled$activity,
method = "rf",
#tuneLength=5,
tuneGrid=rf_grid,
verbose=2,
trControl = ctrl_fast,
ntree=200)
rfFit$results
Seleccionamos el modelo
selected_model<-rfFit
Evaluacion de los resultados de CV (CV Report)
predictions_cv<-selected_model$pred %>% select(Resample,pred,obs)
predictions_cv_cm <- predictions_cv %>% group_by(Resample) %>%
group_modify(
~ as.data.frame(
caret::confusionMatrix(
data = .x$pred,
reference = .x$obs,
mode = 'everything'
)$byClass
) %>%
select(Precision, Recall, F1,) %>%
tibble::rownames_to_column(var = "activity")
)
predictions_cv_results<- predictions_cv_cm %>%
group_by(activity) %>%
summarise(
precision_mean=mean(Precision),
precision_se=sd(Precision)/sqrt(selected_model$control$number*selected_model$control$repeats),
precision_conf_inf=mean(Precision)-(precision_se*1.96),
precision_conf_sup=mean(Precision)+(precision_se*1.96),
recall_mean=mean(Recall),
recall_se=sd(Recall)/sqrt(selected_model$control$number*selected_model$control$repeats),
recall_conf_inf=mean(Recall)-(recall_se*1.96),
recall_conf_sup=mean(Recall)+(recall_se*1.96),
F1_mean=mean(F1),
F1_se=sd(F1)/sqrt(selected_model$control$number*selected_model$control$repeats),
F1_conf_inf=mean(F1)-(F1_se*1.96),
F1_conf_sup=mean(F1)+(F1_se*1.96)
)
predictions_cv_cm_plot<-predictions_cv_cm %>% reshape2::melt() %>%
ggplot()+
facet_wrap(~variable)+
ylim(0.6,1)+
geom_boxplot(aes(x=activity,y=value,fill=activity))+
labs(title=paste0(selected_model$method," for activity prediction [CV] Report"),
subtitle="[3x5 CV]")+
ggdark::dark_theme_gray()+
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
theme(legend.position = "none")
predictions_cv_cm
predictions_cv_cm_plot

predictions_cv_results
prediction_cv_cor<-predictions_cv_cm %>% select(activity,Recall,-Resample) %>% tidyr::pivot_wider(names_from = activity, values_from = Recall) %>% ungroup() %>% select(-Resample) %>% cor(method = "spearman")
Adding missing grouping variables: `Resample`
library(d3heatmap)
d3heatmap(prediction_cv_cor,colors = "Blues",cexRow = 0.8, cexCol = 0.8)
#predictions_cv_results_error %>% select(activity,Sensitivity_sd,Specificity_sd,F1_sd,BalancedAccuracy_sd)
#predictions_cv_results %>% select(activity,Sensitivity_mean,Specificity_mean,F1_mean,BalancedAccuracy_mean) %>% reshape2::melt() %>%
# ggplot()+
# facet_wrap(~variable)+
# geom_col(aes(x=activity,y=value,fill=activity))+
# ggdark::dark_theme_grey()+
# theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
# theme(legend.position = "none")
Evaluacion sobre el conjunto de test (Test Report)
Matriz de Confusion
data_test_prediction<-predict(selected_model,data_test_cluster_labeled %>% select(-id))#,type="prob")
cm<-caret::confusionMatrix(data_test_prediction,as.factor(data_test_cluster_labeled$activity),mode='everything')
cm
Confusion Matrix and Statistics
Reference
Prediction G RF RP W
G 45 9 0 2
RF 9 58 6 0
RP 1 2 33 1
W 1 0 0 32
Overall Statistics
Accuracy : 0.8442
95% CI : (0.7862, 0.8916)
No Information Rate : 0.3467
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.786
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: G Class: RF Class: RP Class: W
Sensitivity 0.8036 0.8406 0.8462 0.9143
Specificity 0.9231 0.8846 0.9750 0.9939
Pos Pred Value 0.8036 0.7945 0.8919 0.9697
Neg Pred Value 0.9231 0.9127 0.9630 0.9819
Precision 0.8036 0.7945 0.8919 0.9697
Recall 0.8036 0.8406 0.8462 0.9143
F1 0.8036 0.8169 0.8684 0.9412
Prevalence 0.2814 0.3467 0.1960 0.1759
Detection Rate 0.2261 0.2915 0.1658 0.1608
Detection Prevalence 0.2814 0.3668 0.1859 0.1658
Balanced Accuracy 0.8633 0.8626 0.9106 0.9541
test_data<-cm$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var='activity') %>%
select(activity,Precision, Recall, F1) %>% reshape2::melt()
predictions_cv_cm_plot+
labs(title=paste0(selected_model$method," for activity prediction [CV+Test] Report"),
subtitle="[3x5 CV]")+
ylim(0.6,1)+
geom_point(data=test_data,aes(x=activity,y=value),fill="darkgreen",color="white",shape=23,size=2)+
theme(legend.position = "none")

Metricas Macro y Micro
macro_metrics<-(apply(cm$byClass,2,sum)/4) [c(5,6,7)]
s=table(data_test_prediction,as.factor(data_test_cluster_labeled$activity))
Precision_micro <- (sum(diag(s)) / sum(apply(s,1, sum)))
Recall_micro <- (sum(diag(s)) / sum(apply(s,2, sum)))
micro_metrics <- c(
(sum(diag(s)) / sum(apply(s,1, sum))),
(sum(diag(s)) / sum(apply(s,2, sum))),
2 * ((Precision_micro*Recall_micro)/(Precision_micro+Recall_micro)))
mm<-cbind(c(macro_metrics),c(micro_metrics))
library(gridExtra)
data.frame(macro=mm[,1],micro=mm[,2])
apply(cm$table,2,sum)
G RF RP W
56 69 39 35
Matriz de confusion (plot)
len_data_test_prediction = length(data_test_prediction)
reshape2::melt(table(data_test_prediction,as.factor(data_test_cluster_labeled$activity))) %>% mutate(value=(value)) %>%
ggplot(aes(x=data_test_prediction,y=Var2))+
geom_tile(aes(fill=value), colour = "white") +
geom_text(aes(label = sprintf("%1.2f", value)), color='black',vjust = 1)+
scale_fill_gradient(low = "white", high = "red")+
xlab(" Predicted Activity ")+ylab(" Actual Activity")+
theme_bw()+ theme(legend.position = "none")

#ggsave(filename = "/home/harpo/Dropbox/ongoing-work/publicaciones/CAI2019-JAIIO/confusion-matrix.png",width = 8, height = 4)
G vs RF
GvsRFmatrix <-
cbind(data_test_cluster_labeled, predictions = data_test_prediction) %>% select(activity, predictions) %>% filter(activity %in% c("G", "RF")) %>% filter(predictions %in% c("G", "RF"))
GvsRF_recall <-
caret::confusionMatrix(
factor(GvsRFmatrix$predictions),
factor(GvsRFmatrix$activity),
mode = 'everything',
positive = "G"
)$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var = "metric") %>% filter(metric == "Recall" | metric == "Specificity") %>% mutate(activities="GvsRF")
GvsRF_recall
RF vs RP
RFvsRPmatrix <-
cbind(data_test_cluster_labeled, predictions = data_test_prediction) %>% select(activity, predictions) %>% filter(activity %in% c("RF", "RP")) %>% filter(predictions %in% c("RF", "RP"))
RFvsRP_recall <-
caret::confusionMatrix(
factor(RFvsRPmatrix$predictions),
factor(RFvsRPmatrix$activity),
mode = 'everything',
positive = "RF"
)$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var = "metric") %>% filter(metric == "Recall" | metric == "Specificity") %>% mutate(activities="RFvsRP")
RFvsRP_recall
G vs RP
GvsRPmatrix <-
cbind(data_test_cluster_labeled, predictions = data_test_prediction) %>% select(activity, predictions) %>% filter(activity %in% c("G", "RP")) %>% filter(predictions %in% c("G", "RP"))
GvsRP_recall <-
caret::confusionMatrix(
factor(GvsRPmatrix$predictions),
factor(GvsRPmatrix$activity),
mode = 'everything',
positive = "G"
)$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var = "metric") %>% filter(metric == "Recall" | metric == "Specificity") %>% mutate(activities="GvsRP")
GvsRP_recall
Session Info
sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=es_AR.UTF-8 LC_NUMERIC=C LC_TIME=es_AR.UTF-8 LC_COLLATE=es_AR.UTF-8
[5] LC_MONETARY=es_AR.UTF-8 LC_MESSAGES=es_AR.UTF-8 LC_PAPER=es_AR.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=es_AR.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] d3heatmap_0.6.1.3 gridExtra_2.3 doMC_1.3.6 iterators_1.0.12 foreach_1.4.7 caret_6.0-86 lattice_0.20-41
[8] plotly_4.9.2 ggplot2_3.2.1 lubridate_1.7.4 dplyr_1.0.0 readr_1.3.1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.5 tidyr_1.0.0 png_0.1-7 class_7.3-17 packrat_0.5.0 assertthat_0.2.1
[7] digest_0.6.25 ipred_0.9-9 R6_2.4.0 plyr_1.8.4 ggdark_0.2.1 stats4_3.6.3
[13] e1071_1.7-2 evaluate_0.14 httr_1.4.2 pillar_1.4.3 rlang_0.4.7 lazyeval_0.2.2
[19] rstudioapi_0.11 data.table_1.12.4 rpart_4.1-15 Matrix_1.2-18 rmarkdown_2.3 labeling_0.3
[25] splines_3.6.3 gower_0.2.1 stringr_1.4.0 htmlwidgets_1.5.1 munsell_0.5.0 tinytex_0.16
[31] compiler_3.6.3 xfun_0.10 base64enc_0.1-3 pkgconfig_2.0.3 htmltools_0.5.0 nnet_7.3-14
[37] tidyselect_1.1.0 tibble_3.0.1 prodlim_2018.04.18 codetools_0.2-16 randomForest_4.6-14 fansi_0.4.0
[43] viridisLite_0.3.0 crayon_1.3.4 withr_2.2.0 MASS_7.3-52 recipes_0.1.10 ModelMetrics_1.2.2.2
[49] grid_3.6.3 nlme_3.1-149 jsonlite_1.7.1 gtable_0.3.0 lifecycle_0.2.0 magrittr_1.5
[55] pROC_1.16.2 scales_1.0.0 cli_2.0.2 stringi_1.4.3 reshape2_1.4.3 timeDate_3043.102
[61] ellipsis_0.3.1 generics_0.0.2 vctrs_0.3.2 lava_1.6.6 RColorBrewer_1.1-2 tools_3.6.3
[67] glue_1.4.1 purrr_0.3.3 hms_0.5.3 yaml_2.2.0 survival_3.1-12 colorspace_1.4-1
[73] knitr_1.25
---
title: "Dataset Ganado Caprino 28/9/20"
output: 
  html_notebook: 
    code_folding: show
---
```{r message=FALSE, warning=FALSE}
library(readr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(plotly)
#library(ggmap)
#library(catboost)

rad2deg <- function(rad) {(rad * 180) / (pi)}
deg2rad <- function(deg) {(deg * pi) / (180)}


# Datasets originales
#caprino_hours<-read.csv("data/caprino_hours.csv.gz",header = F)
#caprino_imu_gyros<-read.csv("data/caprino_imu_gyro.csv.gz", header=F)
#caprino_imu_acc<-read.csv("data/caprino_imu_acc.csv.gz", header=F)
#caprino_gps<-read.csv("data/caprino_gps.csv",header=F)

# Datasets nuevos
caprino_hours<-read.csv("data/caprino_hours.csv.gz",header = F)
caprino_imu_gyros<-read.csv("data/caprino_imu_gyro.csv.gz", header=F)
caprino_imu_acc<-read.csv("data/caprino_imu_acc.csv.gz", header=F)
caprino_gps<-read.csv("data/caprino_gps_distancia.csv",header=F) #INCLUYE UNA SEXTA COLUMNA CON la distancia euclidea
caprino_gps<-caprino_gps %>% select(-V7)

caprino_imu_acc %>% nrow()
caprino_imu_gyros %>% nrow()
caprino_gps %>% nrow()
caprino_hours %>% nrow()


```


### Datos de los archivos 
* caprino_imu_acc.csv, 4 columnas: tiempo UTC en microsegundos,
aceleración X en m/s², aceleración Y en m/s², aceleración, Z en m/s².
Muestreado a 200 muestras por segundo. Filtrados

* caprino_imu_gyro.csv, 4 columnas: tiempo UTC en microsegundos,
velocidad angular X en rad/s, velocidad angular Y en rad/s, velocidad
angular Z en rad/s. Muestreado a 200 muestras por segundo. Filtrados

* caprino_gps_distancia.csv, 4 columnas: tiempo UTC en microsegundos, latitud en
radianes, longitud en radianes, altitud en metros. Muestreado a 5
muestras por segundo Se agrega ademas una columna con la distancia euclidean.

* caprino_hours.csv: tiempo en horas, minutos y segundos. Muestreado a
5 muestras por segundo. La primera hora de este archivo coincide con
el primer valor de tiempo UTC del archivo caprino_gps.csv.



## Construccion del Dataset

```{r}
caprino_data<-cbind(caprino_imu_acc,caprino_imu_gyros %>% select(-V1))

caprino_data<-cbind(caprino_data[seq(2,nrow(caprino_data),2),] %>% head(nrow(caprino_gps)-1),  caprino_gps[2:nrow(caprino_gps),] ) 
names(caprino_data)<-c("imu_ts",
                       "accX",
                       "accY",
                       "accZ",
                       "gyroX",
                       "gyroY",
                       "gyroZ",
                       "gps_hour",
                       "gps_ts",
                       "lat",
                       "long",
                       "alt",
                       "distance" 
                        )
#caprino_data %>% mutate(diff=imu_ts-gps_ts) %>% select(diff)
caprino_data<-caprino_data %>% mutate(activity=ifelse(hms(gps_hour)>hms("8:56:00") & hms(gps_hour)<hms("10:38:00"),"RP",
                                       
                                        ifelse(hms(gps_hour)>=hms("10:38:00") & hms(gps_hour)<hms("11:38:00"),"W",
                                        ifelse(hms(gps_hour)>=hms("11:38:00") & hms(gps_hour)<hms("12:05:00"),"G",
                                        ifelse(hms(gps_hour)>=hms("12:05:00") & hms(gps_hour)<hms("15:48:00"),"RF",
                                        ifelse(hms(gps_hour)>=hms("17:30:00") & hms(gps_hour)<hms("20:33:00"),"G",
                                        ifelse(hms(gps_hour)>=hms("20:50:00") & hms(gps_hour)<hms("21:31:00"),"W",
                                        ifelse(hms(gps_hour)>=hms("21:33:00") & hms(gps_hour)<hms("21:50:00"),"RP",NA)
                                        )))))
                                        )
                        
                        )

caprino_data <-caprino_data %>% mutate(id=cut(as.POSIXct(hms(caprino_data$gps_hour),origin="1960-01-01",tz="GMT"),breaks="1 min")) 
caprino_data %>% head(10)
```
  
```{r eval=FALSE, include=FALSE}
sample_results_test<-data.frame()
for (i in seq(1,30)){
  minute_slice<-as.vector(t(caprino_data %>% group_by(id) %>% summarise(n=n()) %>% select(id)))
  trainidx<-sample(1:length(minute_slice),(length(minute_slice)*70)/100.0)
  data_train<-caprino_data %>% filter( id %in% minute_slice[trainidx])
  data_test<-caprino_data %>% filter( id %in% (minute_slice[-trainidx]))
  data_train<-data_train %>% select (-imu_ts,-gps_hour,-id,-long,-lat,-alt, -gps_ts,-activity)
  data_test<-data_test %>% select (-imu_ts,-gps_hour,-id,-long,-lat,-alt, -gps_ts,-activity)
 
  results_train<-data_train %>% skimr::skim() %>% skimr::yank("numeric")  %>% select(mean,sd) %>% as.data.frame()
  results_train$repetition<-i
  results_train$variable_n<-names(data_train)
  sample_results_train<-rbind(sample_results_train,results_train)
  
  
  results_test<-data_test %>% skimr::skim() %>% skimr::yank("numeric")  %>% select(mean,sd) %>% as.data.frame()
  results_test$repetition<-i
  results_test$variable_n<-names(data_train)
  sample_results_test<-rbind(sample_results_test,results_test)
  
}
setrain<-sample_results_train %>% group_by(variable_n) %>% summarise(grand_mean=mean(mean),se=sd(mean))

settest<-sample_results_test %>% group_by(variable_n) %>% summarise(grand_mean_t=mean(mean),se_t=sd(mean))


cbind(setrain,settest)

sample_results_train %>% ggplot()+
  geom_histogram(aes(x=sd))+
  facet_wrap(~variable_n,scales = 'free')
```

# Tomamos muestras aleatorias de 1 minuto de duracion para sobre las cuales despues queremos predecir el comportamiento

```{r message=FALSE, warning=FALSE}
#readr::write_csv(caprino_data,"data/caprino_data_sliced_distance.csv")
#set.seed(21092019)
#set.seed(21092022)
#set.seed(07092020)
#set.seed(01092020)
minute_slice<-as.vector(t(caprino_data %>% group_by(id) %>% summarise(n=n()) %>% select(id)))
trainidx<-sample(1:length(minute_slice),(length(minute_slice)*70)/100.0)
data_train<-caprino_data %>% filter( id %in% minute_slice[trainidx])
data_test<-caprino_data %>% filter( id %in% (minute_slice[-trainidx]))
data_train%>% group_by(id) %>% summarise(n=n())
data_test%>% group_by(id) %>% summarise(n=n())
data_train %>% filter(!is.na(activity)) %>% group_by(activity) %>% summarise(n=n()) 

```

# Aplicamos Bag-of-features.
## presupuesto

 Consideramos cada muestra como un vector de 5 dimensiones. La idea es aplicar un algoritmo de clustering sobre estos vectores (accX,accY,accZ,GyroX,gyroY). Una vez encontrado los centroides, para cada muestra de 1 se construye un histograma de los vectores que han caido en cada uno de estos clusters. Este histograma es el que se utiliza para entregar el algoritmo.

La hipotesis de aplicar este metodo es que aquellas muestras que pertenecen a una misma actividad tendran valores similares. Los cuales han sido representado mediantes estos histogramas. 

## Aplicamos k-means con 90 centroides sobre el conjunto de entrenamiento. 
Se utilizan todos los sensores y la distancia.

```{r message=FALSE, warning=FALSE}
centroids<-90
cluster_features<-c('accX','accY','accZ','gyroX','gyroY','gyroZ','distance')

## Sampling considering time slices 
  minute_slice <-
    as.vector(t(
      caprino_data %>% group_by(id) %>% summarise(n = n()) %>% select(id)
    ))
  trainidx <-
    sample(1:length(minute_slice), (length(minute_slice) * 70) / 100.0)
  data_train <- caprino_data %>% filter(id %in% minute_slice[trainidx])
  data_test <-
    caprino_data %>% filter(id %in% (minute_slice[-trainidx]))
  
  
  # Kmeans on trainset
  cluster_results <-
    kmeans(
      x = data_train %>% select(cluster_features),
      centers = centroids,
      nstart = 10
    )
  data_train_cluster <-
    cbind(data_train, cluster = as.integer(cluster_results$cluster)) %>% group_by(activity, cluster)
  
  cluster_results<-cluster_results$centers %>% 
    as.data.frame() %>% 
    tibble::rownames_to_column(var = "cluster")
```
### Aplicamos KNN con K=3 en el conjunto de entrenamiento.
```{r message=FALSE, warning=FALSE}
  # KNN using centroids from kmeans for testset
  
  cluster_results_test <-class::knn(train=cluster_results %>% select(-cluster),
                                    test=data_test %>% select(cluster_features)
                                    ,cl = as.integer(cluster_results$cluster),
                                    k=3 )
  data_test_cluster <-
    cbind(data_test, cluster = as.integer(cluster_results_test)) %>% group_by(activity, cluster)
  
  # Create histograms for bag-of-features and label each new datapoint 
  ## For trainset
  data_train_cluster_labeled <- data_train_cluster   %>%
    group_by(id, cluster) %>% summarise(n = n()) %>%
    reshape2::dcast(id ~ cluster, fill = 0) %>%
    inner_join(data_train_cluster %>% ungroup() %>%
                 select(id, activity)  %>%
                 unique()  ,
               by = "id") %>% filter(!is.na(activity))
  
  data_cluster_histogram<-data.frame(matrix(0L,nrow=nrow(data_train_cluster_labeled),ncol=90))
  names(data_cluster_histogram)[1:90]<-seq(1,90)
  data_cluster_histogram[names(data_train_cluster_labeled)]<-data_train_cluster_labeled
  data_train_cluster_labeled<-data_cluster_histogram
  
  ## For testset
  data_test_cluster_labeled <- data_test_cluster   %>% 
    group_by(id, cluster) %>% summarise(n = n()) %>%
    reshape2::dcast(id ~ cluster, fill = 0) %>%
    inner_join(data_test_cluster %>%  ungroup() %>%
                 select(id, activity)  %>%
                 unique() ,
               by = "id")  %>% filter(!is.na(activity))
  
  data_cluster_histogram<-data.frame(matrix(0L,nrow=nrow(data_test_cluster_labeled),ncol=90))
  names(data_cluster_histogram)[1:90]<-seq(1,90)
  data_cluster_histogram[names(data_test_cluster_labeled)]<-data_test_cluster_labeled
  data_test_cluster_labeled<-data_cluster_histogram
```
# Total de actividades en Train
```{r message=FALSE, warning=FALSE}
data_train_cluster_labeled %>% group_by(activity) %>% summarise(total=n())
```
# Total de actividades en Test
```{r message=FALSE, warning=FALSE}
data_test_cluster_labeled %>% group_by(activity) %>% summarise(total=n())
```

## Aplicamos 3x5 CV para elegir la mejor version del modelo
```{r include=FALSE}
library(caret)
library(doMC)
registerDoMC(cores=6)
ctrl_fast <- trainControl(method="repeatedcv", 
                     repeats=3,
                     number=10, 
                     returnResamp = 'final',
                     savePredictions = 'final',
                     
                     verboseIter=F,
                     #summaryFunction = prSummary,
                     classProbs=TRUE,
                     allowParallel = T
                    
                     )  
```
## Probamos Random Forest
```{r}
rf_grid <-  expand.grid(.mtry=c(5)) 
rfFit <- caret::train(
               x= data_train_cluster_labeled %>% select(-id,-activity),
               y= data_train_cluster_labeled$activity,
              
               method = "rf",
  
               #tuneLength=5, 
               tuneGrid=rf_grid,
               
               verbose=2,
       
               trControl = ctrl_fast,
               ntree=200)
rfFit$results
```
<!-- ## Probamos catboost -->
```{r eval=FALSE, include=FALSE}
catboostFit <- caret::train(
               x= data_train_cluster_labeled %>% select(-id,-activity),
               y= data_train_cluster_labeled$activity,
              
               method = catboost.caret,
  
               tuneLength=5,
               verbose=2,
       
               trControl = ctrl_fast)

```

<!-- ## Probamos glmnet -->
```{r eval=FALSE, include=FALSE}
glmnetFit <- caret::train(
               x= data_train_cluster_labeled %>% select(-id,-activity),
               y= data_train_cluster_labeled$activity,
              
               method = "glmnet",
  
               tuneLength=5,
               verbose=2,
       
               trControl = ctrl_fast)

```
<!-- ## modelos obtenidos -->
```{r eval=FALSE, include=FALSE}
rf_cv_results<-cbind(resample=rfFit$resample,model="rf")
catboost_cv_results<-cbind(resample=catboostFit$resample,model="catboost")
glmnet_cv_results<-cbind(resample=glmnetFit$resample,model="glmnet")

cv_results<-rbind(rf_cv_results,catboost_cv_results,glmnet_cv_results)

cv_results %>% ggplot()+
  geom_boxplot(aes(x=model,y=resample.Accuracy,fill=model))+
    geom_point(aes(x=model,y=resample.Accuracy),color='red',alpha=0.5)+
  ggdark::dark_theme_gray() +
  ylim(0.7, 1)+
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
  theme(legend.position = "none") 

oneway.test(resample.Accuracy~model,data=cv_results %>% select(resample.Accuracy,model)) %>% broom::tidy()

cv_results %>% group_by(model) %>% summarise(mean=mean(resample.Accuracy),sd=sd(resample.Accuracy))
```


## Seleccionamos el modelo
```{r}
selected_model<-rfFit
#save(rfFit,file = "journal-polaco/rf_model_journal_polaco2020")
#write_csv(caprino_data,path="journal-polaco/caprino_data_sliced.csv")
#write_csv(data_train_cluster_labeled,path="journal-polaco/trainset.csv")
#write_csv(data_test_cluster_labeled,path="journal-polaco/testset.csv")

```

# Evaluacion de los resultados de CV (CV Report)
```{r fig.height=3, fig.width=8, message=FALSE, warning=FALSE}
predictions_cv<-selected_model$pred %>% select(Resample,pred,obs)

predictions_cv_cm <- predictions_cv %>% group_by(Resample) %>%
  group_modify(
    ~ as.data.frame(
      caret::confusionMatrix(
        data = .x$pred,
        reference = .x$obs,
        mode = 'everything'
      )$byClass
    ) %>%
      select(Precision, Recall, F1,) %>% 
      tibble::rownames_to_column(var = "activity")
  )

predictions_cv_results<- predictions_cv_cm %>% 
  group_by(activity) %>% 
  summarise(
    precision_mean=mean(Precision),
    precision_se=sd(Precision)/sqrt(selected_model$control$number*selected_model$control$repeats),
    precision_conf_inf=mean(Precision)-(precision_se*1.96),
    precision_conf_sup=mean(Precision)+(precision_se*1.96),
    recall_mean=mean(Recall),
    recall_se=sd(Recall)/sqrt(selected_model$control$number*selected_model$control$repeats),
    recall_conf_inf=mean(Recall)-(recall_se*1.96),
    recall_conf_sup=mean(Recall)+(recall_se*1.96),
    F1_mean=mean(F1),
    F1_se=sd(F1)/sqrt(selected_model$control$number*selected_model$control$repeats),
    F1_conf_inf=mean(F1)-(F1_se*1.96),
    F1_conf_sup=mean(F1)+(F1_se*1.96)

)
predictions_cv_cm_plot<-predictions_cv_cm %>% reshape2::melt() %>%
  ggplot()+
  facet_wrap(~variable)+
  ylim(0.6,1)+
  geom_boxplot(aes(x=activity,y=value,fill=activity))+
  labs(title=paste0(selected_model$method," for activity prediction [CV] Report"),
                    subtitle="[3x5 CV]")+
  ggdark::dark_theme_gray()+
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
      theme(legend.position = "none") 
   
predictions_cv_cm

predictions_cv_cm_plot
predictions_cv_results
```

```{r}
prediction_cv_cor<-predictions_cv_cm %>% select(activity,Recall,-Resample) %>% tidyr::pivot_wider(names_from = activity, values_from = Recall) %>% ungroup() %>% select(-Resample) %>% cor(method = "spearman")
library(d3heatmap)
d3heatmap(prediction_cv_cor,colors = "Blues",cexRow = 0.8, cexCol = 0.8)
```

```{r}

#predictions_cv_results_error  %>% select(activity,Sensitivity_sd,Specificity_sd,F1_sd,BalancedAccuracy_sd)
#predictions_cv_results  %>% select(activity,Sensitivity_mean,Specificity_mean,F1_mean,BalancedAccuracy_mean) %>% reshape2::melt() %>%
  
  
#  ggplot()+
#  facet_wrap(~variable)+
#  geom_col(aes(x=activity,y=value,fill=activity))+
#  ggdark::dark_theme_grey()+
#  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))+
#  theme(legend.position = "none") 
```


```{r eval=FALSE, include=FALSE}
varImp(selected_model)
```

## Evaluacion sobre el conjunto de test (Test Report)
Matriz de Confusion
```{r}
data_test_prediction<-predict(selected_model,data_test_cluster_labeled %>% select(-id))#,type="prob")
cm<-caret::confusionMatrix(data_test_prediction,as.factor(data_test_cluster_labeled$activity),mode='everything')
cm
```
```{r fig.height=3, fig.width=8, message=FALSE, warning=FALSE}
test_data<-cm$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var='activity') %>%
  select(activity,Precision, Recall, F1) %>% reshape2::melt()

predictions_cv_cm_plot+
  labs(title=paste0(selected_model$method," for activity prediction [CV+Test] Report"),
                    subtitle="[3x5 CV]")+
    ylim(0.6,1)+
  geom_point(data=test_data,aes(x=activity,y=value),fill="darkgreen",color="white",shape=23,size=2)+
    theme(legend.position = "none") 

```

## Metricas Macro y Micro
```{r}


macro_metrics<-(apply(cm$byClass,2,sum)/4) [c(5,6,7)]

s=table(data_test_prediction,as.factor(data_test_cluster_labeled$activity))

Precision_micro <-  (sum(diag(s)) / sum(apply(s,1, sum)))
Recall_micro <- (sum(diag(s)) / sum(apply(s,2, sum)))

micro_metrics <- c( 
  (sum(diag(s)) / sum(apply(s,1, sum))),
 (sum(diag(s)) / sum(apply(s,2, sum))),
 2 * ((Precision_micro*Recall_micro)/(Precision_micro+Recall_micro)))

mm<-cbind(c(macro_metrics),c(micro_metrics))


library(gridExtra)
data.frame(macro=mm[,1],micro=mm[,2])
```
```{r}
apply(cm$table,2,sum)
```

## Matriz de confusion (plot)
```{r}
len_data_test_prediction = length(data_test_prediction)
reshape2::melt(table(data_test_prediction,as.factor(data_test_cluster_labeled$activity))) %>% mutate(value=(value)) %>%
  ggplot(aes(x=data_test_prediction,y=Var2))+
  geom_tile(aes(fill=value), colour = "white") + 
   geom_text(aes(label = sprintf("%1.2f", value)), color='black',vjust = 1)+
  scale_fill_gradient(low = "white", high = "red")+
  xlab(" Predicted Activity ")+ylab(" Actual Activity")+
  theme_bw()+ theme(legend.position = "none")

#ggsave(filename = "/home/harpo/Dropbox/ongoing-work/publicaciones/CAI2019-JAIIO/confusion-matrix.png",width = 8, height = 4)


```


#### G vs RF
```{r}


GvsRFmatrix <-
  cbind(data_test_cluster_labeled, predictions = data_test_prediction) %>% select(activity, predictions) %>% filter(activity %in% c("G", "RF")) %>% filter(predictions %in% c("G", "RF"))

GvsRF_recall <-
  caret::confusionMatrix(
    factor(GvsRFmatrix$predictions),
    factor(GvsRFmatrix$activity),
    mode = 'everything',
    positive = "G"
  )$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var = "metric") %>% filter(metric == "Recall" | metric == "Specificity") %>% mutate(activities="GvsRF")

GvsRF_recall
```
#### RF vs RP
```{r}

RFvsRPmatrix <-
  cbind(data_test_cluster_labeled, predictions = data_test_prediction) %>% select(activity, predictions) %>% filter(activity %in% c("RF", "RP")) %>% filter(predictions %in% c("RF", "RP"))

RFvsRP_recall <-
  caret::confusionMatrix(
    factor(RFvsRPmatrix$predictions),
    factor(RFvsRPmatrix$activity),
    mode = 'everything',
    positive = "RF"
  )$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var = "metric") %>% filter(metric == "Recall" | metric == "Specificity") %>% mutate(activities="RFvsRP")

RFvsRP_recall
```

#### G vs RP
```{r}

GvsRPmatrix <-
  cbind(data_test_cluster_labeled, predictions = data_test_prediction) %>% select(activity, predictions) %>% filter(activity %in% c("G", "RP")) %>% filter(predictions %in% c("G", "RP"))

GvsRP_recall <-
  caret::confusionMatrix(
    factor(GvsRPmatrix$predictions),
    factor(GvsRPmatrix$activity),
    mode = 'everything',
    positive = "G"
  )$byClass %>% as.data.frame() %>% tibble::rownames_to_column(var = "metric") %>% filter(metric == "Recall" | metric == "Specificity") %>% mutate(activities="GvsRP")

GvsRP_recall
```
# Session Info
```{r}
sessionInfo()
```

