Librerias necesarias

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(magrittr)

Nota: no vamos a usar tildes.

Introduccion

En esta publicacion se muestra como construir modelos estadisticos para estimar el SOH de baterias de litio.

Leyendo los datos

El primer paso es leer los datos.

bat_05 <- read.csv(file='raw_data_05.csv')
bat_06 <- read.csv(file='raw_data_06.csv')
bat_07 <- read.csv(file='raw_data_07.csv')
bat_18 <- read.csv(file='raw_data_18.csv')

Creando la variable SOH

El SOH se puede obtener usando la siguiente expresion:

\[ \text{SOH} = \frac{\text{Capatity}}{\text{Initial capacity}} \]

Vamos a crear la variable SOH y la variable bateria, esas dos nuevas variables se van a agregar a los datos originales.

bat_05 %<>% mutate(soh = capacity/capacity[1], bateria=5)
bat_06 %<>% mutate(soh = capacity/capacity[1], bateria=6)
bat_07 %<>% mutate(soh = capacity/capacity[1], bateria=7)
bat_18 %<>% mutate(soh = capacity/capacity[1], bateria=18)

Vamos a crear un marco de datos consolidado para hacer graficos comparativos.

datos <- rbind(bat_05, bat_06, bat_07, bat_18)
datos$bateria <- factor(datos$bateria)

Explorando los datos

Vamos a explorar los datos.

library(tibble)
glimpse(datos)
## Rows: 185,721
## Columns: 12
## $ cycle                <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ ambient_temperature  <int> 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 2~
## $ datetime             <chr> "2008-04-02 15:25:41", "2008-04-02 15:25:41", "20~
## $ capacity             <dbl> 1.856487, 1.856487, 1.856487, 1.856487, 1.856487,~
## $ voltage_measured     <dbl> 4.191492, 4.190749, 3.974871, 3.951717, 3.934352,~
## $ current_measured     <dbl> -0.004901589, -0.001478006, -2.012528324, -2.0139~
## $ temperature_measured <dbl> 24.33003, 24.32599, 24.38909, 24.54475, 24.73139,~
## $ current_load         <dbl> -0.0006, -0.0006, -1.9982, -1.9982, -1.9982, -1.9~
## $ voltage_load         <dbl> 0.000, 4.206, 3.062, 3.030, 3.011, 2.991, 2.977, ~
## $ time                 <dbl> 0.000, 16.781, 35.703, 53.781, 71.922, 90.094, 10~
## $ soh                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ bateria              <fct> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5~

Vamos a explorar la cantidad de ciclos y el tiempo promedio de cada ciclo por bateria.

datos %>% group_by(bateria) %>% summarise(num_ciclos=max(cycle), 
                                          tiempo_prom_ciclo=mean(max(time)))
## # A tibble: 4 x 3
##   bateria num_ciclos tiempo_prom_ciclo
##   <fct>        <int>             <dbl>
## 1 5              168             3690.
## 2 6              168             3690.
## 3 7              168             3690.
## 4 18             132             3435.

Cada bateria fue observada durante 168 ciclos y cada ciclo duro en promedio 3690 segundos o un poco mas de una hora.

Vamos a calcular el número de observaciones dentro de cada ciclo para la bateria 05.

bat_05 %>% group_by(cycle) %>% summarise(n())
## # A tibble: 168 x 2
##    cycle `n()`
##    <int> <int>
##  1     1   197
##  2     2   196
##  3     3   195
##  4     4   194
##  5     5   194
##  6     6   195
##  7     7   195
##  8     8   191
##  9     9   190
## 10    10   190
## # ... with 158 more rows

Vamos a explorar los valores de capacidad, soh, time, voltage_load y current_load para las lineas 1, 2, 3, 4 y 168 de cada ciclo.

bat_05 %>% group_by(cycle) %>% 
  select(cycle, capacity, soh, time, voltage_load, current_load) %>% 
  slice(c(1:4, 168))
## # A tibble: 840 x 6
## # Groups:   cycle [168]
##    cycle capacity   soh   time voltage_load current_load
##    <int>    <dbl> <dbl>  <dbl>        <dbl>        <dbl>
##  1     1     1.86 1        0           0         -0.0006
##  2     1     1.86 1       16.8         4.21      -0.0006
##  3     1     1.86 1       35.7         3.06      -2.00  
##  4     1     1.86 1       53.8         3.03      -2.00  
##  5     1     1.86 1     3112.          2.36      -2.00  
##  6     2     1.85 0.995    0           0         -0.0006
##  7     2     1.85 0.995   16.7         4.20      -0.0006
##  8     2     1.85 0.995   35.7         3.06      -2.00  
##  9     2     1.85 0.995   53.8         3.02      -2.00  
## 10     2     1.85 0.995 3113.          2.35      -2.00  
## # ... with 830 more rows

De la tabla anterior se observa que la capacidad y el SOH son constantes dentro del ciclo. Vamos a construir una figura para apreciar mejor esta situacion.

bat_05 %>% filter(cycle <= 16) %>% 
  ggplot(mapping = aes(x=time, y=capacity, col=factor(cycle))) + 
  geom_line() + 
  geom_point() +
  facet_wrap( ~ cycle)

Vamos a crear un dataframe mas pequeno que solo van a tener la informacion en el tiempo 0 para todos los ciclos.

d0 <- datos %>% filter(time == 0)

Vamos a graficar la capacidad versus ciclo.

ggplot(d0, mapping = aes(x=cycle, y=capacity, col=bateria)) + 
  geom_line()

Vamos a graficar la SOH versus ciclo. La linea roja corresponde al umbral del 70% en el cual la bateria ya cumple su ciclo de vida y ya es recomendable hacer el cambio.

ggplot(d0, mapping = aes(x=cycle, y=soh, col=bateria)) + 
  geom_line() + 
  geom_hline(yintercept = 0.7, col='blue', lty='dashed') + 
  labs(title='SOH versus cycle. La línea horizontal azul corresponde al umbral.')

Vamos a explorar las variables temperature_measured, voltage_measured, current_measured, voltage_load, current_load en la base de datos completa.

datos %>% 
  select(temperature_measured, voltage_measured, current_measured, voltage_load, current_load) %>% 
  summary()
##  temperature_measured voltage_measured current_measured    voltage_load  
##  Min.   :22.35        Min.   :1.737    Min.   :-2.02910   Min.   :0.000  
##  1st Qu.:29.57        1st Qu.:3.378    1st Qu.:-2.01142   1st Qu.:2.410  
##  Median :32.36        Median :3.501    Median :-2.00902   Median :2.558  
##  Mean   :32.38        Mean   :3.497    Mean   :-1.83257   Mean   :2.366  
##  3rd Qu.:35.42        3rd Qu.:3.656    3rd Qu.:-1.98997   3rd Qu.:2.718  
##  Max.   :42.33        Max.   :4.233    Max.   : 0.01431   Max.   :4.249  
##   current_load   
##  Min.   :-2.000  
##  1st Qu.: 1.998  
##  Median : 1.999  
##  Mean   : 1.465  
##  3rd Qu.: 1.999  
##  Max.   : 2.000

Modelo para predecir SOH

Para hacer las predicciones del SOH vamos a utilizar solo la informacion de los ciclos 2 en adelante, el primer ciclo no se va a considerar porque la bateria esta nueva.

data_train <- datos %>% filter(cycle > 1)

Modelo de regresion normal

Vamos a iniciar con un modelo sencillo y que se puede resumir en la siguiente expresion:

\[ \begin{align*} SOH_i &\sim N(\mu_i, \sigma), \\ \mu_i &= \beta_0 + \beta_1 cycle + \beta_2 time + \beta_3 temp + \beta_4 volt + \beta_5 current + \beta_6 voltage_{load} + \beta_7 current_{load}, \\ \sigma &= constante. \end{align*} \]

mod1 <- lm(soh ~ cycle + time + temperature_measured + voltage_measured + 
            current_measured + voltage_load + current_load, 
          data=data_train)
summary(mod1)
## 
## Call:
## lm(formula = soh ~ cycle + time + temperature_measured + voltage_measured + 
##     current_measured + voltage_load + current_load, data = data_train)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.126092 -0.035766  0.009155  0.034756  0.234710 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.702e-01  6.296e-03   58.79   <2e-16 ***
## cycle                -1.831e-03  3.568e-06 -513.17   <2e-16 ***
## time                  3.242e-05  3.905e-07   83.03   <2e-16 ***
## temperature_measured  1.709e-03  1.021e-04   16.74   <2e-16 ***
## voltage_measured      1.271e-01  1.143e-03  111.17   <2e-16 ***
## current_measured     -1.608e-02  6.474e-04  -24.83   <2e-16 ***
## voltage_load          1.242e-02  5.356e-04   23.19   <2e-16 ***
## current_load         -1.931e-03  1.116e-04  -17.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04757 on 184756 degrees of freedom
## Multiple R-squared:  0.8008, Adjusted R-squared:  0.8008 
## F-statistic: 1.061e+05 on 7 and 184756 DF,  p-value: < 2.2e-16

Vamos a comparar visualmente los resultados de SOH y \(\widehat{SOH}\).

data_train %<>% mutate(yhat1 = fitted(mod1))
ggplot(data_train, aes(x=soh, y=yhat1, col=bateria)) + 
  geom_point(size=0.5) +
  geom_abline(intercept = 0, slope = 1, size = 1.0, col='blue') +
  geom_hline(yintercept = 1, col='red', lty='dashed') +
  labs(main='Predicciones versus SOH, la linea azul es la referencia.')

Modelo de regresión beta

Vamos ahora a considerar otro modelo que tiene la siguiente expresion:

\[ \begin{align*} SOH_i &\sim Beta(\mu_i, \sigma), \\ \mu_i &= \beta_0 + \beta_1 cycle + \beta_2 time + \beta_3 temp + \beta_4 volt + \beta_5 current + \beta_6 voltage_{load} + \beta_7 current_{load}, \\ \sigma &= constante. \end{align*} \]

library(gamlss)

mod2 <- gamlss(soh ~ cycle + time + temperature_measured + voltage_measured + 
            current_measured + voltage_load + current_load, 
            family=BE,
            data=data_train)
## GAMLSS-RS iteration 1: Global Deviance = -415221.1 
## GAMLSS-RS iteration 2: Global Deviance = -572800.4 
## GAMLSS-RS iteration 3: Global Deviance = -601138.3 
## GAMLSS-RS iteration 4: Global Deviance = -601789.4 
## GAMLSS-RS iteration 5: Global Deviance = -601793.3 
## GAMLSS-RS iteration 6: Global Deviance = -601793.4 
## GAMLSS-RS iteration 7: Global Deviance = -601793.4
summary(mod2)
## ******************************************************************
## Family:  c("BE", "Beta") 
## 
## Call:  gamlss(formula = soh ~ cycle + time + temperature_measured +  
##     voltage_measured + current_measured + voltage_load +  
##     current_load, family = BE, data = data_train) 
## 
## Fitting method: RS() 
## 
## ------------------------------------------------------------------
## Mu link function:  logit
## Mu Coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          -8.368e-01  4.839e-02  -17.295  < 2e-16 ***
## cycle                -1.303e-02  2.787e-05 -467.733  < 2e-16 ***
## time                  2.508e-04  3.378e-06   74.246  < 2e-16 ***
## temperature_measured  4.703e-03  8.211e-04    5.727 1.02e-08 ***
## voltage_measured      7.656e-01  8.666e-03   88.341  < 2e-16 ***
## current_measured     -2.552e-01  5.858e-03  -43.567  < 2e-16 ***
## voltage_load          1.094e-01  4.755e-03   23.001  < 2e-16 ***
## current_load         -1.962e-01  1.521e-03 -128.933  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ------------------------------------------------------------------
## Sigma link function:  logit
## Sigma Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.738856   0.001894    -918   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ------------------------------------------------------------------
## No. of observations in the fit:  184764 
## Degrees of Freedom for the fit:  9
##       Residual Deg. of Freedom:  184755 
##                       at cycle:  7 
##  
## Global Deviance:     -601793.4 
##             AIC:     -601775.4 
##             SBC:     -601684.2 
## ******************************************************************
data_train %<>% mutate(yhat2 = predict(object=mod2, what='mu', type='response'))
ggplot(data_train, aes(x=soh, y=yhat2, col=bateria)) + 
  geom_point(size=0.5) +
  geom_abline(intercept = 0, slope = 1, size = 1.0, col='blue') +
  labs(main='Predicciones versus SOH, la linea azul es la referencia.')

Para resumir los resultados obtenidos usamos lo siguiente:

data_train %>% dplyr::select(soh, yhat1, yhat2) %>% cor() %>% round(digits=2)
##        soh yhat1 yhat2
## soh   1.00  0.89  0.87
## yhat1 0.89  1.00  0.98
## yhat2 0.87  0.98  1.00