#**Linear regression CO2 Aranet vs CO2 SCD30 **
#**21 january 2021, sample each minute**

library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(moderndive)
library(skimr)

df <- read_excel("C:/Aire/Procesamiento de datos CO2/Comparison.xlsx")

#**File at google drive:
#**https://docs.google.com/spreadsheets/d/1O4R_fHeQrUQ5U3MAqbr672pdQ4MJkGbTD49tPPa0r3o/edit?usp=sharing


View(df)

glimpse(df)
## Rows: 256
## Columns: 4
## $ numval <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1...
## $ hora   <chr> "1/21/2021 11:20:46 a. m.", "1/21/2021 11:21:46 a. m.", "1/2...
## $ co2ara <dbl> 441, 424, 423, 407, 497, 456, 451, 477, 453, 434, 435, 513, ...
## $ co2scd <dbl> 300, 300, 305, 306, 300, 333, 317, 311, 313, 311, 310, 309, ...
df %>%
  sample_n(size = 10)
## # A tibble: 10 x 4
##    numval hora                     co2ara co2scd
##     <dbl> <chr>                     <dbl>  <dbl>
##  1    135 1/21/2021 1:36:46 p. m.     444    284
##  2    210 1/21/2021 2:53:46 p. m.     625    438
##  3     74 1/21/2021 12:34:46 p. m.   1195    821
##  4     45 1/21/2021 12:04:46 p. m.    428    296
##  5    241 1/21/2021 3:24:46 p. m.     531    372
##  6    145 1/21/2021 1:46:46 p. m.     417    305
##  7    144 1/21/2021 1:45:46 p. m.     430    294
##  8     77 1/21/2021 12:37:46 p. m.   1177    836
##  9    224 1/21/2021 3:07:46 p. m.     610    414
## 10    142 1/21/2021 1:43:46 p. m.     418    287
fig <- plot_ly(df, x = ~numval, y = ~co2ara, name = 'CO2 Aranet', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~co2scd, name = 'CO2 SCD30', mode = 'lines+markers') 

fig
#**Original data**

df %>% select(co2scd, co2ara) %>% skim()
Data summary
Name Piped data
Number of rows 256
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
co2scd 0 1 468.22 196.36 284 299.00 387.5 688.75 843 ▇▃▁▁▃
co2ara 0 1 673.67 276.88 397 440.75 557.0 999.75 1212 ▇▃▁▁▃
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = co2scd ~ co2ara)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.990
ggplot(df, aes(x = co2scd, y = co2ara)) +
  geom_point(alpha = 0.2) +
  labs(x = "CO2 SCD30", y = "CO2 Aranet",
       title = "Relationship between SCD30 and Aranet") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(co2ara ~ co2scd, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    20.3      6.46       3.15   0.002     7.60    33.1 
## 2 co2scd        1.40     0.013    110.     0         1.37     1.42
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 256 x 5
##       ID co2ara co2scd co2ara_hat residual
##    <int>  <dbl>  <dbl>      <dbl>    <dbl>
##  1     1    441    300       439.     2.06
##  2     2    424    300       439.   -14.9 
##  3     3    423    305       446.   -22.9 
##  4     4    407    306       447.   -40.3 
##  5     5    497    300       439.    58.1 
##  6     6    456    333       485.   -29.0 
##  7     7    451    317       463.   -11.7 
##  8     8    477    311       454.    22.7 
##  9     9    453    313       457.    -4.08
## 10    10    434    311       454.   -20.3 
## # ... with 246 more rows
df <- mutate(df, co2aracalculate = (20.3 + (1.4 * co2scd)))

view(df)

fig <- plot_ly(df, x = ~numval, y = ~co2ara, name = 'CO2 Aranet', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~co2aracalculate, name = 'CO2 cal SCD30', mode = 'lines+markers') 

fig
#**Conclusion:**
#**The correlation is near to 1, the sensors measure the same.
#**The SCD30 is bad calibrated.
#**Pearson correlation coefficient = 0.990
#**Linear regression: y = b0 + b1 * x => VALco2scd = -4.57 + (0.702 * co2ara)**
#*