Comparison-Aranet-SCD30

#**Linear regression CO2 Aranet vs CO2 SCD30 **
#**21 january 2021, sample each minute**

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

df <- read_excel("C:/Aire/Procesamiento de datos CO2/Comparison.xlsx")

#**File at google drive:
#**https://docs.google.com/spreadsheets/d/1O4R_fHeQrUQ5U3MAqbr672pdQ4MJkGbTD49tPPa0r3o/edit?usp=sharing


View(df)

glimpse(df)

## Rows: 256
## Columns: 4
## $ numval <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1...
## $ hora   <chr> "1/21/2021 11:20:46 a. m.", "1/21/2021 11:21:46 a. m.", "1/2...
## $ co2ara <dbl> 441, 424, 423, 407, 497, 456, 451, 477, 453, 434, 435, 513, ...
## $ co2scd <dbl> 300, 300, 305, 306, 300, 333, 317, 311, 313, 311, 310, 309, ...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 4
##    numval hora                     co2ara co2scd
##     <dbl> <chr>                     <dbl>  <dbl>
##  1    135 1/21/2021 1:36:46 p. m.     444    284
##  2    210 1/21/2021 2:53:46 p. m.     625    438
##  3     74 1/21/2021 12:34:46 p. m.   1195    821
##  4     45 1/21/2021 12:04:46 p. m.    428    296
##  5    241 1/21/2021 3:24:46 p. m.     531    372
##  6    145 1/21/2021 1:46:46 p. m.     417    305
##  7    144 1/21/2021 1:45:46 p. m.     430    294
##  8     77 1/21/2021 12:37:46 p. m.   1177    836
##  9    224 1/21/2021 3:07:46 p. m.     610    414
## 10    142 1/21/2021 1:43:46 p. m.     418    287

fig <- plot_ly(df, x = ~numval, y = ~co2ara, name = 'CO2 Aranet', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~co2scd, name = 'CO2 SCD30', mode = 'lines+markers') 

fig

#**Original data**

df %>% select(co2scd, co2ara) %>% skim()

Data summary
Name	Piped data
Number of rows	256
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
co2scd	0	1	468.22	196.36	284	299.00	387.5	688.75	843	▇▃▁▁▃
co2ara	0	1	673.67	276.88	397	440.75	557.0	999.75	1212	▇▃▁▁▃

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = co2scd ~ co2ara)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.990

ggplot(df, aes(x = co2scd, y = co2ara)) +
  geom_point(alpha = 0.2) +
  labs(x = "CO2 SCD30", y = "CO2 Aranet",
       title = "Relationship between SCD30 and Aranet") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(co2ara ~ co2scd, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    20.3      6.46       3.15   0.002     7.60    33.1 
## 2 co2scd        1.40     0.013    110.     0         1.37     1.42

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 256 x 5
##       ID co2ara co2scd co2ara_hat residual
##    <int>  <dbl>  <dbl>      <dbl>    <dbl>
##  1     1    441    300       439.     2.06
##  2     2    424    300       439.   -14.9 
##  3     3    423    305       446.   -22.9 
##  4     4    407    306       447.   -40.3 
##  5     5    497    300       439.    58.1 
##  6     6    456    333       485.   -29.0 
##  7     7    451    317       463.   -11.7 
##  8     8    477    311       454.    22.7 
##  9     9    453    313       457.    -4.08
## 10    10    434    311       454.   -20.3 
## # ... with 246 more rows

df <- mutate(df, co2aracalculate = (20.3 + (1.4 * co2scd)))

view(df)

fig <- plot_ly(df, x = ~numval, y = ~co2ara, name = 'CO2 Aranet', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~co2aracalculate, name = 'CO2 cal SCD30', mode = 'lines+markers') 

fig

#**Conclusion:**
#**The correlation is near to 1, the sensors measure the same.
#**The SCD30 is bad calibrated.
#**Pearson correlation coefficient = 0.990
#**Linear regression: y = b0 + b1 * x => VALco2scd = -4.57 + (0.702 * co2ara)**
#*

Comparison-Aranet-SCD30_resume.R

DBB

2021-01-22