#**Linear regression CO2 Aranet vs CO2 SCD30 **
#**21 january 2021, sample each minute**
library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(moderndive)
library(skimr)
df <- read_excel("C:/Aire/Procesamiento de datos CO2/Comparison.xlsx")
#**File at google drive:
#**https://docs.google.com/spreadsheets/d/1O4R_fHeQrUQ5U3MAqbr672pdQ4MJkGbTD49tPPa0r3o/edit?usp=sharing
View(df)
glimpse(df)
## Rows: 256
## Columns: 4
## $ numval <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1...
## $ hora <chr> "1/21/2021 11:20:46 a. m.", "1/21/2021 11:21:46 a. m.", "1/2...
## $ co2ara <dbl> 441, 424, 423, 407, 497, 456, 451, 477, 453, 434, 435, 513, ...
## $ co2scd <dbl> 300, 300, 305, 306, 300, 333, 317, 311, 313, 311, 310, 309, ...
df %>%
sample_n(size = 10)
## # A tibble: 10 x 4
## numval hora co2ara co2scd
## <dbl> <chr> <dbl> <dbl>
## 1 135 1/21/2021 1:36:46 p. m. 444 284
## 2 210 1/21/2021 2:53:46 p. m. 625 438
## 3 74 1/21/2021 12:34:46 p. m. 1195 821
## 4 45 1/21/2021 12:04:46 p. m. 428 296
## 5 241 1/21/2021 3:24:46 p. m. 531 372
## 6 145 1/21/2021 1:46:46 p. m. 417 305
## 7 144 1/21/2021 1:45:46 p. m. 430 294
## 8 77 1/21/2021 12:37:46 p. m. 1177 836
## 9 224 1/21/2021 3:07:46 p. m. 610 414
## 10 142 1/21/2021 1:43:46 p. m. 418 287
fig <- plot_ly(df, x = ~numval, y = ~co2ara, name = 'CO2 Aranet', type = 'scatter', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~co2scd, name = 'CO2 SCD30', mode = 'lines+markers')
fig
#**Original data**
df %>% select(co2scd, co2ara) %>% skim()
Data summary
| Name |
Piped data |
| Number of rows |
256 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| co2scd |
0 |
1 |
468.22 |
196.36 |
284 |
299.00 |
387.5 |
688.75 |
843 |
▇▃▁▁▃ |
| co2ara |
0 |
1 |
673.67 |
276.88 |
397 |
440.75 |
557.0 |
999.75 |
1212 |
▇▃▁▁▃ |
#**Pearson correlation coefficient original**
df %>%
get_correlation(formula = co2scd ~ co2ara)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.990
ggplot(df, aes(x = co2scd, y = co2ara)) +
geom_point(alpha = 0.2) +
labs(x = "CO2 SCD30", y = "CO2 Aranet",
title = "Relationship between SCD30 and Aranet") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(co2ara ~ co2scd, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 20.3 6.46 3.15 0.002 7.60 33.1
## 2 co2scd 1.40 0.013 110. 0 1.37 1.42
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 256 x 5
## ID co2ara co2scd co2ara_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 441 300 439. 2.06
## 2 2 424 300 439. -14.9
## 3 3 423 305 446. -22.9
## 4 4 407 306 447. -40.3
## 5 5 497 300 439. 58.1
## 6 6 456 333 485. -29.0
## 7 7 451 317 463. -11.7
## 8 8 477 311 454. 22.7
## 9 9 453 313 457. -4.08
## 10 10 434 311 454. -20.3
## # ... with 246 more rows
df <- mutate(df, co2aracalculate = (20.3 + (1.4 * co2scd)))
view(df)
fig <- plot_ly(df, x = ~numval, y = ~co2ara, name = 'CO2 Aranet', type = 'scatter', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~co2aracalculate, name = 'CO2 cal SCD30', mode = 'lines+markers')
fig
#**Conclusion:**
#**The correlation is near to 1, the sensors measure the same.
#**The SCD30 is bad calibrated.
#**Pearson correlation coefficient = 0.990
#**Linear regression: y = b0 + b1 * x => VALco2scd = -4.57 + (0.702 * co2ara)**
#*