Comparison-Qtrack_Aranet

#**Linear regression CO2 Qtrack, Aranet and 4 low cost sensors: SCD30, MH-Z14a and CM1106 **
#**24 february - 1 march 2021, sample each minute during 6 days**

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

df <- read_excel("C:/Aire/CO2 modelaciones/Final Qtrack & Aranet & 4sensores.xlsx")

View(df)

glimpse(df)

## Rows: 6,991
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Time      <chr> "2021-02-24 18:00:00", "2021-02-24 18:01:00", "2021-02-24...
## $ Qtrack    <dbl> 579, 592, 598, 615, 631, 654, 669, 681, 690, 712, 754, 87...
## $ Aranet    <dbl> 757, 794, 822, 815, 799, 784, 810, 803, 830, 899, 1077, 1...
## $ CM1106v33 <dbl> 650, 667, 648, 662, 676, 677, 690, 692, 701, 745, 893, 11...
## $ MHZ14a    <dbl> 524, 535, 551, 564, 577, 591, 599, 608, 624, 643, 678, 76...
## $ SCD30     <dbl> 836, 836, 817, NA, 816, 828, 836, 838, 846, 906, 1160, 16...
## $ SCD30box  <dbl> 703, 711, 725, 743, 777, 811, 815, 820, 837, 922, NA, 128...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Time                Qtrack Aranet CM1106v33 MHZ14a SCD30 SCD30box
##    <dbl> <chr>                <dbl>  <dbl>     <dbl>  <dbl> <dbl>    <dbl>
##  1  1864 2021-02-26 01:03:00   1573     NA      1268     NA  1705     1672
##  2  6565 2021-03-01 07:24:00    816    868       787    676    NA      883
##  3  1664 2021-02-25 21:43:00    744     NA       714    660   834      786
##  4  2692 2021-02-26 14:51:00   1393   1357      1128   1072  1455     1485
##  5  4695 2021-02-28 00:14:00    869   1045       924    770  1077      994
##  6   589 2021-02-25 03:48:00    883    935       791    754   988      941
##  7  5550 2021-02-28 14:29:00   1051   1090       931    865  1103     1112
##  8  2426 2021-02-26 10:25:00    706    780       696    624   784      770
##  9  6757 2021-03-01 10:36:00    723    772       726    597   795      777
## 10  6350 2021-03-01 03:49:00   1059   1109       947    836  1145     1127

fig <- plot_ly(df, x = ~Num, y = ~Qtrack, name = 'Qtrack', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Aranet, name = 'Aranet', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~CM1106v33, name = 'CM1106v3.3', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~MHZ14a, name = 'MHZ14a', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SCD30, name = 'SCD30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SCD30box, name = 'SCD30box', mode = 'lines+markers') 

fig

#Caso 1: Qtrack VS Aranet

df %>% select(Qtrack, Aranet) %>% skim()

Data summary
Name	Piped data
Number of rows	6991
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Qtrack	39	0.99	991.87	503.16	379	699.75	853	1145	3300	▇▂▁▁▁
Aranet	854	0.88	1045.63	544.40	408	741.00	903	1172	4318	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = Qtrack ~ Aranet, na.rm = TRUE)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.984

ggplot(df, aes(x = Qtrack, y = Aranet)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 Qtrack", y = "PM25 Aranet",
       title = "Relationship between Qtrack and Aranet") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 854 rows containing non-finite values (stat_smooth).

## Warning: Removed 854 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(Qtrack ~ Aranet, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   27.7       2.46       11.3       0   22.9     32.6  
## 2 Aranet       0.908     0.002     435.        0    0.904    0.912

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 6,137 x 5
##       ID Qtrack Aranet Qtrack_hat residual
##    <int>  <dbl>  <dbl>      <dbl>    <dbl>
##  1     1    579    757       715.   -136. 
##  2     2    592    794       748.   -156. 
##  3     3    598    822       774.   -176. 
##  4     4    615    815       768.   -153. 
##  5     5    631    799       753.   -122. 
##  6     6    654    784       739.    -85.4
##  7     7    669    810       763.    -94.0
##  8     8    681    803       757.    -75.7
##  9     9    690    830       781.    -91.2
## 10    10    712    899       844.   -132. 
## # ... with 6,127 more rows

#Caso 2: Qtrack VS CM1106v33

df %>% select(Qtrack, CM1106v33) %>% skim()

Data summary
Name	Piped data
Number of rows	6991
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Qtrack	39	0.99	991.87	503.16	379	699.75	853	1145	3300	▇▂▁▁▁
CM1106v33	99	0.99	905.44	377.38	445	696.00	795	1001	3200	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = Qtrack ~ CM1106v33, na.rm = TRUE)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.984

ggplot(df, aes(x = Qtrack, y = CM1106v33)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 Qtrack", y = "PM25 CM1106v33",
       title = "Relationship between Qtrack and CM1106v33") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 99 rows containing non-finite values (stat_smooth).

## Warning: Removed 99 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(CM1106v33 ~ Qtrack, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept  173.        1.77       97.6       0  169.     176.   
## 2 Qtrack       0.739     0.002     464.        0    0.736    0.742

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 6,892 x 5
##       ID CM1106v33 Qtrack CM1106v33_hat residual
##    <int>     <dbl>  <dbl>         <dbl>    <dbl>
##  1     1       650    579          601.     49.2
##  2     2       667    592          610.     56.6
##  3     3       648    598          615.     33.1
##  4     4       662    615          627.     34.6
##  5     5       676    631          639.     36.7
##  6     6       677    654          656.     20.7
##  7     7       690    669          667.     22.6
##  8     8       692    681          676.     15.8
##  9     9       701    690          683.     18.1
## 10    10       745    712          699.     45.9
## # ... with 6,882 more rows

#Caso 3: Qtrack VS MHZ14a

df %>% select(Qtrack, MHZ14a) %>% skim()

Data summary
Name	Piped data
Number of rows	6991
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Qtrack	39	0.99	991.87	503.16	379	699.75	853	1145	3300	▇▂▁▁▁
MHZ14a	143	0.98	823.37	379.42	332	612.00	724	923	2918	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = Qtrack ~ MHZ14a, na.rm = TRUE)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = Qtrack, y = MHZ14a)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 Qtrack", y = "PM25 MHZ14a",
       title = "Relationship between Qtrack and MHZ14a") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 143 rows containing non-finite values (stat_smooth).

## Warning: Removed 143 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(MHZ14a ~ Qtrack, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   77.9       0.942      82.7       0    76.1    79.8  
## 2 Qtrack       0.752     0.001     887.        0     0.75    0.753

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 6,848 x 5
##       ID MHZ14a Qtrack MHZ14a_hat residual
##    <int>  <dbl>  <dbl>      <dbl>    <dbl>
##  1     1    524    579       513.     10.8
##  2     2    535    592       523.     12.0
##  3     3    551    598       528.     23.5
##  4     4    564    615       540.     23.7
##  5     5    577    631       552.     24.7
##  6     6    591    654       570.     21.4
##  7     7    599    669       581.     18.1
##  8     8    608    681       590.     18.1
##  9     9    624    690       597.     27.3
## 10    10    643    712       613.     29.8
## # ... with 6,838 more rows

#Caso 4: Qtrack VS SCD30

df %>% select(Qtrack, SCD30) %>% skim()

Data summary
Name	Piped data
Number of rows	6991
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Qtrack	39	0.99	991.87	503.16	379	699.75	853	1145.00	3300	▇▂▁▁▁
SCD30	295	0.96	1105.02	588.39	410	775.00	945	1268.25	4935	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = Qtrack ~ SCD30, na.rm = TRUE)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.980

ggplot(df, aes(x = Qtrack, y = SCD30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 Qtrack", y = "PM25 SCD30",
       title = "Relationship between Qtrack and SCD30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 295 rows containing non-finite values (stat_smooth).

## Warning: Removed 295 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(SCD30 ~ Qtrack, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -30.5      3.14      -9.72       0   -36.7    -24.4 
## 2 Qtrack        1.14     0.003    405.         0     1.14     1.15

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 6,696 x 5
##       ID SCD30 Qtrack SCD30_hat residual
##    <int> <dbl>  <dbl>     <dbl>    <dbl>
##  1     1   836    579      632.    204. 
##  2     2   836    592      647.    189. 
##  3     3   817    598      654.    163. 
##  4     4   816    631      692.    124. 
##  5     5   828    654      718.    110. 
##  6     6   836    669      735.    101. 
##  7     7   838    681      749.     89.2
##  8     8   846    690      759.     86.9
##  9     9   906    712      784.    122. 
## 10    10  1160    754      832.    328. 
## # ... with 6,686 more rows

#Caso 5: Qtrack VS SCD30box

df %>% select(Qtrack, SCD30box) %>% skim()

Data summary
Name	Piped data
Number of rows	6991
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Qtrack	39	0.99	991.87	503.16	379	699.75	853	1145.0	3300	▇▂▁▁▁
SCD30box	252	0.96	1077.04	578.90	380	751.00	920	1236.5	4576	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = Qtrack ~ SCD30box, na.rm = TRUE)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.987

ggplot(df, aes(x = Qtrack, y = SCD30box)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 Qtrack", y = "PM25 SCD30box",
       title = "Relationship between Qtrack and SCD30box") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 252 rows containing non-finite values (stat_smooth).

## Warning: Removed 252 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(SCD30box ~ Qtrack, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -51.1      2.47      -20.7       0   -55.9    -46.2 
## 2 Qtrack        1.14     0.002     512.        0     1.13     1.14

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 6,739 x 5
##       ID SCD30box Qtrack SCD30box_hat residual
##    <int>    <dbl>  <dbl>        <dbl>    <dbl>
##  1     1      703    579         608.     95.4
##  2     2      711    592         622.     88.6
##  3     3      725    598         629.     95.7
##  4     4      743    615         649.     94.4
##  5     5      777    631         667.    110. 
##  6     6      811    654         693.    118. 
##  7     7      815    669         710.    105. 
##  8     8      820    681         724.     96.3
##  9     9      837    690         734.    103. 
## 10    10      922    712         759.    163. 
## # ... with 6,729 more rows

#**Conclusion:**
#**Pearson correlation coefficient = 0.990
#**Linear regression: y = b0 + b1 * x => VALco2scd = -4.57 + (0.702 * co2ara)**
#*

Comparison-Qtrack_Aranet_4sensores.R

DBB

2021-04-16