#**Linear regression CO2 Qtrack, Aranet and 4 low cost sensors: SCD30, MH-Z14a and CM1106 **
#**24 february - 1 march 2021, sample each minute during 6 days**
library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(moderndive)
library(skimr)
df <- read_excel("C:/Aire/CO2 modelaciones/Final Qtrack & Aranet & 4sensores.xlsx")
View(df)
glimpse(df)
## Rows: 6,991
## Columns: 8
## $ Num <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Time <chr> "2021-02-24 18:00:00", "2021-02-24 18:01:00", "2021-02-24...
## $ Qtrack <dbl> 579, 592, 598, 615, 631, 654, 669, 681, 690, 712, 754, 87...
## $ Aranet <dbl> 757, 794, 822, 815, 799, 784, 810, 803, 830, 899, 1077, 1...
## $ CM1106v33 <dbl> 650, 667, 648, 662, 676, 677, 690, 692, 701, 745, 893, 11...
## $ MHZ14a <dbl> 524, 535, 551, 564, 577, 591, 599, 608, 624, 643, 678, 76...
## $ SCD30 <dbl> 836, 836, 817, NA, 816, 828, 836, 838, 846, 906, 1160, 16...
## $ SCD30box <dbl> 703, 711, 725, 743, 777, 811, 815, 820, 837, 922, NA, 128...
df %>%
sample_n(size = 10)
## # A tibble: 10 x 8
## Num Time Qtrack Aranet CM1106v33 MHZ14a SCD30 SCD30box
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1864 2021-02-26 01:03:00 1573 NA 1268 NA 1705 1672
## 2 6565 2021-03-01 07:24:00 816 868 787 676 NA 883
## 3 1664 2021-02-25 21:43:00 744 NA 714 660 834 786
## 4 2692 2021-02-26 14:51:00 1393 1357 1128 1072 1455 1485
## 5 4695 2021-02-28 00:14:00 869 1045 924 770 1077 994
## 6 589 2021-02-25 03:48:00 883 935 791 754 988 941
## 7 5550 2021-02-28 14:29:00 1051 1090 931 865 1103 1112
## 8 2426 2021-02-26 10:25:00 706 780 696 624 784 770
## 9 6757 2021-03-01 10:36:00 723 772 726 597 795 777
## 10 6350 2021-03-01 03:49:00 1059 1109 947 836 1145 1127
fig <- plot_ly(df, x = ~Num, y = ~Qtrack, name = 'Qtrack', type = 'scatter', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~Aranet, name = 'Aranet', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~CM1106v33, name = 'CM1106v3.3', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~MHZ14a, name = 'MHZ14a', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~SCD30, name = 'SCD30', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~SCD30box, name = 'SCD30box', mode = 'lines+markers')
fig
#Caso 1: Qtrack VS Aranet
df %>% select(Qtrack, Aranet) %>% skim()
Data summary
| Name |
Piped data |
| Number of rows |
6991 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| Qtrack |
39 |
0.99 |
991.87 |
503.16 |
379 |
699.75 |
853 |
1145 |
3300 |
▇▂▁▁▁ |
| Aranet |
854 |
0.88 |
1045.63 |
544.40 |
408 |
741.00 |
903 |
1172 |
4318 |
▇▂▁▁▁ |
#**Pearson correlation coefficient original**
df %>%
get_correlation(formula = Qtrack ~ Aranet, na.rm = TRUE)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.984
ggplot(df, aes(x = Qtrack, y = Aranet)) +
geom_point(alpha = 0.2) +
labs(x = "PM25 Qtrack", y = "PM25 Aranet",
title = "Relationship between Qtrack and Aranet") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 854 rows containing non-finite values (stat_smooth).
## Warning: Removed 854 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(Qtrack ~ Aranet, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 27.7 2.46 11.3 0 22.9 32.6
## 2 Aranet 0.908 0.002 435. 0 0.904 0.912
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 6,137 x 5
## ID Qtrack Aranet Qtrack_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 579 757 715. -136.
## 2 2 592 794 748. -156.
## 3 3 598 822 774. -176.
## 4 4 615 815 768. -153.
## 5 5 631 799 753. -122.
## 6 6 654 784 739. -85.4
## 7 7 669 810 763. -94.0
## 8 8 681 803 757. -75.7
## 9 9 690 830 781. -91.2
## 10 10 712 899 844. -132.
## # ... with 6,127 more rows
#Caso 2: Qtrack VS CM1106v33
df %>% select(Qtrack, CM1106v33) %>% skim()
Data summary
| Name |
Piped data |
| Number of rows |
6991 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| Qtrack |
39 |
0.99 |
991.87 |
503.16 |
379 |
699.75 |
853 |
1145 |
3300 |
▇▂▁▁▁ |
| CM1106v33 |
99 |
0.99 |
905.44 |
377.38 |
445 |
696.00 |
795 |
1001 |
3200 |
▇▂▁▁▁ |
#**Pearson correlation coefficient original**
df %>%
get_correlation(formula = Qtrack ~ CM1106v33, na.rm = TRUE)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.984
ggplot(df, aes(x = Qtrack, y = CM1106v33)) +
geom_point(alpha = 0.2) +
labs(x = "PM25 Qtrack", y = "PM25 CM1106v33",
title = "Relationship between Qtrack and CM1106v33") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 99 rows containing non-finite values (stat_smooth).
## Warning: Removed 99 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(CM1106v33 ~ Qtrack, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 173. 1.77 97.6 0 169. 176.
## 2 Qtrack 0.739 0.002 464. 0 0.736 0.742
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 6,892 x 5
## ID CM1106v33 Qtrack CM1106v33_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 650 579 601. 49.2
## 2 2 667 592 610. 56.6
## 3 3 648 598 615. 33.1
## 4 4 662 615 627. 34.6
## 5 5 676 631 639. 36.7
## 6 6 677 654 656. 20.7
## 7 7 690 669 667. 22.6
## 8 8 692 681 676. 15.8
## 9 9 701 690 683. 18.1
## 10 10 745 712 699. 45.9
## # ... with 6,882 more rows
#Caso 3: Qtrack VS MHZ14a
df %>% select(Qtrack, MHZ14a) %>% skim()
Data summary
| Name |
Piped data |
| Number of rows |
6991 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| Qtrack |
39 |
0.99 |
991.87 |
503.16 |
379 |
699.75 |
853 |
1145 |
3300 |
▇▂▁▁▁ |
| MHZ14a |
143 |
0.98 |
823.37 |
379.42 |
332 |
612.00 |
724 |
923 |
2918 |
▇▂▁▁▁ |
#**Pearson correlation coefficient original**
df %>%
get_correlation(formula = Qtrack ~ MHZ14a, na.rm = TRUE)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.996
ggplot(df, aes(x = Qtrack, y = MHZ14a)) +
geom_point(alpha = 0.2) +
labs(x = "PM25 Qtrack", y = "PM25 MHZ14a",
title = "Relationship between Qtrack and MHZ14a") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 143 rows containing non-finite values (stat_smooth).
## Warning: Removed 143 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(MHZ14a ~ Qtrack, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 77.9 0.942 82.7 0 76.1 79.8
## 2 Qtrack 0.752 0.001 887. 0 0.75 0.753
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 6,848 x 5
## ID MHZ14a Qtrack MHZ14a_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 524 579 513. 10.8
## 2 2 535 592 523. 12.0
## 3 3 551 598 528. 23.5
## 4 4 564 615 540. 23.7
## 5 5 577 631 552. 24.7
## 6 6 591 654 570. 21.4
## 7 7 599 669 581. 18.1
## 8 8 608 681 590. 18.1
## 9 9 624 690 597. 27.3
## 10 10 643 712 613. 29.8
## # ... with 6,838 more rows
#Caso 4: Qtrack VS SCD30
df %>% select(Qtrack, SCD30) %>% skim()
Data summary
| Name |
Piped data |
| Number of rows |
6991 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| Qtrack |
39 |
0.99 |
991.87 |
503.16 |
379 |
699.75 |
853 |
1145.00 |
3300 |
▇▂▁▁▁ |
| SCD30 |
295 |
0.96 |
1105.02 |
588.39 |
410 |
775.00 |
945 |
1268.25 |
4935 |
▇▂▁▁▁ |
#**Pearson correlation coefficient original**
df %>%
get_correlation(formula = Qtrack ~ SCD30, na.rm = TRUE)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.980
ggplot(df, aes(x = Qtrack, y = SCD30)) +
geom_point(alpha = 0.2) +
labs(x = "PM25 Qtrack", y = "PM25 SCD30",
title = "Relationship between Qtrack and SCD30") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 295 rows containing non-finite values (stat_smooth).
## Warning: Removed 295 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(SCD30 ~ Qtrack, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept -30.5 3.14 -9.72 0 -36.7 -24.4
## 2 Qtrack 1.14 0.003 405. 0 1.14 1.15
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 6,696 x 5
## ID SCD30 Qtrack SCD30_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 836 579 632. 204.
## 2 2 836 592 647. 189.
## 3 3 817 598 654. 163.
## 4 4 816 631 692. 124.
## 5 5 828 654 718. 110.
## 6 6 836 669 735. 101.
## 7 7 838 681 749. 89.2
## 8 8 846 690 759. 86.9
## 9 9 906 712 784. 122.
## 10 10 1160 754 832. 328.
## # ... with 6,686 more rows
#Caso 5: Qtrack VS SCD30box
df %>% select(Qtrack, SCD30box) %>% skim()
Data summary
| Name |
Piped data |
| Number of rows |
6991 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| Qtrack |
39 |
0.99 |
991.87 |
503.16 |
379 |
699.75 |
853 |
1145.0 |
3300 |
▇▂▁▁▁ |
| SCD30box |
252 |
0.96 |
1077.04 |
578.90 |
380 |
751.00 |
920 |
1236.5 |
4576 |
▇▂▁▁▁ |
#**Pearson correlation coefficient original**
df %>%
get_correlation(formula = Qtrack ~ SCD30box, na.rm = TRUE)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.987
ggplot(df, aes(x = Qtrack, y = SCD30box)) +
geom_point(alpha = 0.2) +
labs(x = "PM25 Qtrack", y = "PM25 SCD30box",
title = "Relationship between Qtrack and SCD30box") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 252 rows containing non-finite values (stat_smooth).
## Warning: Removed 252 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(SCD30box ~ Qtrack, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept -51.1 2.47 -20.7 0 -55.9 -46.2
## 2 Qtrack 1.14 0.002 512. 0 1.13 1.14
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 6,739 x 5
## ID SCD30box Qtrack SCD30box_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 703 579 608. 95.4
## 2 2 711 592 622. 88.6
## 3 3 725 598 629. 95.7
## 4 4 743 615 649. 94.4
## 5 5 777 631 667. 110.
## 6 6 811 654 693. 118.
## 7 7 815 669 710. 105.
## 8 8 820 681 724. 96.3
## 9 9 837 690 734. 103.
## 10 10 922 712 759. 163.
## # ... with 6,729 more rows
#**Conclusion:**
#**Pearson correlation coefficient = 0.990
#**Linear regression: y = b0 + b1 * x => VALco2scd = -4.57 + (0.702 * co2ara)**
#*