#**Linear regression CO2 Qtrack, Aranet and 4 low cost sensors: SCD30, MH-Z14a and CM1106 **
#**24 february - 1 march 2021, sample each minute during 6 days**

library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(moderndive)
library(skimr)

df <- read_excel("C:/Aire/CO2 modelaciones/Final Qtrack & Aranet & 4sensores.xlsx")

View(df)

glimpse(df)
## Rows: 6,991
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Time      <chr> "2021-02-24 18:00:00", "2021-02-24 18:01:00", "2021-02-24...
## $ Qtrack    <dbl> 579, 592, 598, 615, 631, 654, 669, 681, 690, 712, 754, 87...
## $ Aranet    <dbl> 757, 794, 822, 815, 799, 784, 810, 803, 830, 899, 1077, 1...
## $ CM1106v33 <dbl> 650, 667, 648, 662, 676, 677, 690, 692, 701, 745, 893, 11...
## $ MHZ14a    <dbl> 524, 535, 551, 564, 577, 591, 599, 608, 624, 643, 678, 76...
## $ SCD30     <dbl> 836, 836, 817, NA, 816, 828, 836, 838, 846, 906, 1160, 16...
## $ SCD30box  <dbl> 703, 711, 725, 743, 777, 811, 815, 820, 837, 922, NA, 128...
df %>%
  sample_n(size = 10)
## # A tibble: 10 x 8
##      Num Time                Qtrack Aranet CM1106v33 MHZ14a SCD30 SCD30box
##    <dbl> <chr>                <dbl>  <dbl>     <dbl>  <dbl> <dbl>    <dbl>
##  1  1864 2021-02-26 01:03:00   1573     NA      1268     NA  1705     1672
##  2  6565 2021-03-01 07:24:00    816    868       787    676    NA      883
##  3  1664 2021-02-25 21:43:00    744     NA       714    660   834      786
##  4  2692 2021-02-26 14:51:00   1393   1357      1128   1072  1455     1485
##  5  4695 2021-02-28 00:14:00    869   1045       924    770  1077      994
##  6   589 2021-02-25 03:48:00    883    935       791    754   988      941
##  7  5550 2021-02-28 14:29:00   1051   1090       931    865  1103     1112
##  8  2426 2021-02-26 10:25:00    706    780       696    624   784      770
##  9  6757 2021-03-01 10:36:00    723    772       726    597   795      777
## 10  6350 2021-03-01 03:49:00   1059   1109       947    836  1145     1127
fig <- plot_ly(df, x = ~Num, y = ~Qtrack, name = 'Qtrack', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Aranet, name = 'Aranet', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~CM1106v33, name = 'CM1106v3.3', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~MHZ14a, name = 'MHZ14a', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SCD30, name = 'SCD30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SCD30box, name = 'SCD30box', mode = 'lines+markers') 

fig
#Caso 1: Qtrack VS Aranet

df %>% select(Qtrack, Aranet) %>% skim()
Data summary
Name Piped data
Number of rows 6991
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Qtrack 39 0.99 991.87 503.16 379 699.75 853 1145 3300 ▇▂▁▁▁
Aranet 854 0.88 1045.63 544.40 408 741.00 903 1172 4318 ▇▂▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = Qtrack ~ Aranet, na.rm = TRUE)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.984
ggplot(df, aes(x = Qtrack, y = Aranet)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 Qtrack", y = "PM25 Aranet",
       title = "Relationship between Qtrack and Aranet") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 854 rows containing non-finite values (stat_smooth).
## Warning: Removed 854 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(Qtrack ~ Aranet, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   27.7       2.46       11.3       0   22.9     32.6  
## 2 Aranet       0.908     0.002     435.        0    0.904    0.912
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 6,137 x 5
##       ID Qtrack Aranet Qtrack_hat residual
##    <int>  <dbl>  <dbl>      <dbl>    <dbl>
##  1     1    579    757       715.   -136. 
##  2     2    592    794       748.   -156. 
##  3     3    598    822       774.   -176. 
##  4     4    615    815       768.   -153. 
##  5     5    631    799       753.   -122. 
##  6     6    654    784       739.    -85.4
##  7     7    669    810       763.    -94.0
##  8     8    681    803       757.    -75.7
##  9     9    690    830       781.    -91.2
## 10    10    712    899       844.   -132. 
## # ... with 6,127 more rows
#Caso 2: Qtrack VS CM1106v33

df %>% select(Qtrack, CM1106v33) %>% skim()
Data summary
Name Piped data
Number of rows 6991
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Qtrack 39 0.99 991.87 503.16 379 699.75 853 1145 3300 ▇▂▁▁▁
CM1106v33 99 0.99 905.44 377.38 445 696.00 795 1001 3200 ▇▂▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = Qtrack ~ CM1106v33, na.rm = TRUE)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.984
ggplot(df, aes(x = Qtrack, y = CM1106v33)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 Qtrack", y = "PM25 CM1106v33",
       title = "Relationship between Qtrack and CM1106v33") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 99 rows containing non-finite values (stat_smooth).
## Warning: Removed 99 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(CM1106v33 ~ Qtrack, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept  173.        1.77       97.6       0  169.     176.   
## 2 Qtrack       0.739     0.002     464.        0    0.736    0.742
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 6,892 x 5
##       ID CM1106v33 Qtrack CM1106v33_hat residual
##    <int>     <dbl>  <dbl>         <dbl>    <dbl>
##  1     1       650    579          601.     49.2
##  2     2       667    592          610.     56.6
##  3     3       648    598          615.     33.1
##  4     4       662    615          627.     34.6
##  5     5       676    631          639.     36.7
##  6     6       677    654          656.     20.7
##  7     7       690    669          667.     22.6
##  8     8       692    681          676.     15.8
##  9     9       701    690          683.     18.1
## 10    10       745    712          699.     45.9
## # ... with 6,882 more rows
#Caso 3: Qtrack VS MHZ14a

df %>% select(Qtrack, MHZ14a) %>% skim()
Data summary
Name Piped data
Number of rows 6991
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Qtrack 39 0.99 991.87 503.16 379 699.75 853 1145 3300 ▇▂▁▁▁
MHZ14a 143 0.98 823.37 379.42 332 612.00 724 923 2918 ▇▂▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = Qtrack ~ MHZ14a, na.rm = TRUE)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996
ggplot(df, aes(x = Qtrack, y = MHZ14a)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 Qtrack", y = "PM25 MHZ14a",
       title = "Relationship between Qtrack and MHZ14a") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 143 rows containing non-finite values (stat_smooth).
## Warning: Removed 143 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(MHZ14a ~ Qtrack, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   77.9       0.942      82.7       0    76.1    79.8  
## 2 Qtrack       0.752     0.001     887.        0     0.75    0.753
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 6,848 x 5
##       ID MHZ14a Qtrack MHZ14a_hat residual
##    <int>  <dbl>  <dbl>      <dbl>    <dbl>
##  1     1    524    579       513.     10.8
##  2     2    535    592       523.     12.0
##  3     3    551    598       528.     23.5
##  4     4    564    615       540.     23.7
##  5     5    577    631       552.     24.7
##  6     6    591    654       570.     21.4
##  7     7    599    669       581.     18.1
##  8     8    608    681       590.     18.1
##  9     9    624    690       597.     27.3
## 10    10    643    712       613.     29.8
## # ... with 6,838 more rows
#Caso 4: Qtrack VS SCD30

df %>% select(Qtrack, SCD30) %>% skim()
Data summary
Name Piped data
Number of rows 6991
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Qtrack 39 0.99 991.87 503.16 379 699.75 853 1145.00 3300 ▇▂▁▁▁
SCD30 295 0.96 1105.02 588.39 410 775.00 945 1268.25 4935 ▇▂▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = Qtrack ~ SCD30, na.rm = TRUE)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.980
ggplot(df, aes(x = Qtrack, y = SCD30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 Qtrack", y = "PM25 SCD30",
       title = "Relationship between Qtrack and SCD30") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 295 rows containing non-finite values (stat_smooth).
## Warning: Removed 295 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(SCD30 ~ Qtrack, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -30.5      3.14      -9.72       0   -36.7    -24.4 
## 2 Qtrack        1.14     0.003    405.         0     1.14     1.15
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 6,696 x 5
##       ID SCD30 Qtrack SCD30_hat residual
##    <int> <dbl>  <dbl>     <dbl>    <dbl>
##  1     1   836    579      632.    204. 
##  2     2   836    592      647.    189. 
##  3     3   817    598      654.    163. 
##  4     4   816    631      692.    124. 
##  5     5   828    654      718.    110. 
##  6     6   836    669      735.    101. 
##  7     7   838    681      749.     89.2
##  8     8   846    690      759.     86.9
##  9     9   906    712      784.    122. 
## 10    10  1160    754      832.    328. 
## # ... with 6,686 more rows
#Caso 5: Qtrack VS SCD30box

df %>% select(Qtrack, SCD30box) %>% skim()
Data summary
Name Piped data
Number of rows 6991
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Qtrack 39 0.99 991.87 503.16 379 699.75 853 1145.0 3300 ▇▂▁▁▁
SCD30box 252 0.96 1077.04 578.90 380 751.00 920 1236.5 4576 ▇▂▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = Qtrack ~ SCD30box, na.rm = TRUE)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.987
ggplot(df, aes(x = Qtrack, y = SCD30box)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 Qtrack", y = "PM25 SCD30box",
       title = "Relationship between Qtrack and SCD30box") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 252 rows containing non-finite values (stat_smooth).
## Warning: Removed 252 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(SCD30box ~ Qtrack, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -51.1      2.47      -20.7       0   -55.9    -46.2 
## 2 Qtrack        1.14     0.002     512.        0     1.13     1.14
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 6,739 x 5
##       ID SCD30box Qtrack SCD30box_hat residual
##    <int>    <dbl>  <dbl>        <dbl>    <dbl>
##  1     1      703    579         608.     95.4
##  2     2      711    592         622.     88.6
##  3     3      725    598         629.     95.7
##  4     4      743    615         649.     94.4
##  5     5      777    631         667.    110. 
##  6     6      811    654         693.    118. 
##  7     7      815    669         710.    105. 
##  8     8      820    681         724.     96.3
##  9     9      837    690         734.    103. 
## 10    10      922    712         759.    163. 
## # ... with 6,729 more rows
#**Conclusion:**
#**Pearson correlation coefficient = 0.990
#**Linear regression: y = b0 + b1 * x => VALco2scd = -4.57 + (0.702 * co2ara)**
#*