#**Linear regression CO2 Aranet vs CO2 SCD30 **
#**21 january 2021, sample each minute**

library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(moderndive)
library(skimr)

df <- read_excel("C:/Aire/Procesamiento de datos CO2/Comparacion.xlsx")

View(df)

glimpse(df)
## Rows: 256
## Columns: 4
## $ numval <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1...
## $ hora   <chr> "1/21/2021 11:20:46 a. m.", "1/21/2021 11:21:46 a. m.", "1/2...
## $ co2ara <dbl> 441, 424, 423, 407, 497, 456, 451, 477, 453, 434, 435, 513, ...
## $ co2scd <dbl> 300, 300, 305, 306, 300, 333, 317, 311, 313, 311, 310, 309, ...
df %>%
  sample_n(size = 10)
## # A tibble: 10 x 4
##    numval hora                     co2ara co2scd
##     <dbl> <chr>                     <dbl>  <dbl>
##  1    247 1/21/2021 3:30:46 p. m.     524    349
##  2      7 1/21/2021 11:26:46 a. m.    451    317
##  3    211 1/21/2021 2:54:46 p. m.     641    438
##  4     97 1/21/2021 12:57:46 p. m.   1041    722
##  5     40 1/21/2021 11:59:46 a. m.    413    291
##  6    108 1/21/2021 1:09:46 p. m.    1103    748
##  7    173 1/21/2021 2:15:46 p. m.     456    316
##  8     84 1/21/2021 12:44:46 p. m.   1190    841
##  9     90 1/21/2021 12:50:46 p. m.   1117    758
## 10    210 1/21/2021 2:53:46 p. m.     625    438
fig <- plot_ly(df, x = ~numval, y = ~co2ara, name = 'CO2 Aranet', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~co2scd, name = 'CO2 SCD30', mode = 'lines+markers') 

fig
#**co2ara Sensor boxplot and outliers**

impute_outliersP <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataP <- impute_outliersP(df$co2ara)

par(mfrow = c(1,2))

boxplot(df$co2ara, main = "CO2 with outliers Aranet",
        col = 3)
boxplot(imputed_dataP, main = "CO2 without outliers Aranet",col=2)

#**CO2 Aranet Sensor boxplot and outliers**

impute_outliersH <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataH <- impute_outliersH(df$co2scd)

par(mfrow = c(1,2))

boxplot(df$co2scd, main = "CO2 with outliers SCD30",
        col = 3)
boxplot(imputed_dataH, main = "CO2 without outliers SCD30",col=2)

df[c("co2araH")] <- imputed_dataP

df[c("co2scdH")] <- imputed_dataH

dfT <- df %>%
  select(hora, co2scdH, co2araH)

dfC <- na.omit(dfT)

#**Original data**

df %>% select(co2scd, co2ara) %>% skim()
Data summary
Name Piped data
Number of rows 256
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
co2scd 0 1 468.22 196.36 284 299.00 387.5 688.75 843 ▇▃▁▁▃
co2ara 0 1 673.67 276.88 397 440.75 557.0 999.75 1212 ▇▃▁▁▃
#**Data without outliers**

dfC %>% select(co2scdH, co2araH) %>% skim()
Data summary
Name Piped data
Number of rows 222
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
co2scdH 0 1 460.77 181.65 291 301.25 398.5 643.75 827 ▇▃▁▁▃
co2araH 0 1 664.15 256.52 413 446.00 567.5 892.75 1169 ▇▃▁▁▃
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = co2scd ~ co2ara)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.990
#**Pearson correlation coefficient without outliers**

dfC %>% 
  get_correlation(formula = co2scdH ~ co2araH)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.987
# Choose original data with better Pearson 

ggplot(df, aes(x = co2scd, y = co2ara)) +
  geom_point(alpha = 0.2) +
  labs(x = "CO2 SCD30", y = "CO2 Aranet",
       title = "Relationship between SCD30 and Aranet") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(co2ara ~ co2scd, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    20.3      6.46       3.15   0.002     7.60    33.1 
## 2 co2scd        1.40     0.013    110.     0         1.37     1.42
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 256 x 5
##       ID co2ara co2scd co2ara_hat residual
##    <int>  <dbl>  <dbl>      <dbl>    <dbl>
##  1     1    441    300       439.     2.06
##  2     2    424    300       439.   -14.9 
##  3     3    423    305       446.   -22.9 
##  4     4    407    306       447.   -40.3 
##  5     5    497    300       439.    58.1 
##  6     6    456    333       485.   -29.0 
##  7     7    451    317       463.   -11.7 
##  8     8    477    311       454.    22.7 
##  9     9    453    313       457.    -4.08
## 10    10    434    311       454.   -20.3 
## # ... with 246 more rows
df <- mutate(df, co2aracalculate = (20.3 + (1.4 * co2scd)))

view(df)

fig <- plot_ly(df, x = ~numval, y = ~co2ara, name = 'CO2 Aranet', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~co2aracalculate, name = 'CO2 cal SCD30', mode = 'lines+markers') 

fig
#**Conclusion:**
#**Better correlation with outliers**
#**Pearson correlation coefficient = 0.990
#**Linear regression: y = b0 + b1 * x => VALco2scd = -4.57 + (0.702 * co2ara)**
#*