#**Linear regression CanAirIO (Humedal La Vaca) vs Oficial Kennedy**
#**12feb2020 7april2020 resolution 1hour**

library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(moderndive)
library(skimr)
library(gapminder)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
EstacionOficialVSEstacionCanAirIO <- read_excel("Estacion Oficial VS Estacion CanAirIO.xlsx")
## New names:
## * `` -> ...1
dfOri <- na.omit(EstacionOficialVSEstacionCanAirIO)

df <- dfOri %>%
  select(date, ken, cana)

glimpse(df)
## Rows: 5,336
## Columns: 3
## $ date <chr> "30-10-2019 24:00", "31-10-2019 01:00", "31-10-2019 02:00", "3...
## $ ken  <dbl> 48, 48, 25, 21, 24, 23, 35, 50, 39, 66, 86, 68, 27, 21, 16, 14...
## $ cana <dbl> 16.787234, 16.981132, 16.533333, 16.976190, 14.725490, 37.2352...
df %>%
  sample_n(size = 10)
## # A tibble: 10 x 3
##    date               ken  cana
##    <chr>            <dbl> <dbl>
##  1 18-03-2020 10:00    65 84.7 
##  2 05-06-2020 22:00     7  6.47
##  3 26-05-2020 14:00    10  3.33
##  4 06-03-2020 19:00    44 65.7 
##  5 01-04-2020 05:00    20 23   
##  6 26-12-2019 14:00    16  2.59
##  7 11-11-2019 01:00    16  5.39
##  8 12-02-2020 09:00    95  8.98
##  9 15-02-2020 21:00    22 15.3 
## 10 25-04-2020 02:00    15 26.1
ggplot(df, aes(x=date)) + 
  geom_line(aes(y = cana), group = 1, color = "red") + 
  geom_line(aes(y = ken), group = 2, color = "blue") 

#**CanAirIO Sensor boxplot and outliers**

impute_outliersP <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataP <- impute_outliersP(df$cana)

par(mfrow = c(1,2))

boxplot(df$cana, main = "PM2.5 with outliers CanAirIO",
        col = 3)
boxplot(imputed_dataP, main = "PPM2.5 without outliers CanAirIO",col=2)

#**Kennedy Sensor boxplot and outliers**

impute_outliersH <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataH <- impute_outliersH(df$ken)

par(mfrow = c(1,2))

boxplot(df$ken, main = "PM2.5 with outliers Kennedy",
        col = 3)
boxplot(imputed_dataH, main = "PPM2.5 without outliers Kennedy",col=2)

df[c("kenH")] <- imputed_dataH

df[c("canaH")] <- imputed_dataP

dfT <- df %>%
  select(date, kenH, canaH)

dfC <- na.omit(dfT)

#**Original data**

df %>% select(ken, cana) %>% skim()
Data summary
Name Piped data
Number of rows 5336
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ken 0 1 24.46 15.19 0.00 13.00 22.00 32.00 127.0 ▇▅▁▁▁
cana 0 1 22.81 18.27 0.24 9.34 18.25 31.14 233.8 ▇▁▁▁▁
#**Data without outliers**

dfC %>% select(kenH, canaH) %>% skim()
Data summary
Name Piped data
Number of rows 4470
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
kenH 0 1 23.18 11.29 5.00 15.00 22.0 31.00 53.00 ▆▇▅▃▂
canaH 0 1 21.06 12.92 3.73 10.44 18.5 29.33 57.55 ▇▆▃▂▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = ken ~ cana)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.585
#**Pearson correlation coefficient without outliers**

dfC %>% 
  get_correlation(formula = kenH ~ canaH)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.549
ggplot(df, aes(x = cana, y = ken)) +
  geom_point(alpha = 0.2) +
  labs(x = "CanAirIO", y = "Kennedy",
       title = "Relationship between Kennedy and CanAirIO scores") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

ggplot(df, aes(x = imputed_dataP, y = imputed_dataH)) +
  geom_point(alpha = 0.2) +
  labs(x = "CanAirIO", y = "Kennedy",
       title = "Relationship between Kennedy and CanAirIO scores") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 866 rows containing non-finite values (stat_smooth).
## Warning: Removed 866 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(ken ~ cana, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   13.4       0.27       49.5       0   12.8     13.9  
## 2 cana         0.487     0.009      52.7       0    0.469    0.505
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 5,336 x 5
##       ID   ken  cana ken_hat residual
##    <int> <dbl> <dbl>   <dbl>    <dbl>
##  1     1    48  16.8    21.5   26.5  
##  2     2    48  17.0    21.6   26.4  
##  3     3    25  16.5    21.4    3.59 
##  4     4    21  17.0    21.6   -0.621
##  5     5    24  14.7    20.5    3.47 
##  6     6    23  37.2    31.5   -8.48 
##  7     7    35  25.1    25.6    9.44 
##  8     8    50  24.5    25.3   24.7  
##  9     9    39  33.9    29.9    9.12 
## 10    10    66  24.6    25.3   40.7  
## # ... with 5,326 more rows
#**Fit regression model withput outliers:**
score_modelC <- lm(kenH ~ canaH, data = dfC)
#**Get regression table withput outliers:**
get_regression_table(score_modelC)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    13.1      0.27       48.4       0   12.6     13.6  
## 2 canaH         0.48     0.011      43.9       0    0.458    0.501
regression_pointsC <- get_regression_points(score_modelC)
regression_pointsC
## # A tibble: 4,470 x 5
##       ID  kenH canaH kenH_hat residual
##    <int> <dbl> <dbl>    <dbl>    <dbl>
##  1     1    48 16.8      21.1   26.9  
##  2     2    48 17.0      21.2   26.8  
##  3     3    25 16.5      21.0    3.99 
##  4     4    21 17.0      21.2   -0.225
##  5     5    24 14.7      20.1    3.86 
##  6     6    23 37.2      30.9   -7.94 
##  7     7    35 25.1      25.1    9.90 
##  8     8    50 24.5      24.8   25.2  
##  9     9    39 33.9      29.4    9.64 
## 10    10    27  8.51     17.2    9.84 
## # ... with 4,460 more rows
Vken = mutate(df, VALken = 13.4 + (0.487 * cana))
Vrest = mutate(Vken, Valori = ken )
Vfin = mutate (Vrest, VALrest = VALken - ken )
View(Vfin)

#**Conclusion:**
#**Better model original with outliers**
#**Linear regression: y = b0 + b1 * x => VALUEkennedy = 13.4 + 0.487 * VALUEcanairio**