#**Linear regression 4 models lcs: Plantower, Honeywell, Nova, Panasonic**
#**12feb2020 7april2020 resolution 1hour**

library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(moderndive)
library(skimr)
library(gapminder)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(readr)

grafana_data_export_4_sensors_1hour <- read_csv("grafana_data_export 4 sensors 1hour.csv", col_types = cols(date = col_datetime(format = "%Y-%m-%d %H:%M:%S")))

View(grafana_data_export_4_sensors_1hour)

df <- na.omit(grafana_data_export_4_sensors_1hour)

glimpse(df)
## Rows: 3,042
## Columns: 5
## $ date  <dttm> 2020-02-25 00:00:00, 2020-02-25 01:00:00, 2020-02-25 02:00:0...
## $ plan  <dbl> 20.12308, 24.93750, 20.89062, 19.41538, 24.27692, 22.27692, 3...
## $ honey <dbl> 16.51724, 21.60345, 17.74138, 15.50000, 18.79310, 17.00000, 2...
## $ nova  <dbl> 11.034483, 18.224138, 14.810345, 12.000000, 14.456140, 12.793...
## $ pana  <dbl> 9.614035, 12.701754, 9.896552, 9.122807, 11.551724, 10.413793...
df %>%
  sample_n(size = 10)
## # A tibble: 10 x 5
##    date                  plan honey   nova   pana
##    <dttm>               <dbl> <dbl>  <dbl>  <dbl>
##  1 2020-03-03 09:00:00 30.2   24.4  16.8   14.1  
##  2 2020-03-28 03:00:00 39.6   31.0  28.5   18.5  
##  3 2020-05-01 16:00:00  0.185  1.55  0.714  0.05 
##  4 2020-03-29 10:00:00 52.4   39.6  30.9   23.3  
##  5 2020-05-24 21:00:00  1.25   3.52  1.47   0.456
##  6 2020-02-29 17:00:00 24.9   19.5  13.4   10.8  
##  7 2020-07-03 04:00:00  6.31   7.69  3.54   3.21 
##  8 2020-03-31 12:00:00  9.15   9.75  5.79   4.25 
##  9 2020-03-24 16:00:00 43.8   34.6  31.6   20.5  
## 10 2020-03-23 19:00:00 47.3   33.1  24.1   20.4
ggplot(df, aes(x=date)) + 
  geom_line(aes(y = plan), group = 1, color = "red") + 
  geom_line(aes(y = honey), group = 2, color = "blue") +
  geom_line(aes(y = nova), group = 3, color = "green") +
  geom_line(aes(y = pana), group = 4, color = "yellow")

#**Plantower Sensor boxplot and outliers**

impute_outliersPlan <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataPlan <- impute_outliersPlan(df$plan)

par(mfrow = c(1,2))

boxplot(df$plan, main = "PM2.5 with outliers Plantower", col = 3)
boxplot(imputed_dataPlan, main = "PPM2.5 without outliers Plantower",col=2)

#**Honeywell Sensor boxplot and outliers**

impute_outliersHoney <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataHoney <- impute_outliersHoney(df$honey)

par(mfrow = c(1,2))

boxplot(df$honey, main = "PM2.5 with outliers Honey",
        col = 3)
boxplot(imputed_dataHoney, main = "PPM2.5 without outliers Honey",col=2)

#**Nova Sensor boxplot and outliers**

impute_outliersNova <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataNova <- impute_outliersNova(df$nova)

par(mfrow = c(1,2))

boxplot(df$nova, main = "PM2.5 with outliers Nova",
        col = 3)
boxplot(imputed_dataNova, main = "PPM2.5 without outliers Nova",col=2)

#**Panasonic Sensor boxplot and outliers**

impute_outliersPana <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataPana <- impute_outliersPana(df$pana)

par(mfrow = c(1,2))

boxplot(df$pana, main = "PM2.5 with outliers Panasonic",
        col = 3)
boxplot(imputed_dataPana, main = "PPM2.5 without outliers Panasonic",col=2)

df[c("plantower")] <- imputed_dataPlan

df[c("honeywell")] <- imputed_dataHoney

df[c("novasen")] <- imputed_dataNova

df[c("panasonic")] <- imputed_dataPana

dfT <- df %>%
  select(date, plantower, honeywell, novasen, panasonic)

View(dfT)

dfC <- na.omit(dfT)

#**Original data**

df %>% select(plan, honey,nova,pana) %>% skim()
Data summary
Name Piped data
Number of rows 3042
Number of columns 4
_______________________
Column type frequency:
numeric 4
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
plan 0 1 15.17 17.24 0 1.67 7.85 23.82 91.33 ▇▂▁▁▁
honey 0 1 13.30 12.13 0 3.81 8.91 19.49 65.07 ▇▃▂▁▁
nova 0 1 9.42 10.38 0 1.67 5.06 13.81 55.61 ▇▂▁▁▁
pana 0 1 6.92 7.87 0 0.76 3.71 10.73 41.26 ▇▂▁▁▁
#**Data without outliers**

dfC %>% select(plantower, honeywell, novasen, panasonic) %>% skim()
Data summary
Name Piped data
Number of rows 2629
Number of columns 4
_______________________
Column type frequency:
numeric 4
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
plantower 0 1 13.36 13.31 0.08 2.22 8.14 21.68 51.81 ▇▂▂▂▁
honeywell 0 1 12.13 9.30 1.55 4.33 9.02 17.80 38.78 ▇▃▂▂▁
novasen 0 1 8.26 7.73 0.47 2.00 5.25 12.59 32.26 ▇▃▂▁▁
panasonic 0 1 6.07 6.02 0.00 1.05 3.83 9.81 23.52 ▇▂▂▂▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = plan ~ honey)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.987
df %>% 
  get_correlation(formula = plan ~ nova)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.988
df %>% 
  get_correlation(formula = plan ~ pana)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.993
df %>% 
  get_correlation(formula = honey ~ nova)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.985
df %>% 
  get_correlation(formula = honey ~ pana)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.985
df %>% 
  get_correlation(formula = nova ~ pana)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.985
#**Pearson correlation coefficient without outliers**

dfC %>% 
  get_correlation(formula = plantower ~ honeywell)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.980
dfC %>% 
  get_correlation(formula = plantower ~ novasen)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.983
dfC %>% 
  get_correlation(formula = plantower ~ panasonic)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.989
dfC %>% 
  get_correlation(formula = honeywell ~ novasen)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.978
dfC %>% 
  get_correlation(formula = honeywell ~ panasonic)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.976
dfC %>% 
  get_correlation(formula = novasen ~ panasonic)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.979
ggplot(df, aes(x = plan , y = honey)) +
  geom_point(alpha = 0.2) +
  labs(x = "Plantower", y = "Honeywell",
       title = "Relationship between Plantower and Honeywell scores") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

ggplot(df, aes(x = imputed_dataPlan, y = imputed_dataHoney)) +
  geom_point(alpha = 0.2) +
  labs(x = "Plantower", y = "Honeywell",
       title = "Relationship between Plantower and Honeywell scores") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 370 rows containing non-finite values (stat_smooth).
## Warning: Removed 370 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(honey ~ plan, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    2.76      0.046      59.8       0    2.67     2.85 
## 2 plan         0.695     0.002     345.        0    0.691    0.699
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 3,042 x 5
##       ID honey  plan honey_hat residual
##    <int> <dbl> <dbl>     <dbl>    <dbl>
##  1     1  16.5  20.1      16.7   -0.224
##  2     2  21.6  24.9      20.1    1.52 
##  3     3  17.7  20.9      17.3    0.467
##  4     4  15.5  19.4      16.2   -0.749
##  5     5  18.8  24.3      19.6   -0.833
##  6     6  17    22.3      18.2   -1.24 
##  7     7  22.5  31.6      24.7   -2.20 
##  8     8  17.9  24.1      19.5   -1.56 
##  9     9  18.0  21.7      17.8    0.213
## 10    10  19.6  24.4      19.7   -0.122
## # ... with 3,032 more rows
#**Fit regression model withput outliers:**
score_modelC <- lm(honeywell ~ plantower, data = dfC)
#**Get regression table withput outliers:**
get_regression_table(score_modelC)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    2.98      0.051      58.1       0     2.88     3.08
## 2 plantower    0.685     0.003     252.        0     0.68     0.69
regression_pointsC <- get_regression_points(score_modelC)
regression_pointsC
## # A tibble: 2,629 x 5
##       ID honeywell plantower honeywell_hat residual
##    <int>     <dbl>     <dbl>         <dbl>    <dbl>
##  1     1      16.5      20.1          16.8   -0.243
##  2     2      21.6      24.9          20.1    1.55 
##  3     3      17.7      20.9          17.3    0.456
##  4     4      15.5      19.4          16.3   -0.775
##  5     5      18.8      24.3          19.6   -0.812
##  6     6      17        22.3          18.2   -1.24 
##  7     7      22.5      31.6          24.6   -2.11 
##  8     8      17.9      24.1          19.5   -1.54 
##  9     9      18.0      21.7          17.8    0.21 
## 10    10      19.6      24.4          19.7   -0.099
## # ... with 2,619 more rows
#Vken = mutate(df, VALken = 13.4 + (0.487 * cana))
#Vrest = mutate(Vken, Valori = ken )
#Vfin = mutate (Vrest, VALrest = VALken - ken )
#View(Vfin)

#**Conclusion:**
#**Better model original with outliers**
#**Linear regression: y = b0 + b1 * x => VALUEkennedy = 13.4 + 0.487 * VALUEcanairio**