#**Linear regression SML complete Panasonic vs Honeywell**
#**12feb2020 7april2020 resolution 1hour**

library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(moderndive)
library(skimr)
library(gapminder)

Honeywel_PanasonicSML_1hour_nonull <- read_excel("Honeywel&PanasonicSML_1hour_nonull.xlsx")
## New names:
## * `` -> ...6
df <- Honeywel_PanasonicSML_1hour_nonull %>%
  select(date, honey, pana)

glimpse(df)
## Rows: 3,273
## Columns: 3
## $ date  <chr> "2/12/2020 12:00:00.000000000 AM", "2/12/2020 1:00:00.0000000...
## $ honey <dbl> 21.568966, 18.137931, 16.500000, 16.325000, 18.847458, 29.719...
## $ pana  <dbl> 14.350877, 12.413793, 11.137931, 11.153846, 13.051724, 21.708...
df %>%
  sample_n(size = 10)
## # A tibble: 10 x 3
##    date                            honey   pana
##    <chr>                           <dbl>  <dbl>
##  1 6/11/2020 12:00:00.000000000 PM  4.22  1.04 
##  2 5/15/2020 2:00:00.000000000 PM   2.35  0.193
##  3 3/21/2020 5:00:00.000000000 AM  40.5  26.1  
##  4 3/30/2020 7:00:00.000000000 AM  65.1  41.3  
##  5 3/16/2020 12:00:00.000000000 AM 18.4  11.3  
##  6 2/22/2020 3:00:00.000000000 PM  20.4  13.5  
##  7 6/28/2020 2:00:00.000000000 AM   4.11  0.789
##  8 3/6/2020 7:00:00.000000000 AM   48.7  32.1  
##  9 3/9/2020 4:00:00.000000000 PM   23.8  13.1  
## 10 3/27/2020 4:00:00.000000000 PM  39.8  22.6
ggplot(df, aes(x=date)) + 
  geom_line(aes(y = pana), group = 1, color = "red") + 
  geom_line(aes(y = honey), group = 2, color = "blue") 

#**Panasonic Sensor boxplot and outliers**

impute_outliersP <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.25, 0.75), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataP <- impute_outliersP(df$pana)

par(mfrow = c(1,2))

boxplot(df$pana, main = "PM2.5 with outliers Panasonic",
        col = 3)
boxplot(imputed_dataP, main = "PPM2.5 without outliers Panasonic",col=2)

#**Honeywell Sensor boxplot and outliers**

impute_outliersH <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.25, 0.75), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataH <- impute_outliersH(df$honey)

par(mfrow = c(1,2))

boxplot(df$honey, main = "PM2.5 with outliers Honeywell",
        col = 3)
boxplot(imputed_dataH, main = "PPM2.5 without outliers Honeywell",col=2)

df[c("honeyH")] <- imputed_dataH

df[c("panaH")] <- imputed_dataP

View(df)

dfT <- df %>%
  select(date, honeyH, panaH)

dfC <- na.omit(dfT)

#**Original data**

df %>% select(honey, pana) %>% skim()
Data summary
Name Piped data
Number of rows 3273
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
honey 0 1 13.4 11.91 0 4.12 9.17 19.47 65.07 ▇▃▂▁▁
pana 0 1 7.1 7.77 0 0.94 4.17 10.98 41.26 ▇▂▁▁▁
#**Data without outliers**

dfC %>% select(honeyH, panaH) %>% skim()
Data summary
Name Piped data
Number of rows 1513
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
honeyH 0 1 10.14 4.24 4.12 6.38 9.31 13.60 19.47 ▇▆▅▃▂
panaH 0 1 4.91 2.80 0.95 2.48 4.34 7.16 10.98 ▇▆▅▃▃
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = honey ~ pana)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.986
#**Pearson correlation coefficient without outliers**

dfC %>% 
  get_correlation(formula = honeyH ~ panaH)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.949
ggplot(df, aes(x = pana, y = honey)) +
  geom_point(alpha = 0.2) +
  labs(x = "Panasonic", y = "Honeywell",
       title = "Relationship between Honeywell and Panasonic scores") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

ggplot(df, aes(x = imputed_dataP, y = imputed_dataH)) +
  geom_point(alpha = 0.2) +
  labs(x = "Panasonic", y = "Honeywell",
       title = "Relationship between Honeywell and Panasonic scores") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1760 rows containing non-finite values (stat_smooth).
## Warning: Removed 1760 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(honey ~ pana, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     2.68     0.048      56.0       0     2.59     2.78
## 2 pana          1.51     0.005     332.        0     1.50     1.52
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 3,273 x 5
##       ID honey  pana honey_hat residual
##    <int> <dbl> <dbl>     <dbl>    <dbl>
##  1     1  21.6  14.4      24.3    -2.78
##  2     2  18.1  12.4      21.4    -3.29
##  3     3  16.5  11.1      19.5    -3.00
##  4     4  16.3  11.2      19.5    -3.20
##  5     5  18.8  13.1      22.4    -3.54
##  6     6  29.7  21.7      35.5    -5.74
##  7     7  42.5  31.5      50.2    -7.72
##  8     8  37.2  27.9      44.8    -7.65
##  9     9  45.7  34.6      54.9    -9.19
## 10    10  50.6  35.6      56.4    -5.88
## # ... with 3,263 more rows
#**Fit regression model withput outliers:**
score_modelC <- lm(honeyH ~ panaH, data = dfC)
#**Get regression table withput outliers:**
get_regression_table(score_modelC)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     3.07     0.069      44.3       0     2.94     3.21
## 2 panaH         1.44     0.012     117.        0     1.42     1.46
regression_pointsC <- get_regression_points(score_modelC)
regression_pointsC
## # A tibble: 1,513 x 5
##       ID honeyH panaH honeyH_hat residual
##    <int>  <dbl> <dbl>      <dbl>    <dbl>
##  1     1   7.93  4.79       9.97    -2.04
##  2     2   6.45  3.32       7.85    -1.40
##  3     3   5.33  2.48       6.65    -1.32
##  4     4   5.12  2.38       6.50    -1.38
##  5     5   7     3.66       8.33    -1.33
##  6     6   8.97  4.97      10.2     -1.25
##  7     7  10.2   5.91      11.6     -1.34
##  8     8  16.3  10.9       18.8     -2.58
##  9     9  14.5   9.84      17.2     -2.77
## 10    10  16.6  10.9       18.7     -2.12
## # ... with 1,503 more rows
View(df)

Vhoney = mutate(df, VALhoney = 2.68 + (1.51 * pana))
Vrest = mutate(Vhoney, Valori = honey )
Vfin = mutate (Vrest, VALrest = VALhoney - honey )
View(Vfin)

#**Conclusion:**
#**Better model original with outliers**
#**Linear regression: y = b0 + b1 * x => VALhoney = 2.68 + 1.51 * Valpana**