SensoresLowCost.R

#**Linear regression 4 models lcs: Plantower, Honeywell, Nova, Panasonic**
#**12feb2020 7april2020 resolution 1hour**

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(moderndive)
library(skimr)
library(gapminder)
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(readr)

grafana_data_export_4_sensors_1hour <- read_csv("grafana_data_export 4 sensors 1hour.csv", col_types = cols(date = col_datetime(format = "%Y-%m-%d %H:%M:%S")))

View(grafana_data_export_4_sensors_1hour)

df <- na.omit(grafana_data_export_4_sensors_1hour)

glimpse(df)

## Rows: 3,042
## Columns: 5
## $ date  <dttm> 2020-02-25 00:00:00, 2020-02-25 01:00:00, 2020-02-25 02:00:0...
## $ plan  <dbl> 20.12308, 24.93750, 20.89062, 19.41538, 24.27692, 22.27692, 3...
## $ honey <dbl> 16.51724, 21.60345, 17.74138, 15.50000, 18.79310, 17.00000, 2...
## $ nova  <dbl> 11.034483, 18.224138, 14.810345, 12.000000, 14.456140, 12.793...
## $ pana  <dbl> 9.614035, 12.701754, 9.896552, 9.122807, 11.551724, 10.413793...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 5
##    date                  plan honey   nova   pana
##    <dttm>               <dbl> <dbl>  <dbl>  <dbl>
##  1 2020-03-03 09:00:00 30.2   24.4  16.8   14.1  
##  2 2020-03-28 03:00:00 39.6   31.0  28.5   18.5  
##  3 2020-05-01 16:00:00  0.185  1.55  0.714  0.05 
##  4 2020-03-29 10:00:00 52.4   39.6  30.9   23.3  
##  5 2020-05-24 21:00:00  1.25   3.52  1.47   0.456
##  6 2020-02-29 17:00:00 24.9   19.5  13.4   10.8  
##  7 2020-07-03 04:00:00  6.31   7.69  3.54   3.21 
##  8 2020-03-31 12:00:00  9.15   9.75  5.79   4.25 
##  9 2020-03-24 16:00:00 43.8   34.6  31.6   20.5  
## 10 2020-03-23 19:00:00 47.3   33.1  24.1   20.4

ggplot(df, aes(x=date)) + 
  geom_line(aes(y = plan), group = 1, color = "red") + 
  geom_line(aes(y = honey), group = 2, color = "blue") +
  geom_line(aes(y = nova), group = 3, color = "green") +
  geom_line(aes(y = pana), group = 4, color = "yellow")

#**Plantower Sensor boxplot and outliers**

impute_outliersPlan <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataPlan <- impute_outliersPlan(df$plan)

par(mfrow = c(1,2))

boxplot(df$plan, main = "PM2.5 with outliers Plantower", col = 3)
boxplot(imputed_dataPlan, main = "PPM2.5 without outliers Plantower",col=2)

#**Honeywell Sensor boxplot and outliers**

impute_outliersHoney <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataHoney <- impute_outliersHoney(df$honey)

par(mfrow = c(1,2))

boxplot(df$honey, main = "PM2.5 with outliers Honey",
        col = 3)
boxplot(imputed_dataHoney, main = "PPM2.5 without outliers Honey",col=2)

#**Nova Sensor boxplot and outliers**

impute_outliersNova <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataNova <- impute_outliersNova(df$nova)

par(mfrow = c(1,2))

boxplot(df$nova, main = "PM2.5 with outliers Nova",
        col = 3)
boxplot(imputed_dataNova, main = "PPM2.5 without outliers Nova",col=2)

#**Panasonic Sensor boxplot and outliers**

impute_outliersPana <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- NA
  x[x>quantiles[2]] <- NA
  x
}

imputed_dataPana <- impute_outliersPana(df$pana)

par(mfrow = c(1,2))

boxplot(df$pana, main = "PM2.5 with outliers Panasonic",
        col = 3)
boxplot(imputed_dataPana, main = "PPM2.5 without outliers Panasonic",col=2)

df[c("plantower")] <- imputed_dataPlan

df[c("honeywell")] <- imputed_dataHoney

df[c("novasen")] <- imputed_dataNova

df[c("panasonic")] <- imputed_dataPana

dfT <- df %>%
  select(date, plantower, honeywell, novasen, panasonic)

View(dfT)

dfC <- na.omit(dfT)

#**Original data**

df %>% select(plan, honey,nova,pana) %>% skim()

Data summary
Name	Piped data
Number of rows	3042
Number of columns	4
_______________________
Column type frequency:
numeric	4
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p25	p50	p75	p100	hist
plan	1	15.17	17.24	1.67	7.85	23.82	91.33	▇▂▁▁▁
honey	1	13.30	12.13	3.81	8.91	19.49	65.07	▇▃▂▁▁
nova	1	9.42	10.38	1.67	5.06	13.81	55.61	▇▂▁▁▁
pana	1	6.92	7.87	0.76	3.71	10.73	41.26	▇▂▁▁▁

#**Data without outliers**

dfC %>% select(plantower, honeywell, novasen, panasonic) %>% skim()

Data summary
Name	Piped data
Number of rows	2629
Number of columns	4
_______________________
Column type frequency:
numeric	4
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
plantower	1	13.36	13.31	0.08	2.22	8.14	21.68	51.81	▇▂▂▂▁
honeywell	1	12.13	9.30	1.55	4.33	9.02	17.80	38.78	▇▃▂▂▁
novasen	1	8.26	7.73	0.47	2.00	5.25	12.59	32.26	▇▃▂▁▁
panasonic	1	6.07	6.02	0.00	1.05	3.83	9.81	23.52	▇▂▂▂▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = plan ~ honey)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.987

df %>% 
  get_correlation(formula = plan ~ nova)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.988

df %>% 
  get_correlation(formula = plan ~ pana)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.993

df %>% 
  get_correlation(formula = honey ~ nova)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.985

df %>% 
  get_correlation(formula = honey ~ pana)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.985

df %>% 
  get_correlation(formula = nova ~ pana)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.985

#**Pearson correlation coefficient without outliers**

dfC %>% 
  get_correlation(formula = plantower ~ honeywell)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.980

dfC %>% 
  get_correlation(formula = plantower ~ novasen)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.983

dfC %>% 
  get_correlation(formula = plantower ~ panasonic)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.989

dfC %>% 
  get_correlation(formula = honeywell ~ novasen)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.978

dfC %>% 
  get_correlation(formula = honeywell ~ panasonic)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.976

dfC %>% 
  get_correlation(formula = novasen ~ panasonic)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.979

ggplot(df, aes(x = plan , y = honey)) +
  geom_point(alpha = 0.2) +
  labs(x = "Plantower", y = "Honeywell",
       title = "Relationship between Plantower and Honeywell scores") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

ggplot(df, aes(x = imputed_dataPlan, y = imputed_dataHoney)) +
  geom_point(alpha = 0.2) +
  labs(x = "Plantower", y = "Honeywell",
       title = "Relationship between Plantower and Honeywell scores") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 370 rows containing non-finite values (stat_smooth).

## Warning: Removed 370 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(honey ~ plan, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    2.76      0.046      59.8       0    2.67     2.85 
## 2 plan         0.695     0.002     345.        0    0.691    0.699

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 3,042 x 5
##       ID honey  plan honey_hat residual
##    <int> <dbl> <dbl>     <dbl>    <dbl>
##  1     1  16.5  20.1      16.7   -0.224
##  2     2  21.6  24.9      20.1    1.52 
##  3     3  17.7  20.9      17.3    0.467
##  4     4  15.5  19.4      16.2   -0.749
##  5     5  18.8  24.3      19.6   -0.833
##  6     6  17    22.3      18.2   -1.24 
##  7     7  22.5  31.6      24.7   -2.20 
##  8     8  17.9  24.1      19.5   -1.56 
##  9     9  18.0  21.7      17.8    0.213
## 10    10  19.6  24.4      19.7   -0.122
## # ... with 3,032 more rows

#**Fit regression model withput outliers:**
score_modelC <- lm(honeywell ~ plantower, data = dfC)
#**Get regression table withput outliers:**
get_regression_table(score_modelC)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    2.98      0.051      58.1       0     2.88     3.08
## 2 plantower    0.685     0.003     252.        0     0.68     0.69

regression_pointsC <- get_regression_points(score_modelC)
regression_pointsC

## # A tibble: 2,629 x 5
##       ID honeywell plantower honeywell_hat residual
##    <int>     <dbl>     <dbl>         <dbl>    <dbl>
##  1     1      16.5      20.1          16.8   -0.243
##  2     2      21.6      24.9          20.1    1.55 
##  3     3      17.7      20.9          17.3    0.456
##  4     4      15.5      19.4          16.3   -0.775
##  5     5      18.8      24.3          19.6   -0.812
##  6     6      17        22.3          18.2   -1.24 
##  7     7      22.5      31.6          24.6   -2.11 
##  8     8      17.9      24.1          19.5   -1.54 
##  9     9      18.0      21.7          17.8    0.21 
## 10    10      19.6      24.4          19.7   -0.099
## # ... with 2,619 more rows

#Vken = mutate(df, VALken = 13.4 + (0.487 * cana))
#Vrest = mutate(Vken, Valori = ken )
#Vfin = mutate (Vrest, VALrest = VALken - ken )
#View(Vfin)

#**Conclusion:**
#**Better model original with outliers**
#**Linear regression: y = b0 + b1 * x => VALUEkennedy = 13.4 + 0.487 * VALUEcanairio**

SensoresLowCost.R

DBB

2020-07-28