#Install and open the "dplyr" package

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##The pipe operator: The %>% operator is a handy function that allows for simplicity and a more natural way of writing code.

# The pipe operator takes the output of the expression or function on its left-hand side and uses it as the first argument of the function on its right-hand side.

# This allows for chaining multiple operations together without the need for nested function calls or creating intermediate objects.

# Example: Suppose we want to clean a dataset, and remove all observations that is an "NA". Use the dataset "airquality".



# Check dataset

View(airquality)



# Conventional way: filter(airquality, !is.na(Ozone) & !is.na(Solar.R))

# We can also write this as:



#Notice that I don't need to specify the first argument in the "filer function.


airquality %>%
  
  filter(!is.na(Ozone) & !is.na(Solar.R)) %>%
  
  group_by(Month) %>%
  
  mutate(z_Ozone =scale(Ozone)) #Normalizing the 'Ozone' variable. How would the resuts change if the 'Month' was not a group?
## # A tibble: 111 × 7
## # Groups:   Month [5]
##    Ozone Solar.R  Wind  Temp Month   Day z_Ozone[,1]
##    <int>   <int> <dbl> <int> <int> <int>       <dbl>
##  1    41     190   7.4    67     5     1      0.737 
##  2    36     118   8      72     5     2      0.519 
##  3    12     149  12.6    74     5     3     -0.530 
##  4    18     313  11.5    62     5     4     -0.268 
##  5    23     299   8.6    65     5     7     -0.0492
##  6    19      99  13.8    59     5     8     -0.224 
##  7     8      19  20.1    61     5     9     -0.705 
##  8    16     256   9.7    69     5    12     -0.355 
##  9    11     290   9.2    66     5    13     -0.573 
## 10    14     274  10.9    68     5    14     -0.442 
## # ℹ 101 more rows
# Let us add more operations in the filtered data



# Create a table with the mean and standard deviation of the observation variables by groups.

air_count <- airquality %>%
  
  filter(!is.na(Ozone) & !is.na(Solar.R)) %>%
  
  group_by(Month) %>%
  
  count()



air_transform <- airquality %>%
  
  filter(!is.na(Ozone) & !is.na(Solar.R)) %>%
  
  group_by(Month) %>%
  
  summarise(
    
    mean(Ozone),
    
    sd(Ozone)
    
  )



#Create vectors for the mean and standard deviation of the variable Ozone.

meanOzone <- as.vector(sapply(list(air_transform$`mean(Ozone)`),rep,air_count$n))

stdOzone <- as.vector(sapply(list(air_transform$`sd(Ozone)`),rep,air_count$n))



# Add the standardized z scores of the variables of interest

mod_airquality <- airquality %>%
  
  filter(!is.na(Ozone) & !is.na(Solar.R)) %>%
  
  mutate(meanOzone) %>%
  
  mutate(stdOzone) %>%
  
  mutate(stand_Ozone = (Ozone-meanOzone)/stdOzone)

#Visualizing correlations among the variables of interest. # Select variables of interest using the "select" function.
#Selecting only a subset of variables.
airquality_subset <- airquality %>%
  filter(!is.na(Ozone) & !is.na(Solar.R)) %>%
  select(Ozone, Solar.R, Wind)
  


#Plot among all combination of variables
pairs(airquality_subset)

# It seems like it is only observations of Ozone and Solar.R are positively correlated. 
# We can check the correlation by using the "cor" function.

cor(airquality_subset) 
##              Ozone    Solar.R       Wind
## Ozone    1.0000000  0.3483417 -0.6124966
## Solar.R  0.3483417  1.0000000 -0.1271835
## Wind    -0.6124966 -0.1271835  1.0000000
# Let us confirm our observations by modelling a regression 
summary(lm(Ozone~Wind, airquality_subset)) 
## 
## Call:
## lm(formula = Ozone ~ Wind, data = airquality_subset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.513 -18.597  -5.035  15.814  88.437 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  99.0413     7.4724   13.25  < 2e-16 ***
## Wind         -5.7288     0.7082   -8.09 9.09e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.42 on 109 degrees of freedom
## Multiple R-squared:  0.3752, Adjusted R-squared:  0.3694 
## F-statistic: 65.44 on 1 and 109 DF,  p-value: 9.089e-13
# We find confirmation of the negative correlation between Wind and Solar.R. Let us now check by including the other variables.
summary(lm(Ozone~Wind+Solar.R, airquality_subset))
## 
## Call:
## lm(formula = Ozone ~ Wind + Solar.R, data = airquality_subset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.651 -18.164  -5.959  18.514  85.237 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 77.24604    9.06751   8.519 1.05e-13 ***
## Wind        -5.40180    0.67324  -8.024 1.34e-12 ***
## Solar.R      0.10035    0.02628   3.819 0.000224 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.92 on 108 degrees of freedom
## Multiple R-squared:  0.4495, Adjusted R-squared:  0.4393 
## F-statistic: 44.09 on 2 and 108 DF,  p-value: 1.003e-14
#What does the regression model tell you? What metric will you observe to comment on the efficacy of the model.
# Variance *explained* by the model.
summary(lm(Ozone~Wind+Solar.R, airquality_subset))$sigma^2
## [1] 620.8586
# Variance of the dependent variable.
var(airquality_subset$Ozone)
## [1] 1107.29
# Therefore, the proportion of *unexplained* variance, R squared, from the model is
(var(airquality_subset$Ozone)-summary(lm(Ozone~Wind+Solar.R, airquality_subset))$sigma^2)/var(airquality_subset$Ozone)
## [1] 0.439299
# Compare it with the R-sq value of the model.

airquality %>%
  filter(!is.na(Ozone) & !is.na(Solar.R)) %>%
  mutate(Ozone = scale(Ozone))
##           Ozone Solar.R Wind Temp Month Day
## 1   -0.03302982     190  7.4   67     5   1
## 2   -0.18328840     118  8.0   72     5   2
## 3   -0.90452961     149 12.6   74     5   3
## 4   -0.72421931     313 11.5   62     5   4
## 5   -0.57396073     299  8.6   65     5   7
## 6   -0.69416759      99 13.8   59     5   8
## 7   -1.02473648      19 20.1   61     5   9
## 8   -0.78432275     256  9.7   69     5  12
## 9   -0.93458133     290  9.2   66     5  13
## 10  -0.84442618     274 10.9   68     5  14
## 11  -0.72421931      65 13.2   58     5  15
## 12  -0.84442618     334 11.5   64     5  16
## 13  -0.24339184     307 12.0   66     5  17
## 14  -1.08483992      78 18.4   57     5  18
## 15  -0.36359870     322 11.5   68     5  19
## 16  -0.93458133      44  9.7   62     5  20
## 17  -1.23509850       8  9.7   59     5  21
## 18  -0.93458133     320 16.6   73     5  22
## 19  -1.14494335      25  9.7   61     5  23
## 20  -0.30349527      92 12.0   61     5  24
## 21  -0.57396073      13 12.0   67     5  28
## 22   0.08717705     252 14.9   81     5  29
## 23   2.19079726     223  5.7   79     5  30
## 24  -0.15323668     279  7.4   76     5  31
## 25  -0.39365042     127  9.7   82     6   7
## 26   0.86852170     291 13.8   90     6   9
## 27  -0.09313325     323 11.5   87     6  10
## 28  -0.57396073     148  8.0   82     6  13
## 29  -0.63406416     191 14.9   77     6  16
## 30  -0.15323668     284 20.7   72     6  17
## 31  -0.66411588      37  9.2   65     6  18
## 32  -0.90452961     120 11.5   73     6  19
## 33  -0.87447790     137 10.3   76     6  20
## 34   2.79183160     269  4.1   84     7   1
## 35   0.20738392     248  9.2   85     7   2
## 36  -0.30349527     236  9.2   81     7   3
## 37   0.65815968     175  4.6   83     7   5
## 38  -0.06308153     314 10.9   83     7   6
## 39   1.04883200     276  5.1   88     7   7
## 40   1.64986635     267  6.3   92     7   8
## 41   1.64986635     272  5.7   92     7   9
## 42   1.28924574     175  7.4   89     7  10
## 43  -0.96463305     264 14.3   73     7  12
## 44  -0.45375386     175 14.9   81     7  13
## 45  -1.05478820      48 14.3   80     7  15
## 46   0.17733221     260  6.9   81     7  16
## 47  -0.21334012     274 10.3   82     7  17
## 48   0.56800453     285  6.3   84     7  18
## 49   1.10893544     187  5.1   87     7  19
## 50   0.62810796     220 11.5   85     7  20
## 51  -0.78432275       7  6.9   74     7  21
## 52   1.13898716     294  8.6   86     7  24
## 53   1.98043524     223  8.0   85     7  25
## 54  -0.66411588      81  8.6   82     7  26
## 55   0.29753907      82 12.0   86     7  27
## 56   1.19909059     213  7.4   88     7  28
## 57   0.23743564     275  7.4   86     7  29
## 58   0.65815968     253  7.4   83     7  30
## 59   0.50790109     254  9.2   81     7  31
## 60  -0.09313325      83  6.9   81     8   1
## 61  -0.99468477      24 13.8   81     8   2
## 62  -0.78432275      77  7.4   82     8   3
## 63   2.40115928     255  4.0   89     8   7
## 64   1.40945261     229 10.3   90     8   8
## 65   2.04053867     207  8.0   90     8   9
## 66   0.05712534     192 11.5   86     8  12
## 67  -0.42370214     273 11.5   82     8  13
## 68   0.68821140     157  9.7   80     8  14
## 69  -0.60401244      71 10.3   77     8  16
## 70   0.50790109      51  6.3   79     8  17
## 71  -0.57396073     115  7.4   76     8  18
## 72  -0.33354699     244 10.9   78     8  19
## 73   0.05712534     190 10.3   78     8  20
## 74  -0.63406416     259 15.5   77     8  21
## 75  -0.99468477      36 14.3   72     8  22
## 76   0.08717705     212  9.7   79     8  24
## 77   3.78353827     238  3.4   81     8  25
## 78   0.92862514     215  8.0   86     8  26
## 79   1.01878029     203  9.7   97     8  28
## 80   2.28095241     225  2.3   94     8  29
## 81   1.25919402     237  6.3   96     8  30
## 82   1.28924574     188  6.3   94     8  31
## 83   1.61981463     167  6.9   91     9   1
## 84   1.07888372     197  5.1   92     9   2
## 85   0.92862514     183  2.8   93     9   3
## 86   1.46955605     189  4.6   93     9   4
## 87   0.14728049      95  7.4   87     9   5
## 88  -0.30349527      92 15.5   84     9   6
## 89  -0.66411588     252 10.9   80     9   7
## 90  -0.57396073     220 10.3   78     9   8
## 91  -0.63406416     230 10.9   75     9   9
## 92  -0.54390901     259  9.7   73     9  10
## 93   0.05712534     236 14.9   81     9  11
## 94  -0.63406416     259 15.5   76     9  12
## 95  -0.42370214     238  6.3   77     9  13
## 96  -0.99468477      24 10.9   71     9  14
## 97  -0.87447790     112 11.5   71     9  15
## 98   0.11722877     237  6.9   78     9  16
## 99  -0.72421931     224 13.8   67     9  17
## 100 -0.87447790      27 10.3   76     9  18
## 101 -0.54390901     238 10.3   68     9  19
## 102 -0.78432275     201  8.0   82     9  20
## 103 -0.87447790     238 12.6   64     9  21
## 104 -0.57396073      14  9.2   71     9  22
## 105 -0.18328840     139 10.3   81     9  23
## 106 -1.05478820      49 10.3   69     9  24
## 107 -0.84442618      20 16.6   63     9  25
## 108 -0.36359870     193  6.9   70     9  26
## 109 -0.84442618     191 14.3   75     9  28
## 110 -0.72421931     131  8.0   76     9  29
## 111 -0.66411588     223 11.5   68     9  30