Preparing Workspace

rm(list = ls()) # Clear environment
gc()            # Clear unused memory
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 533753 28.6    1190298 63.6         NA   669420 35.8
## Vcells 986097  7.6    8388608 64.0      16384  1851779 14.2
cat("\f")       # Clear the console
if(!is.null(dev.list())) dev.off() # Clear all plots
## null device 
##           1

Installing Packages

 packages <- c("psych",
               "tidyverse",
               "ggplot2",
               "lemon",
               "gridExtra",
               "ggrepel",
               "scales",
               "dplyr"
   )
   
  for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i], dependencies = TRUE)
    }
    library(packages[i], character.only = TRUE)
  }
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'lemon'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     %||%
## 
## 
## 
## Attaching package: 'gridExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## 
## 
## Attaching package: 'scales'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
## 
## 
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
  rm(packages)

Retrieving Dataset

data("airquality")
str(airquality)
## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...

Imputing Missing Values

colSums(is.na(airquality))
##   Ozone Solar.R    Wind    Temp   Month     Day 
##      37       7       0       0       0       0
ozone.source.data <- airquality$Ozone #Storing source data

ozone.mean <- mean(airquality$Ozone, na.rm = T) #calculating mean ozone level

airquality$Ozone[is.na(airquality$Ozone)] <- mean(airquality$Ozone, na.rm=TRUE) #Replacing NAs with mean ozone level

sum(is.na(airquality$Ozone)) #Check 
## [1] 0
summary(airquality$Ozone)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   21.00   42.13   42.13   46.00  168.00
solar.r.source.data <- airquality$Solar.R #Storing source data

solar.r.mean <- mean(airquality$Solar.R, na.rm = T) #calculating mean ozone level

airquality$Solar.R[is.na(airquality$Solar.R)] <- mean(airquality$Solar.R, na.rm=TRUE) #Replacing NAs with mean ozone level

sum(is.na(airquality$Solar.R)) #Check 
## [1] 0
summary(airquality$Solar.R)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     7.0   120.0   194.0   185.9   256.0   334.0
colSums(is.na(airquality))
##   Ozone Solar.R    Wind    Temp   Month     Day 
##       0       0       0       0       0       0

Estimating Equation:

\[ Ozone \sim \beta_0 + Solar Radiation * \beta_1 + \epsilon \] \[\epsilon \sim N(0,\sigma^2)\]

Testing Correlation

cor(airquality$Ozone, airquality$Solar.R) #positive correlation
## [1] 0.3029695

Fitting Linear Model

air.quality.lm <- lm(airquality$Ozone ~ airquality$Solar.R)
summary(air.quality.lm)
## 
## Call:
## lm(formula = airquality$Ozone ~ airquality$Solar.R)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.379 -17.125  -6.332  10.143 120.725 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        23.75348    5.20054   4.567 1.02e-05 ***
## airquality$Solar.R  0.09883    0.02530   3.907 0.000141 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27.44 on 151 degrees of freedom
## Multiple R-squared:  0.09179,    Adjusted R-squared:  0.08578 
## F-statistic: 15.26 on 1 and 151 DF,  p-value: 0.0001409

Plotting Line of Best Fit

plot(x = airquality$Solar.R, y = airquality$Ozone,
     main = "Solar Radiation vs. Ozone Level",
     xlab = "Solar Radiation",
     ylab = "Ozone Level"
     )
abline(air.quality.lm, col = "red")

Replicating Slope Parameter

covariance.xy <- cov(airquality$Solar.R, airquality$Ozone)
variance.x <- var(airquality$Solar.R)

beta.1 <- covariance.xy / variance.x
print(beta.1)
## [1] 0.09883118

Replicating Intercept Parameter

beta.0 <- mean(airquality$Ozone) - beta.1 * mean(airquality$Solar.R)
print(beta.0)
## [1] 23.75348

Part II

Plotting results

plot(air.quality.lm)