library(tidyverse)

\(\underline{\textbf{Oxygen Purity Data:}}\)
\(\bullet\ \ Y =\) Purity of Oxygen
\(\bullet\ \ X =\) Percentage of Hydrocarbons

df1 <- read_csv("problem1-oxygenpurity.csv")
attach(df1)

Part 1 - Fit a Simple Linear (SLR) Regression Model

\(\large \hat{Y} = \widehat{\beta}_0 + \widehat{\beta}_1 X\)

y_diffs <- purity - mean(purity)
x_diffs <- hydro - mean(hydro)
Cov <- sum(y_diffs * x_diffs)
Var_x <- sum(x_diffs^2)
# Beta 1 (Coefficient) Calculations
hat_beta_1 <- Cov / Var_x

\(\widehat{\beta}_1 = \dfrac{ \sum (y_i - \overline{y})(x_i - \overline{x}) }{ \sum (x_i - \overline{x})^2 } = \dfrac{Cov(Y, X)}{Var(X)} =\) 11.801

# Beta 0 (Intercept) Calculations
hat_beta_0 <- mean(purity) - (hat_beta_1 * mean(hydro))

\(\widehat{\beta}_0 = \overline{y} - \widehat{\beta}_1 \overline{x} =\) 77.8633

\(\underline{\textbf{SLR Fitted Model:}} \hspace{0.25cm} \hat Y =\) 77.8633 \(+\) 11.801 \(X\)

Part 2: Hypothesis Tests on \(\hat\beta_1\)

\(\large H_0: \ \beta_1 = 0 \hspace{0.75cm} \text{and} \hspace{0.75cm} H_1: \ \beta_1 \neq 0\)

The estimated \(\hat\beta_1\) is statistically significant if the test statistic \((t^*)\) has a p-value \((p)\) is less than the desired significance level \((\alpha = 0.05)\).

fitted <- hat_beta_0 + hat_beta_1 * hydro
SSE <- sum((purity - fitted)^2)
hat_variance <- SSE / (20-2)
seBeta_1 <- sqrt(hat_variance / Var_x)
tBeta_1 <- hat_beta_1 / seBeta_1

\(\text{s.e.} \left ( \hat\beta_1 \right ) =\) 3.4851 , \(\hspace{1cm} t^* = \dfrac{\hat\beta_1}{\text{s.e.} \left ( \hat\beta_1 \right )} =\) 3.3861 , \(\hspace{1cm} p =\) .00329 \(\hspace{0.15cm} \therefore \hspace{0.15cm} p < \alpha\)

\(\underline{\textbf{Conclusion:}} \hspace{0.25cm}\) Reject \(H_0\) and conclude that \(\hat\beta_1\) is statistically significant.

Part 3: Coefficient of Determination, \(r^2\)

\(\hspace{0.7cm} \text{SST} \hspace{0.6cm} = \hspace{0.8cm} \text{SSR} \hspace{0.7cm} + \hspace{0.7cm} \text{SSE}\)

\(\sum (y_i - \overline{y})^2 \ = \ \sum (\hat{y}_i - \overline{y})^2 \ + \ \sum (y_i - \overline{y})^2\)

SST <- sum(y_diffs^2)
rSquared <- 1 - (SSE / SST)

\(\large r^2 = 1 - \frac {\text{SSE}} {\text{SST}} =\) 0.3891

Part 4: 95% Confidence Interval on the Slope

Fit1 <- lm(purity ~ hydro, data = df1)
sumFit1 <- summary(Fit1)
confint.lm(Fit1)
##              2.5 % 97.5 %
## (Intercept) 69.042  86.68
## hydro        4.479  19.12

Part 5: 95% C.I. on \(\overline{y}\) when \(x = 1.05\)

predict(Fit1, newdata = tibble(hydro = 1.05), interval = "confidence")
##     fit   lwr  upr
## 1 90.25 88.31 92.2

Part 6: Sample Correlation Coefficient: \(\ r_{xy}\)

Rxy <- sqrt(rSquared) 

\(\large r_{xy} = \sqrt{r^2} =\) 0.6238

Part 7: t-Test on \(\ r_{xy}\)

\(\large H_0: \ \rho = 0 \hspace{0.75cm} \text{and} \hspace{0.75cm} H_A: \ \rho \neq 0\)

tCor <- (Rxy * sqrt(18)) / sqrt(1 - Rxy^2)
cor.test(purity, hydro)
## 
##  Pearson's product-moment correlation
## 
## data:  purity and hydro
## t = 3.4, df = 18, p-value = 0.003
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2504 0.8356
## sample estimates:
##    cor 
## 0.6238

\(\underline{\textbf{Conclusion:}} \hspace{0.25cm}\) Reject \(H_0\). There is significant evidence supporting the alternative hypothesis that the \(\rho \neq 0\)

detach(df1)