1. (15 marks) Three observations for a response random variable Y are {6, 12, 18}; the corresponding values observed for the explanatory variable X are {4, 5, 6}.

  1. Perform the same matrix algebra calculations of part (a) and (b) using R.
Y <- c(6, 12, 18)
X <- c(4, 5, 6)

# Matrix
X_matrix <- cbind(1, X)

XtX <- t(X_matrix) %*% X_matrix

XtX_inv <- solve(XtX)

XtX_inv_Xt <- XtX_inv %*% t(X_matrix)

beta_hat <- XtX_inv_Xt %*% Y

print(beta_hat)
##   [,1]
##    -18
## X    6
  1. Estimate the coeffcients using the function lm in R.
##              [,1]
## [1,] 4.973799e-14
## [2,] 4.263256e-14
## [3,] 2.842171e-14
## Warning in summary.lm(model): essentially perfect fit: summary may be
## unreliable
## 
## Call:
## lm(formula = Y ~ X, data = data)
## 
## Residuals:
##          1          2          3 
## -7.252e-16  1.450e-15 -7.252e-16 
## 
## Coefficients:
##               Estimate Std. Error    t value Pr(>|t|)    
## (Intercept) -1.800e+01  6.364e-15 -2.829e+15 2.25e-16 ***
## X            6.000e+00  1.256e-15  4.777e+15  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.776e-15 on 1 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 2.282e+31 on 1 and 1 DF,  p-value: < 2.2e-16

3. (30 marks) Using the data in the provided CSV le (happy.csv), generate a simple linear regression model to describe the relationship between Income and Happiness. Then answer the questions below.

library(ggplot2)  
## Warning: package 'ggplot2' was built under R version 4.2.3
library(dplyr)     
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lmtest)   
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
data <- read.csv("happy.csv")

str(data)
## 'data.frame':    498 obs. of  3 variables:
##  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ income   : num  3.86 4.98 4.92 3.21 7.2 ...
##  $ happiness: num  2.31 3.43 4.6 2.79 5.6 ...
summary(data)
##        X             income        happiness    
##  Min.   :  1.0   Min.   :1.506   Min.   :0.266  
##  1st Qu.:125.2   1st Qu.:3.006   1st Qu.:2.266  
##  Median :249.5   Median :4.424   Median :3.473  
##  Mean   :249.5   Mean   :4.467   Mean   :3.393  
##  3rd Qu.:373.8   3rd Qu.:5.992   3rd Qu.:4.503  
##  Max.   :498.0   Max.   :7.482   Max.   :6.863
model <- lm(happiness ~ income, data = data)

summary(model)
## 
## Call:
## lm(formula = happiness ~ income, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.02479 -0.48526  0.04078  0.45898  2.37805 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.20427    0.08884   2.299   0.0219 *  
## income       0.71383    0.01854  38.505   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7181 on 496 degrees of freedom
## Multiple R-squared:  0.7493, Adjusted R-squared:  0.7488 
## F-statistic:  1483 on 1 and 496 DF,  p-value: < 2.2e-16
new_data <- data.frame(income = c(5, 7, 9))  
predictions <- predict(model, newdata = new_data)

ggplot(data, aes(x = income, y = happiness)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Simple Linear Regression", x = "Income ($10,000)", y = "Happiness (1-10)")
## `geom_smooth()` using formula = 'y ~ x'

bptest(model) 
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 1.3515, df = 1, p-value = 0.245
coefficients <- coef(model)

beta_hat_0 <- coefficients["(Intercept)"]
beta_hat_1 <- coefficients["income"]

Validate regression model

plot(model, which = 1)

plot(model, which = 2)  

Well labelled plot of the observations

ggplot(data, aes(x = income, y = happiness)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Simple Linear Regression", x = "Income ($10,000)", y = "Happiness (1-10)")
## `geom_smooth()` using formula = 'y ~ x'

Predictions

income_values <- c(3.65, 6.87, 8.49)
predictions <- 0.204 + 0.714 * income_values

print(predictions)
## [1] 2.81010 5.10918 6.26586

In this question we are going to be working with the CalCOFI Dataset (CC BY 4.0 License). This dataset contains comprehensive oceanographic measurement data from California from 1949 up to the present day. The dataset includes features such as water temperature, salinity, measurement depth, O2 level, and more. For our purposes, we will only be using the water temperature (T degC) feature to predict the water salinity (Salnty).

library(readr)

data <- read_csv("calcofi.csv")
## Rows: 864863 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): T_degC, Salnty
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)  
sample_size <- round(0.01 * nrow(data))
sample_data <- data[sample(1:nrow(data), sample_size), ]

library(ggplot2)

ggplot(sample_data, aes(x = T_degC, y = Salnty)) +
  geom_point() +
  labs(title = "Water Temperature vs. Water Salinity",
       x = "Water Temperature (T degC)",
       y = "Water Salinity (Salnty)")
## Warning: Removed 514 rows containing missing values (`geom_point()`).

correlation <- cor(sample_data$T_degC, sample_data$Salnty)
print(paste("Correlation Coefficient:", round(correlation, 3)))
## [1] "Correlation Coefficient: NA"
model <- lm(Salnty ~ T_degC, data = sample_data)

summary(model)
## 
## Call:
## lm(formula = Salnty ~ T_degC, data = sample_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.90135 -0.22568  0.00459  0.19475  3.08921 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.448676   0.012267 2808.32   <2e-16 ***
## T_degC      -0.055923   0.001056  -52.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.399 on 8133 degrees of freedom
##   (514 observations deleted due to missingness)
## Multiple R-squared:  0.2562, Adjusted R-squared:  0.2561 
## F-statistic:  2802 on 1 and 8133 DF,  p-value: < 2.2e-16
new_data <- data.frame(T_degC = c(7.48, 18.60, 26.35))
predictions <- predict(model, newdata = new_data)
print(predictions)
##        1        2        3 
## 34.03037 33.40851 32.97511

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.