Y <- c(6, 12, 18)
X <- c(4, 5, 6)
# Matrix
X_matrix <- cbind(1, X)
XtX <- t(X_matrix) %*% X_matrix
XtX_inv <- solve(XtX)
XtX_inv_Xt <- XtX_inv %*% t(X_matrix)
beta_hat <- XtX_inv_Xt %*% Y
print(beta_hat)
## [,1]
## -18
## X 6
## [,1]
## [1,] 4.973799e-14
## [2,] 4.263256e-14
## [3,] 2.842171e-14
## Warning in summary.lm(model): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = Y ~ X, data = data)
##
## Residuals:
## 1 2 3
## -7.252e-16 1.450e-15 -7.252e-16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.800e+01 6.364e-15 -2.829e+15 2.25e-16 ***
## X 6.000e+00 1.256e-15 4.777e+15 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.776e-15 on 1 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.282e+31 on 1 and 1 DF, p-value: < 2.2e-16
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
data <- read.csv("happy.csv")
str(data)
## 'data.frame': 498 obs. of 3 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ income : num 3.86 4.98 4.92 3.21 7.2 ...
## $ happiness: num 2.31 3.43 4.6 2.79 5.6 ...
summary(data)
## X income happiness
## Min. : 1.0 Min. :1.506 Min. :0.266
## 1st Qu.:125.2 1st Qu.:3.006 1st Qu.:2.266
## Median :249.5 Median :4.424 Median :3.473
## Mean :249.5 Mean :4.467 Mean :3.393
## 3rd Qu.:373.8 3rd Qu.:5.992 3rd Qu.:4.503
## Max. :498.0 Max. :7.482 Max. :6.863
model <- lm(happiness ~ income, data = data)
summary(model)
##
## Call:
## lm(formula = happiness ~ income, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.02479 -0.48526 0.04078 0.45898 2.37805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.20427 0.08884 2.299 0.0219 *
## income 0.71383 0.01854 38.505 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7181 on 496 degrees of freedom
## Multiple R-squared: 0.7493, Adjusted R-squared: 0.7488
## F-statistic: 1483 on 1 and 496 DF, p-value: < 2.2e-16
new_data <- data.frame(income = c(5, 7, 9))
predictions <- predict(model, newdata = new_data)
ggplot(data, aes(x = income, y = happiness)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Simple Linear Regression", x = "Income ($10,000)", y = "Happiness (1-10)")
## `geom_smooth()` using formula = 'y ~ x'
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 1.3515, df = 1, p-value = 0.245
coefficients <- coef(model)
beta_hat_0 <- coefficients["(Intercept)"]
beta_hat_1 <- coefficients["income"]
plot(model, which = 1)
plot(model, which = 2)
ggplot(data, aes(x = income, y = happiness)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Simple Linear Regression", x = "Income ($10,000)", y = "Happiness (1-10)")
## `geom_smooth()` using formula = 'y ~ x'
income_values <- c(3.65, 6.87, 8.49)
predictions <- 0.204 + 0.714 * income_values
print(predictions)
## [1] 2.81010 5.10918 6.26586
library(readr)
data <- read_csv("calcofi.csv")
## Rows: 864863 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): T_degC, Salnty
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)
sample_size <- round(0.01 * nrow(data))
sample_data <- data[sample(1:nrow(data), sample_size), ]
library(ggplot2)
ggplot(sample_data, aes(x = T_degC, y = Salnty)) +
geom_point() +
labs(title = "Water Temperature vs. Water Salinity",
x = "Water Temperature (T degC)",
y = "Water Salinity (Salnty)")
## Warning: Removed 514 rows containing missing values (`geom_point()`).
correlation <- cor(sample_data$T_degC, sample_data$Salnty)
print(paste("Correlation Coefficient:", round(correlation, 3)))
## [1] "Correlation Coefficient: NA"
model <- lm(Salnty ~ T_degC, data = sample_data)
summary(model)
##
## Call:
## lm(formula = Salnty ~ T_degC, data = sample_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.90135 -0.22568 0.00459 0.19475 3.08921
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.448676 0.012267 2808.32 <2e-16 ***
## T_degC -0.055923 0.001056 -52.93 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.399 on 8133 degrees of freedom
## (514 observations deleted due to missingness)
## Multiple R-squared: 0.2562, Adjusted R-squared: 0.2561
## F-statistic: 2802 on 1 and 8133 DF, p-value: < 2.2e-16
new_data <- data.frame(T_degC = c(7.48, 18.60, 26.35))
predictions <- predict(model, newdata = new_data)
print(predictions)
## 1 2 3
## 34.03037 33.40851 32.97511
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.