linReg_closedForm <- function(X, y) {
# Add a column of 1s to X for the intercept
X <- cbind(1, X)
# Calculate MLE of beta
beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
# Compute residuals
residuals <- y - X %*% beta_hat
# Estimate sigma^2
sigma2_hat <- sum(residuals^2) / (nrow(X) - ncol(X))
# Compute the covariance matrix
cov_matrix <- sigma2_hat * solve(t(X) %*% X)
# Calculate standard errors of beta
sd_beta <- sqrt(diag(cov_matrix))
# Calculate t-statistics
t_beta <- beta_hat / sd_beta
# Calculate p-values
p_beta <- 2 * pt(-abs(t_beta), df = nrow(X) - ncol(X))
# Return a dataframe with the results
result <- data.frame(beta = beta_hat, sd_beta = sd_beta, t_beta = t_beta, p_beta = p_beta)
row.names(result) <- c("Intercept", paste("X", 1:(ncol(X) - 1), sep = "."))
return(result)
}
# Load the data
data_linReg <- read.csv("C:/Users/User/Downloads/data_linReg_-1226050742.csv")
# Prepare the data
y <- data_linReg[, 1]
X <- as.matrix(data_linReg[, -1])
# Apply the linReg_closedForm function
result_closedForm <- linReg_closedForm(X, y)
# Apply the lm function
result_lm <- lm(y ~ ., data = data_linReg)
# Print the results
print("Results from linReg_closedForm:")
## [1] "Results from linReg_closedForm:"
print(result_closedForm)
## beta sd_beta t_beta p_beta
## Intercept -0.39558662 0.1449342 -2.7294224 6.400287e-03
## X.1 1.09732283 0.1450665 7.5642762 5.917272e-14
## X.2 1.91351236 0.1409708 13.5738161 3.303174e-40
## X.3 2.96408750 0.1462851 20.2624042 3.692855e-83
## X.4 -0.05828176 0.1451228 -0.4016030 6.880192e-01
## X.5 -0.13714888 0.1461317 -0.9385296 3.480860e-01
print("Results from lm:")
## [1] "Results from lm:"
print(summary(result_lm)$coefficients)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.39558662 0.1449342 -2.7294224 6.400287e-03
## X.1 1.09732283 0.1450665 7.5642762 5.917272e-14
## X.2 1.91351236 0.1409708 13.5738161 3.303174e-40
## X.3 2.96408750 0.1462851 20.2624042 3.692855e-83
## X.4 -0.05828176 0.1451228 -0.4016030 6.880192e-01
## X.5 -0.13714888 0.1461317 -0.9385296 3.480860e-01
linReg_GD <- function(X, y, maxIter, step_size, tol) {
n <- nrow(X)
p <- ncol(X)
X <- cbind(1, X) # Add intercept
beta_hat <- matrix(0, p + 1, 1)
llh <- numeric(maxIter)
for (t in 1:maxIter) {
residuals <- y - X %*% beta_hat
gradient <- -t(X) %*% residuals
# Update coefficients using the gradient descent formula
beta_hat <- beta_hat - step_size * gradient
# Evaluate negative loss function llh(β) = −D(β) at each iteration
llh[t] <- -1/2 * sum(residuals^2)
# Check for convergence
if (t > 1 && abs((llh[t] - llh[t-1]) / abs(llh[t])) < tol) {
llh <- llh[1:t]
break
}
}
# After convergence, compute standard errors, t-statistics, and p-values
sigma2_hat <- sum(residuals^2) / (n - p - 1)
cov_matrix <- sigma2_hat * solve(t(X) %*% X)
sd_beta <- sqrt(diag(cov_matrix))
t_stat <- beta_hat / sd_beta
p_values <- 2 * pt(-abs(t_stat), df = n - p - 1)
# Prepare coefficients data frame
coefficients <- cbind(beta_hat, sd_beta, t_stat, p_values)
colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
# Return a list with coefficients and log-likelihood values
return(list(coefficient = coefficients, llh = llh))
}
# Load the data
data_linReg <- read.csv("C:/Users/User/Downloads/data_linReg_-1226050742.csv")
# Prepare the data
y <- data_linReg[, 1]
X <- as.matrix(data_linReg[, -1])
# Apply linReg_GD
results_GD <- linReg_GD(X, y, maxIter = 500, step_size = 1e-4, tol = 1e-6)
# Print results from Gradient Descent
print("Results from linReg_GD:")
## [1] "Results from linReg_GD:"
print(results_GD$coefficient)
## Estimate Std. Error t value Pr(>|t|)
## -0.39506038 0.1449343 -2.7257898 6.470882e-03
## X.1 1.09597956 0.1450666 7.5550114 6.341492e-14
## X.2 1.91146934 0.1409709 13.5593145 3.958887e-40
## X.3 2.95885983 0.1462852 20.2266545 6.740942e-83
## X.4 -0.05715256 0.1451229 -0.3938218 6.937547e-01
## X.5 -0.13880234 0.1461318 -0.9498438 3.423067e-01
# Plot llh values
plot(results_GD$llh, type = "l", ylim=c(-52000, -42000),xlab = "Iteration", ylab = "Log-Likelihood", main = "Log-Likelihood vs Iteration for linReg_GD")
abline(h=-sum((y-cbind(1,X) %*% result_closedForm$beta)^2)/2,col="red",lty=2)
legend("bottomright",
legend=c("Exact","Gradient Descent"),
lty=c(2,1),
col=c("red","black"),
bty="n",
cex=0.75,
text.width=0.5,
inset=c(0.3,0))
This analysis explores the application of logistic regression to predict the likelihood of a heart attack based on patient characteristics. The dataset includes 14 attributes which includes age, sex, chest pain type, and resting blood pressure.The primary questions include: Can the likelihood of a heart attack, be accurately predicted and which features play a significant role in determining the risk?
The dataset contains patient attributes related to heart health. The questions of interest revolve around predicting heart attack risk and identifying influential features.
# loading the dataset
data_heart <- read.csv("C:/Users/User/Downloads/Part_II_data.csv")
# loading libraries
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(rlang)
Logistic regression method was used for prediction and statistical inference. The statistical inferences include z and p coefficients, a likelihood ratio test for overall model significance, and confidence intervals for odds ratios.
# checking the structure if the data set
str(data_heart)
## 'data.frame': 303 obs. of 14 variables:
## $ age : int 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : int 1 1 0 1 0 1 0 1 1 1 ...
## $ cp : int 3 2 1 1 0 0 1 1 2 2 ...
## $ trtbps : int 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : int 233 250 204 236 354 192 294 263 199 168 ...
## $ fbs : int 1 0 0 0 0 0 0 0 1 0 ...
## $ restecg : int 0 1 0 1 1 1 0 1 1 1 ...
## $ thalachh: int 150 187 172 178 163 148 153 173 162 174 ...
## $ exng : int 0 0 0 0 1 0 0 0 0 0 ...
## $ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slp : int 0 0 2 2 2 1 1 2 2 2 ...
## $ caa : int 0 0 0 0 0 0 0 0 0 0 ...
## $ thall : int 1 2 2 2 2 1 2 3 3 2 ...
## $ output : int 1 1 1 1 1 1 1 1 1 1 ...
# Split the data into training and testing sets
# Set seed for reproducibility
set.seed(123)
sample_index <- sample(1:nrow(data_heart), 0.8 * nrow(data_heart))
train_data <- data_heart[sample_index, ]
test_data <- data_heart[-sample_index, ]
# Fit logistic regression model
logistic_model <- glm(output ~ ., data = train_data, family = "binomial")
# Perform statistical inference
summary_logistic <- summary(logistic_model)
# Calculate standard errors of coefficients
se_coefs <- summary_logistic$coefficients[, "Std. Error"]
# Calculate confidence intervals for individual coefficients
conf_intervals <- confint(logistic_model)
## Waiting for profiling to be done...
# Z tests for individual coefficients
z_tests <- coef(logistic_model) / se_coefs
# Two-tailed p-values for the Z tests
p_values <- 2 * (1 - pnorm(abs(z_tests)))
# Create a data frame for results
z_test_results <- data.frame(
"Coefficient" = names(coef(logistic_model)),
"Estimate" = coef(logistic_model),
"Std. Error" = se_coefs,
"Z Value" = z_tests,
"P Value" = p_values,
"Lower CI" = conf_intervals[, 1],
"Upper CI" = conf_intervals[, 2]
)
cat("Z Tests and Confidence Intervals for Coefficients:\n")
## Z Tests and Confidence Intervals for Coefficients:
print(z_test_results)
## Coefficient Estimate Std..Error Z.Value P.Value
## (Intercept) (Intercept) 4.555346811 2.79468302 1.6300048 0.1031004774
## age age -0.013283007 0.02528902 -0.5252480 0.5994107729
## sex sex -1.540661055 0.50986062 -3.0217298 0.0025133476
## cp cp 0.769377658 0.19530886 3.9392871 0.0000817241
## trtbps trtbps -0.017915647 0.01100329 -1.6282082 0.1034807553
## chol chol -0.003181799 0.00417662 -0.7618120 0.4461722345
## fbs fbs 0.132747941 0.56624763 0.2344344 0.8146477323
## restecg restecg 0.658516345 0.38900912 1.6928044 0.0904927097
## thalachh thalachh 0.019761139 0.01143924 1.7274863 0.0840803561
## exng exng -1.077693823 0.44860946 -2.4022985 0.0162924088
## oldpeak oldpeak -0.678520376 0.24096094 -2.8158936 0.0048641771
## slp slp 0.263729055 0.41185859 0.6403388 0.5219523341
## caa caa -0.646435428 0.20134611 -3.2105682 0.0013247284
## thall thall -1.015509639 0.33019374 -3.0754964 0.0021015250
## Lower.CI Upper.CI
## (Intercept) -0.824207577 10.203977508
## age -0.063370318 0.036370605
## sex -2.590379694 -0.577759417
## cp 0.398452863 1.168863539
## trtbps -0.040177495 0.003295871
## chol -0.011434855 0.005243270
## fbs -0.961922049 1.269705956
## restecg -0.100546557 1.433637107
## thalachh -0.002325345 0.042804671
## exng -1.969247858 -0.198902742
## oldpeak -1.175550096 -0.224013920
## slp -0.562751364 1.065280624
## caa -1.050369590 -0.254333029
## thall -1.683909963 -0.381796748
# Likelihood ratio test
likelihood_ratio_test <- lrtest(logistic_model)
cat("Likelihood Ratio Test:\n")
## Likelihood Ratio Test:
print(likelihood_ratio_test)
## Likelihood ratio test
##
## Model 1: output ~ age + sex + cp + trtbps + chol + fbs + restecg + thalachh +
## exng + oldpeak + slp + caa + thall
## Model 2: output ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 14 -86.535
## 2 1 -166.740 -13 160.41 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Confidence intervals for odds ratios
odds_ratio_intervals <- exp(confint(logistic_model))
## Waiting for profiling to be done...
cat("Confidence Intervals for Odds Ratios:\n", odds_ratio_intervals, "\n")
## Confidence Intervals for Odds Ratios:
## 0.4385824 0.9385958 0.07499156 1.489518 0.9606189 0.9886303 0.3821577 0.904343 0.9976774 0.1395618 0.3086491 0.5696396 0.3498084 0.1856467 27010.41 1.03704 0.5611543 3.218333 1.003301 1.005257 3.559806 4.193925 1.043734 0.8196296 0.799304 2.901653 0.7754335 0.6826338
# Plotting coefficients
barplot(coef(summary_logistic)[, "Estimate"], col = "lightblue", main = "Logistic Regression Coefficients")