PART 1

linReg_closedForm <- function(X, y) {
  # Add a column of 1s to X for the intercept
  X <- cbind(1, X)
  
  # Calculate MLE of beta
  beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
  
  # Compute residuals
  residuals <- y - X %*% beta_hat
  
  # Estimate sigma^2
  sigma2_hat <- sum(residuals^2) / (nrow(X) - ncol(X))
  
  # Compute the covariance matrix
  cov_matrix <- sigma2_hat * solve(t(X) %*% X)
  
  # Calculate standard errors of beta
  sd_beta <- sqrt(diag(cov_matrix))
  
  # Calculate t-statistics
  t_beta <- beta_hat / sd_beta
  
  # Calculate p-values
  p_beta <- 2 * pt(-abs(t_beta), df = nrow(X) - ncol(X))
  
  # Return a dataframe with the results
  result <- data.frame(beta = beta_hat, sd_beta = sd_beta, t_beta = t_beta, p_beta = p_beta)
  row.names(result) <- c("Intercept", paste("X", 1:(ncol(X) - 1), sep = "."))
  return(result)
}

# Load the data
data_linReg <- read.csv("C:/Users/User/Downloads/data_linReg_-1226050742.csv")

# Prepare the data
y <- data_linReg[, 1]
X <- as.matrix(data_linReg[, -1])

# Apply the linReg_closedForm function
result_closedForm <- linReg_closedForm(X, y)

# Apply the lm function
result_lm <- lm(y ~ ., data = data_linReg)

# Print the results
print("Results from linReg_closedForm:")

## [1] "Results from linReg_closedForm:"

print(result_closedForm)

##                  beta   sd_beta     t_beta       p_beta
## Intercept -0.39558662 0.1449342 -2.7294224 6.400287e-03
## X.1        1.09732283 0.1450665  7.5642762 5.917272e-14
## X.2        1.91351236 0.1409708 13.5738161 3.303174e-40
## X.3        2.96408750 0.1462851 20.2624042 3.692855e-83
## X.4       -0.05828176 0.1451228 -0.4016030 6.880192e-01
## X.5       -0.13714888 0.1461317 -0.9385296 3.480860e-01

print("Results from lm:")

## [1] "Results from lm:"

print(summary(result_lm)$coefficients)

##                Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) -0.39558662  0.1449342 -2.7294224 6.400287e-03
## X.1          1.09732283  0.1450665  7.5642762 5.917272e-14
## X.2          1.91351236  0.1409708 13.5738161 3.303174e-40
## X.3          2.96408750  0.1462851 20.2624042 3.692855e-83
## X.4         -0.05828176  0.1451228 -0.4016030 6.880192e-01
## X.5         -0.13714888  0.1461317 -0.9385296 3.480860e-01

linReg_GD <- function(X, y, maxIter, step_size, tol) {
  n <- nrow(X)
  p <- ncol(X)
  X <- cbind(1, X)  # Add intercept
  beta_hat <- matrix(0, p + 1, 1)
  llh <- numeric(maxIter)
  
  for (t in 1:maxIter) {
    residuals <- y - X %*% beta_hat
    gradient <- -t(X) %*% residuals
    
    # Update coefficients using the gradient descent formula
    beta_hat <- beta_hat - step_size * gradient
    
    # Evaluate negative loss function llh(β) = −D(β) at each iteration
    llh[t] <- -1/2 * sum(residuals^2)
    
    # Check for convergence
    if (t > 1 && abs((llh[t] - llh[t-1]) / abs(llh[t])) < tol) {
      llh <- llh[1:t]
      break
    }
  }
  
  # After convergence, compute standard errors, t-statistics, and p-values
  sigma2_hat <- sum(residuals^2) / (n - p - 1)
  cov_matrix <- sigma2_hat * solve(t(X) %*% X)
  sd_beta <- sqrt(diag(cov_matrix))
  t_stat <- beta_hat / sd_beta
  p_values <- 2 * pt(-abs(t_stat), df = n - p - 1)
  
  # Prepare coefficients data frame
  coefficients <- cbind(beta_hat, sd_beta, t_stat, p_values)
  colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
  
  # Return a list with coefficients and log-likelihood values
  return(list(coefficient = coefficients, llh = llh))
}

# Load the data
data_linReg <- read.csv("C:/Users/User/Downloads/data_linReg_-1226050742.csv")

# Prepare the data
y <- data_linReg[, 1]
X <- as.matrix(data_linReg[, -1])

# Apply linReg_GD
results_GD <- linReg_GD(X, y, maxIter = 500, step_size = 1e-4, tol = 1e-6)

# Print results from Gradient Descent
print("Results from linReg_GD:")

## [1] "Results from linReg_GD:"

print(results_GD$coefficient)

##        Estimate Std. Error    t value     Pr(>|t|)
##     -0.39506038  0.1449343 -2.7257898 6.470882e-03
## X.1  1.09597956  0.1450666  7.5550114 6.341492e-14
## X.2  1.91146934  0.1409709 13.5593145 3.958887e-40
## X.3  2.95885983  0.1462852 20.2266545 6.740942e-83
## X.4 -0.05715256  0.1451229 -0.3938218 6.937547e-01
## X.5 -0.13880234  0.1461318 -0.9498438 3.423067e-01

# Plot llh values
plot(results_GD$llh, type = "l", ylim=c(-52000, -42000),xlab = "Iteration", ylab = "Log-Likelihood", main = "Log-Likelihood vs Iteration for linReg_GD")
abline(h=-sum((y-cbind(1,X) %*% result_closedForm$beta)^2)/2,col="red",lty=2)
legend("bottomright", 
       legend=c("Exact","Gradient Descent"),
       lty=c(2,1),
       col=c("red","black"),
       bty="n",   
       cex=0.75,  
       text.width=0.5, 
       inset=c(0.3,0))

PART 2

Abstract

This analysis explores the application of logistic regression to predict the likelihood of a heart attack based on patient characteristics. The dataset includes 14 attributes which includes age, sex, chest pain type, and resting blood pressure.The primary questions include: Can the likelihood of a heart attack, be accurately predicted and which features play a significant role in determining the risk?

Dataset description

The dataset contains patient attributes related to heart health. The questions of interest revolve around predicting heart attack risk and identifying influential features.

# loading the dataset
data_heart <- read.csv("C:/Users/User/Downloads/Part_II_data.csv")

# loading libraries
library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-7

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(rlang)

Method used

Logistic regression method was used for prediction and statistical inference. The statistical inferences include z and p coefficients, a likelihood ratio test for overall model significance, and confidence intervals for odds ratios.

# checking the structure if the data set
str(data_heart)

## 'data.frame':    303 obs. of  14 variables:
##  $ age     : int  63 37 41 56 57 57 56 44 52 57 ...
##  $ sex     : int  1 1 0 1 0 1 0 1 1 1 ...
##  $ cp      : int  3 2 1 1 0 0 1 1 2 2 ...
##  $ trtbps  : int  145 130 130 120 120 140 140 120 172 150 ...
##  $ chol    : int  233 250 204 236 354 192 294 263 199 168 ...
##  $ fbs     : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ restecg : int  0 1 0 1 1 1 0 1 1 1 ...
##  $ thalachh: int  150 187 172 178 163 148 153 173 162 174 ...
##  $ exng    : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ oldpeak : num  2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
##  $ slp     : int  0 0 2 2 2 1 1 2 2 2 ...
##  $ caa     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ thall   : int  1 2 2 2 2 1 2 3 3 2 ...
##  $ output  : int  1 1 1 1 1 1 1 1 1 1 ...

# Split the data into training and testing sets

# Set seed for reproducibility
set.seed(123)

sample_index <- sample(1:nrow(data_heart), 0.8 * nrow(data_heart))
train_data <- data_heart[sample_index, ]
test_data <- data_heart[-sample_index, ]

# Fit logistic regression model
logistic_model <- glm(output ~ ., data = train_data, family = "binomial")

# Perform statistical inference
summary_logistic <- summary(logistic_model)

# Calculate standard errors of coefficients
se_coefs <- summary_logistic$coefficients[, "Std. Error"]

# Calculate confidence intervals for individual coefficients
conf_intervals <- confint(logistic_model)

## Waiting for profiling to be done...

# Z tests for individual coefficients
z_tests <- coef(logistic_model) / se_coefs

# Two-tailed p-values for the Z tests
p_values <- 2 * (1 - pnorm(abs(z_tests)))

# Create a data frame for results
z_test_results <- data.frame(
  "Coefficient" = names(coef(logistic_model)),
  "Estimate" = coef(logistic_model),
  "Std. Error" = se_coefs,
  "Z Value" = z_tests,
  "P Value" = p_values,
  "Lower CI" = conf_intervals[, 1],
  "Upper CI" = conf_intervals[, 2]
)

cat("Z Tests and Confidence Intervals for Coefficients:\n")

## Z Tests and Confidence Intervals for Coefficients:

print(z_test_results)

##             Coefficient     Estimate Std..Error    Z.Value      P.Value
## (Intercept) (Intercept)  4.555346811 2.79468302  1.6300048 0.1031004774
## age                 age -0.013283007 0.02528902 -0.5252480 0.5994107729
## sex                 sex -1.540661055 0.50986062 -3.0217298 0.0025133476
## cp                   cp  0.769377658 0.19530886  3.9392871 0.0000817241
## trtbps           trtbps -0.017915647 0.01100329 -1.6282082 0.1034807553
## chol               chol -0.003181799 0.00417662 -0.7618120 0.4461722345
## fbs                 fbs  0.132747941 0.56624763  0.2344344 0.8146477323
## restecg         restecg  0.658516345 0.38900912  1.6928044 0.0904927097
## thalachh       thalachh  0.019761139 0.01143924  1.7274863 0.0840803561
## exng               exng -1.077693823 0.44860946 -2.4022985 0.0162924088
## oldpeak         oldpeak -0.678520376 0.24096094 -2.8158936 0.0048641771
## slp                 slp  0.263729055 0.41185859  0.6403388 0.5219523341
## caa                 caa -0.646435428 0.20134611 -3.2105682 0.0013247284
## thall             thall -1.015509639 0.33019374 -3.0754964 0.0021015250
##                 Lower.CI     Upper.CI
## (Intercept) -0.824207577 10.203977508
## age         -0.063370318  0.036370605
## sex         -2.590379694 -0.577759417
## cp           0.398452863  1.168863539
## trtbps      -0.040177495  0.003295871
## chol        -0.011434855  0.005243270
## fbs         -0.961922049  1.269705956
## restecg     -0.100546557  1.433637107
## thalachh    -0.002325345  0.042804671
## exng        -1.969247858 -0.198902742
## oldpeak     -1.175550096 -0.224013920
## slp         -0.562751364  1.065280624
## caa         -1.050369590 -0.254333029
## thall       -1.683909963 -0.381796748

# Likelihood ratio test
likelihood_ratio_test <- lrtest(logistic_model)
cat("Likelihood Ratio Test:\n")

## Likelihood Ratio Test:

print(likelihood_ratio_test)

## Likelihood ratio test
## 
## Model 1: output ~ age + sex + cp + trtbps + chol + fbs + restecg + thalachh + 
##     exng + oldpeak + slp + caa + thall
## Model 2: output ~ 1
##   #Df   LogLik  Df  Chisq Pr(>Chisq)    
## 1  14  -86.535                          
## 2   1 -166.740 -13 160.41  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Confidence intervals for odds ratios
odds_ratio_intervals <- exp(confint(logistic_model))

## Waiting for profiling to be done...

cat("Confidence Intervals for Odds Ratios:\n", odds_ratio_intervals, "\n")

## Confidence Intervals for Odds Ratios:
##  0.4385824 0.9385958 0.07499156 1.489518 0.9606189 0.9886303 0.3821577 0.904343 0.9976774 0.1395618 0.3086491 0.5696396 0.3498084 0.1856467 27010.41 1.03704 0.5611543 3.218333 1.003301 1.005257 3.559806 4.193925 1.043734 0.8196296 0.799304 2.901653 0.7754335 0.6826338

# Plotting coefficients
barplot(coef(summary_logistic)[, "Estimate"], col = "lightblue", main = "Logistic Regression Coefficients")

Inferences

The coefficients correspond to the variables in the logistic regression model.

Estimates provides the estimated values for each coefficient. It represents the estimated effect size of each variable on the log-odds of the response variable.

Standard Error indicates the variability or uncertainty in the estimation of each coefficient.

Z-test measure of how many standard deviations an estimate is from the mean. It is used to assess the significance of each coefficient.

each coefficient tests the null hypothesis that the true value of the coefficient is zero (no effect). A low P-value (typically below 0.05) suggests that the variable is statistically significant.

For the “sex” variable, the estimate is approximately -1.54. This means that, holding other variables constant, being male (assuming “sex” is coded as 0 for female and 1 for male) is associated with a decrease in the log-odds of the response variable by approximately 1.54.

The Z-value for “sex” is -3.02, and the corresponding P-value is 0.0025. This indicates that the “sex” variable is statistically significant, as the P-value is below the commonly used threshold of 0.05.

Part1 and part 2

xxx

12/18/2023