Homework5.knit

# Question 1

A = matrix(c(1, 7, 3,7, 4, 5,3, 5, 0), nrow = 3, byrow = TRUE)

# (a) Eigenvalues and Eigenvectors
eig = eigen(A)
eigenvalues = eig$values
eigenvectors = eig$vectors

cat("Eigenvalues:\n")

## Eigenvalues:

print(eigenvalues)

## [1] 12.364127 -2.454628 -4.909498

cat("\nEigenvectors:\n")

## 
## Eigenvectors:

print(eigenvectors)

##            [,1]       [,2]       [,3]
## [1,] -0.5534805 -0.5016948  0.6648020
## [2,] -0.7167664 -0.1195787 -0.6869839
## [3,] -0.4241524  0.8567399  0.2934135

# (b) Is the matrix positive definite?
is_positive_definite = all(eigenvalues > 0)
cat("\nIs the matrix positive definite?:", is_positive_definite, "\n")

## 
## Is the matrix positive definite?: FALSE

# (c) Is A^2 positive definite?
A_squared = A %*% A
eig_A_squared = eigen(A_squared)
eigenvalues_A_squared = eig_A_squared$values
is_A_squared_positive_definite = all(eigenvalues_A_squared > 0)

cat("\nEigenvalues of A^2:\n")

## 
## Eigenvalues of A^2:

print(eigenvalues_A_squared)

## [1] 152.871626  24.103175   6.025199

cat("\nIs A^2 positive definite?:", is_A_squared_positive_definite, "\n")

## 
## Is A^2 positive definite?: TRUE

# (d) Orthogonality of Eigenvectors
orthogonality_check = t(eigenvectors) %*% eigenvectors
is_orthogonal = all(abs(orthogonality_check - diag(nrow(A))) < 1e-10)

cat("\nOrthogonality Check (V^T * V):\n")

## 
## Orthogonality Check (V^T * V):

print(orthogonality_check)

##               [,1]          [,2]          [,3]
## [1,]  1.000000e+00 -5.551115e-17  1.942890e-16
## [2,] -5.551115e-17  1.000000e+00 -2.220446e-16
## [3,]  1.942890e-16 -2.220446e-16  1.000000e+00

cat("\nAre eigenvectors orthogonal?:", is_orthogonal, "\n")

## 
## Are eigenvectors orthogonal?: TRUE

# (e) Matrix Decomposition
E = diag(eigenvalues)
V = eigenvectors
A_reconstructed = V %*% E %*% solve(V)

cat("\nDiagonal Matrix (E):\n")

## 
## Diagonal Matrix (E):

print(E)

##          [,1]      [,2]      [,3]
## [1,] 12.36413  0.000000  0.000000
## [2,]  0.00000 -2.454628  0.000000
## [3,]  0.00000  0.000000 -4.909498

cat("\nMatrix of Eigenvectors (V):\n")

## 
## Matrix of Eigenvectors (V):

print(V)

##            [,1]       [,2]       [,3]
## [1,] -0.5534805 -0.5016948  0.6648020
## [2,] -0.7167664 -0.1195787 -0.6869839
## [3,] -0.4241524  0.8567399  0.2934135

cat("\nReconstructed A (V * E * V^(-1)):\n")

## 
## Reconstructed A (V * E * V^(-1)):

print(A_reconstructed)

##      [,1] [,2]         [,3]
## [1,]    1    7  3.00000e+00
## [2,]    7    4  5.00000e+00
## [3,]    3    5 -4.82947e-15

cat("\nIs the reconstructed A equal to original A?:", all(abs(A - A_reconstructed) < 1e-10), "\n")

## 
## Is the reconstructed A equal to original A?: TRUE

# Question 2
beta1 = 7.19
beta2 = -5.23

# Residual standard deviation (Root Mean Square Error)
sigma_residual = 0.871

# Given values of x1 and x2
x1 = 1
x2 = -3

# 1. Calculate E(y | (x1, x2))
E_y = beta1 * x1 + beta2 * x2
cat("E(y | (x1, x2) = (1, -3)) =", E_y, "\n")

## E(y | (x1, x2) = (1, -3)) = 22.88

# 2. Calculate P(y > 21 | (x1, x2))
# y follows N(E(y), sigma_residual^2)
# Calculate Z-score for y = 21
y_threshold = 21
z_score = (y_threshold - E_y) / sigma_residual
cat("Z-score for y = 21:", z_score, "\n")

## Z-score for y = 21: -2.158439

# Calculate probability P(y > 21)
# P(y > 21) = 1 - P(Z <= z_score)
p_less_equal = pnorm(z_score)
p_greater = 1 - p_less_equal
cat("P(y > 21 | (x1, x2) = (1, -3)) =", p_greater, "\n")

## P(y > 21 | (x1, x2) = (1, -3)) = 0.9845531

# Question 3

credit_data = read.csv("credit_score.csv")

# Clean the data by removing rows with NA values
credit_data_cleaned = na.omit(credit_data)

# Select only numerical columns for covariance matrix calculation
numerical_data = credit_data_cleaned[sapply(credit_data_cleaned, is.numeric)]

# Calculate the covariance matrix
cov_matrix = cov(numerical_data)

# Calculate eigenvalues of the covariance matrix
eigen_result = eigen(cov_matrix)
eigenvalues = eigen_result$values

# Find the largest eigenvalue and determine how many are within 0.1%
largest_eigenvalue = max(eigenvalues)
threshold = 0.001 * largest_eigenvalue
within_01_percent = sum(abs(eigenvalues - largest_eigenvalue) <= threshold)

# Print results
cat("Covariance Matrix:\n")

## Covariance Matrix:

print(cov_matrix)

##                                     ID Monthly_Inhand_Salary Num_Bank_Accounts
## ID                        1.872689e+09           -791821.199      27833.730262
## Monthly_Inhand_Salary    -7.918212e+05          10151801.315      -3987.411696
## Num_Bank_Accounts         2.783373e+04             -3987.412      13403.717184
## Num_Credit_Card          -1.578806e+04             -3192.155          8.429534
## Interest_Rate             8.072086e+04             -8078.645       -314.620170
## Delay_from_due_date      -8.917304e+02            -11840.630         28.136646
## Num_Credit_Inquiries      5.674590e+04             -4637.933        -37.868717
## Credit_Utilization_Ratio -1.393385e+03              2802.141         -2.899387
## Total_EMI_per_month       1.225724e+06            200433.236       1266.982309
##                          Num_Credit_Card Interest_Rate Delay_from_due_date
## ID                         -15788.059289  80720.857611          -891.73040
## Monthly_Inhand_Salary       -3192.155095  -8078.645310        -11840.62951
## Num_Bank_Accounts               8.429534   -314.620170            28.13665
## Num_Credit_Card             16652.876400   -253.581333            14.96018
## Interest_Rate                -253.581333 220583.134877            56.11761
## Delay_from_due_date            14.960178     56.117608           221.07271
## Num_Credit_Inquiries         -124.603490    105.214322            29.91397
## Credit_Utilization_Ratio       -2.828593     -7.709259            -4.63557
## Total_EMI_per_month           -37.712646  23412.840967          -476.49065
##                          Num_Credit_Inquiries Credit_Utilization_Ratio
## ID                               56745.902935             -1393.384548
## Monthly_Inhand_Salary            -4637.932648              2802.141469
## Num_Bank_Accounts                  -37.868717                -2.899387
## Num_Credit_Card                   -124.603490                -2.828593
## Interest_Rate                      105.214322                -7.709259
## Delay_from_due_date                 29.913973                -4.635570
## Num_Credit_Inquiries             36832.769272                -2.036039
## Credit_Utilization_Ratio            -2.036039                26.134196
## Total_EMI_per_month             -10519.388820               132.332927
##                          Total_EMI_per_month
## ID                              1.225724e+06
## Monthly_Inhand_Salary           2.004332e+05
## Num_Bank_Accounts               1.266982e+03
## Num_Credit_Card                -3.771265e+01
## Interest_Rate                   2.341284e+04
## Delay_from_due_date            -4.764906e+02
## Num_Credit_Inquiries           -1.051939e+04
## Credit_Utilization_Ratio        1.323329e+02
## Total_EMI_per_month             6.923916e+07

cat("\nEigenvalues:\n")

## 
## Eigenvalues:

print(eigenvalues)

## [1] 1.872691e+09 6.923903e+07 1.015081e+07 2.205660e+05 3.682817e+04
## [6] 1.665066e+04 1.340119e+04 2.071932e+02 2.534909e+01

cat("\nNumber of principal components within 0.1% of the largest component:", 
    within_01_percent, "\n")

## 
## Number of principal components within 0.1% of the largest component: 1

# Load required libraries
library(caret)

## Warning: package 'caret' was built under R version 4.4.2

## Loading required package: ggplot2

## Loading required package: lattice

# Load the dataset
hotel_data = read.csv("Hotel Reservations.csv")

# PART 1: Correlation Analysis
# Calculate correlations
correlations = cor(hotel_data[sapply(hotel_data, is.numeric)])
strongest_variable = names(sort(correlations["avg_price_per_room",], decreasing = TRUE)[2:4])
cat("Strongest correlated variables with avg_price_per_room:\n")

## Strongest correlated variables with avg_price_per_room:

print(strongest_variable)

## [1] "no_of_children"         "no_of_adults"           "no_of_special_requests"

# PART 2: Simple Linear Regression
# Extract the strongest correlated variable
X = hotel_data[["no_of_children"]]
y = hotel_data[["avg_price_per_room"]]

# Using normal equations
X_with_intercept = cbind(1, X)  # Add intercept
coefficients_normal = solve(t(X_with_intercept) %*% X_with_intercept) %*% t(X_with_intercept) %*% y
cat("Coefficients using normal equations (simple linear regression):\n")

## Coefficients using normal equations (simple linear regression):

print(coefficients_normal)

##        [,1]
##   100.32498
## X  29.43185

# Using lm()
lm_simple = lm(y ~ X)
cat("Coefficients using lm() (simple linear regression):\n")

## Coefficients using lm() (simple linear regression):

print(coef(lm_simple))

## (Intercept)           X 
##   100.32498    29.43185

# PART 3: Multi-Variable Linear Regression
# Extract top 3 correlated variables
X_multi = hotel_data[c("no_of_children", "no_of_adults", "no_of_special_requests")]

# Using normal equations
X_multi_with_intercept = cbind(1, as.matrix(X_multi))
coefficients_multi_normal = solve(t(X_multi_with_intercept) %*% X_multi_with_intercept) %*% t(X_multi_with_intercept) %*% y
cat("Coefficients using normal equations (multi-variable regression):\n")

## Coefficients using normal equations (multi-variable regression):

print(coefficients_multi_normal)

##                             [,1]
##                        62.144079
## no_of_children         28.963747
## no_of_adults           19.391623
## no_of_special_requests  3.959254

# Using lm()
lm_multi = lm(y ~ ., data = X_multi)
cat("Coefficients using lm() (multi-variable regression):\n")

## Coefficients using lm() (multi-variable regression):

print(coef(lm_multi))

##            (Intercept)         no_of_children           no_of_adults 
##              62.144079              28.963747              19.391623 
## no_of_special_requests 
##               3.959254

# Adjusted R-squared values
adjusted_r2 = function(model, X, y) {
  r2 = summary(model)$r.squared
  n = nrow(X)
  p = ncol(X)
  return(1 - (1 - r2) * (n - 1) / (n - p - 1))
}

adj_r2_simple = adjusted_r2(lm_simple, X_with_intercept, y)
adj_r2_multi = adjusted_r2(lm_multi, as.matrix(X_multi), y)
cat("Adjusted R-squared (simple regression):", adj_r2_simple, "\n")

## Adjusted R-squared (simple regression): 0.1140113

cat("Adjusted R-squared (multi-variable regression):", adj_r2_multi, "\n")

## Adjusted R-squared (multi-variable regression): 0.2136433

# PART 4: Train-Test Split and RMSE Calculation
set.seed(42)
trainIndex = createDataPartition(y, p = 0.8, list = FALSE)
train = hotel_data[trainIndex, ]
test = hotel_data[-trainIndex, ]

# Train model on training set
lm_train = lm(avg_price_per_room ~ no_of_children + no_of_adults + no_of_special_requests, data = train)

# Predict on the test set
predictions = predict(lm_train, newdata = test)
rmse_test = sqrt(mean((test$avg_price_per_room - predictions)^2))
cat("RMSE on test set:", rmse_test, "\n")

## RMSE on test set: 31.05055

# RMSE on the full dataset for comparison
lm_full = lm(avg_price_per_room ~ no_of_children + no_of_adults + no_of_special_requests, data = hotel_data)
predictions_full = predict(lm_full, newdata = hotel_data)
rmse_full = sqrt(mean((hotel_data$avg_price_per_room - predictions_full)^2))
cat("RMSE on full dataset:", rmse_full, "\n")

## RMSE on full dataset: 31.11445