# Question 1
A = matrix(c(1, 7, 3,7, 4, 5,3, 5, 0), nrow = 3, byrow = TRUE)
# (a) Eigenvalues and Eigenvectors
eig = eigen(A)
eigenvalues = eig$values
eigenvectors = eig$vectors
cat("Eigenvalues:\n")
## Eigenvalues:
print(eigenvalues)
## [1] 12.364127 -2.454628 -4.909498
cat("\nEigenvectors:\n")
##
## Eigenvectors:
print(eigenvectors)
## [,1] [,2] [,3]
## [1,] -0.5534805 -0.5016948 0.6648020
## [2,] -0.7167664 -0.1195787 -0.6869839
## [3,] -0.4241524 0.8567399 0.2934135
# (b) Is the matrix positive definite?
is_positive_definite = all(eigenvalues > 0)
cat("\nIs the matrix positive definite?:", is_positive_definite, "\n")
##
## Is the matrix positive definite?: FALSE
# (c) Is A^2 positive definite?
A_squared = A %*% A
eig_A_squared = eigen(A_squared)
eigenvalues_A_squared = eig_A_squared$values
is_A_squared_positive_definite = all(eigenvalues_A_squared > 0)
cat("\nEigenvalues of A^2:\n")
##
## Eigenvalues of A^2:
print(eigenvalues_A_squared)
## [1] 152.871626 24.103175 6.025199
cat("\nIs A^2 positive definite?:", is_A_squared_positive_definite, "\n")
##
## Is A^2 positive definite?: TRUE
# (d) Orthogonality of Eigenvectors
orthogonality_check = t(eigenvectors) %*% eigenvectors
is_orthogonal = all(abs(orthogonality_check - diag(nrow(A))) < 1e-10)
cat("\nOrthogonality Check (V^T * V):\n")
##
## Orthogonality Check (V^T * V):
print(orthogonality_check)
## [,1] [,2] [,3]
## [1,] 1.000000e+00 -5.551115e-17 1.942890e-16
## [2,] -5.551115e-17 1.000000e+00 -2.220446e-16
## [3,] 1.942890e-16 -2.220446e-16 1.000000e+00
cat("\nAre eigenvectors orthogonal?:", is_orthogonal, "\n")
##
## Are eigenvectors orthogonal?: TRUE
# (e) Matrix Decomposition
E = diag(eigenvalues)
V = eigenvectors
A_reconstructed = V %*% E %*% solve(V)
cat("\nDiagonal Matrix (E):\n")
##
## Diagonal Matrix (E):
print(E)
## [,1] [,2] [,3]
## [1,] 12.36413 0.000000 0.000000
## [2,] 0.00000 -2.454628 0.000000
## [3,] 0.00000 0.000000 -4.909498
cat("\nMatrix of Eigenvectors (V):\n")
##
## Matrix of Eigenvectors (V):
print(V)
## [,1] [,2] [,3]
## [1,] -0.5534805 -0.5016948 0.6648020
## [2,] -0.7167664 -0.1195787 -0.6869839
## [3,] -0.4241524 0.8567399 0.2934135
cat("\nReconstructed A (V * E * V^(-1)):\n")
##
## Reconstructed A (V * E * V^(-1)):
print(A_reconstructed)
## [,1] [,2] [,3]
## [1,] 1 7 3.00000e+00
## [2,] 7 4 5.00000e+00
## [3,] 3 5 -4.82947e-15
cat("\nIs the reconstructed A equal to original A?:", all(abs(A - A_reconstructed) < 1e-10), "\n")
##
## Is the reconstructed A equal to original A?: TRUE
# Question 2
beta1 = 7.19
beta2 = -5.23
# Residual standard deviation (Root Mean Square Error)
sigma_residual = 0.871
# Given values of x1 and x2
x1 = 1
x2 = -3
# 1. Calculate E(y | (x1, x2))
E_y = beta1 * x1 + beta2 * x2
cat("E(y | (x1, x2) = (1, -3)) =", E_y, "\n")
## E(y | (x1, x2) = (1, -3)) = 22.88
# 2. Calculate P(y > 21 | (x1, x2))
# y follows N(E(y), sigma_residual^2)
# Calculate Z-score for y = 21
y_threshold = 21
z_score = (y_threshold - E_y) / sigma_residual
cat("Z-score for y = 21:", z_score, "\n")
## Z-score for y = 21: -2.158439
# Calculate probability P(y > 21)
# P(y > 21) = 1 - P(Z <= z_score)
p_less_equal = pnorm(z_score)
p_greater = 1 - p_less_equal
cat("P(y > 21 | (x1, x2) = (1, -3)) =", p_greater, "\n")
## P(y > 21 | (x1, x2) = (1, -3)) = 0.9845531
# Question 3
credit_data = read.csv("credit_score.csv")
# Clean the data by removing rows with NA values
credit_data_cleaned = na.omit(credit_data)
# Select only numerical columns for covariance matrix calculation
numerical_data = credit_data_cleaned[sapply(credit_data_cleaned, is.numeric)]
# Calculate the covariance matrix
cov_matrix = cov(numerical_data)
# Calculate eigenvalues of the covariance matrix
eigen_result = eigen(cov_matrix)
eigenvalues = eigen_result$values
# Find the largest eigenvalue and determine how many are within 0.1%
largest_eigenvalue = max(eigenvalues)
threshold = 0.001 * largest_eigenvalue
within_01_percent = sum(abs(eigenvalues - largest_eigenvalue) <= threshold)
# Print results
cat("Covariance Matrix:\n")
## Covariance Matrix:
print(cov_matrix)
## ID Monthly_Inhand_Salary Num_Bank_Accounts
## ID 1.872689e+09 -791821.199 27833.730262
## Monthly_Inhand_Salary -7.918212e+05 10151801.315 -3987.411696
## Num_Bank_Accounts 2.783373e+04 -3987.412 13403.717184
## Num_Credit_Card -1.578806e+04 -3192.155 8.429534
## Interest_Rate 8.072086e+04 -8078.645 -314.620170
## Delay_from_due_date -8.917304e+02 -11840.630 28.136646
## Num_Credit_Inquiries 5.674590e+04 -4637.933 -37.868717
## Credit_Utilization_Ratio -1.393385e+03 2802.141 -2.899387
## Total_EMI_per_month 1.225724e+06 200433.236 1266.982309
## Num_Credit_Card Interest_Rate Delay_from_due_date
## ID -15788.059289 80720.857611 -891.73040
## Monthly_Inhand_Salary -3192.155095 -8078.645310 -11840.62951
## Num_Bank_Accounts 8.429534 -314.620170 28.13665
## Num_Credit_Card 16652.876400 -253.581333 14.96018
## Interest_Rate -253.581333 220583.134877 56.11761
## Delay_from_due_date 14.960178 56.117608 221.07271
## Num_Credit_Inquiries -124.603490 105.214322 29.91397
## Credit_Utilization_Ratio -2.828593 -7.709259 -4.63557
## Total_EMI_per_month -37.712646 23412.840967 -476.49065
## Num_Credit_Inquiries Credit_Utilization_Ratio
## ID 56745.902935 -1393.384548
## Monthly_Inhand_Salary -4637.932648 2802.141469
## Num_Bank_Accounts -37.868717 -2.899387
## Num_Credit_Card -124.603490 -2.828593
## Interest_Rate 105.214322 -7.709259
## Delay_from_due_date 29.913973 -4.635570
## Num_Credit_Inquiries 36832.769272 -2.036039
## Credit_Utilization_Ratio -2.036039 26.134196
## Total_EMI_per_month -10519.388820 132.332927
## Total_EMI_per_month
## ID 1.225724e+06
## Monthly_Inhand_Salary 2.004332e+05
## Num_Bank_Accounts 1.266982e+03
## Num_Credit_Card -3.771265e+01
## Interest_Rate 2.341284e+04
## Delay_from_due_date -4.764906e+02
## Num_Credit_Inquiries -1.051939e+04
## Credit_Utilization_Ratio 1.323329e+02
## Total_EMI_per_month 6.923916e+07
cat("\nEigenvalues:\n")
##
## Eigenvalues:
print(eigenvalues)
## [1] 1.872691e+09 6.923903e+07 1.015081e+07 2.205660e+05 3.682817e+04
## [6] 1.665066e+04 1.340119e+04 2.071932e+02 2.534909e+01
cat("\nNumber of principal components within 0.1% of the largest component:",
within_01_percent, "\n")
##
## Number of principal components within 0.1% of the largest component: 1
# Load required libraries
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: ggplot2
## Loading required package: lattice
# Load the dataset
hotel_data = read.csv("Hotel Reservations.csv")
# PART 1: Correlation Analysis
# Calculate correlations
correlations = cor(hotel_data[sapply(hotel_data, is.numeric)])
strongest_variable = names(sort(correlations["avg_price_per_room",], decreasing = TRUE)[2:4])
cat("Strongest correlated variables with avg_price_per_room:\n")
## Strongest correlated variables with avg_price_per_room:
print(strongest_variable)
## [1] "no_of_children" "no_of_adults" "no_of_special_requests"
# PART 2: Simple Linear Regression
# Extract the strongest correlated variable
X = hotel_data[["no_of_children"]]
y = hotel_data[["avg_price_per_room"]]
# Using normal equations
X_with_intercept = cbind(1, X) # Add intercept
coefficients_normal = solve(t(X_with_intercept) %*% X_with_intercept) %*% t(X_with_intercept) %*% y
cat("Coefficients using normal equations (simple linear regression):\n")
## Coefficients using normal equations (simple linear regression):
print(coefficients_normal)
## [,1]
## 100.32498
## X 29.43185
# Using lm()
lm_simple = lm(y ~ X)
cat("Coefficients using lm() (simple linear regression):\n")
## Coefficients using lm() (simple linear regression):
print(coef(lm_simple))
## (Intercept) X
## 100.32498 29.43185
# PART 3: Multi-Variable Linear Regression
# Extract top 3 correlated variables
X_multi = hotel_data[c("no_of_children", "no_of_adults", "no_of_special_requests")]
# Using normal equations
X_multi_with_intercept = cbind(1, as.matrix(X_multi))
coefficients_multi_normal = solve(t(X_multi_with_intercept) %*% X_multi_with_intercept) %*% t(X_multi_with_intercept) %*% y
cat("Coefficients using normal equations (multi-variable regression):\n")
## Coefficients using normal equations (multi-variable regression):
print(coefficients_multi_normal)
## [,1]
## 62.144079
## no_of_children 28.963747
## no_of_adults 19.391623
## no_of_special_requests 3.959254
# Using lm()
lm_multi = lm(y ~ ., data = X_multi)
cat("Coefficients using lm() (multi-variable regression):\n")
## Coefficients using lm() (multi-variable regression):
print(coef(lm_multi))
## (Intercept) no_of_children no_of_adults
## 62.144079 28.963747 19.391623
## no_of_special_requests
## 3.959254
# Adjusted R-squared values
adjusted_r2 = function(model, X, y) {
r2 = summary(model)$r.squared
n = nrow(X)
p = ncol(X)
return(1 - (1 - r2) * (n - 1) / (n - p - 1))
}
adj_r2_simple = adjusted_r2(lm_simple, X_with_intercept, y)
adj_r2_multi = adjusted_r2(lm_multi, as.matrix(X_multi), y)
cat("Adjusted R-squared (simple regression):", adj_r2_simple, "\n")
## Adjusted R-squared (simple regression): 0.1140113
cat("Adjusted R-squared (multi-variable regression):", adj_r2_multi, "\n")
## Adjusted R-squared (multi-variable regression): 0.2136433
# PART 4: Train-Test Split and RMSE Calculation
set.seed(42)
trainIndex = createDataPartition(y, p = 0.8, list = FALSE)
train = hotel_data[trainIndex, ]
test = hotel_data[-trainIndex, ]
# Train model on training set
lm_train = lm(avg_price_per_room ~ no_of_children + no_of_adults + no_of_special_requests, data = train)
# Predict on the test set
predictions = predict(lm_train, newdata = test)
rmse_test = sqrt(mean((test$avg_price_per_room - predictions)^2))
cat("RMSE on test set:", rmse_test, "\n")
## RMSE on test set: 31.05055
# RMSE on the full dataset for comparison
lm_full = lm(avg_price_per_room ~ no_of_children + no_of_adults + no_of_special_requests, data = hotel_data)
predictions_full = predict(lm_full, newdata = hotel_data)
rmse_full = sqrt(mean((hotel_data$avg_price_per_room - predictions_full)^2))
cat("RMSE on full dataset:", rmse_full, "\n")
## RMSE on full dataset: 31.11445