setwd("C:/Users/Nandan Hegde/OneDrive/Documents/MSU_Grad_Studies/STT810_class_assignments/ICA6")
df_pizza = read.csv("pizza.csv")

# a.Calculate the covariance and correlation between moisture and cal using cov() and cor()

covariance = cov(df_pizza$mois, df_pizza$cal)
print(paste("Covariance between mois and cal: ", covariance))
## [1] "Covariance between mois and cal:  -4.5279181270903"
correlation = cor(df_pizza$mois, df_pizza$cal)
print(paste("Correlation between mois and cal: ", correlation))
## [1] "Correlation between mois and cal:  -0.764440543783477"
# b.Verify the covariance value by calculating E(XY) – E(X)E(Y) (will not be exact)

EXY = mean(df_pizza$mois * df_pizza$cal)
EX = mean(df_pizza$mois)
EY = mean(df_pizza$cal)
verify_covariance = EXY - (EX * EY)

cat("Calculation of covariance:", verify_covariance, "\n")
## Calculation of covariance: -4.512825
# c.Verify the correlation  value by diving the covariance by the product of standard deviations

std_mois = sd(df_pizza$mois)
std_cal = sd(df_pizza$cal)
verify_correlation = verify_covariance / (std_mois * std_cal)

cat("Manual calculation of correlation:", verify_correlation, "\n")
## Manual calculation of correlation: -0.7618924
# d.Examine the entire correlation matrix.  Which variable has the strongest correlation with cal?

numeric_cols = sapply(df_pizza, is.numeric)
correlation_matrix = cor(df_pizza[, numeric_cols])

cat("Correlation matrix:\n")
## Correlation matrix:
print(correlation_matrix)
##                 id        mois        prot         fat         ash      sodium
## id      1.00000000  0.03259505 -0.07648546 -0.00486006 -0.03407156 -0.01515608
## mois    0.03259505  1.00000000  0.36024768 -0.17131821  0.26555552 -0.10227890
## prot   -0.07648546  0.36024768  1.00000000  0.49800167  0.82384370  0.42912952
## fat    -0.00486006 -0.17131821  0.49800167  1.00000000  0.79163396  0.93332522
## ash    -0.03407156  0.26555552  0.82384370  0.79163396  1.00000000  0.80812215
## sodium -0.01515608 -0.10227890  0.42912952  0.93332522  0.80812215  1.00000000
## carb    0.01496614 -0.59180165 -0.85354226 -0.64023817 -0.89898837 -0.62017634
## cal    -0.02120862 -0.76444054  0.07025810  0.76456710  0.32646845  0.67195750
##               carb         cal
## id      0.01496614 -0.02120862
## mois   -0.59180165 -0.76444054
## prot   -0.85354226  0.07025810
## fat    -0.64023817  0.76456710
## ash    -0.89898837  0.32646845
## sodium -0.62017634  0.67195750
## carb    1.00000000 -0.02348458
## cal    -0.02348458  1.00000000
correlation_matrix = correlation_matrix["cal", -which(colnames(correlation_matrix) == "cal")]

# Identify the strongest positive correlation
strongest_positive_correlation = max(correlation_matrix)
strongest_positive_variable = names(which.max(correlation_matrix))

# Identify the strongest negative correlation
strongest_negative_correlation = min(correlation_matrix)
strongest_negative_variable = names(which.min(correlation_matrix))

cat("Variable with the strongest positive correlation with calories:", strongest_positive_variable, 
    "with correlation:", strongest_positive_correlation, "\n")
## Variable with the strongest positive correlation with calories: fat with correlation: 0.7645671
cat("Variable with the strongest negative correlation with calories:", strongest_negative_variable, 
    "with correlation:", strongest_negative_correlation, "\n")
## Variable with the strongest negative correlation with calories: mois with correlation: -0.7644405
# Given parameters

mean_value = 8
std_value = 2
n_simulations = 1000000

# Generate two simulations of a normal distribution
set.seed(123)
x = rnorm(n_simulations, mean = mean_value, sd = std_value)
y = rnorm(n_simulations, mean = mean_value, sd = std_value)

# a.What is the covariance between x and y?  Why?
covariance_xy = cov(x, y)
cat("Covariance between x and y: ", covariance_xy, "\n")
## Covariance between x and y:  -0.003257275
# b.Define z = x + y and verify Var(z) = Var(x) + Var(y) = 2 * Var(x)
z = x + y
variance_x = var(x)
variance_y = var(y)
variance_z = var(z)
cat("Variance of x: ", variance_x, "\n")
## Variance of x:  3.999417
cat("Variance of y: ", variance_y, "\n")
## Variance of y:  3.998049
cat("Variance of z: ", variance_z, "\n")
## Variance of z:  7.990951
cat("2 * Variance of x: ", 2 * variance_x, "\n")
## 2 * Variance of x:  7.998833
# c.What is the covariance and correlation between z and x?
covariance_zx = cov(z, x)
correlation_zx = cor(z, x)
cat("Covariance between z and x: ", covariance_zx, "\n")
## Covariance between z and x:  3.996159
cat("Correlation between z and x: ", correlation_zx, "\n")
## Correlation between z and x:  0.7068793
# d. Define w = z + x and verify Var(w) = Var(z) + Var(x) + 2 * Cov(z, x)
w = z + x
variance_w = var(w)
calculated_variance_w = variance_z + variance_x + 2 * covariance_zx
cat("Variance of w: ", variance_w, "\n")
## Variance of w:  19.98269
cat("Calculated variance of w (using the formula): ", calculated_variance_w, "\n")
## Calculated variance of w (using the formula):  19.98269