1 Descriptive Statistics

url1 <- "https://raw.githubusercontent.com/novrisuhermi/dataset/refs/heads/main/data_sample.csv"
data <- read.csv(url1)
head(data)
data_stats <- summary(data)
print(data_stats)
       x        
 Min.   : 3.00  
 1st Qu.: 8.00  
 Median :10.00  
 Mean   : 9.94  
 3rd Qu.:12.00  
 Max.   :19.00  

1.1 Mean

\text { Mean: } \bar{x}=\frac{1}{n} \sum_{i=1}^n x_i

sample_mean <- function(data){
  n <- length(data)
  sum_data <- 0
  for (i in 1:n){
    sum_data <- sum_data + data[i]
  }
  return(sum_data/n)
}
sample_mean(data$x)
[1] 9.94
# built-in function in R
mean(data$x)
[1] 9.94

1.2 Variance

\text { Variance: } S^2=\frac{1}{n-1} \sum_{i=1}^n\left(x_i-\bar{x}\right)^2

sample_variance <- function(data){
  n <- length(data)
  
  sum_data <- 0
  for (i in 1:n){
    sum_data <- sum_data + data[i]
  }
  mean_data <- sum_data/n
  
  var_data <- 0
  for (i in 1:n){
    var_data <- var_data + (data[i]-mean_data)^2
  }
  
  return(var_data/(n-1))
}

sample_variance(data$x)
[1] 10.76404
# built-in function in R
var(data$x)
[1] 10.76404

1.3 Median, Quartiles & Percentiles

# Sorting Algorithm
bubble_sort <- function(x) {
  n <- length(x)
  
  for (i in 1:(n-1)) {
    for (j in 1:(n-i)) {
      if (x[j] > x[j+1]) {
        temp <- x[j]
        x[j] <- x[j+1]
        x[j+1] <- temp
      }
    }
  }
  
  return(x)
}
bubble_sort(data$x)
  [1]  3  4  4  4  4  4  5  5  5  5  6  6  6  6  7  7  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8  8  8  8  8  9  9  9  9  9  9  9  9
 [44]  9  9 10 10 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 12 12 12 12 12 12 12 12 12 12 12 13 13 13 13
 [87] 13 13 14 14 15 15 15 16 16 16 16 17 17 19
sort(data$x)
  [1]  3  4  4  4  4  4  5  5  5  5  6  6  6  6  7  7  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8  8  8  8  8  9  9  9  9  9  9  9  9
 [44]  9  9 10 10 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 12 12 12 12 12 12 12 12 12 12 12 13 13 13 13
 [87] 13 13 14 14 15 15 15 16 16 16 16 17 17 19

Calculating the Sample 100p-th Percentile

  1. Order the data from smallest to largest.
  2. Determine the product (sample size) \times (proportion) = n p.

If np is not an integer, round it up to the next integer and find the corresponding ordered value.

If np is an integer, say k, calculate the average of the kth and (k+1)st ordered values.

\begin{array}{ll} \text { Lower (first) quartile } & \mathrm{Q}_1=25 \text {th percentile } \\ \text { Second quartile (or median) } & \mathrm{Q}_2=50 \text {th percentile } \\ \text { Upper (third) quartile } & \mathrm{Q}_3=75 \text {th percentile } \end{array}

sample_median <- function(data){
  n <- length(data)
  sorted_data <- sort(data)
  if (n %% 2 == 1){
    med <- sorted_data[(n+1)/2]
  } else{
    med <- 0.5*(sorted_data[n/2] + sorted_data[n/2+1])
  }
  return(med)
}
sample_median(data$x)
[1] 10
median(data$x)
[1] 10
sample_percentile <- function(data, p){
  n <- length(data)
  sorted_data <- sort(data)
  np <- n*p
  
  if(np %% 1 != 0){                
    k <- ceiling(np)
    q <- sorted_data[k]
  } else{ 
    k <- np
    q <- 0.5 * (sorted_data[k] + sorted_data[k + 1])
  }
  return(q)
}

cat("Sample Quartiles\n",
    "-----------------\n",
    "Q1 (25%) :", sample_percentile(data$x, 0.25), "\n",
    "Q2 (50%) :", sample_percentile(data$x, 0.50), "\n",
    "Q3 (75%) :", sample_percentile(data$x, 0.75), "\n")
Sample Quartiles
 -----------------
 Q1 (25%) : 8 
 Q2 (50%) : 10 
 Q3 (75%) : 12 
quantile(data$x, probs = c(0.25,0.50,0.75))
25% 50% 75% 
  8  10  12 

1.4 Modes

sample_modes <- function(data){
  tab <- table(data)
  max_freq <- max(tab)
  modes <- names(tab)[tab == max_freq]
  list(modes = modes, frequency = max_freq)
}

sample_modes(data$x)
$modes
[1] "11"

$frequency
[1] 16
table(data$x)

 3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 19 
 1  5  4  4  7 14 10 10 16 11  6  2  3  4  2  1 
sample_modes(c(1,1,2,2,3,4,5,6,7))
$modes
[1] "1" "2"

$frequency
[1] 2

2 Bivariate & Multivariate Statistics

2.1 Covariance

df <- mtcars
head(df)

\operatorname{Cov}(x, y)=\frac{1}{n-1} \sum_{i=1}^n\left(x_i-\bar{x}\right)\left(y_i-\bar{y}\right)

sample_covariance <- function(x,y){
  n <- length(x)
  meanX <- mean(x)
  meanY <- mean(y)
  
  sample_cov <- 0
  for (i in 1:n){
    sample_cov <- sample_cov + (x[i]-meanX)*(y[i]-meanY)
  }
  
  return(sample_cov/(n-1))
}

sample_covariance(df$mpg, df$wt)
[1] -5.116685
cov(df$mpg, df$wt)
[1] -5.116685

2.2 Pearson’s Correlation Coefficient

r_{x y}=\frac{\sum_{i=1}^n\left(x_i-\bar{x}\right)\left(y_i-\bar{y}\right)}{\sqrt{\sum_{i=1}^n\left(x_i-\bar{x}\right)^2} \sqrt{\sum_{i=1}^n\left(y_i-\bar{y}\right)^2}}

sample_correlation <- function(x,y){
  n <- length(x)
  meanX <- mean(x)
  meanY <- mean(y)
  
  Sxy <- 0
  Sxx <- 0
  Syy <- 0
  
  for (i in 1:n){
    Sxy <- Sxy + (x[i]-meanX)*(y[i]-meanY)
    Sxx <- Sxx + (x[i]-meanX)^2
    Syy <- Syy + (y[i]-meanY)^2
  }
  
  Sx <- sqrt(Sxx)
  Sy <- sqrt(Syy)
  
  return(Sxy/(Sx*Sy))
}

sample_correlation(df$mpg, df$wt)
[1] -0.8676594
cor(df$mpg, df$wt)
[1] -0.8676594

2.3 Variance-Covariance Matrix

2.3.1 Multivariate Sample Setup

Suppose we observe:

  • n observations
  • p variables

The data matrix is:

X = \begin{pmatrix} x_{11} & x_{12} & \cdots & x_{1p} \\ x_{21} & x_{22} & \cdots & x_{2p} \\ \vdots & \vdots & \ddots & \vdots \\ x_{n1} & x_{n2} & \cdots & x_{np} \end{pmatrix}

The sample mean of variable j:

\bar{x}_j = \frac{1}{n}\sum_{i=1}^{n} x_{ij}

Sample Variance (variable j)

s_{jj} = \frac{1}{n-1} \sum_{i=1}^{n} (x_{ij}-\bar{x}_j)^2

Sample Covariance (variables j and k)

s_{jk} = \frac{1}{n-1} \sum_{i=1}^{n} (x_{ij}-\bar{x}_j)(x_{ik}-\bar{x}_k)

The sample variance–covariance matrix is:

S=\frac{1}{n-1}\left(\begin{array}{cccc} \sum_{i=1}^n\left(x_{i 1}-\bar{x}_1\right)^2 & \sum_{i=1}^n\left(x_{i 1}-\bar{x}_1\right)\left(x_{i 2}-\bar{x}_2\right) & \cdots & \sum_{i=1}^n\left(x_{i 1}-\bar{x}_1\right)\left(x_{i p}-\bar{x}_p\right) \\ \sum_{i=1}^n\left(x_{i 2}-\bar{x}_2\right)\left(x_{i 1}-\bar{x}_1\right) & \sum_{i=1}^n\left(x_{i 2}-\bar{x}_2\right)^2 & \cdots & \sum_{i=1}^n\left(x_{i 2}-\bar{x}_2\right)\left(x_{i p}-\bar{x}_p\right) \\ \vdots & \vdots & \ddots & \vdots \\ \sum_{i=1}^n\left(x_{i p}-\bar{x}_p\right)\left(x_{i 1}-\bar{x}_1\right) & \sum_{i=1}^n\left(x_{i p}-\bar{x}_p\right)\left(x_{i 2}-\bar{x}_2\right) & \cdots & \sum_{i=1}^n\left(x_{i p}-\bar{x}_p\right)^2 \end{array}\right)


2.3.2 Centering the Data

Define the mean vector:

\bar{x} = \begin{pmatrix} \bar{x}_1 \\ \bar{x}_2 \\ \vdots \\ \bar{x}_p \end{pmatrix}

Let \mathbf{1} be an n \times 1 vector of ones.

The centered data matrix:

X_c = X - \mathbf{1}\bar{x}^\top

Each row becomes:

x_i - \bar{x}


2.3.3 Matrix Derivation

Consider:

X_c^\top X_c = \sum_{i=1}^{n} (x_i - \bar{x})(x_i - \bar{x})^\top

The (j,k)-th element equals:

\sum_{i=1}^{n} (x_{ij}-\bar{x}_j)(x_{ik}-\bar{x}_k)

Therefore,

S = \frac{1}{n-1} X_c^\top X_c


2.3.4 Alternative Expression

Expanding:

X_c^\top X_c = X^\top X - n \bar{x}\bar{x}^\top

Thus,

S = \frac{1}{n-1} \left( X^\top X - n \bar{x}\bar{x}^\top \right)


2.3.5 Write variance-covariance matrix function

covariance_matrix <- function(data){
  n <- dim(data)[1]
  MeanX <- colMeans(data)
  Cov <- t(data) %*% data - n * MeanX %*% t(MeanX)
  return(Cov/(n-1))
}

covariance_matrix(as.matrix(df)[,1:4])
             mpg        cyl       disp        hp
mpg    36.324103  -9.172379  -633.0972 -320.7321
cyl    -9.172379   3.189516   199.6603  101.9315
disp -633.097208 199.660282 15360.7998 6721.1587
hp   -320.732056 101.931452  6721.1587 4700.8669
cov(as.matrix(df)[,1:4])
             mpg        cyl       disp        hp
mpg    36.324103  -9.172379  -633.0972 -320.7321
cyl    -9.172379   3.189516   199.6603  101.9315
disp -633.097208 199.660282 15360.7998 6721.1587
hp   -320.732056 101.931452  6721.1587 4700.8669

2.4 Correlation Matrix

2.4.1 Scalar form sample correlation

The sample correlation between variables j and k:

r_{jk} = \frac{s_{jk}}{\sqrt{s_{jj}}\sqrt{s_{kk}}}

where

s_{jj} = \text{Var}(X_j), \quad s_{kk} = \text{Var}(X_k)


2.4.2 Explicit Matrix Form

R = \begin{pmatrix} 1 & r_{12} & \cdots & r_{1p} \\ r_{21} & 1 & \cdots & r_{2p} \\ \vdots & \vdots & \ddots & \vdots \\ r_{p1} & r_{p2} & \cdots & 1 \end{pmatrix}


2.4.3 Standardization

Define the standard deviation of variable j:

s_j = \sqrt{s_{jj}}

Define the diagonal matrix of standard deviations:

D = \begin{pmatrix} s_1 & 0 & \cdots & 0 \\ 0 & s_2 & \cdots & 0 \\ \vdots & \vdots & \ddots & \vdots \\ 0 & 0 & \cdots & s_p \end{pmatrix}


2.4.4 Matrix Transformation

The correlation matrix is obtained by standardizing the covariance matrix:

R = D^{-1} S D^{-1}

Each element becomes:

r_{jk} = \frac{s_{jk}}{s_j s_k}


2.4.5 Alternative Derivation

Define the standardized variables:

z_{ij} = \frac{x_{ij}-\bar{x}_j}{s_j}

Let Z be the standardized data matrix.

Then,

R = \frac{1}{n-1} Z^\top Z

Thus, the correlation matrix is simply the covariance matrix of standardized variables.


2.4.6 Write correlation matrix function

correlation_matrix <- function(data){
  S <- cov(data)
  D <- diag(sqrt(diag(S)))
  Corr <- solve(D) %*% S %*% solve(D)
  return(Corr)
}

correlation_matrix(as.matrix(df)[,1:4])
           [,1]       [,2]       [,3]       [,4]
[1,]  1.0000000 -0.8521620 -0.8475514 -0.7761684
[2,] -0.8521620  1.0000000  0.9020329  0.8324475
[3,] -0.8475514  0.9020329  1.0000000  0.7909486
[4,] -0.7761684  0.8324475  0.7909486  1.0000000
cor(as.matrix(df)[,1:4])
            mpg        cyl       disp         hp
mpg   1.0000000 -0.8521620 -0.8475514 -0.7761684
cyl  -0.8521620  1.0000000  0.9020329  0.8324475
disp -0.8475514  0.9020329  1.0000000  0.7909486
hp   -0.7761684  0.8324475  0.7909486  1.0000000
correlation_matrix_std <- function(data){
  n <- dim(data)[1]
  Z <- scale(data, center = TRUE, scale = TRUE)
  R <- (t(Z) %*% Z) / (n - 1)
  return(R)
}

correlation_matrix_std(as.matrix(df)[,1:4])
            mpg        cyl       disp         hp
mpg   1.0000000 -0.8521620 -0.8475514 -0.7761684
cyl  -0.8521620  1.0000000  0.9020329  0.8324475
disp -0.8475514  0.9020329  1.0000000  0.7909486
hp   -0.7761684  0.8324475  0.7909486  1.0000000

3 Handling Categorical Data

3.1 Contingency Table (Crosstab)

df$Transmission <- factor(df$am,
                          levels = c(0,1),
                          labels = c("Automatic","Manual"))

df$Cylinders <- factor(df$cyl)

df$Engine <- factor(df$vs,
                    levels = c(0,1),
                    labels = c("V-shaped","Straight"))

head(df[,c("Transmission","Cylinders","Engine")], 10)

3.1.1 Crosstab

tab1 <- table(df$Transmission, df$Engine)
tab1
           
            V-shaped Straight
  Automatic       12        7
  Manual           6        7

3.1.2 Row Proportions

prop.table(tab1, margin = 1)
           
             V-shaped  Straight
  Automatic 0.6315789 0.3684211
  Manual    0.4615385 0.5384615

3.1.3 Column Proportions

prop.table(tab1, margin = 2)
           
             V-shaped  Straight
  Automatic 0.6666667 0.5000000
  Manual    0.3333333 0.5000000

3.1.4 Three-way Contingency Table

tab3 <- table(df$Transmission,
              df$Cylinders,
              df$Engine)

tab3
, ,  = V-shaped

           
             4  6  8
  Automatic  0  0 12
  Manual     1  3  2

, ,  = Straight

           
             4  6  8
  Automatic  3  4  0
  Manual     7  0  0
ftable(tab3)
             V-shaped Straight
                              
Automatic 4         0        3
          6         0        4
          8        12        0
Manual    4         1        7
          6         3        0
          8         2        0
---
title: "Computational Statistics Week 2"

output:
  html_notebook:
    math_method: katex
    theme: yeti
    toc: true
    toc_float:
      toc_collapsed: true
    number_sections: true
    df_print: paged
---

# Descriptive Statistics


```{r}
url1 <- "https://raw.githubusercontent.com/novrisuhermi/dataset/refs/heads/main/data_sample.csv"
data <- read.csv(url1)
head(data)
```

```{r}
data_stats <- summary(data)
print(data_stats)
```

## Mean

$$
\text { Mean: } \bar{x}=\frac{1}{n} \sum_{i=1}^n x_i
$$

```{r}
sample_mean <- function(data){
  n <- length(data)
  sum_data <- 0
  for (i in 1:n){
    sum_data <- sum_data + data[i]
  }
  return(sum_data/n)
}
```


```{r}
sample_mean(data$x)
```

```{r}
# built-in function in R
mean(data$x)
```

---

## Variance

$$
\text { Variance: } S^2=\frac{1}{n-1} \sum_{i=1}^n\left(x_i-\bar{x}\right)^2
$$

```{r}
sample_variance <- function(data){
  n <- length(data)
  
  sum_data <- 0
  for (i in 1:n){
    sum_data <- sum_data + data[i]
  }
  mean_data <- sum_data/n
  
  var_data <- 0
  for (i in 1:n){
    var_data <- var_data + (data[i]-mean_data)^2
  }
  
  return(var_data/(n-1))
}

sample_variance(data$x)
```

```{r}
# built-in function in R
var(data$x)
```

---

## Median, Quartiles & Percentiles


```{r}
# Sorting Algorithm
bubble_sort <- function(x) {
  n <- length(x)
  
  for (i in 1:(n-1)) {
    for (j in 1:(n-i)) {
      if (x[j] > x[j+1]) {
        temp <- x[j]
        x[j] <- x[j+1]
        x[j+1] <- temp
      }
    }
  }
  
  return(x)
}
```

```{r}
bubble_sort(data$x)
```

```{r}
sort(data$x)
```

Calculating the Sample $100p$-th Percentile

1. Order the data from smallest to largest.
2. Determine the product (sample size) $\times$ (proportion) $= n p$.

If $np$ is not an integer, round it up to the next integer and find the corresponding ordered value.

If $np$ is an integer, say $k$, calculate the average of the $k$th and $(k+1)$st ordered values.

$$
\begin{array}{ll}
\text { Lower (first) quartile } & \mathrm{Q}_1=25 \text {th percentile } \\
\text { Second quartile (or median) } & \mathrm{Q}_2=50 \text {th percentile } \\
\text { Upper (third) quartile } & \mathrm{Q}_3=75 \text {th percentile }
\end{array}
$$

```{r}
sample_median <- function(data){
  n <- length(data)
  sorted_data <- sort(data)
  if (n %% 2 == 1){
    med <- sorted_data[(n+1)/2]
  } else{
    med <- 0.5*(sorted_data[n/2] + sorted_data[n/2+1])
  }
  return(med)
}
sample_median(data$x)
```

```{r}
median(data$x)
```

```{r}
sample_percentile <- function(data, p){
  n <- length(data)
  sorted_data <- sort(data)
  np <- n*p
  
  if(np %% 1 != 0){                
    k <- ceiling(np)
    q <- sorted_data[k]
  } else{ 
    k <- np
    q <- 0.5 * (sorted_data[k] + sorted_data[k + 1])
  }
  return(q)
}

cat("Sample Quartiles\n",
    "-----------------\n",
    "Q1 (25%) :", sample_percentile(data$x, 0.25), "\n",
    "Q2 (50%) :", sample_percentile(data$x, 0.50), "\n",
    "Q3 (75%) :", sample_percentile(data$x, 0.75), "\n")
```

```{r}
quantile(data$x, probs = c(0.25,0.50,0.75))
```

---

## Modes

```{r}
sample_modes <- function(data){
  tab <- table(data)
  max_freq <- max(tab)
  modes <- names(tab)[tab == max_freq]
  list(modes = modes, frequency = max_freq)
}

sample_modes(data$x)
```

```{r}
table(data$x)
```


```{r}
sample_modes(c(1,1,2,2,3,4,5,6,7))
```

# Bivariate & Multivariate Statistics

## Covariance 

```{r}
df <- mtcars
head(df)
```

$$
\operatorname{Cov}(x, y)=\frac{1}{n-1} \sum_{i=1}^n\left(x_i-\bar{x}\right)\left(y_i-\bar{y}\right)
$$

```{r}
sample_covariance <- function(x,y){
  n <- length(x)
  meanX <- mean(x)
  meanY <- mean(y)
  
  sample_cov <- 0
  for (i in 1:n){
    sample_cov <- sample_cov + (x[i]-meanX)*(y[i]-meanY)
  }
  
  return(sample_cov/(n-1))
}

sample_covariance(df$mpg, df$wt)
```

```{r}
cov(df$mpg, df$wt)
```

---

## Pearson's Correlation Coefficient

$$
r_{x y}=\frac{\sum_{i=1}^n\left(x_i-\bar{x}\right)\left(y_i-\bar{y}\right)}{\sqrt{\sum_{i=1}^n\left(x_i-\bar{x}\right)^2} \sqrt{\sum_{i=1}^n\left(y_i-\bar{y}\right)^2}}
$$
```{r}
sample_correlation <- function(x,y){
  n <- length(x)
  meanX <- mean(x)
  meanY <- mean(y)
  
  Sxy <- 0
  Sxx <- 0
  Syy <- 0
  
  for (i in 1:n){
    Sxy <- Sxy + (x[i]-meanX)*(y[i]-meanY)
    Sxx <- Sxx + (x[i]-meanX)^2
    Syy <- Syy + (y[i]-meanY)^2
  }
  
  Sx <- sqrt(Sxx)
  Sy <- sqrt(Syy)
  
  return(Sxy/(Sx*Sy))
}

sample_correlation(df$mpg, df$wt)
```

```{r}
cor(df$mpg, df$wt)
```

---

## Variance-Covariance Matrix

### Multivariate Sample Setup

Suppose we observe:

- \( n \) observations  
- \( p \) variables  

The data matrix is:

\[
X =
\begin{pmatrix}
x_{11} & x_{12} & \cdots & x_{1p} \\
x_{21} & x_{22} & \cdots & x_{2p} \\
\vdots & \vdots & \ddots & \vdots \\
x_{n1} & x_{n2} & \cdots & x_{np}
\end{pmatrix}
\]

The sample mean of variable \( j \):

\[
\bar{x}_j = \frac{1}{n}\sum_{i=1}^{n} x_{ij}
\]

Sample Variance (variable \( j \))

\[
s_{jj} =
\frac{1}{n-1}
\sum_{i=1}^{n}
(x_{ij}-\bar{x}_j)^2
\]

Sample Covariance (variables \( j \) and \( k \))

\[
s_{jk} =
\frac{1}{n-1}
\sum_{i=1}^{n}
(x_{ij}-\bar{x}_j)(x_{ik}-\bar{x}_k)
\]





The sample variance–covariance matrix is:

$$
S=\frac{1}{n-1}\left(\begin{array}{cccc}
\sum_{i=1}^n\left(x_{i 1}-\bar{x}_1\right)^2 & \sum_{i=1}^n\left(x_{i 1}-\bar{x}_1\right)\left(x_{i 2}-\bar{x}_2\right) & \cdots & \sum_{i=1}^n\left(x_{i 1}-\bar{x}_1\right)\left(x_{i p}-\bar{x}_p\right) \\
\sum_{i=1}^n\left(x_{i 2}-\bar{x}_2\right)\left(x_{i 1}-\bar{x}_1\right) & \sum_{i=1}^n\left(x_{i 2}-\bar{x}_2\right)^2 & \cdots & \sum_{i=1}^n\left(x_{i 2}-\bar{x}_2\right)\left(x_{i p}-\bar{x}_p\right) \\
\vdots & \vdots & \ddots & \vdots \\
\sum_{i=1}^n\left(x_{i p}-\bar{x}_p\right)\left(x_{i 1}-\bar{x}_1\right) & \sum_{i=1}^n\left(x_{i p}-\bar{x}_p\right)\left(x_{i 2}-\bar{x}_2\right) & \cdots & \sum_{i=1}^n\left(x_{i p}-\bar{x}_p\right)^2
\end{array}\right)
$$

---

### Centering the Data

Define the mean vector:

\[
\bar{x} =
\begin{pmatrix}
\bar{x}_1 \\
\bar{x}_2 \\
\vdots \\
\bar{x}_p
\end{pmatrix}
\]

Let \( \mathbf{1} \) be an \( n \times 1 \) vector of ones.

The centered data matrix:

\[
X_c = X - \mathbf{1}\bar{x}^\top
\]

Each row becomes:

\[
x_i - \bar{x}
\]

---

### Matrix Derivation

Consider:

\[
X_c^\top X_c
=
\sum_{i=1}^{n}
(x_i - \bar{x})(x_i - \bar{x})^\top
\]

The \( (j,k) \)-th element equals:

\[
\sum_{i=1}^{n}
(x_{ij}-\bar{x}_j)(x_{ik}-\bar{x}_k)
\]

Therefore,

\[
S = \frac{1}{n-1} X_c^\top X_c
\]

---

### Alternative Expression

Expanding:

\[
X_c^\top X_c
=
X^\top X - n \bar{x}\bar{x}^\top
\]

Thus,

\[
S =
\frac{1}{n-1}
\left(
X^\top X - n \bar{x}\bar{x}^\top
\right)
\]

---

### Write variance-covariance matrix function

```{r}
covariance_matrix <- function(data){
  n <- dim(data)[1]
  MeanX <- colMeans(data)
  Cov <- t(data) %*% data - n * MeanX %*% t(MeanX)
  return(Cov/(n-1))
}

covariance_matrix(as.matrix(df)[,1:4])
```

```{r}
cov(as.matrix(df)[,1:4])
```

## Correlation Matrix

### Scalar form sample correlation

The sample correlation between variables \( j \) and \( k \):

\[
r_{jk} =
\frac{s_{jk}}{\sqrt{s_{jj}}\sqrt{s_{kk}}}
\]

where

\[
s_{jj} = \text{Var}(X_j), \quad
s_{kk} = \text{Var}(X_k)
\]

---

### Explicit Matrix Form

\[
R =
\begin{pmatrix}
1 & r_{12} & \cdots & r_{1p} \\
r_{21} & 1 & \cdots & r_{2p} \\
\vdots & \vdots & \ddots & \vdots \\
r_{p1} & r_{p2} & \cdots & 1
\end{pmatrix}
\]

---

### Standardization

Define the standard deviation of variable \( j \):

\[
s_j = \sqrt{s_{jj}}
\]

Define the diagonal matrix of standard deviations:

\[
D =
\begin{pmatrix}
s_1 & 0 & \cdots & 0 \\
0 & s_2 & \cdots & 0 \\
\vdots & \vdots & \ddots & \vdots \\
0 & 0 & \cdots & s_p
\end{pmatrix}
\]

---

### Matrix Transformation

The correlation matrix is obtained by standardizing the covariance matrix:

\[
R = D^{-1} S D^{-1}
\]

Each element becomes:

\[
r_{jk} =
\frac{s_{jk}}{s_j s_k}
\]

---

### Alternative Derivation

Define the standardized variables:

\[
z_{ij} =
\frac{x_{ij}-\bar{x}_j}{s_j}
\]

Let \( Z \) be the standardized data matrix.

Then,

\[
R = \frac{1}{n-1} Z^\top Z
\]

Thus, the correlation matrix is simply the covariance matrix of standardized variables.

---

### Write correlation matrix function

```{r}
correlation_matrix <- function(data){
  S <- cov(data)
  D <- diag(sqrt(diag(S)))
  Corr <- solve(D) %*% S %*% solve(D)
  return(Corr)
}

correlation_matrix(as.matrix(df)[,1:4])
```

```{r}
cor(as.matrix(df)[,1:4])
```

```{r}
correlation_matrix_std <- function(data){
  n <- dim(data)[1]
  Z <- scale(data, center = TRUE, scale = TRUE)
  R <- (t(Z) %*% Z) / (n - 1)
  return(R)
}

correlation_matrix_std(as.matrix(df)[,1:4])
```

# Handling Categorical Data

## Contingency Table (Crosstab)

```{r}
df$Transmission <- factor(df$am,
                          levels = c(0,1),
                          labels = c("Automatic","Manual"))

df$Cylinders <- factor(df$cyl)

df$Engine <- factor(df$vs,
                    levels = c(0,1),
                    labels = c("V-shaped","Straight"))

head(df[,c("Transmission","Cylinders","Engine")], 10)
```

### Crosstab

```{r}
tab1 <- table(df$Transmission, df$Engine)
tab1
```

### Row Proportions

```{r}
prop.table(tab1, margin = 1)
```

### Column Proportions

```{r}
prop.table(tab1, margin = 2)
```

### Three-way Contingency Table

```{r}
tab3 <- table(df$Transmission,
              df$Cylinders,
              df$Engine)

tab3
```

```{r}
ftable(tab3)
```




