공분산과 상관계수 in r

cov

\[ Population: Cov(X, Y) = E((X - \mu_X)(Y - \mu_Y)) \\ \; \\ Sample: S_{XY} = \frac{1}{n-1}\sum_{i=1}^n(x_i-\bar x)(y_i - \bar y) \]

X, Y가 독립이면 E(XY) = E(x)E(Y) = \(\mu\nu\)이다.

options(digits = 7)
pg_na_omitted <- na.omit(pg)
pg_sub <- pg[, c(3,4)]
pg_sub[is.na(pg$sex), ] <- NA

pg_bill_length <- pg_na_omitted$bill_length_mm
pg_bill_depth <- pg_na_omitted$bill_depth_mm

sum((pg_bill_length-mean(pg_bill_length))*(pg_bill_depth-mean(pg_bill_depth)))/
  (length(pg_bill_length)-1)
# [1] -2.462091
cov(pg_bill_length, pg_bill_depth)
# [1] -2.462091
pg_sub[sample(1:length(pg_sub$bill_length_mm), 10), "bill_length_mm"] <- NA
colSums(is.na(pg_sub))
# bill_length_mm  bill_depth_mm 
#             20             11
cov(pg_sub$bill_length_mm, pg_sub$bill_depth_mm, use = "complete")
# [1] -2.441408
cov(pg_sub$bill_length_mm, pg_sub$bill_depth_mm, use = "na.or.complete")
# [1] -2.441408
cov(pg_sub$bill_length_mm, pg_sub$bill_depth_mm, use = "pairwise")
# [1] -2.441408

cor

\[ Population: \rho = \frac{Cov(X, Y)}{\sigma_X\sigma_Y} \\ \; \\ Sample: r = \frac{\sum_{i=1}^n[(x_i-\bar x)(y_i-\bar y)]} {\sqrt{\sum_{i=1}^n(x_i-\bar x)^2 \times (y_i-\bar y)^2}} \]

cov(pg_bill_length, pg_bill_depth)/(sd(pg_bill_length)*sd(pg_bill_depth))
# [1] -0.2286256
sum((pg_bill_length-mean(pg_bill_length))*(pg_bill_depth-mean(pg_bill_depth)))/
  sqrt(sum((pg_bill_length-mean(pg_bill_length))^2)*
       sum((pg_bill_depth-mean(pg_bill_depth))^2))
# [1] -0.2286256
cor(pg_bill_length, pg_bill_depth)
# [1] -0.2286256
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
    usr <- par("usr")
    on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    Cor <- abs(cor(x, y)) # Remove abs function if desired
    txt <- paste0(prefix, format(c(Cor, 0.123456789), digits = digits)[1])
    if(missing(cex.cor)) {
        cex.cor <- 0.4 / strwidth(txt)
    }
    text(0.5, 0.5, txt,
         cex = 1 + cex.cor * Cor) # Resize the text by level of correlation
}


panel.hist <- function(x, ...) {
    usr <- par("usr")
    on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5))
    his <- hist(x, plot = FALSE)
    breaks <- his$breaks
    nB <- length(breaks)
    y <- his$counts
    y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = rgb(0, 1, 1, alpha = 0.5), ...)
    # lines(density(x), col = 2, lwd = 2) # Uncomment to add density lines
}

pairs(data.frame(`Bill Length` = na.omit(pg$bill_length_mm), 
                 `Bill Depth` = na.omit(pg$bill_depth_mm)),
      upper.panel = panel.cor, diag.panel = panel.hist)

{ plot(pg_bill_length, pg_bill_depth, pch = 19)
abline(lm(pg_bill_depth ~ pg_bill_length), col = "red", lwd = 3)
text(paste("Correlation:", round(cor(pg_bill_length, pg_bill_depth), 4)),
     x = 36, y = 13.3, ajd = c(0, 0.5), cex = 1.2)
}

.EOD.