code 10.1

콜레스키 분해를 이용한 공분산행렬 기반의 난수생성

r.cov<-function(mu.e, mu.x1, mu.x2, cov.ee, cov.e.x1, cov.e.x2, cov.x1.x1, cov.x1.x2, cov.x2.x2, n)
{
  mu<-matrix(c(mu.e, mu.x1, mu.x2), ncol=1)
    cov<-matrix(c(cov.ee, cov.e.x1, cov.e.x2, cov.e.x1, cov.x1.x1, cov.x1.x2, cov.e.x2, cov.x1.x2, cov.x2.x2), ncol=3)

    print("평균벡터")
    print(mu)
    print("공분산행렬")
    print(cov)

    print("공분산행렬의 콜레스키 분해")
    l <- chol(cov) 

    print(l)
    temp <- t(l)%*%matrix(rnorm(dim(cov)[1]*n), ncol=n)
    y<-as.data.frame(t(temp))
    names(y) <- c("error", "x1", "x2")  
    y$error<-y$error + mu.e
    y$x1<-y$x1 + mu.x1
    y$x2<-y$x2 + mu.x2
    return(y)
}

salary<-r.cov(0, 60, 30,  2,  0, 0.1,  2, -0.8, 1, 1000)

## [1] "평균벡터"
##      [,1]
## [1,]    0
## [2,]   60
## [3,]   30
## [1] "공분산행렬"
##      [,1] [,2] [,3]
## [1,]  2.0  0.0  0.1
## [2,]  0.0  2.0 -0.8
## [3,]  0.1 -0.8  1.0
## [1] "공분산행렬의 콜레스키 분해"
##          [,1]     [,2]        [,3]
## [1,] 1.414214 0.000000  0.07071068
## [2,] 0.000000 1.414214 -0.56568542
## [3,] 0.000000 0.000000  0.82158384

cov(salary)

##            error          x1         x2
## error 1.92652757  0.06743591  0.1167598
## x1    0.06743591  2.04033120 -0.7970544
## x2    0.11675983 -0.79705436  0.9412816

cor(salary)

##            error          x1          x2
## error 1.00000000  0.03401365  0.08670538
## x1    0.03401365  1.00000000 -0.57514563
## x2    0.08670538 -0.57514563  1.00000000

salary$y<-with(salary, 1 + 0.2*x1 + 0.3*x2 + error)

library(ggplot2)

ggplot(salary, aes(x=x1, y=x2)) + geom_point() + stat_smooth(method=lm, se=FALSE)

10.2

콜레스키 분해를 이용한 공분산행렬 기반의 난수생성

r.cov<-function(mu.x1, mu.x2, cov.x1.x1, cov.x2.x2, cov.x1.x2, n)
{
    mu<-matrix(c(mu.x1, mu.x2), ncol=1)
    cov<-matrix(c(cov.x1.x1, cov.x1.x2, cov.x1.x2, cov.x2.x2), ncol=2)
    
    print("평균 벡터")
    print(mu)

    print("공분산 행렬")
    print(cov)

    print("공분산행렬의 콜레스키 분해")
    l <- chol(cov) 

    print(l)
    temp <- t(l)%*%matrix(rnorm(dim(cov)[1]*n), ncol=n)
    y<-as.data.frame(t(temp))
    names(y) <- c("x1", "x2")   
    y$x1<-y$x1 + mu.x1
    y$x2<-y$x2 + mu.x2
    return(y)

}

#상관관계가 음수일때

cor1<-r.cov(0, 0, 1, 1, -0.1, 100)

## [1] "평균 벡터"
##      [,1]
## [1,]    0
## [2,]    0
## [1] "공분산 행렬"
##      [,1] [,2]
## [1,]  1.0 -0.1
## [2,] -0.1  1.0
## [1] "공분산행렬의 콜레스키 분해"
##      [,1]       [,2]
## [1,]    1 -0.1000000
## [2,]    0  0.9949874

cor2<-r.cov(0, 0, 1, 1, -0.3, 100)

## [1] "평균 벡터"
##      [,1]
## [1,]    0
## [2,]    0
## [1] "공분산 행렬"
##      [,1] [,2]
## [1,]  1.0 -0.3
## [2,] -0.3  1.0
## [1] "공분산행렬의 콜레스키 분해"
##      [,1]       [,2]
## [1,]    1 -0.3000000
## [2,]    0  0.9539392

cor3<-r.cov(0, 0, 1, 1, -0.5, 100)

## [1] "평균 벡터"
##      [,1]
## [1,]    0
## [2,]    0
## [1] "공분산 행렬"
##      [,1] [,2]
## [1,]  1.0 -0.5
## [2,] -0.5  1.0
## [1] "공분산행렬의 콜레스키 분해"
##      [,1]       [,2]
## [1,]    1 -0.5000000
## [2,]    0  0.8660254

cor4<-r.cov(0, 0, 1, 1, -0.9, 100)

## [1] "평균 벡터"
##      [,1]
## [1,]    0
## [2,]    0
## [1] "공분산 행렬"
##      [,1] [,2]
## [1,]  1.0 -0.9
## [2,] -0.9  1.0
## [1] "공분산행렬의 콜레스키 분해"
##      [,1]       [,2]
## [1,]    1 -0.9000000
## [2,]    0  0.4358899

par(mfrow=c(2,2))   

lm1<-lm(x2~x1, data=cor1)
plot(cor1$x1, cor1$x2, xlim=c(-3, 3), ylim=c(-3, 3), main="cor=-0.1")
abline(lm1)


lm2<-lm(x2~x1, data=cor2)
plot(cor2$x1, cor2$x2, xlim=c(-3, 3), ylim=c(-3, 3), main="cor=-0.3")
abline(lm2)

lm3<-lm(x2~x1, data=cor3)
plot(cor3$x1, cor3$x2, xlim=c(-3, 3), ylim=c(-3, 3), main="cor=-0.5")
abline(lm3)

lm4<-lm(x2~x1, data=cor4)
plot(cor4$x1, cor4$x2, xlim=c(-3, 3), ylim=c(-3, 3), main="cor=-0.9")
abline(lm4)

#상관관계가 양수일때
cor1<-r.cov(0, 0, 1, 1, 0.1, 100)

## [1] "평균 벡터"
##      [,1]
## [1,]    0
## [2,]    0
## [1] "공분산 행렬"
##      [,1] [,2]
## [1,]  1.0  0.1
## [2,]  0.1  1.0
## [1] "공분산행렬의 콜레스키 분해"
##      [,1]      [,2]
## [1,]    1 0.1000000
## [2,]    0 0.9949874

cor2<-r.cov(0, 0, 1, 1, 0.3, 100)

## [1] "평균 벡터"
##      [,1]
## [1,]    0
## [2,]    0
## [1] "공분산 행렬"
##      [,1] [,2]
## [1,]  1.0  0.3
## [2,]  0.3  1.0
## [1] "공분산행렬의 콜레스키 분해"
##      [,1]      [,2]
## [1,]    1 0.3000000
## [2,]    0 0.9539392

cor3<-r.cov(0, 0, 1, 1, 0.5, 100)

## [1] "평균 벡터"
##      [,1]
## [1,]    0
## [2,]    0
## [1] "공분산 행렬"
##      [,1] [,2]
## [1,]  1.0  0.5
## [2,]  0.5  1.0
## [1] "공분산행렬의 콜레스키 분해"
##      [,1]      [,2]
## [1,]    1 0.5000000
## [2,]    0 0.8660254

cor4<-r.cov(0, 0, 1, 1, 0.9, 100)

## [1] "평균 벡터"
##      [,1]
## [1,]    0
## [2,]    0
## [1] "공분산 행렬"
##      [,1] [,2]
## [1,]  1.0  0.9
## [2,]  0.9  1.0
## [1] "공분산행렬의 콜레스키 분해"
##      [,1]      [,2]
## [1,]    1 0.9000000
## [2,]    0 0.4358899

par(mfrow=c(2,2))   

lm1<-lm(x2~x1, data=cor1)
plot(cor1$x1, cor1$x2, xlim=c(-3, 3), ylim=c(-3, 3), main="cor=-0.1")
abline(lm1)


lm2<-lm(x2~x1, data=cor2)
plot(cor2$x1, cor2$x2, xlim=c(-3, 3), ylim=c(-3, 3), main="cor=-0.3")
abline(lm2)

lm3<-lm(x2~x1, data=cor3)
plot(cor3$x1, cor3$x2, xlim=c(-3, 3), ylim=c(-3, 3), main="cor=-0.5")
abline(lm3)

lm4<-lm(x2~x1, data=cor4)
plot(cor4$x1, cor4$x2, xlim=c(-3, 3), ylim=c(-3, 3), main="cor=-0.9")
abline(lm4)

code 10.3

set.seed(1092)
i<-seq(1, 20)
x<-rnorm(20, mean=20, sd=10)
y<-2*x + rnorm(20, mean=20, sd=10)
y[20]<-400
sim<-data.frame(i, x, y)

cor(x,y)

## [1] 0.4356212

cor(x[i<20], y[i<20])

## [1] 0.9423874

library(ggplot2)

# 원본대로 그리기

ggplot(sim, aes(x=x, y=y)) + geom_point() + ylim(c(-10,400)) +
    stat_smooth(method=lm, se=FALSE)

X11()   #창 하나 더 띄우기

ggplot(sim[i<20,], aes(x=x, y=y)) + geom_point() + ylim(c(-10,400)) +
    stat_smooth(method=lm, se=FALSE)

# 두 개 같이 그려 비교하기

x1<-append(x, x)
y1<-append(y, y)
y1[40]<-NA
type<-c(rep("cor1", each=20, times=1), rep("cor2", each=20, times=1))

sim2<-data.frame(x1, y1, type)

ggplot(sim, aes(x=x1, y=y1, color=type)) + geom_point() + ylim(c(-10,400)) +
    stat_smooth(method=lm, se=FALSE)

## Warning: Removed 1 rows containing missing values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

code 10.4

coef<-c(rep(2, each=20, times=1), rep(10, each=20, times=1))
x<-rnorm(40, mean=20, sd=10)
y<-coef*x + rnorm(40, mean=0, sd=0)

sim<-data.frame(coef, x, y)
sim$coef.type<-with(sim, as.factor(coef))

#같이 그리기
ggplot(sim, aes(x=x, y=y, color=coef.type)) + geom_point() +
    stat_smooth(method=lm, se=FALSE)

#따로 그리기
ggplot(sim, aes(x=x, y=y, color=coef.type)) + geom_point() +
    stat_smooth(method=lm, se=FALSE) + facet_grid( ~ coef)

code 10.5

x<-seq(1, 20)
y<-x^2 -20*x + rnorm(20, mean=0, sd=1)

sim<-data.frame(x, y)

ggplot(sim, aes(x=x, y=y)) + geom_point() + 
    stat_smooth(method=lm, se=FALSE)

code 10.6

install.packages(“corrplot”) install.packages(“psych”) install.packages(“ggm”)

id<-c(1, 2, 3, 4, 5)
q1<-c(6, 4, 4, 3, 1)
q2<-c(6, 6, 4, 1, 2)
q3<-c(5, 5, 4, 4, 1)
q4<-c(4, 3, 2, 2, 1)

cron<-data.frame(id, q1, q2, q3, q4)
summary(cron)

##        id          q1            q2            q3            q4     
##  Min.   :1   Min.   :1.0   Min.   :1.0   Min.   :1.0   Min.   :1.0  
##  1st Qu.:2   1st Qu.:3.0   1st Qu.:2.0   1st Qu.:4.0   1st Qu.:2.0  
##  Median :3   Median :4.0   Median :4.0   Median :4.0   Median :2.0  
##  Mean   :3   Mean   :3.6   Mean   :3.8   Mean   :3.8   Mean   :2.4  
##  3rd Qu.:4   3rd Qu.:4.0   3rd Qu.:6.0   3rd Qu.:5.0   3rd Qu.:3.0  
##  Max.   :5   Max.   :6.0   Max.   :6.0   Max.   :5.0   Max.   :4.0

cor(cron[,2:5])

##           q1        q2        q3        q4
## q1 1.0000000 0.7604172 0.8877834 0.9414689
## q2 0.7604172 1.0000000 0.6538566 0.8076923
## q3 0.8877834 0.6538566 1.0000000 0.8540168
## q4 0.9414689 0.8076923 0.8540168 1.0000000

plot(cron[,2:5])
pairs(cron[,2:5])

#다양한 상관관계 plot

#함수 정의
panel.cor <- function(x,y,digits=2,prefix="",cex.cor, ...)
{
    usr <-par("usr")
    on.exit(par(usr))
    par(usr =c(0, 1, 0, 1))
    r <-abs(cor(x,y,use="complete.obs"))
    txt <-format(c(r, 0.123456789),digits=digits)[1]
    txt <-paste(prefix,txt,sep="")
    if(missing(cex.cor))cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5,txt,cex =cex.cor * (1 +r) / 2)
}

panel.hist <- function(x, ...)
{
    usr <-par("usr")
    on.exit(par(usr))
    par(usr =c(usr[1:2], 0, 1.5) )
    h <-hist(x,plot = FALSE)
    breaks <-h$breaks
    nB <-length(breaks)
    y <-h$counts
    y <-y/max(y)
    rect(breaks[-nB], 0,breaks[-1],y,col="white", ...)
}

panel.lm <- function (x,y,col =par("col"),bg = NA,pch =par("pch"),
cex = 1,col.smooth = "black", ...) 
{
    points(x,y,pch =pch,col =col,bg =bg,cex =cex)
    abline(stats::lm(y ~ x),col =col.smooth, ...)
}

#다양한 matrix 그림
pairs(cron[,2:5],upper.panel =panel.cor,
        diag.panel  =panel.hist, lower.panel =panel.lm)

#상관관계 행렬을 나타내는 다양한 그림

library(corrplot)

corrplot(cor(cron[,2:5]))

corrplot(cor(cron[,2:5]) ,method="shade",shade.col=NA,tl.col="black",tl.srt=45)

corrplot(cor(cron[,2:5]),method="shade",shade.col=NA,tl.col="black",tl.srt=45,
    addCoef.col="black",addcolorlabel="no",order="AOE")

## Warning in text.default(pos.xlabel[, 1], pos.xlabel[, 2], newcolnames, srt
## = tl.srt, : "addcolorlabel"는 그래픽 매개변수가 아닙니다

## Warning in text.default(pos.ylabel[, 1], pos.ylabel[, 2], newrownames, col
## = tl.col, : "addcolorlabel"는 그래픽 매개변수가 아닙니다

## Warning in title(title, ...): "addcolorlabel"는 그래픽 매개변수가 아닙니다

# 귀무가설 검정(rho=0)


library(psych)

## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:ggplot2':
## 
##     %+%

?corr.test

## starting httpd help server ... done

tt<-corr.test(cron[,2:5])
print(tt, short=FALSE)

## Call:corr.test(x = cron[, 2:5])
## Correlation matrix 
##      q1   q2   q3   q4
## q1 1.00 0.76 0.89 0.94
## q2 0.76 1.00 0.65 0.81
## q3 0.89 0.65 1.00 0.85
## q4 0.94 0.81 0.85 1.00
## Sample Size 
## [1] 5
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##      q1   q2   q3   q4
## q1 0.00 0.29 0.22 0.10
## q2 0.14 0.00 0.29 0.29
## q3 0.04 0.23 0.00 0.26
## q4 0.02 0.10 0.07 0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
## 
##  Confidence intervals based upon normal theory.  To get bootstrapped values, try cor.ci
##       lower    r upper    p
## q1-q2 -0.37 0.76  0.98 0.14
## q1-q3  0.03 0.89  0.99 0.04
## q1-q4  0.35 0.94  1.00 0.02
## q2-q3 -0.54 0.65  0.97 0.23
## q2-q4 -0.26 0.81  0.99 0.10
## q3-q4 -0.11 0.85  0.99 0.07

library(ggm)

## Loading required package: igraph
## 
## Attaching package: 'igraph'
## 
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## 
## The following object is masked from 'package:base':
## 
##     union
## 
## 
## Attaching package: 'ggm'
## 
## The following object is masked from 'package:igraph':
## 
##     pa

pcor(c("q1", "q3", "q2", "q4" ), var(cron[,2:5]))

## [1] 0.4808766

#앞의 둘은 편상관계수 구할 넘, 나머지 뒤의 벡터는 통제


#test statistic for rho=0
p.coef<-pcor(c("q1", "q3", "q2", "q4" ), var(cron[,2:5]))

pcor.test(p.coef, 2, 5)

## $tval
## [1] 0.5484524
## 
## $df
## [1] 1
## 
## $pvalue
## [1] 0.680637

## code 10.8

cov(cron[,2:5])

##      q1   q2   q3   q4
## q1 3.30 3.15 2.65 1.95
## q2 3.15 5.20 2.45 2.10
## q3 2.65 2.45 2.70 1.60
## q4 1.95 2.10 1.60 1.30

library(psych)
alpha(cron[,2:5])

## 
## Reliability analysis   
## Call: alpha(x = cron[, 2:5])
## 
##   raw_alpha std.alpha G6(smc) average_r S/N  ase mean  sd
##       0.92      0.95    0.95      0.82  18 0.24  3.4 1.6
## 
##  lower alpha upper     95% confidence boundaries
## 0.45 0.92 1.39 
## 
##  Reliability if an item is dropped:
##    raw_alpha std.alpha G6(smc) average_r  S/N alpha se
## q1      0.86      0.91    0.90      0.77 10.1     0.36
## q2      0.94      0.96    0.95      0.89 25.4     0.30
## q3      0.89      0.94    0.93      0.84 15.4     0.34
## q4      0.89      0.91    0.90      0.77  9.9     0.34
## 
##  Item statistics 
##    n raw.r std.r r.cor r.drop mean  sd
## q1 5  0.96  0.97  0.97   0.92  3.6 1.8
## q2 5  0.89  0.87  0.80   0.76  3.8 2.3
## q3 5  0.90  0.91  0.88   0.83  3.8 1.6
## q4 5  0.96  0.97  0.97   0.94  2.4 1.1
## 
## Non missing response frequency for each item
##      1   2   3   4   5   6 miss
## q1 0.2 0.0 0.2 0.4 0.0 0.2    0
## q2 0.2 0.2 0.0 0.2 0.0 0.4    0
## q3 0.2 0.0 0.0 0.4 0.4 0.0    0
## q4 0.2 0.4 0.2 0.2 0.0 0.0    0

#척도변화에 따른 알파값의 변화

cron2<-cron[,2:5]
cron2$q5<-with(cron, q1*10)

cron

##   id q1 q2 q3 q4
## 1  1  6  6  5  4
## 2  2  4  6  5  3
## 3  3  4  4  4  2
## 4  4  3  1  4  2
## 5  5  1  2  1  1

alpha(cron2[,2:5])

## 
## Reliability analysis   
## Call: alpha(x = cron2[, 2:5])
## 
##   raw_alpha std.alpha G6(smc) average_r S/N  ase mean  sd
##       0.44      0.95    0.95      0.82  18 0.45   12 5.6
## 
##  lower alpha upper     95% confidence boundaries
## -0.44 0.44 1.32 
## 
##  Reliability if an item is dropped:
##    raw_alpha std.alpha G6(smc) average_r  S/N alpha se
## q2      0.33      0.96    0.95      0.89 25.4     0.48
## q3      0.36      0.94    0.93      0.84 15.4     0.49
## q4      0.40      0.91    0.90      0.77  9.9     0.50
## q5      0.86      0.91    0.90      0.77 10.1     0.36
## 
##  Item statistics 
##    n raw.r std.r r.cor r.drop mean   sd
## q2 5  0.80  0.87  0.80   0.76  3.8  2.3
## q3 5  0.90  0.91  0.88   0.88  3.8  1.6
## q4 5  0.95  0.97  0.97   0.95  2.4  1.1
## q5 5  1.00  0.97  0.97   0.92 36.0 18.2