library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
library(ggplot2)
library(ggthemr)
自殺資料
ref <- c(5427, 5688, 6198, 6462, 6635, 7336,
7248, 7491, 8161, 8578, 9000)
自訂標準化函數
myStd <- function(x, ref){
x <- na.omit(x)
x1 <- (x - min(x)) / (max(x) - min(x))
x2 <- x1 * (max(ref) - min(ref)) + min(ref)
x2
}
隨機生成10萬組 1999 - 2009 的亂數資料集
set.seed(2015)
mat <- matrix(runif(1100000, 0, 1), nrow = 11) %>%
apply(2, myStd, ref=ref)
計算這10萬組資料集與自殺資料的相關係數 得到平均數為 -0.0016 及其分佈圖
out <- apply(mat, 2, function(y) cor(ref, y))
ggthemr('fresh')
ggplot(data.frame(cor=out)) +
geom_histogram(aes(cor), binwidth=0.1) +
theme(text=element_text(size=18)) +
labs(x="Correlation", y="Count") +
geom_vline(xintercept=mean(out)) +
annotate("text", label = "mean = 0", x = 0.24, y = 2600, size = 5)
從10萬組資料中取相關性最高的資料畫圖
df <- data.frame(year=1999:2009, suicides=ref, simulation1=mat[, which.max(out)]) %>%
select(year, suicides, simulation1) %>%
melt(id.vars="year")
ggplot(df, aes(x=year, y=value, colour=variable)) +
geom_point(size=3) + geom_line(size=1) +
theme(text=element_text(size=18), legend.position="bottom") +
labs(title=paste("correlation =", round(max(out),4)))