data-science-danger-zone.R

library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(reshape2)
library(ggplot2)
library(ggthemr)

自殺資料

ref <- c(5427,  5688,   6198,   6462,   6635,   7336,   
       7248,    7491,   8161,   8578,   9000)

自訂標準化函數

myStd <- function(x, ref){
  x <- na.omit(x)
  x1 <- (x - min(x)) / (max(x) - min(x))
  x2 <- x1 * (max(ref) - min(ref)) + min(ref)
  x2
}

隨機生成10萬組 1999 - 2009 的亂數資料集

set.seed(2015)
mat <- matrix(runif(1100000, 0, 1), nrow = 11) %>%
  apply(2, myStd, ref=ref)

計算這10萬組資料集與自殺資料的相關係數得到平均數為 -0.0016 及其分佈圖

out <- apply(mat, 2, function(y) cor(ref, y))
ggthemr('fresh')
ggplot(data.frame(cor=out)) + 
  geom_histogram(aes(cor), binwidth=0.1) + 
  theme(text=element_text(size=18)) +
  labs(x="Correlation", y="Count") +
  geom_vline(xintercept=mean(out)) + 
  annotate("text", label = "mean = 0", x = 0.24, y = 2600, size = 5)

從10萬組資料中取相關性最高的資料畫圖

df <- data.frame(year=1999:2009, suicides=ref, simulation1=mat[, which.max(out)]) %>%
  select(year, suicides, simulation1) %>%
  melt(id.vars="year")

ggplot(df, aes(x=year, y=value, colour=variable)) + 
  geom_point(size=3) + geom_line(size=1) + 
  theme(text=element_text(size=18), legend.position="bottom") +
  labs(title=paste("correlation =", round(max(out),4)))

data-science-danger-zone.R

johnson

Sun Sep 20 20:17:48 2015