library(data.table)
library(curl)
## Using libcurl 7.64.1 with Schannel
library(openxlsx)
df1 = fread("anova.csv")
df2 = fread("tempbig.csv")
df3 = as.data.table(read.xlsx("twoway.xlsx"))
out1 = df2[, .(Mean=mean(y), SD=sd(y), N=.(.N))]
out1
## Mean SD N
## 1: 61.21455 27.51436 10000000
out2 = df2[, .(Mean=mean(y), SD=sd(y), N=.(.N)), by=.(x1,x2)]
out2
## x1 x2 Mean SD N
## 1: 1 5 51.32877 26.42764 489860
## 2: 1 6 47.39352 25.75961 354670
## 3: 1 4 54.75922 26.60266 789890
## 4: 1 2 62.80916 27.61108 1395690
## 5: 1 1 60.22331 29.10055 1021910
## 6: 1 3 60.82691 28.03215 1246580
## 7: 2 6 51.27192 24.85099 87710
## 8: 2 2 66.91873 26.11230 1671400
## 9: 2 1 63.30948 26.89169 1843800
## 10: 2 3 66.28037 25.14893 677430
## 11: 2 4 58.39985 26.92955 275400
## 12: 2 5 53.88967 28.43449 145660
out3 = df2[, .(Mean=mean(y), SD=sd(y), N=.(.N)), keyby=.(x1,x2)]
out3
## x1 x2 Mean SD N
## 1: 1 1 60.22331 29.10055 1021910
## 2: 1 2 62.80916 27.61108 1395690
## 3: 1 3 60.82691 28.03215 1246580
## 4: 1 4 54.75922 26.60266 789890
## 5: 1 5 51.32877 26.42764 489860
## 6: 1 6 47.39352 25.75961 354670
## 7: 2 1 63.30948 26.89169 1843800
## 8: 2 2 66.91873 26.11230 1671400
## 9: 2 3 66.28037 25.14893 677430
## 10: 2 4 58.39985 26.92955 275400
## 11: 2 5 53.88967 28.43449 145660
## 12: 2 6 51.27192 24.85099 87710
out3 = df2[x2 <= 4, .(Mean=mean(y), SD=sd(y), N=.(.N)), keyby = .(x1,x2)]
out3
## x1 x2 Mean SD N
## 1: 1 1 60.22331 29.10055 1021910
## 2: 1 2 62.80916 27.61108 1395690
## 3: 1 3 60.82691 28.03215 1246580
## 4: 1 4 54.75922 26.60266 789890
## 5: 2 1 63.30948 26.89169 1843800
## 6: 2 2 66.91873 26.11230 1671400
## 7: 2 3 66.28037 25.14893 677430
## 8: 2 4 58.39985 26.92955 275400
out3 = df2[x2 <= 4 & x1 == 1, .(Mean=mean(y), SD=sd(y), N=.(.N)), keyby = .(x1,x2)]
out3
## x1 x2 Mean SD N
## 1: 1 1 60.22331 29.10055 1021910
## 2: 1 2 62.80916 27.61108 1395690
## 3: 1 3 60.82691 28.03215 1246580
## 4: 1 4 54.75922 26.60266 789890
#**************************************************
josa <- function(word) {
k <- substr(word,nchar(word),nchar(word))
if (((k >="가") & (k <= "힝")) & (((utf8ToInt(k) - utf8ToInt("가")) %% 28) > 0)) {
return (paste0(word,"은 "))
} else {
return (paste0(word,"는 "))
}
}
#**************************************************
ANOVA = function(data) {
vname=colnames(data)
x = as.matrix(data[,1])
y = as.matrix(data[,2])
m = aggregate(y ~ x, 'FUN'= mean)
s = aggregate(y ~ x, 'FUN'= sd)
n = aggregate(y ~ x, 'FUN'= length)
tmean= mean(y)
ssb = sum((m[,2] - tmean)^2 * n[,2])
sst = sum((y - tmean)^2)
ssw = sst - ssb
df1 = nrow(m) - 1
df2 = nrow(y) - nrow(m)
dft = nrow(y) - 1
f = (ssb/df1) / (ssw/df2)
p = pf(f,df1,df2,lower.tail = FALSE)
table = matrix(NaN, 3, 5)
table[1,1] = ssb
table[2,1] = ssw
table[3,1] = sst
table[1,2] = df1
table[2,2] = df2
table[3,2] = dft
table[1,3] = ssb/df1
table[2,3] = ssw/df2
table[1,4] = f
table[1,5] = p
colnames(table) = c("제곱합", "자유도", "평균제곱","F","p")
rownames(table) = c("집단간", "집단내","합계계")
#*******************************<익힘1: 평균테이블작성>***********************
stat = matrix(NaN, nrow(m)+1, 5)
temp = cbind(m[,2],s[,2],n[,2])
stat[1:3,1:3] = temp
stat[4,1] = tmean
stat[4,2] = sd(y)
stat[4,3] = nrow(y)
stat[1,4] = f
stat[1,5] = p
colnames(stat) = c("평균","표준편차","사례수","F","p")
rownames(stat) = c(m[,1],"합계")
hypo = paste0("가설: ",josa(vname[2]),vname[1],"에 따라 통계적으로 차이가 있을 것이다.")
if (p >= 0.01 & p < 0.05) level = 'p < 0.05'
if (p >= 0.001 & p < 0.01) level='p < 0.01'
if (p < 0.001) level= 'p < 0.001'
if (p >= 0.05) {res=paste0(josa(vname[2]), vname[1],"에 따라 차이가 없을 것이다.")
}else {
res=paste0(josa(vname[2]),vname[1],"에 따라 통계적으로 ",level,"수준에서 의미있는 차이가 있다. (p = ",f,", df1 = ",df1,", df2= ",df2,", p =",round(p,4),')')
result = list(hypo=hypo,table=table,stat=stat,res=res)
}
return(result)
}
data = read.csv("anova.csv")
ANOVA(data)
## $hypo
## [1] "가설: 근무만족도는 상사의유형에 따라 통계적으로 차이가 있을 것이다."
##
## $table
## 제곱합 자유도 평균제곱 F p
## 집단간 40.44444 2 20.222222 4.0625 0.03891091
## 집단내 74.66667 15 4.977778 NaN NaN
## 합계계 115.11111 17 NaN NaN NaN
##
## $stat
## 평균 표준편차 사례수 F p
## 민주형 11.333333 3.559026 6 4.0625 0.03891091
## 자유방임형 8.333333 1.032796 6 NaN NaN
## 전제형 8.000000 1.095445 6 NaN NaN
## 합계 9.222222 2.602161 18 NaN NaN
##
## $res
## [1] "근무만족도는 상사의유형에 따라 통계적으로 p < 0.05수준에서 의미있는 차이가 있다. (p = 4.0625, df1 = 2, df2= 15, p =0.0389)"
library(ggplot2)
month = c(1,2,3,4,5,6)
rain = c(55,50,45,50,60,70)
df = data.frame(x=month, y=rain)
ggplot(df,aes(x=month, y=rain)) +
geom_bar(stat = "identity",
width = 0.7,
fill = 'steelblue')