Statistics with R

R 강의 Summary (Last Day)

2024년 5월 10일 AI Bigdata Statistics with R 마지막 강의 요약본입니다.

강의(2024.5.10) 내용은 다음과 같은 내용이 포함됩니다.

R ANOVA Program
- ANOVA 기본코드
- function을 이용한 코드
- 조사처리를 위한 function 추가
R Graph
- R Base Graph
- ggplot Graph
- Graph Example URLs

1-1. ANOVA Program (I)

ANOVA 프로그램의 기본

library(openxlsx)
df = read.xlsx("anova1.xlsx")
vname = colnames(df)

stat = as.data.frame(matrix(0,4,4))
anovat = as.data.frame(matrix(NaN,3,5))

m = aggregate(근무만족도 ~ 상사의유형, df, FUN="mean")
s = aggregate(근무만족도 ~ 상사의유형, df, FUN="sd")
n = aggregate(!is.na(근무만족도) ~ 상사의유형, df, FUN="sum" )

stat[1:3,] = cbind(m,s[,2],n[,2])
stat[4,2] = tmean = mean(df[,2])
stat[4,3] = sd(df[,2])
stat[4,4] = sum(stat[1:3,4])
stat[4,1] = '합계'
colnames(stat) = c("상사의 유형","평균","표준편차","사례수")

anovat[3,1] = SST = sum((df[,2] - tmean)^2)
anovat[1,1] = SSB = sum(n[,2]*(m[,2] - tmean)^2)
anovat[2,1] = SSW = SST - SSB
anovat[1,2] = df1 = nrow(m) - 1
anovat[3,2] = dft = stat[4,4] - 1
anovat[2,2] = df2 = dft - df1
anovat[1,3] = MSB = SSB/df1
anovat[2,3] = MSW = SSW/df2
anovat[1,4] = F = MSB/MSW
anovat[1,5] = p = pf(F,df1,df2,lower.tail = FALSE)
colnames(anovat) = c("SS", 'df', "MS", 'F', 'p')
rownames(anovat) = c("집단간","집단내","전체")

hypo = sprintf("%s는(은) %s에 따라 차이가 있을 것이다.",vname[2],vname[1])
if (p >= 0.05){
  res = sprintf("%s는(은) %s에 따라 통계적으로 차이가 없다.",vname[2],vname[1])
   } else if (p >= 0.01 & p < 0.05) {
     res = sprintf("%s는(은) %s에 따라 통계적으로 p<0.05 수준에서 유의한 차이가 있다.",vname[2],vname[1])
   } else if (p >= 0.001 & p < 0.01) {
     res = sprintf("%s는(은) %s에 따라 통계적으로 p<0.01 수준에서 유의한 차이가 있다.",vname[2],vname[1])
   } else if (p < 0.001) {
     res = sprintf("%s는(은) %s에 따라 통계적으로 p<0.001 수준에서 유의한 차이가 있다.",vname[2],vname[1])
}

output = list(hypothesis = hypo, anovatable = anovat, meantable=stat, result=res)

print(output)

## $hypothesis
## [1] "근무만족도는(은) 상사의유형에 따라 차이가 있을 것이다."
## 
## $anovatable
##            SS df     MS     F       p
## 집단간  40.44  2 20.222 4.063 0.03891
## 집단내  74.67 15  4.978   NaN     NaN
## 전체   115.11 17    NaN   NaN     NaN
## 
## $meantable
##   상사의 유형   평균 표준편차 사례수
## 1      민주형 11.333    3.559      6
## 2  자유방임형  8.333    1.033      6
## 3      전제형  8.000    1.095      6
## 4        합계  9.222    2.602     18
## 
## $result
## [1] "근무만족도는(은) 상사의유형에 따라 통계적으로 p<0.05 수준에서 유의한 차이가 있다."

1-2. ANOVA Program (II)

ANOVA 프로그램 (function활용)

anova = function(df) {
  vname = colnames(df)
  stat = as.data.frame(matrix(0,4,4))
  anovat = as.data.frame(matrix(NaN,3,5))

  m = aggregate(근무만족도 ~ 상사의유형, df, FUN="mean")
  s = aggregate(근무만족도 ~ 상사의유형, df, FUN="sd")
  n = aggregate(!is.na(근무만족도) ~ 상사의유형, df, FUN="sum" )

  stat[1:3,] = cbind(m,s[,2],n[,2])
  stat[4,2] = tmean = mean(df[,2])
  stat[4,3] = sd(df[,2])
  stat[4,4] = sum(stat[1:3,4])
  stat[4,1] = '합계'
  colnames(stat) = c("상사의 유형","평균","표준편차","사례수")
  
  anovat[3,1] = SST = sum((df[,2] - tmean)^2)
  anovat[1,1] = SSB = sum(n[,2]*(m[,2] - tmean)^2)
  anovat[2,1] = SSW = SST - SSB
  anovat[1,2] = df1 = nrow(m) - 1
  anovat[3,2] = dft = stat[4,4] - 1
  anovat[2,2] = df2 = dft - df1
  anovat[1,3] = MSB = SSB/df1
  anovat[2,3] = MSW = SSW/df2
  anovat[1,4] = F = MSB/MSW
  anovat[1,5] = p = pf(F,df1,df2,lower.tail = FALSE)
  colnames(anovat) = c("SS", 'df', "MS", 'F', 'p')
  rownames(anovat) = c("집단간","집단내","전체")

  hypo = sprintf("%s는(은) %s에 따라 차이가 있을 것이다.",vname[2],vname[1])
  if (p >= 0.05){
    res = sprintf("%s는(은) %s에 따라 통계적으로 차이가 없다.",vname[2],vname[1])
  } else if (p >= 0.01 & p < 0.05) {
    res = sprintf("%s는(은) %s에 따라 통계적으로 p<0.05 수준에서 유의한 차이가 있다.",vname[2],vname[1])
  } else if (p >= 0.001 & p < 0.01) {
    res = sprintf("%s는(은) %s에 따라 통계적으로 p<0.01 수준에서 유의한 차이가 있다.",vname[2],vname[1])
  } else if (p < 0.001) {
    res = sprintf("%s는(은) %s에 따라 통계적으로 p<0.001 수준에서 유의한 차이가 있다.",vname[2],vname[1])
  }

  output = list(hypothesis = hypo, anovatable = anovat, meantable=stat, result=res)
  return(output)
}  

library(openxlsx)
df = read.xlsx("anova1.xlsx")
 
print(anova(df))

## $hypothesis
## [1] "근무만족도는(은) 상사의유형에 따라 차이가 있을 것이다."
## 
## $anovatable
##            SS df     MS     F       p
## 집단간  40.44  2 20.222 4.063 0.03891
## 집단내  74.67 15  4.978   NaN     NaN
## 전체   115.11 17    NaN   NaN     NaN
## 
## $meantable
##   상사의 유형   평균 표준편차 사례수
## 1      민주형 11.333    3.559      6
## 2  자유방임형  8.333    1.033      6
## 3      전제형  8.000    1.095      6
## 4        합계  9.222    2.602     18
## 
## $result
## [1] "근무만족도는(은) 상사의유형에 따라 통계적으로 p<0.05 수준에서 유의한 차이가 있다."

1-3. ANOVA Program (II)

ANOVA 프로그램 (function + 조사처리)

# 조사를 처리하는 function 추가
josa <- function(word) {
  k <- substr(word,nchar(word),nchar(word))
  if (((k >="가") & (k <= "힝")) & (((utf8ToInt(k) - utf8ToInt("가")) %% 28) > 0)) {
    return (paste0(word,"은"))
  } else {
    return (paste0(word,"는"))
  }
}

anova = function(df) {
  vname = colnames(df)
  stat = as.data.frame(matrix(0,4,4))
  anovat = as.data.frame(matrix(NaN,3,5))

  m = aggregate(근무만족도 ~ 상사의유형, df, FUN="mean")
  s = aggregate(근무만족도 ~ 상사의유형, df, FUN="sd")
  n = aggregate(!is.na(근무만족도) ~ 상사의유형, df, FUN="sum" )

  stat[1:3,] = cbind(m,s[,2],n[,2])
  stat[4,2] = tmean = mean(df[,2])
  stat[4,3] = sd(df[,2])
  stat[4,4] = sum(stat[1:3,4])
  stat[4,1] = '합계'
  colnames(stat) = c("상사의 유형","평균","표준편차","사례수")
  
  anovat[3,1] = SST = sum((df[,2] - tmean)^2)
  anovat[1,1] = SSB = sum(n[,2]*(m[,2] - tmean)^2)
  anovat[2,1] = SSW = SST - SSB
  anovat[1,2] = df1 = nrow(m) - 1
  anovat[3,2] = dft = stat[4,4] - 1
  anovat[2,2] = df2 = dft - df1
  anovat[1,3] = MSB = SSB/df1
  anovat[2,3] = MSW = SSW/df2
  anovat[1,4] = F = MSB/MSW
  anovat[1,5] = p = pf(F,df1,df2,lower.tail = FALSE)
  colnames(anovat) = c("SS", 'df', "MS", 'F', 'p')
  rownames(anovat) = c("집단간","집단내","전체")

  hypo = sprintf("%s %s에 따라 차이가 있을 것이다.",josa(vname[2]),vname[1])
  if (p >= 0.05){
    res = sprintf("%s %s에 따라 통계적으로 차이가 없다.",josa(vname[2]),vname[1])
  } else if (p >= 0.01 & p < 0.05) {
    res = sprintf("%s %s에 따라 통계적으로 p<0.05 수준에서 유의한 차이가 있다.",josa(vname[2]),vname[1])
  } else if (p >= 0.001 & p < 0.01) {
    res = sprintf("%s %s에 따라 통계적으로 p<0.01 수준에서 유의한 차이가 있다.",josa(vname[2]),vname[1])
  } else if (p < 0.001) {
    res = sprintf("%s %s에 따라 통계적으로 p<0.001 수준에서 유의한 차이가 있다.",josa(vname[2]),vname[1])
  }

  output = list(hypothesis = hypo, anovatable = anovat, meantable=stat, result=res)
  return(output)
}  

library(openxlsx)
df = read.xlsx("anova1.xlsx")
 
print(anova(df))

## $hypothesis
## [1] "근무만족도는 상사의유형에 따라 차이가 있을 것이다."
## 
## $anovatable
##            SS df     MS     F       p
## 집단간  40.44  2 20.222 4.063 0.03891
## 집단내  74.67 15  4.978   NaN     NaN
## 전체   115.11 17    NaN   NaN     NaN
## 
## $meantable
##   상사의 유형   평균 표준편차 사례수
## 1      민주형 11.333    3.559      6
## 2  자유방임형  8.333    1.033      6
## 3      전제형  8.000    1.095      6
## 4        합계  9.222    2.602     18
## 
## $result
## [1] "근무만족도는 상사의유형에 따라 통계적으로 p<0.05 수준에서 유의한 차이가 있다."

2-1. R Base Graph

R Base를 이용한 Graph Examples

#***** R Base Graph *******

options(warn = -1)
library(openxlsx)
df1 = read.xlsx("twoway.xlsx")
df3 = read.xlsx("regress.xlsx")
df4 = read.xlsx("ClassExample.xlsx")
df4 = na.omit(df4)

barplot(table(df4$sliptime))

barplot(table(df4$sliptime),
        main = "Sleep Time 빈도",
        xlab = "Sleep Time 정도",
        ylab = "빈도",
        col  = 'red'
        )

boxplot(df1$근무만족도 ~ df1$상사의유형)

boxplot(df1$근무만족도 ~ df1$상사의유형,
        main = '상사의 유형별 근무만족도',
        xlab = "상사의 유형",
        ylab = "근무만족도",
        col = 'yellow')

plot(df3$근무만족도,df3$근무평정,
     main = '근무만족도-근무평정 산점도',
     xlab = '근무만족도',
     ylab = '근무평정',
     col = "red",
     pch = 20)

pairs(df3,
      main = 'Multi Plots',
      pct = 20,
      col = "red")

pie(table(df4$sliptime))

df6 = table(df4$sliptime)
df6 = data.frame(freq=df6,lab=c("1-3hrs","3-5hrs","6~7hrs",
                                "7~8hrs","8~9hrs","10~hrs"))

pie(df6[,2],labels=df6[,3])

df7 = aggregate(df4$weight ~df4$sliptime, df4, mean)

plot(df7[,1],df7[,2])

plot(df7[,1],df7[,2],
     main = "Weight by Sleeptime",
     xlab = "Sleep Time", ylab="Weight", type='o')

2-2. ggplot Graph

ggplot을 이용한 Graph Examples

#***** ggplot Graph *******
#*
library(ggplot2)
options(warn = -1)
library(openxlsx)
df1 = read.xlsx("twoway.xlsx")
df3 = read.xlsx("regress.xlsx")
df4 = read.xlsx("ClassExample.xlsx")
df4 = na.omit(df4)

month = c(1,2,3,4,5,6)
rain  = c(55,50,45,50,60,70)

df = data.frame(x=month, y=rain)

ggplot(df,aes(x=month, y=rain)) +
  geom_bar(stat = "identity",
           width = 0.7,
           fill = 'steelblue')

ggplot(df,aes(x=month, y=rain)) +
  geom_bar(stat = "identity",
           width = 0.7,
           fill = 'steelblue') +
  coord_flip()

ggplot(df, aes(x=month, y=rain)) +
  geom_bar(stat = "identity",
           width=0.7,
           fill="steelblue") +
  ggtitle("월별강수량") +
  theme(plot.title=element_text(size=25,face="bold",color="steelblue")) +
  labs(x='월',y="강수량") +
  coord_flip()

ggplot(iris, aes(x=Petal.Length)) +
  geom_histogram(binwidth=0.5, fill='red', color='black')

ggplot(iris,aes(x=Sepal.Width, fill=Species, color=Species)) +
  geom_histogram(binwidth =0.5, position='dodge', color='black') +
  theme(legend.position = "top")

ggplot(iris, aes(x=Petal.Length, y=Petal.Width)) +
  geom_point()

ggplot(iris, aes(x=Petal.Length, y=Petal.Width, color=Species)) +
  geom_point(size=3) +
  ggtitle("꽃잎의 길이와 폭") +
  theme(plot.title=element_text(size=25,face="bold", color='steelblue'))

ggplot(iris, aes(y=Petal.Length, fill=Species)) +
  geom_boxplot()

year = 1937:1960
cnt = as.vector(airmiles)
df  = data.frame(year,cnt)
ggplot(df, aes(x=year, y=cnt)) +
  geom_line(col='red')

ggplot() +
  geom_point(mapping=aes(x=displ, y=hwy,color=class), data=mpg)

ggplot() +
  geom_point(mapping=aes(x=cty, y=hwy, color=displ), data=mpg)

ggplot() +
  geom_point(mapping=aes(x=displ, y=hwy, color=class, shape=drv), data=mpg)

ggplot() +
  geom_point(mapping=aes(x=age, y=circumference), data=Orange)

ggplot() +
  geom_line(mapping=aes(x=age, y=circumference), data=Orange)

ggplot() +
  geom_line(mapping=aes(x=age, y=circumference, linetype=Tree), data=Orange)

ggplot() +
  geom_point(mapping=aes(x=displ, y=hwy), data=mpg) +
  geom_smooth(mapping=aes(x=displ, y=hwy), data=mpg)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot() +
  geom_point(mapping=aes(x=displ, y=hwy), data=mpg) +
  geom_smooth(mapping=aes(x=displ, y=hwy), data=mpg) +
  geom_point(mapping=aes(x=displ, y=cty), data=mpg, col='red', shape=1) +
  geom_smooth(mapping=aes(x=displ, y=cty), data=mpg, linetype=2, col='red')

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg) + geom_bar(aes(class, fill=drv))

ggplot(mpg, aes(hwy, color=drv)) + geom_density()

ggplot(mpg, aes(class, fill=drv)) +
  geom_bar(position='fill') + coord_polar()

2-3. R Graph Example URLs

https://kkokkilkon.tistory.com/17 (기본)

https://bioinformaticsandme.tistory.com/255

https://rpubs.com/kimwoohyung/ggplot2/