Machine Learning Class

setwd("c:/data")
rm(list=ls())
df<-read.delim("c:/data/Data1.txt")
library(WRS2)

## Warning: 패키지 'WRS2'는 R 버전 4.3.2에서 작성되었습니다

library(dplyr)

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data(diet)
glimpse(diet)

## Rows: 76
## Columns: 7
## $ gender         <fct> Female, Female, Female, Female, Female, Female, Female,…
## $ age            <int> 22, 46, 55, 33, 50, 50, 37, 28, 28, 45, 60, 48, 41, 37,…
## $ height         <int> 159, 192, 170, 171, 170, 201, 174, 176, 165, 165, 173, …
## $ diet.type      <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, B, B, B, B, B…
## $ initial.weight <int> 58, 60, 64, 64, 65, 66, 67, 69, 70, 70, 72, 72, 72, 82,…
## $ final.weight   <dbl> 54.2, 54.0, 63.3, 61.1, 62.2, 64.0, 65.0, 60.5, 68.1, 6…
## $ weight.loss    <dbl> 3.8, 6.0, 0.7, 2.9, 2.8, 2.0, 2.0, 8.5, 1.9, 3.1, 1.5, …

diet %>% count(diet.type)

##   diet.type  n
## 1         A 24
## 2         B 25
## 3         C 27

gc.out1<-aov(weight.loss~diet.type,data=diet)
summary(gc.out1)

##             Df Sum Sq Mean Sq F value Pr(>F)   
## diet.type    2   60.5  30.264   5.383 0.0066 **
## Residuals   73  410.4   5.622                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# 귀무가설: 다이어트 종류에 따라 체중 감소가 동일하다
# p-value 값이 0.0066으로 유의수준 0.05보다 작음
# (분석) 귀무가설은 기각되고, 대립가설을 채택하여 
# 다이어트 종류에 따라 체중 감소는 다르다는 것을 확인함

# 사후분석
library(multcomp)

## Warning: 패키지 'multcomp'는 R 버전 4.3.2에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: mvtnorm

## Warning: 패키지 'mvtnorm'는 R 버전 4.3.2에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: survival

## 필요한 패키지를 로딩중입니다: TH.data

## Warning: 패키지 'TH.data'는 R 버전 4.3.2에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: MASS

## 
## 다음의 패키지를 부착합니다: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## 
## 다음의 패키지를 부착합니다: 'TH.data'

## The following object is masked from 'package:MASS':
## 
##     geyser

gc.out2<-glht(gc.out1,linfct=mcp(diet.type='Tukey'))
summary(gc.out2)

## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = weight.loss ~ diet.type, data = diet)
## 
## Linear Hypotheses:
##            Estimate Std. Error t value Pr(>|t|)  
## B - A == 0  -0.0320     0.6776  -0.047   0.9988  
## C - A == 0   1.8481     0.6652   2.778   0.0188 *
## C - B == 0   1.8801     0.6581   2.857   0.0152 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

# 다이어트 종류에 따른 집단간의 평균차이는 
# P-value값이 0.05보다 큰 B-A 집단의 평균차이는 통계적으로 의미 없음 
# C-A집단과 C-B집단의 P-Value 값이 0.05보다 작으므로 
# C집단은 A집단과 B집단과의 평균차이가 있고 통계적으로 의미가 있음 


# 등분산 검정
library(lawstat)

## Warning: 패키지 'lawstat'는 R 버전 4.3.2에서 작성되었습니다

levene.test(diet$weight.loss, diet$diet.type)

## 
##  Modified robust Brown-Forsythe Levene-type test based on the absolute
##  deviations from the median
## 
## data:  diet$weight.loss
## Test Statistic = 0.46291, p-value = 0.6313

# (분석) P-value 값이 0.6313으로 유의수준 0.05보다 크므로 등분산성이 있음


# 정규성 검정
shapiro.test(gc.out1$residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  gc.out1$residuals
## W = 0.99175, p-value = 0.9088

# (분석) P-Value 값이 0.9088으로 유의수준 0.05보다 크므로
# 귀무가설을 채택하여 테이터는 정규분포를 따른다.

# 비모수 검정
kruskal.test(weight.loss~diet.type, data=diet)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  weight.loss by diet.type
## Kruskal-Wallis chi-squared = 9.4159, df = 2, p-value = 0.009023

# (분석) P-value 값이 0.009023으로 0.05보다 작으므로 귀무가설을 기각 
# 각 집단의 위치모수는 동일하지 않음


# 상자그림
boxplot(weight.loss~diet.type, data=diet)

# 분석 C집단이 A집단, B 집단보다 위치모수가 더 큰 것을 확인함

Machine Learning Class - Report

Kim, Dong-Hyun

2023-11-30