#7월 19일
rm(list=ls()) #1 - 이때까지 저장된거 지우기
setwd("C:/data")
getwd()
## [1] "C:/data"
test<-read.csv('Data1.csv')
# 행복에 대한 변수 만들기
attach(test)
names(test)
## [1] "Q1" "Q2" "Q3" "Q4" "Q5" "Q6"
## [7] "Q7" "Q8" "Q9" "Q10" "Q11" "Q12"
## [13] "Q13" "Q14" "Q15" "Q16" "Q17" "Q18"
## [19] "Q19" "Q20" "Gender1" "EDU1" "BF" "BM"
## [25] "Happiness" "Peace"
test<-transform(test,Happiness1=(Q11+Q12+Q13+Q14+Q15)/5)
names(test)
## [1] "Q1" "Q2" "Q3" "Q4" "Q5"
## [6] "Q6" "Q7" "Q8" "Q9" "Q10"
## [11] "Q11" "Q12" "Q13" "Q14" "Q15"
## [16] "Q16" "Q17" "Q18" "Q19" "Q20"
## [21] "Gender1" "EDU1" "BF" "BM" "Happiness"
## [26] "Peace" "Happiness1"
detach(test)
#분산 동질성 검증
var.test(Happiness1 ~ Gender1,data=test)
##
## F test to compare two variances
##
## data: Happiness1 by Gender1
## F = 0.96247, num df = 1135, denom df = 788, p-value = 0.5567
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.8456798 1.0937359
## sample estimates:
## ratio of variances
## 0.962471
nrow(test)
## [1] 1925
#등분산성 검정.
#귀무가설 : 두 집단의 등분산성은 같다
#대립가설 : 두 집단의 등분산성은 같지 않다.
#p-value)0.5715) > 유의수준 0.05보다 크므로 귀무가설 채택
#대립가설 검증
t.test(Happiness1 ~ Gender1,var.equal=TRUE,data=test)
##
## Two Sample t-test
##
## data: Happiness1 by Gender1
## t = 1.3585, df = 1923, p-value = 0.1745
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.0208855 0.1150460
## sample estimates:
## mean in group 0 mean in group 1
## 3.566725 3.519645
#Two Sample t-test : 독립표본 t.test
#t = 1.3585
#df = 1923 : 자유도 = 표본크기 - 2
NROW(test)
## [1] 1925
tail(test)
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11 Q12 Q13 Q14 Q15 Q16 Q17 Q18 Q19 Q20
## 1920 4 4 3 4 4 2 2 3 4 2 2 4 3 4 4 3 4 4 3 4
## 1921 2 2 2 1 2 2 2 2 2 2 1 3 2 1 3 2 2 2 2 2
## 1922 3 2 2 2 3 1 1 1 1 1 3 3 3 4 4 4 4 5 2 2
## 1923 5 4 4 4 4 2 2 2 2 3 3 4 3 4 3 3 3 4 4 4
## 1924 4 4 4 2 2 4 2 4 4 3 3 2 3 4 3 4 4 4 3 4
## 1925 3 3 1 1 2 1 1 1 1 1 4 4 3 2 2 3 4 4 3 2
## Gender1 EDU1 BF BM Happiness Peace Happiness1
## 1920 1 3 3.8 2.6 3.4 3.6 3.4
## 1921 1 2 1.8 2.0 2.0 2.0 2.0
## 1922 0 2 2.4 1.0 3.4 3.4 3.4
## 1923 0 2 4.2 2.2 3.4 3.6 3.4
## 1924 1 2 3.2 3.4 3.0 3.8 3.0
## 1925 0 3 2.0 1.0 3.0 3.2 3.0
#p-value = 0.1745 : 귀무가설 맞다는 가정하에 확률
# 귀무가설 : 남녀 행복 차이 없다.
# 대립가설 : 남녀 행복 차이가 있다.
# p-value = 0.1745 > 유의수준 0.05보다 크다
# 따라서 귀무가설 채택된다.
# alternative hypothesis : 대립가설
# 95 percent confidence interval : 95% 신뢰구간
# -0.0208855 0.1150460 -> 0이 이 안에 속하기 때문에 귀무가설이 맞다
boxplot(Happiness1~Gender1, data=test)
#집단에 따른 기술 통계량
library(psych)

describeBy(test$Happiness1, test$Gender1)
##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1136 3.57 0.74 3.6 3.59 0.59 1.4 5 3.6 -0.39 -0.21 0.02
## ------------------------------------------------------------
## group: 1
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 789 3.52 0.76 3.6 3.54 0.59 1.4 5 3.6 -0.35 -0.18 0.03
#2) 짝지어진 T 검증의 분석 방법
# 짝지어진 test : 한 집단의 전후 차이 검정.
# 독립표본 test : 각각 두 집단의 평균차이 검정
before<-c(59,72,85,69,78,82,55)
after<-c(54,65,84,63,72,83,51)
#귀무가설 : 다이어트약의 효과가 없다.
#대립가설 : 다이어트약의 효과가 있다.
t.test(before,after,mu=0,alternative='greater',paired=T)
##
## Paired t-test
##
## data: before and after
## t = 3.5949, df = 6, p-value = 0.005718
## alternative hypothesis: true mean difference is greater than 0
## 95 percent confidence interval:
## 1.837829 Inf
## sample estimates:
## mean difference
## 4
#단일표본 t.test : 한개의 집단의 평균검정
#독립표본 t.test : 독립적인 2개의 집단 평균 감정
#대응비표 t.test : 짝지어진 집단의 평균 차이 검정
data("mtcars")
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(mtcars)
## Rows: 32
## Columns: 11
## $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8,…
## $ cyl <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8,…
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 16…
## $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 180…
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,…
## $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.150, 3.…
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.90, 18…
## $ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,…
## $ am <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,…
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3,…
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2,…
t.test(mtcars$mpg,mu=20,alternative='greater')
##
## One Sample t-test
##
## data: mtcars$mpg
## t = 0.08506, df = 31, p-value = 0.4664
## alternative hypothesis: true mean is greater than 20
## 95 percent confidence interval:
## 18.28418 Inf
## sample estimates:
## mean of x
## 20.09062
#예제 1
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
#예제2
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
#예제 3
mean(iris$Sepal.Length)
## [1] 5.843333
median(iris$Sepal.Length)
## [1] 5.8
sd(iris$Sepal.Length)
## [1] 0.8280661
var(iris$Sepal.Length)
## [1] 0.6856935
quantile(iris$Sepal.Length)
## 0% 25% 50% 75% 100%
## 4.3 5.1 5.8 6.4 7.9
quantile(iris$Sepal.Length,3/4)
## 75%
## 6.4
max(iris$Sepal.Length)
## [1] 7.9
min(iris$Sepal.Length)
## [1] 4.3
#예제4
library(MASS)
##
## 다음의 패키지를 부착합니다: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data("Animals")
head(Animals)
## body brain
## Mountain beaver 1.35 8.1
## Cow 465.00 423.0
## Grey wolf 36.33 119.5
## Goat 27.66 115.0
## Guinea pig 1.04 5.5
## Dipliodocus 11700.00 50.0
summary(Animals)
## body brain
## Min. : 0.02 Min. : 0.40
## 1st Qu.: 3.10 1st Qu.: 22.23
## Median : 53.83 Median : 137.00
## Mean : 4278.44 Mean : 574.52
## 3rd Qu.: 479.00 3rd Qu.: 420.00
## Max. :87000.00 Max. :5712.00
quantile(Animals$body)
## 0% 25% 50% 75% 100%
## 0.023 3.100 53.830 479.000 87000.000
library(ISLR)
data('attitude')
glimpse(attitude)
## Rows: 30
## Columns: 7
## $ rating <dbl> 43, 63, 71, 61, 81, 43, 58, 71, 72, 67, 64, 67, 69, 68, 77,…
## $ complaints <dbl> 51, 64, 70, 63, 78, 55, 67, 75, 82, 61, 53, 60, 62, 83, 77,…
## $ privileges <dbl> 30, 51, 68, 45, 56, 49, 42, 50, 72, 45, 53, 47, 57, 83, 54,…
## $ learning <dbl> 39, 54, 69, 47, 66, 44, 56, 55, 67, 47, 58, 39, 42, 45, 72,…
## $ raises <dbl> 61, 63, 76, 54, 71, 54, 66, 70, 71, 62, 58, 59, 55, 59, 79,…
## $ critical <dbl> 92, 73, 86, 84, 83, 49, 68, 66, 83, 80, 67, 74, 63, 77, 77,…
## $ advance <dbl> 45, 47, 48, 35, 47, 34, 35, 41, 31, 41, 34, 41, 25, 35, 46,…
out<-lm(rating~.,data=attitude)
summary(out)
##
## Call:
## lm(formula = rating ~ ., data = attitude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.9418 -4.3555 0.3158 5.5425 11.5990
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.78708 11.58926 0.931 0.361634
## complaints 0.61319 0.16098 3.809 0.000903 ***
## privileges -0.07305 0.13572 -0.538 0.595594
## learning 0.32033 0.16852 1.901 0.069925 .
## raises 0.08173 0.22148 0.369 0.715480
## critical 0.03838 0.14700 0.261 0.796334
## advance -0.21706 0.17821 -1.218 0.235577
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.068 on 23 degrees of freedom
## Multiple R-squared: 0.7326, Adjusted R-squared: 0.6628
## F-statistic: 10.5 on 6 and 23 DF, p-value: 1.24e-05
#lm(),종속변수~.(모든 변수를 독립변수 투입)
#out<-lm(rating~complaints+privileges,data=attitude)
#회귀분석 결과는 summary()로 확인한다.
data('attitude')
attitude %>% NROW
## [1] 30
#30(표본크기)-(독립변수의 수(6)+1) = 자유도(23)