pitch <- c(233,204,242,130,112,142) #categorial var.
sex <- c(rep("female", 3), rep("male",3))
my.df <- data.frame(sex, pitch); my.df
## sex pitch
## 1 female 233
## 2 female 204
## 3 female 242
## 4 male 130
## 5 male 112
## 6 male 142
data.frame(gender = sex, pitch)
## gender pitch
## 1 female 233
## 2 female 204
## 3 female 242
## 4 male 130
## 5 male 112
## 6 male 142
xmdl <- lm(pitch ~ sex, my.df)
# 모델
summary(xmdl)
##
## Call:
## lm(formula = pitch ~ sex, data = my.df)
##
## Residuals:
## 1 2 3 4 5 6
## 6.667 -22.333 15.667 2.000 -16.000 14.000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 226.33 10.18 22.224 2.43e-05 ***
## sexmale -98.33 14.40 -6.827 0.00241 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.64 on 4 degrees of freedom
## Multiple R-squared: 0.921, Adjusted R-squared: 0.9012
## F-statistic: 46.61 on 1 and 4 DF, p-value: 0.002407
with(my.df, mean(pitch[sex=='female']))
## [1] 226.3333
mean(my.df[my.df$sex=="female",]$pitch)
## [1] 226.3333
library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.2.2에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
tapply(my.df$pitch, sex, mean)
## female male
## 226.3333 128.0000
my.df %>%
group_by(sex) %>%
summarise(mean=mean(pitch))
## # A tibble: 2 × 2
## sex mean
## <chr> <dbl>
## 1 female 226.
## 2 male 128
age <- c(14,23,35,48,52,67) #continuous var.
pitch2 <- c(252,244,240,233,212,204)
my.df2 <- data.frame(age, pitch2,stringAsFactor = T)
xmdl2 <- lm(pitch2 ~ age, my.df2)
summary(xmdl2)
##
## Call:
## lm(formula = pitch2 ~ age, data = my.df2)
##
## Residuals:
## 1 2 3 4 5 6
## -2.338 -2.149 4.769 9.597 -7.763 -2.115
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 267.0765 6.8522 38.98 2.59e-06 ***
## age -0.9099 0.1569 -5.80 0.00439 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.886 on 4 degrees of freedom
## Multiple R-squared: 0.8937, Adjusted R-squared: 0.8672
## F-statistic: 33.64 on 1 and 4 DF, p-value: 0.004395
#plot
plot(age, pitch2, data= my.df2)
## Warning in plot.window(...): "data"는 그래픽 매개변수가 아닙니다
## Warning in plot.xy(xy, type, ...): "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(side = side, at = at, labels = labels, ...): "data"는 그래픽 매
## 개변수가 아닙니다
## Warning in axis(side = side, at = at, labels = labels, ...): "data"는 그래픽 매
## 개변수가 아닙니다
## Warning in box(...): "data"는 그래픽 매개변수가 아닙니다
## Warning in title(...): "data"는 그래픽 매개변수가 아닙니다
pitch ~ sex + age + e#plot of residuals(rotated)
#잔차(residuals): 표본집단에서의 (기댓값-관측값) 차이=fitted values
plot(fitted(xmdl2), residuals(xmdl2)) # (rotated!) residual plot: 잔차(fitted vlaues)가 y=0인 x절편위에 있음.
abline(a=0,b=0, col=5, lty="dashed") #좌표에 직선을 그음 0콤마0
ex) "how average talking speed affects intelligence ratings?"에 관해 알고자 한다.
아래와 같은 회귀식 만들 수 있음.
intelligence rating ~ talking speed
talking speed를 알아보기 위해 초당 발화하는 syllables/words/sentences를 측정
--> 말을 빨리하면 초당 발화하는 syllables/words/sentences이 많아지는 것이 당연하다.
--> 그래서 초당 syllables/words/sentences를 predictors로 사용하면 collinearity 문제가 생김!
plot(rnorm(100),rnorm(100))
hist(residuals(xmdl2)) #히스토그램으로 확인 #ideal shape: bell 모양
qqnorm(residuals(xmdl2)) #qqplot으로 확인 #ideal shape: 정규분포처럼 1자
dfbeta(xmdl) #DFbeta 값
## (Intercept) sexmale
## 1 3.333333e+00 -3.333333
## 2 -1.116667e+01 11.166667
## 3 7.833333e+00 -7.833333
## 4 -1.359740e-16 1.000000
## 5 1.087792e-15 -8.000000
## 6 -9.518180e-16 7.000000
politness <- read.csv("C:\\Users\\csjja\\Desktop\\ChosunSL-main\\politeness_data.csv", header=T)
summary(politness)
## subject gender scenario attitude
## Length:84 Length:84 Min. :1 Length:84
## Class :character Class :character 1st Qu.:2 Class :character
## Mode :character Mode :character Median :4 Mode :character
## Mean :4
## 3rd Qu.:6
## Max. :7
##
## frequency
## Min. : 82.2
## 1st Qu.:131.6
## Median :203.9
## Mean :193.6
## 3rd Qu.:248.6
## Max. :306.8
## NA's :1
politness$scenario<- as.factor(politness$scenario)
class(politness$scenario)
## [1] "factor"
which(is.na(politness$frequency)) #방법1
## [1] 39
which(!complete.cases(politness)) #방법2
## [1] 39
boxplot(frequency ~ attitude*gender, col=c("white","lightgray"),politness)
library(lme4)
## Warning: 패키지 'lme4'는 R 버전 4.2.2에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: Matrix
politeness.model <- lmer(frequency ~ attitude + (1|subject) + (1|scenario), data=politness)
summary(politeness.model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: frequency ~ attitude + (1 | subject) + (1 | scenario)
## Data: politness
##
## REML criterion at convergence: 793.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.2006 -0.5817 -0.0639 0.5625 3.4385
##
## Random effects:
## Groups Name Variance Std.Dev.
## scenario (Intercept) 219 14.80
## subject (Intercept) 4015 63.36
## Residual 646 25.42
## Number of obs: 83, groups: scenario, 7; subject, 6
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 202.588 26.754 7.572
## attitudepol -19.695 5.585 -3.527
##
## Correlation of Fixed Effects:
## (Intr)
## attitudepol -0.103
politeness.model = lmer(frequency ~ attitude + gender + (1|subject) + (1|scenario), data=politness)
summary(politeness.model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: frequency ~ attitude + gender + (1 | subject) + (1 | scenario)
## Data: politness
##
## REML criterion at convergence: 775.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.2591 -0.6236 -0.0772 0.5388 3.4795
##
## Random effects:
## Groups Name Variance Std.Dev.
## scenario (Intercept) 219.5 14.81
## subject (Intercept) 615.6 24.81
## Residual 645.9 25.41
## Number of obs: 83, groups: scenario, 7; subject, 6
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 256.846 16.116 15.938
## attitudepol -19.721 5.584 -3.532
## genderM -108.516 21.013 -5.164
##
## Correlation of Fixed Effects:
## (Intr) atttdp
## attitudepol -0.173
## genderM -0.652 0.004
<추가 정보>
lmetest 사용하면 p-vlaue볼 수 있음.