Data 1: sex, pitch, my.df

  1. Create dummy data
pitch <- c(233,204,242,130,112,142) #categorial var.
sex <- c(rep("female", 3), rep("male",3))
my.df <- data.frame(sex, pitch); my.df
##      sex pitch
## 1 female   233
## 2 female   204
## 3 female   242
## 4   male   130
## 5   male   112
## 6   male   142


  1. Change vector name
data.frame(gender = sex, pitch)
##   gender pitch
## 1 female   233
## 2 female   204
## 3 female   242
## 4   male   130
## 5   male   112
## 6   male   142


  1. Make linear model
xmdl <- lm(pitch ~ sex, my.df)
# 모델
summary(xmdl)
## 
## Call:
## lm(formula = pitch ~ sex, data = my.df)
## 
## Residuals:
##       1       2       3       4       5       6 
##   6.667 -22.333  15.667   2.000 -16.000  14.000 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   226.33      10.18  22.224 2.43e-05 ***
## sexmale       -98.33      14.40  -6.827  0.00241 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.64 on 4 degrees of freedom
## Multiple R-squared:  0.921,  Adjusted R-squared:  0.9012 
## F-statistic: 46.61 on 1 and 4 DF,  p-value: 0.002407


  1. Calculate mean in terms of sex (sex지정하여)
with(my.df, mean(pitch[sex=='female']))
## [1] 226.3333
mean(my.df[my.df$sex=="female",]$pitch)
## [1] 226.3333


  1. dplyr 활용하여 sex별 pitch 평균 한꺼번에 보기
library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.2.2에서 작성되었습니다
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
tapply(my.df$pitch, sex, mean)
##   female     male 
## 226.3333 128.0000
my.df %>% 
  group_by(sex) %>% 
  summarise(mean=mean(pitch))
## # A tibble: 2 × 2
##   sex     mean
##   <chr>  <dbl>
## 1 female  226.
## 2 male    128


Data 2: age, pitch2, my.df2

  1. age에 따른 pitch 변화보기_make data
age <- c(14,23,35,48,52,67) #continuous var.
pitch2 <- c(252,244,240,233,212,204)
my.df2 <- data.frame(age, pitch2,stringAsFactor = T)
xmdl2 <- lm(pitch2 ~ age, my.df2)
summary(xmdl2)
## 
## Call:
## lm(formula = pitch2 ~ age, data = my.df2)
## 
## Residuals:
##      1      2      3      4      5      6 
## -2.338 -2.149  4.769  9.597 -7.763 -2.115 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 267.0765     6.8522   38.98 2.59e-06 ***
## age          -0.9099     0.1569   -5.80  0.00439 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.886 on 4 degrees of freedom
## Multiple R-squared:  0.8937, Adjusted R-squared:  0.8672 
## F-statistic: 33.64 on 1 and 4 DF,  p-value: 0.004395
#plot
plot(age, pitch2, data= my.df2)
## Warning in plot.window(...): "data"는 그래픽 매개변수가 아닙니다
## Warning in plot.xy(xy, type, ...): "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(side = side, at = at, labels = labels, ...): "data"는 그래픽 매
## 개변수가 아닙니다

## Warning in axis(side = side, at = at, labels = labels, ...): "data"는 그래픽 매
## 개변수가 아닙니다
## Warning in box(...): "data"는 그래픽 매개변수가 아닙니다
## Warning in title(...): "data"는 그래픽 매개변수가 아닙니다


linear model이 되기 위한 조건

  1. Linearity
#plot of residuals(rotated)
#잔차(residuals): 표본집단에서의 (기댓값-관측값) 차이=fitted values

plot(fitted(xmdl2), residuals(xmdl2)) #  (rotated!) residual plot: 잔차(fitted vlaues)가 y=0인 x절편위에 있음.
abline(a=0,b=0, col=5, lty="dashed") #좌표에 직선을 그음 0콤마0


  1. Absence of collinearity=공산성이 없어야 함
ex) "how average talking speed affects intelligence ratings?"에 관해 알고자 한다.
아래와 같은 회귀식 만들 수 있음.
intelligence rating ~ talking speed

talking speed를 알아보기 위해 초당 발화하는 syllables/words/sentences를 측정
  --> 말을 빨리하면 초당 발화하는 syllables/words/sentences이 많아지는 것이 당연하다.
    --> 그래서 초당 syllables/words/sentences를 predictors로 사용하면 collinearity 문제가 생김!


  1. Homoskedasticity(등분산성) OR ‘absence of heteroskedasticity(이분산성)’
plot(rnorm(100),rnorm(100))


  1. Normality of residuals
hist(residuals(xmdl2)) #히스토그램으로 확인 #ideal shape: bell 모양

qqnorm(residuals(xmdl2)) #qqplot으로 확인 #ideal shape: 정규분포처럼 1자


  1. Absence of influential data points
dfbeta(xmdl) #DFbeta 값
##     (Intercept)   sexmale
## 1  3.333333e+00 -3.333333
## 2 -1.116667e+01 11.166667
## 3  7.833333e+00 -7.833333
## 4 -1.359740e-16  1.000000
## 5  1.087792e-15 -8.000000
## 6 -9.518180e-16  7.000000


  1. Indepencence !


Data 3: Politness(subject, gender, scenario, attitude, freq)

  1. Import Politeness Data
politness <- read.csv("C:\\Users\\csjja\\Desktop\\ChosunSL-main\\politeness_data.csv", header=T)
summary(politness)
##    subject             gender             scenario   attitude        
##  Length:84          Length:84          Min.   :1   Length:84         
##  Class :character   Class :character   1st Qu.:2   Class :character  
##  Mode  :character   Mode  :character   Median :4   Mode  :character  
##                                        Mean   :4                     
##                                        3rd Qu.:6                     
##                                        Max.   :7                     
##                                                                      
##    frequency    
##  Min.   : 82.2  
##  1st Qu.:131.6  
##  Median :203.9  
##  Mean   :193.6  
##  3rd Qu.:248.6  
##  Max.   :306.8  
##  NA's   :1


  1. scenario를 factor로 다시 지정
politness$scenario<- as.factor(politness$scenario)
class(politness$scenario)
## [1] "factor"


  1. missing value 확인
which(is.na(politness$frequency)) #방법1
## [1] 39
which(!complete.cases(politness)) #방법2
## [1] 39


  1. 공손성(태도)와 피치 간의 관계를 평균을 통해 boxplot으로 보기
boxplot(frequency ~ attitude*gender, col=c("white","lightgray"),politness)


  1. 모델 만들기(ver1)
library(lme4)
## Warning: 패키지 'lme4'는 R 버전 4.2.2에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: Matrix
politeness.model <- lmer(frequency ~ attitude + (1|subject) + (1|scenario), data=politness)
summary(politeness.model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: frequency ~ attitude + (1 | subject) + (1 | scenario)
##    Data: politness
## 
## REML criterion at convergence: 793.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.2006 -0.5817 -0.0639  0.5625  3.4385 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  scenario (Intercept)  219     14.80   
##  subject  (Intercept) 4015     63.36   
##  Residual              646     25.42   
## Number of obs: 83, groups:  scenario, 7; subject, 6
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  202.588     26.754   7.572
## attitudepol  -19.695      5.585  -3.527
## 
## Correlation of Fixed Effects:
##             (Intr)
## attitudepol -0.103


  1. 모델 만들기(ver2)
politeness.model = lmer(frequency ~ attitude + gender + (1|subject) + (1|scenario), data=politness)
summary(politeness.model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: frequency ~ attitude + gender + (1 | subject) + (1 | scenario)
##    Data: politness
## 
## REML criterion at convergence: 775.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.2591 -0.6236 -0.0772  0.5388  3.4795 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  scenario (Intercept) 219.5    14.81   
##  subject  (Intercept) 615.6    24.81   
##  Residual             645.9    25.41   
## Number of obs: 83, groups:  scenario, 7; subject, 6
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  256.846     16.116  15.938
## attitudepol  -19.721      5.584  -3.532
## genderM     -108.516     21.013  -5.164
## 
## Correlation of Fixed Effects:
##             (Intr) atttdp
## attitudepol -0.173       
## genderM     -0.652  0.004


<추가 정보>