# install.packages("dplyr")
library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.2.2에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# install.packages("ggplot2")
library(ggplot2)
# install.packages("lmerTest")
library(lmerTest)
## Warning: 패키지 'lmerTest'는 R 버전 4.2.2에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: lme4
## Warning: 패키지 'lme4'는 R 버전 4.2.2에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: Matrix
##
## 다음의 패키지를 부착합니다: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
# install.packages("lme4")
library(lme4)
# install.packages("sciplot")
library(sciplot)
mlu_data<- read.csv("C:\\Users\\csjja\\Desktop\\data1_mlu_by_age_register.csv", header=T);head(mlu_data)
## File age sex X.utterances X.words mlu sd register
## 1 11_A1P04M.cha A1 M 559 1693 3.029 1.945 CDS
## 2 12_A2P04M.cha A2 M 481 1672 3.476 2.647 CDS
## 3 13_A0P04M.cha A0 M 565 1289 2.281 1.434 CDS
## 4 14_A2P05M.cha A2 M 722 2117 2.932 1.968 CDS
## 5 15_A1P05F.cha A1 F 381 1046 2.745 1.909 CDS
## 6 16_A1P06F.cha A1 F 613 1445 2.357 1.631 CDS
class(mlu_data$File)
## [1] "character"
mlu_data$File <- as.factor(mlu_data$File)
#filter
cds_data <- mlu_data %>%
filter(register=="CDS")
View(cds_data)
LM1<- lm(mlu ~ age, cds_data)
summary(LM1)
##
## Call:
## lm(formula = mlu ~ age, data = cds_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.76750 -0.22516 0.04017 0.25184 0.48717
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.49750 0.09305 26.839 < 2e-16 ***
## ageA1 0.09332 0.13456 0.694 0.492985
## ageA2 0.49133 0.13160 3.734 0.000736 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3224 on 32 degrees of freedom
## Multiple R-squared: 0.3281, Adjusted R-squared: 0.2861
## F-statistic: 7.812 on 2 and 32 DF, p-value: 0.001726
#result graph
boxplot(mlu ~ age, col=c("green","blue","red"),cds_data) #수정한그래프
# ggplot(data=cds_data, aes(x=age, y=mlu)) + geom_line() #처음에 그린 그래프
dependent var: mlu, fixed effect: File이라고
생각함.age.model <- lmer(mlu ~ age + (1|File), data = mlu_data)
코드를 돌림. boundary (singular) fit: see help('isSingular')
error발생.dependent var: cds의 mlu,
fixed effect(predictor): age#filter_시행착오1
#한꺼번에 하면 추출이 안됨.
mlu_data %>%
filter(register=="ADS_Fam" & register=="ADS_Exp")
## [1] File age sex X.utterances X.words
## [6] mlu sd register
## <0 행> <또는 row.names의 길이가 0입니다>
#하나씩하면 됨.
famAds <- mlu_data %>%
filter(register=="ADS_Fam")
expAds <- mlu_data %>%
filter(register=="ADS_Exp")
#해결책1: rbind사용
ads_data <- famAds %>% bind_rows(expAds)
#해결책2(쉬운 방법): CDS만 빼고 filter**
ads_data <- mlu_data %>%
filter(register !="CDS")
ggplot(ads_data, aes(x=File, y=mlu, fill=register)) + geom_col()
LM2 <- lm(mlu ~ register, ads_data)
summary(LM2)
##
## Call:
## lm(formula = mlu ~ register, data = ads_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4942 -0.8750 -0.2530 0.5824 5.3118
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.1229 0.2451 12.741 < 2e-16 ***
## registerADS_Fam 1.7603 0.3466 5.078 3.17e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.45 on 68 degrees of freedom
## Multiple R-squared: 0.275, Adjusted R-squared: 0.2643
## F-statistic: 25.79 on 1 and 68 DF, p-value: 3.173e-06
LM3 <- lmer(mlu ~ register + (1|File), data =ads_data)
summary(LM3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: mlu ~ register + (1 | File)
## Data: ads_data
##
## REML criterion at convergence: 242.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.4786 -0.5159 -0.2076 0.3433 2.5727
##
## Random effects:
## Groups Name Variance Std.Dev.
## File (Intercept) 0.9773 0.9886
## Residual 1.1254 1.0609
## Number of obs: 70, groups: File, 35
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 3.1229 0.2451 55.9201 12.741 < 2e-16 ***
## registerADS_Fam 1.7603 0.2536 34.0000 6.942 5.3e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## rgstrADS_Fm -0.517
#normality test
qqnorm(residuals(LM3)) #잔차확인 그래프 #y=x 형태->정규분포 ㅇㅇ
#result graph
ggplot(data=ads_data, aes(x=register, y=mlu, fill=register)) + geom_boxplot() #수정한그래프
# ggplot(data=ads_data, aes(x=register, y=mlu)) + geom_line() #처음 그렸던 그래프
lmerTest를 통해 p-value를 보니 유의함을 알 수
있음.data2 <- read.table("C:\\Users\\csjja\\Desktop\\data2_utterancelevel_@wp@sc@o_SpeakingRate_with_all_no_xxx.txt", header=T)# data import
head(data2) # data check
## file_id U_id Speaker MOT_N_words CHI_N_words MOT_N_char CHI_N_char
## 1 11_A1P04M.cha 1 MOT 3 0 7 0
## 2 11_A1P04M.cha 2 MOT 6 0 13 0
## 3 11_A1P04M.cha 3 MOT 4 0 8 0
## 4 11_A1P04M.cha 4 MOT 1 0 1 0
## 5 11_A1P04M.cha 5 MOT 7 0 12 0
## 6 11_A1P04M.cha 6 MOT 6 0 10 0
## MOT_dur CHI_dur MOT_R_words CHI_R_words MOT_R_char CHI_R_char
## 1 1.252 0 2.396 0 5.591 0
## 2 2.428 0 2.471 0 5.354 0
## 3 1.012 0 3.953 0 7.905 0
## 4 0.309 0 3.236 0 3.236 0
## 5 2.971 0 2.356 0 4.039 0
## 6 2.827 0 2.122 0 3.537 0
## MOT_N_words_ads1 CHI_N_words_ads1 MOT_N_char_ads1 CHI_N_char_ads1
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## MOT_dur_ads1 CHI_dur_ads1 MOT_R_words_ads1 CHI_R_words_ads1 MOT_R_char_ads1
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## CHI_R_char_ads1 MOT_N_words_ads2 CHI_N_words_ads2 MOT_N_char_ads2
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## CHI_N_char_ads2 MOT_dur_ads2 CHI_dur_ads2 MOT_R_words_ads2 CHI_R_words_ads2
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## MOT_R_char_ads2 CHI_R_char_ads2
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
which(!complete.cases(data2)) # na check
## [1] 9279 27851 32082
m_data를 데이터 분석에 사용age_data <- mlu_data %>%
select(File, age) # File과 age만 있는 칼럼 추출
names(data2)[1] <- c("File") # data2의 'file_id"를 file로 변경
m_data <- merge(x = age_data, y = data2, by='File', all= TRUE) #"File"기준으로 merge-->age알기 위해
# m_data # data2에 age가 들어있는 df
summary(lmer(MOT_R_words ~ age + (1|File), data=m_data)) #종속변수:MOT_R_words
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: MOT_R_words ~ age + (1 | File)
## Data: m_data
##
## REML criterion at convergence: 327699
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.6307 -0.8745 -0.0920 0.7025 7.4301
##
## Random effects:
## Groups Name Variance Std.Dev.
## File (Intercept) 0.05965 0.2442
## Residual 1.65234 1.2854
## Number of obs: 98061, groups: File, 35
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1.53399 0.07094 32.07641 21.623 < 2e-16 ***
## ageA1 0.07731 0.10253 32.01114 0.754 0.45635
## ageA2 -0.32411 0.10023 31.94762 -3.234 0.00284 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) ageA1
## ageA1 -0.692
## ageA2 -0.708 0.490
summary(lmer(MOT_R_char ~ age + (1|File), data=m_data))#종속변수:MOT_R_char
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: MOT_R_char ~ age + (1 | File)
## Data: m_data
##
## REML criterion at convergence: 459520.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.6308 -0.9683 -0.0351 0.7725 5.8992
##
## Random effects:
## Groups Name Variance Std.Dev.
## File (Intercept) 0.2089 0.4571
## Residual 6.3379 2.5175
## Number of obs: 98061, groups: File, 35
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 3.2892 0.1328 32.1015 24.760 < 2e-16 ***
## ageA1 -0.1221 0.1920 32.0301 -0.636 0.529286
## ageA2 -0.7934 0.1877 31.9606 -4.228 0.000184 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) ageA1
## ageA1 -0.692
## ageA2 -0.708 0.490
summary(lmer(MOT_dur ~ age + (1|File), data=m_data))#종속변수:MOT_dur
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: MOT_dur ~ age + (1 | File)
## Data: m_data
##
## REML criterion at convergence: 304610.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.5725 -0.7440 -0.2200 0.4539 13.2857
##
## Random effects:
## Groups Name Variance Std.Dev.
## File (Intercept) 0.04896 0.2213
## Residual 1.30568 1.1427
## Number of obs: 98061, groups: File, 35
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1.32835 0.06426 32.04234 20.673 < 2e-16 ***
## ageA1 -0.20814 0.09287 31.97954 -2.241 0.0321 *
## ageA2 -0.43338 0.09078 31.91843 -4.774 3.85e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) ageA1
## ageA1 -0.692
## ageA2 -0.708 0.490
lineplot.CI(m_data$age, m_data$MOT_R_words) #나이 변화에 따른 단어발화 속도 변화
lineplot.CI(m_data$age, m_data$MOT_R_char) #나이 변화에 따른 음절발화 속도 변화
lineplot.CI(m_data$age, m_data$MOT_dur) #나이 변화에 따른 초당 발화 길이 변화
<여기까진완료, 그런데 data2-문제2의 데이터 체크하는 과정을 잘
모르겠음.>
# summary(lmer(MOT_R_words ~ MOT_R_words_ads1 + (1|File), data=m_data)) #종속변수: 단어발화속도, 독립변수: 가족에게
# summary(lmer(MOT_R_words ~ MOT_R_words_ads2 + (1|File), data=m_data)) #종속변수: 단어발화속도, 독립변수: 실험자에게
# summary(lmer(MOT_R_char ~ MOT_R_words_ads1 + (1|File), data=m_data)) #종속변수: 음절발화속도, 독립변수: 가족에게
# summary(lmer(MOT_R_char ~ MOT_R_words_ads2 + (1|File), data=m_data)) #종속변수: 음절발화속도, 독립변수: 실험자에게
# ggplot(data=m_data, aes(x=MOT_R_words_ads1, y=MOT_R_words)) + geom_line()
#
# ggplot(data=m_data, aes(x=MOT_R_words_ads2, y=MOT_R_words)) + geom_line()
#
# ggplot(data=m_data, aes(x=MOT_R_words_ads1, y=MOT_R_char)) + geom_line()
#
# ggplot(data=m_data, aes(x=MOT_R_words_ads2, y=MOT_R_char)) + geom_line()