data and package import

align<- read.csv("C:\\Users\\csjja\\Desktop\\sentence_data_20220222.csv", header=T)
# install.packages(dplyr)
library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.2.2에서 작성되었습니다
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# install.packages(lme4)
library(lme4)
## Warning: 패키지 'lme4'는 R 버전 4.2.2에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: Matrix
sub("^2_complete.TEXTGRID","A0P01M",align$ID)->align$ID
sub("^3_complete.TEXTGRID","A2P01F",align$ID)->align$ID
sub("^4_complete.TEXTGRID","A1P01M",align$ID)->align$ID
sub("^5_complete.TEXTGRID","A2P02M",align$ID)->align$ID
sub("^6_complete.TEXTGRID","A0P02F",align$ID)->align$ID
sub("^7_complete.TEXTGRID","A0P03F",align$ID)->align$ID
sub("^8_complete.TEXTGRID","A1P02M",align$ID)->align$ID
sub("^9_complete.TEXTGRID","A2P03F",align$ID)->align$ID
sub("11_complete.TEXTGRID","A1P04M",align$ID)->align$ID
sub("12_complete.TEXTGRID","A2P04M",align$ID)->align$ID
sub("13_complete.TEXTGRID","A0P04M",align$ID)->align$ID
sub("14_complete.TEXTGRID","A2P05M",align$ID)->align$ID
sub("15_complete.TEXTGRID","A1P05F",align$ID)->align$ID
sub("16_complete.TEXTGRID","A1P06F",align$ID)->align$ID
sub("17_complete.TEXTGRID","A2P06M",align$ID)->align$ID
sub("18_complete.TEXTGRID","A2P07M",align$ID)->align$ID
sub("19_complete.TEXTGRID","A2P08M",align$ID)->align$ID
sub("20_complete.TEXTGRID","A2P09F",align$ID)->align$ID
sub("21_complete.TEXTGRID","A0P05M",align$ID)->align$ID
sub("22_complete.TEXTGRID","A2P10F",align$ID)->align$ID
sub("23_complete.TEXTGRID","A1P07F",align$ID)->align$ID
sub("24_complete.TEXTGRID","A1P08M",align$ID)->align$ID
sub("25_complete.TEXTGRID","A1P09M",align$ID)->align$ID
sub("26_complete.TEXTGRID","A2P11M",align$ID)->align$ID
sub("27_complete.TEXTGRID","A0P06M",align$ID)->align$ID
sub("28_complete.TEXTGRID","A0P07M",align$ID)->align$ID
sub("29_complete.TEXTGRID","A0P08M",align$ID)->align$ID
sub("30_complete.TEXTGRID","A0P09F",align$ID)->align$ID
sub("31_complete.TEXTGRID","A0P10F",align$ID)->align$ID
sub("32_complete.TEXTGRID","A1P10F",align$ID)->align$ID
sub("33_complete.TEXTGRID","A2P12M",align$ID)->align$ID
sub("34_complete.TEXTGRID","A1P11F",align$ID)->align$ID
sub("35_complete.TEXTGRID","A0P11M",align$ID)->align$ID
sub("36_complete.TEXTGRID","A0P12M",align$ID)->align$ID
sub("37_complete.TEXTGRID","A1P12F",align$ID)->align$ID

data check

str(align) #데이터프레임 구조
## 'data.frame':    4209 obs. of  17 variables:
##  $ X                     : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ ID                    : chr  "A1P04M" "A1P04M" "A1P04M" "A1P04M" ...
##  $ left.right            : chr  "" "" "" "" ...
##  $ sent_end.touch_end    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sent_end.touch_start  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sent_start.touch_end  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sent_start.touch_start: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sent_end              : num  343 356 360 386 408 ...
##  $ sent_start            : num  340 356 359 385 405 ...
##  $ prev_sent_end         : num  340 343 356 360 386 ...
##  $ touch_align_sent_I_F  : chr  "" "" "" "" ...
##  $ touch_duration        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ touch_end             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ touch_middle          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ touch_start           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ align_start           : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ align_end             : int  NA NA NA NA NA NA NA NA NA NA ...
head(align)
##   X     ID left.right sent_end.touch_end sent_end.touch_start
## 1 0 A1P04M                            NA                   NA
## 2 1 A1P04M                            NA                   NA
## 3 2 A1P04M                            NA                   NA
## 4 3 A1P04M                            NA                   NA
## 5 4 A1P04M                            NA                   NA
## 6 5 A1P04M                            NA                   NA
##   sent_start.touch_end sent_start.touch_start sent_end sent_start prev_sent_end
## 1                   NA                     NA  343.289    339.937       339.937
## 2                   NA                     NA  356.184    356.014       343.289
## 3                   NA                     NA  360.407    358.757       356.184
## 4                   NA                     NA  385.616    384.796       360.407
## 5                   NA                     NA  407.716    404.996       385.616
## 6                   NA                     NA  425.172    423.382       407.716
##   touch_align_sent_I_F touch_duration touch_end touch_middle touch_start
## 1                                  NA        NA           NA          NA
## 2                                  NA        NA           NA          NA
## 3                                  NA        NA           NA          NA
## 4                                  NA        NA           NA          NA
## 5                                  NA        NA           NA          NA
## 6                                  NA        NA           NA          NA
##   align_start align_end
## 1          NA        NA
## 2          NA        NA
## 3          NA        NA
## 4          NA        NA
## 5          NA        NA
## 6          NA        NA

문제1. align_start 와 align_end 는 각각 어떤 경우에 0으로 어떤 경우에 1로 코딩되어 있는지 파악하라.

align$align_start_cal_check <- ifelse(abs(align$sent_start - align$touch_start)<=0.500, 1, 0)
align$align_end_cal_check <- ifelse(abs(align$sent_end - align$touch_end)<=0.500, 1, 0)
str(align) #총4209개의 데이터
## 'data.frame':    4209 obs. of  19 variables:
##  $ X                     : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ ID                    : chr  "A1P04M" "A1P04M" "A1P04M" "A1P04M" ...
##  $ left.right            : chr  "" "" "" "" ...
##  $ sent_end.touch_end    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sent_end.touch_start  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sent_start.touch_end  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sent_start.touch_start: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sent_end              : num  343 356 360 386 408 ...
##  $ sent_start            : num  340 356 359 385 405 ...
##  $ prev_sent_end         : num  340 343 356 360 386 ...
##  $ touch_align_sent_I_F  : chr  "" "" "" "" ...
##  $ touch_duration        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ touch_end             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ touch_middle          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ touch_start           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ align_start           : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ align_end             : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ align_start_cal_check : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ align_end_cal_check   : num  NA NA NA NA NA NA NA NA NA NA ...

문제2. 문장시작과 문장 끝점이 터치와 align 된 비율을 각 나이대 별로 구하라. 왼손 오른손을 분리해서 구한다.

align$age <- substr(align$ID, 1, 2)
head(align)
##   X     ID left.right sent_end.touch_end sent_end.touch_start
## 1 0 A1P04M                            NA                   NA
## 2 1 A1P04M                            NA                   NA
## 3 2 A1P04M                            NA                   NA
## 4 3 A1P04M                            NA                   NA
## 5 4 A1P04M                            NA                   NA
## 6 5 A1P04M                            NA                   NA
##   sent_start.touch_end sent_start.touch_start sent_end sent_start prev_sent_end
## 1                   NA                     NA  343.289    339.937       339.937
## 2                   NA                     NA  356.184    356.014       343.289
## 3                   NA                     NA  360.407    358.757       356.184
## 4                   NA                     NA  385.616    384.796       360.407
## 5                   NA                     NA  407.716    404.996       385.616
## 6                   NA                     NA  425.172    423.382       407.716
##   touch_align_sent_I_F touch_duration touch_end touch_middle touch_start
## 1                                  NA        NA           NA          NA
## 2                                  NA        NA           NA          NA
## 3                                  NA        NA           NA          NA
## 4                                  NA        NA           NA          NA
## 5                                  NA        NA           NA          NA
## 6                                  NA        NA           NA          NA
##   align_start align_end align_start_cal_check align_end_cal_check age
## 1          NA        NA                    NA                  NA  A1
## 2          NA        NA                    NA                  NA  A1
## 3          NA        NA                    NA                  NA  A1
## 4          NA        NA                    NA                  NA  A1
## 5          NA        NA                    NA                  NA  A1
## 6          NA        NA                    NA                  NA  A1
# touch_align_sent_I_F가 final 이거나 initial 인 행만 추출
length(which(align$touch_align_sent_I_F=="Final"|align$touch_align_sent_I_F=="Initial"))
## [1] 1399
  1. 연령: A0
length(which(align$age=="A0"&align$left.right=="Lhand"&(align$touch_align_sent_I_F=="Final"|align$touch_align_sent_I_F=="Initial"))) #age=A0,왼손 #277
## [1] 277
length(which(align$age=="A0"&align$left.right=="Rhand"&(align$touch_align_sent_I_F=="Final"|align$touch_align_sent_I_F=="Initial"))) #age=A0, 오른손 #641
## [1] 641
A0_lr.df <- c("A0",277/1399*100, 641/1399*100) #0살의 왼손과 오른손 align 비율 계산
  1. 연령: A1
length(which(align$age=="A1"&align$left.right=="Lhand"&(align$touch_align_sent_I_F=="Final"|align$touch_align_sent_I_F=="Initial"))) #age=A1, 왼손 #78
## [1] 78
length(which(align$age=="A1"&align$left.right=="Rhand"&(align$touch_align_sent_I_F=="Final"|align$touch_align_sent_I_F=="Initial"))) #age=A1, 오른손 #179
## [1] 179
A1_lr.df <- c("A1",78/1399*100, 179/1399*100) #1살의 왼손과 오른손 align 비율 계산
  1. 연령: A2
length(which(align$age=="A2"&align$left.right=="Lhand"&(align$touch_align_sent_I_F=="Final"|align$touch_align_sent_I_F=="Initial"))) #age=A2, 왼손#36
## [1] 36
length(which(align$age=="A2"&align$left.right=="Rhand"&(align$touch_align_sent_I_F=="Final"|align$touch_align_sent_I_F=="Initial"))) #age=A2, 오른손 #119
## [1] 119
A2_lr.df <- c("A2",36/1399*100, 119/1399*100) #2살의 왼손과 오른손 align 비율 계산
merged <- rbind(A0_lr.df, A1_lr.df, A2_lr.df)
colnames(merged) = c("age","Lhand","Rhand" )
merged <-as.data.frame(merged);head(merged) #table형태를 df으로 변환
##          age            Lhand            Rhand
## A0_lr.df  A0 19.7998570407434 45.8184417441029
## A1_lr.df  A1 5.57541100786276  12.794853466762
## A2_lr.df  A2 2.57326661901358   8.506075768406

문제3. align 된 비율이 왼손/오른손 위치와 나이에 따라 추측이 가능한지에 대한 통계모델을 세워 분석하라.

align_new <- align %>% 
  filter(touch_align_sent_I_F=="Final"|touch_align_sent_I_F=="Initial") #touch align된 데이터만 필터

summary(glmer(align_start ~ left.right + age +(1|ID), family = "binomial", data=align_new)) #수정
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: align_start ~ left.right + age + (1 | ID)
##    Data: align_new
## 
##      AIC      BIC   logLik deviance df.resid 
##   1376.7   1408.1   -682.3   1364.7     1393 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -0.6866 -0.5081 -0.4591 -0.3800  2.7866 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  ID     (Intercept) 0.101    0.3178  
## Number of obs: 1399, groups:  ID, 35
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)   
## (Intercept)      -1.3103     0.4417  -2.966  0.00301 **
## left.rightRhand  -0.1710     0.1498  -1.141  0.25372   
## ageA0            -0.1865     0.4631  -0.403  0.68719   
## ageA1             0.2721     0.4799   0.567  0.57065   
## ageA2             0.3371     0.4908   0.687  0.49226   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) lft.rR ageA0  ageA1 
## lft.rghtRhn -0.139                     
## ageA0       -0.921 -0.101              
## ageA1       -0.889 -0.094  0.870       
## ageA2       -0.867 -0.106  0.852  0.823
# summary(lm(Lhand ~ age, data=merged)) 
summary(lm(Rhand ~ age, data=merged))
## 
## Call:
## lm(formula = Rhand ~ age, data = merged)
## 
## Residuals:
## ALL 3 residuals are 0: no residual degrees of freedom!
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    45.82        NaN     NaN      NaN
## ageA1         -33.02        NaN     NaN      NaN
## ageA2         -37.31        NaN     NaN      NaN
## 
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 2 and 0 DF,  p-value: NA