2차 : 15P24N 총 39개의 용어에서 30개월 이상의 AoA를 빼서
P14N18 총 32개

1. Installing package and importing data.

# rm(list=ls())
# install.packages('readxl')
library(readxl)
# install.packages('dplyr')
library(dplyr)
# install.packages(lme4)
library(lme4)
# install.packages("lmerTest")
library(lmerTest)
# install.packages("ggplot2")
library(ggplot2)
# install.packages("sciplot")
library(sciplot)
# install.packages("openxlsx")
library("openxlsx")
# rm(list=ls())

directory 설정

getwd()
## [1] "C:/Users/csjja/Desktop/pn_data"
setwd("C:\\Users\\csjja\\Desktop\\pn_data")
dir()
##  [1] "~$긍부정.xlsx"                           
##  [2] "1_cdi_merge_0416.csv"                    
##  [3] "1_df.csv"                                
##  [4] "1_pn_aoa.csv"                            
##  [5] "14P21N_0419.html"                        
##  [6] "14P21N_0419.Rmd"                         
##  [7] "230221_영어학세미나_프로젝트 계획서.pptx"
##  [8] "2303_final_pn_anlaysis_revise_code.R"    
##  [9] "29mOUT_P11N16.html"                      
## [10] "29mOUT_P11N16.Rmd"                       
## [11] "30mOUT_P14N18.html"                      
## [12] "30mOUT_P14N18.Rmd"                       
## [13] "aoa_join_230405.R"                       
## [14] "cdi_merge_0407.csv"                      
## [15] "cdi_merge_0411.csv"                      
## [16] "cdi_merge_0416.csv"                      
## [17] "df.csv"                                  
## [18] "ex_Data"                                 
## [19] "female_WG_CDI_comprehension_norms.csv"   
## [20] "female_WS_CDI_production_norms.csv"      
## [21] "final_CDI_result.xlsx"                   
## [22] "final_pnData_230411.html"                
## [23] "final_pnData_230411.Rmd"                 
## [24] "kr_aoa_with_percentilel(edited).xlsx"    
## [25] "male_WG_CDI_comprehension_norms.csv"     
## [26] "male_WS_CDI_production_norms.csv"        
## [27] "norm_Data"                               
## [28] "percentileCal_2023_hyeonah.R"            
## [29] "PercentileCalculation_20211110.R"        
## [30] "pic"                                     
## [31] "pn_aoa.csv"                              
## [32] "pn_newdata.R"                            
## [33] "pnValue_check.R"                         
## [34] "ReadMe"                                  
## [35] "rsconnect"                               
## [36] "SESCDI.csv"                              
## [37] "SESCDI_230407.csv"                       
## [38] "Ttest.pptx"                              
## [39] "Ttest.R"                                 
## [40] "wd_Data"                                 
## [41] "긍부정.xlsx"                             
## [42] "긍부정_forsave.xlsx"

2. CDI data import and clean

# cdi data import
cdi <- read_excel("final_CDI_result.xlsx",
                      sheet = "시트1",
                      col_names = TRUE, # TRUE to use the first row as column names
                      na = "NA") # Character vector of strings to use for missing values
# change Column name
cdi <- rename(cdi, "subject" = "아동 이름")
cdi <- rename(cdi, "Birthdate" = "아동 생일")
cdi <- rename(cdi, "TestingDate" = "검사 날짜")

names(cdi) <- gsub("[0-9]", "", names(cdi))
names(cdi) <- gsub("[[:punct:]]","",names(cdi))
names(cdi) <- gsub(" ", "",names(cdi))
names(cdi)
##  [1] "타임스탬프"     "아동성별"       "subject"        "Birthdate"     
##  [5] "TestingDate"    "아동연령"       "보호자연락처"   "보호자이메일"  
##  [9] "소리"           "탈것"           "장난감및문구류" "동물"          
## [13] "옷"             "가구및방안"     "음식"           "신체부위"      
## [17] "가정용품"       "외부사물"       "일상생활"       "장소"          
## [21] "양정도"         "사람"           "의문사"         "동사"          
## [25] "형용사"         "끝맺는말"       "조사"           "연결하는말"    
## [29] "위치"           "시간"           "대명사"         "돕는말"        
## [33] "표현점수"
cdi_categ_words <- paste(cdi$일상생활, cdi$동사, cdi$형용사)

as.character(cdi$`subject`) -> subject # subject를 character로 만듦.
as.character(cdi$`Birthdate`) -> Birthdate # Birthdate character로 만듦.
as.character(cdi$TestingDate) -> TestingDate # TestingDate character로 만듦.
data.frame(subject, Birthdate,TestingDate) -> df # 참여자
data.frame(df, cdi_categ_words) -> df #참여자+3개category단어

#na check
sum(is.na(df)) #0
## [1] 0

3. P(ositive)/N(egative) data import and clean

pn <- read_excel("긍부정.xlsx",
                 sheet = "30mOUT_p14n18_prod",
                 # range = "A1:D35", 
                 col_names = TRUE,
                 na = "NA")
str(pn)
## tibble [32 × 4] (S3: tbl_df/tbl/data.frame)
##  $ ...1          : num [1:32] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Words         : chr [1:32] "고마워" "놀아" "괜찮아" "귀여워" ...
##  $ SentiWord_Dict: num [1:32] 2 1 1 2 2 2 2 1 2 1 ...
##  $ P/N           : chr [1:32] "P" "P" "P" "P" ...
unique(pn$`P/N`) #변수 확인
## [1] "P" "N"
is.na(unique(df$'categ_merged'))
## logical(0)
# pn 데이터 분할 및 할당(긍/부정어)
# (1) 직접 할당- to calculate likert score.
positive <- c("고마워","놀아","괜찮아","귀여워","맛있어","사랑해","예뻐","웃어","재미있어","조용해","좋아해","춤춰","도와","안아")
positive_weights <- c(2,1,1,2,2,2,2,1,2,1,2,1,1,1)
negative <- c("더러워","때려","무거워","무서워","미워해","숨어","시끄러워","싫어","아파","안돼","없어","울어","추워","혼나","힘들어","간지러워","더워","버려")
negative_weights <- c(-2,-1,-2,-2,-2,-1,-2,-2,-2,-1,-1,-1,-1,-2,-2,-1,-1,-2)

# (2) 구간설정해서 할당- to count the number of p/n words.
pn_pos <- pn[1:14,]
pn_neg <- pn[15:32,]
for (i in 1:length(df$subject)){
  vec <- gsub(',', ' ', df$cdi_categ_words[i])
  vec <- gsub(' +', ' ', vec)
  list <- strsplit(vec, ' ')
  if (length(pn$Words[which(as.vector(pn$Words) %in% list[[1]])]) == 0) {
    df$pn_words[i] <- NA
  } else {
    df$pn_words[i] <- paste(pn$Words[which(pn$Words %in% list[[1]])], collapse = ",") #일치하는 단어 paste하기
  }
}

4. Calculation number of P/N words and P/N score.

pos_neg_wordcount <- function(x){
  pos_prop <- length(intersect(x, positive)) / length(positive)
  neg_prop <- length(intersect(x, negative)) / length(negative)
  return((pos_prop - neg_prop) / (pos_prop + neg_prop))
  if (pos_prop >= neg_prop) {
    return((pos_prop - neg_prop) / (pos_prop + neg_prop))
  } else {
    return(-1 * (neg_prop - pos_prop) / (pos_prop + neg_prop))
  }
}
# 
for (i in 1:length(df$subject)){
  df[i,5] -> ah
  unlist(strsplit(as.character(ah), ',')) -> ah
  pos_neg_wordcount(ah) -> ah
  df$pn_wordcount[i] <- paste(ah)
}
pos_neg_index <- function(x){
  pos_prop <- sum(positive_weights[match(x, positive)], na.rm = TRUE) / sum(positive_weights)
  neg_prop <- abs(sum(negative_weights[match(x, negative)], na.rm = TRUE)) / abs(sum(negative_weights))
  if (pos_prop >= neg_prop) {
    return((pos_prop - neg_prop) / (pos_prop + neg_prop))
  } else {
    return(-1 * (neg_prop - pos_prop) / (pos_prop + neg_prop))
  }
}
for (i in 1:length(df$subject)){
  df[i,5] -> a
  unlist(strsplit(as.character(a), ',')) -> a
  pos_neg_index(a) -> a
  df$pn_score[i] <- paste(a)
}
as.numeric(df$pn_wordcount) -> df$pn_wordcount
as.numeric(df$pn_score) -> df$pn_score

df$pn_wordcount <- sprintf("%.2f", df$pn_wordcount) # 소숫점 두째 자리까지
df$pn_score <- sprintf("%.2f", df$pn_score) # 소숫점 두째 자리까지
str(df) #264 obs
## 'data.frame':    262 obs. of  7 variables:
##  $ subject        : chr  "추제니" "양리온(P02)" "김서하" "전시우(P06)" ...
##  $ Birthdate      : chr  "2020-07-06" "2018-04-03" "2018-05-30" "2017-11-11" ...
##  $ TestingDate    : chr  "2022-05-27" "2020-02-19" "2020-02-20" "2020-05-02" ...
##  $ cdi_categ_words: chr  "고마워, 네/응, 돼, 만세, 목욕, 빠이빠이, 쉬, 아니(야), 안녕, 안돼, 양치, 응가/똥, 하지마 가, 가리켜, 가져, 간지"| __truncated__ "고마워, 네/응, 돼, 만세, 목욕, 빠이빠이, 쉬, 아니(야), 안녕, 안돼, 양치, 응가/똥, 화이팅, 하지마 가, 간지럽혀, "| __truncated__ "네/응, 빠이빠이, 아니(야), 안녕 (통에)넣어, 마셔, 박수쳐, 뽀뽀해, 사랑해, 앉아, 일어나/일어서, (잠)자 더러워, "| __truncated__ "고마워, 네/응, 돼, 만세, 목욕, 빠이빠이, 쉬, 아니(야), 안녕, 안돼, 양치, 응가/똥, 화이팅, 하지마 가, 가리켜, 가"| __truncated__ ...
##  $ pn_words       : chr  "고마워,놀아,괜찮아,귀여워,맛있어,사랑해,예뻐,웃어,재미있어,좋아해,춤춰,도와,안아,더러워,때려,숨어,싫어,아파,안"| __truncated__ "고마워,놀아,괜찮아,귀여워,맛있어,사랑해,예뻐,웃어,재미있어,조용해,좋아해,춤춰,도와,안아,더러워,때려,무거워,무서"| __truncated__ "사랑해,더러워,아파,추워" "고마워,놀아,괜찮아,귀여워,맛있어,사랑해,예뻐,웃어,재미있어,조용해,좋아해,춤춰,도와,안아,더러워,때려,무거워,무서"| __truncated__ ...
##  $ pn_wordcount   : chr  "0.16" "0.03" "-0.40" "0.00" ...
##  $ pn_score       : chr  "0.19" "0.04" "-0.30" "0.00" ...

5. CDI percentile data import

options(encoding = 'UTF-8')
cdi_percentile <- read.csv("cdi_merge_0416.csv",head=T) #264 obs

#na 확인
sum(is.na(cdi_percentile)) 
## [1] 0
#check mode
mode(cdi_percentile$TestingDate) #character
## [1] "character"
mode(cdi_percentile$Birthdate) #character
## [1] "character"
mode(cdi_percentile$subject) #character
## [1] "character"
mode(df$TestingDate) #character
## [1] "character"
mode(df$Birthdate) #character
## [1] "character"
mode(df$subject) #character
## [1] "character"
df <- left_join(df, cdi_percentile,key='subject')

6. Analayis

정규분포 확인(not working with Rmarkdown)

#정규분포 확인
# qqnorm(df$pn_wordcount, ylab="pn_wordcount") ; qqline(df$pn_wordcount, col='red')
# qqnorm(df$pn_score, ylab="pn_score") ; qqline(df$pn_score, col='blue')

Hypothesis 1: 감정어 개수에 따른 백분위 변화수

  • dependent var.: CDI percentile
  • fixed effect: number of P/N words
  • 긍정단어 개수를 많이 아는 아동일수록 cdi percentile이 낮다. ***
  • 즉, 긍정단어를 많이 알수록 백분위가 감소한다.
# lm(eachPercentile ~ log(pn_wordcount + 0.000001) , data=df) -> lm_fit2;summary(lm_fit2)
as.numeric(df$pn_score)->df$pn_score
df$pn_wordcount <- as.numeric(df$pn_wordcount)
lm(eachPercentile ~ pn_wordcount , data=df)->lm_fit1;summary(lm_fit1)
## 
## Call:
## lm(formula = eachPercentile ~ pn_wordcount, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.805 -23.328   0.564  25.753  42.645 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    56.355      1.881  29.956  < 2e-16 ***
## pn_wordcount   14.321      5.094   2.811  0.00537 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.25 on 224 degrees of freedom
##   (결측으로 인하여 36개의 관측치가 삭제되었습니다.)
## Multiple R-squared:  0.03408,    Adjusted R-squared:  0.02977 
## F-statistic: 7.903 on 1 and 224 DF,  p-value: 0.005372
attributes(lm_fit1)
## $names
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "na.action"     "xlevels"       "call"          "terms"        
## [13] "model"        
## 
## $class
## [1] "lm"
lm_fit1$coefficients #기울기와 절편
##  (Intercept) pn_wordcount 
##     56.35522     14.32141
  • graph
ggplot(data=df, aes(x=df$pn_wordcount, y=df$eachPercentile)) + geom_point(aes(size=df$pn_wordcount, color="red"))

plot(df$pn_wordcount, df$eachPercentile,
     xlab = "감정어 개수",
     ylab = "CDI percentile",
     main = "감정어개수 변화에 따른 CDI percentile 변화")

abline(lm_fit1$coefficients)

plot(lm_fit1)

#잔차들이(residuals) 특정한 패턴이 없이 0 근처로 모여있으면 모여있을수록 좋음. 
#0 근처로 모여있다는 것은 예측 값들이 회귀분석 직선을 중심으로 위아래로 잘 퍼져있다는 것이며, 특정 예측값들에 영향을 받지 않고 골로구 분포하고 있을 수록 세운 회귀분석 식이 잘 작동한다는 것을 나타냄.

Hypothesis 2: 감정점수에 따른 백분위 변화

  • dependent var.: CDI percentile
  • fixed effect: P/N score
  • 긍정점수가 높은 아동일수록 cdi percentile이 낮다. ***
# lm(eachPercentile ~ log(pn_score + 0.000001) , data=df) -> lm_fit4;summary(lm_fit4)
df$pn_score <- as.numeric(df$pn_score)
lm(eachPercentile ~ pn_score, data=df)-> lm_fit3;summary(lm_fit3)
## 
## Call:
## lm(formula = eachPercentile ~ pn_score, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.585 -23.851   1.596  26.133  42.926 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   56.074      1.901  29.497   <2e-16 ***
## pn_score      12.057      5.002   2.411   0.0167 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.38 on 224 degrees of freedom
##   (결측으로 인하여 36개의 관측치가 삭제되었습니다.)
## Multiple R-squared:  0.02528,    Adjusted R-squared:  0.02093 
## F-statistic: 5.811 on 1 and 224 DF,  p-value: 0.01674
attributes(lm_fit3)
## $names
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "na.action"     "xlevels"       "call"          "terms"        
## [13] "model"        
## 
## $class
## [1] "lm"
lm_fit3$coefficients #기울기와 절편
## (Intercept)    pn_score 
##    56.07422    12.05744
  • graph
ggplot(data=df, aes(x=df$pn_score, y=df$eachPercentile)) + geom_point(aes(size=df$pn_score, color="blue"))

plot(df$pn_score, df$eachPercentile,
     xlab = "감정어 점수",
     ylab = "CDI percentile",
     main = "감정어 점수 변화에 따른 CDI percentile 변화")
abline(lm_fit3$coefficients)

plot(lm_fit3)

Hypothesis 3: 성별에 따른 백분위 변화

  • dependent var.: 표현낱말 백분위
  • fixed effect: Gender
  • no difference
# summary(lm(log(eachPercentile +0.00001) ~ Gender , data=df))
lm(eachPercentile ~ Gender, data=df)-> lm_fit5;summary(lm_fit5)
## 
## Call:
## lm(formula = eachPercentile ~ Gender, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -53.631 -27.400   1.293  30.293  47.293 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   53.631      2.716  19.743   <2e-16 ***
## GenderM       -2.924      3.716  -0.787    0.432    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30 on 260 degrees of freedom
## Multiple R-squared:  0.002376,   Adjusted R-squared:  -0.001461 
## F-statistic: 0.6191 on 1 and 260 DF,  p-value: 0.4321

Hypothesis 4: 성별에 따른 긍점단어개수 변화

  • dependent var.: pn_wordcount
  • fixed effect: Gender
  • no difference
# summary(lm(log(pn_wordcount +0.000001) ~ Gender , data=df))
lm(pn_wordcount ~ Gender, data=df)-> lm_fit6;summary(lm_fit6) 
## 
## Call:
## lm(formula = pn_wordcount ~ Gender, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02127 -0.08965 -0.01127  0.10422  0.98422 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.021273   0.035323   0.602    0.548
## GenderM     -0.005497   0.049304  -0.111    0.911
## 
## Residual standard error: 0.3705 on 224 degrees of freedom
##   (결측으로 인하여 36개의 관측치가 삭제되었습니다.)
## Multiple R-squared:  5.549e-05,  Adjusted R-squared:  -0.004409 
## F-statistic: 0.01243 on 1 and 224 DF,  p-value: 0.9113

Hypothesis 5: 성별에 따른 긍정점수 변화

  • dependent var.: pn score
  • fixed effect: Gender
  • no difference
# summary(lm(log(pn_score +0.0000001) ~ Gender , data=df))
lm(pn_score ~ Gender, data=df)-> lm_fit7;summary(lm_fit7) 
## 
## Call:
## lm(formula = pn_score ~ Gender, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.05145 -0.09931 -0.00931  0.10069  0.96069 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.05145    0.03613   1.424    0.156
## GenderM     -0.01214    0.05044  -0.241    0.810
## 
## Residual standard error: 0.379 on 224 degrees of freedom
##   (결측으로 인하여 36개의 관측치가 삭제되었습니다.)
## Multiple R-squared:  0.0002588,  Adjusted R-squared:  -0.004204 
## F-statistic: 0.05798 on 1 and 224 DF,  p-value: 0.8099