어머니들의 응답을 가지고. 기준을 : P17N15 총32개의 용어
# rm(list=ls())
# install.packages('readxl')
library(readxl)
# install.packages('dplyr')
library(dplyr)
# install.packages(lme4)
library(lme4)
# install.packages("lmerTest")
library(lmerTest)
# install.packages("ggplot2")
library(ggplot2)
# install.packages("sciplot")
library(sciplot)
# install.packages("openxlsx")
library("openxlsx")
# rm(list=ls())
# cdi data import
cdi <- read_excel("final_CDI_result.xlsx",
sheet = "시트1",
col_names = TRUE, # TRUE to use the first row as column names
na = "NA") # Character vector of strings to use for missing values
# change Column name
cdi <- rename(cdi, "subject" = "아동 이름")
cdi <- rename(cdi, "Birthdate" = "아동 생일")
cdi <- rename(cdi, "TestingDate" = "검사 날짜")
names(cdi) <- gsub("[0-9]", "", names(cdi))
names(cdi) <- gsub("[[:punct:]]","",names(cdi))
names(cdi) <- gsub(" ", "",names(cdi))
names(cdi)
## [1] "타임스탬프" "아동성별" "subject" "Birthdate"
## [5] "TestingDate" "아동연령" "보호자연락처" "보호자이메일"
## [9] "소리" "탈것" "장난감및문구류" "동물"
## [13] "옷" "가구및방안" "음식" "신체부위"
## [17] "가정용품" "외부사물" "일상생활" "장소"
## [21] "양정도" "사람" "의문사" "동사"
## [25] "형용사" "끝맺는말" "조사" "연결하는말"
## [29] "위치" "시간" "대명사" "돕는말"
## [33] "표현점수"
df data: Combine the three categories in each row into
onecdi_categ_words <- paste0(cdi$일상생활, ",", cdi$동사, ",", cdi$형용사, ",") # paste with comma end of each cell
cdi_categ_words <- sub(",$", "", cdi_categ_words) #remove the very last comma
as.character(cdi$`subject`) -> subject # subject를 character로 만듦.
as.character(cdi$`Birthdate`) -> Birthdate # Birthdate character로 만듦.
as.character(cdi$TestingDate) -> TestingDate # TestingDate character로 만듦.
data.frame(subject, Birthdate,TestingDate) -> df # 참여자
data.frame(df, cdi_categ_words) -> df #참여자+3개category단어
#na check
sum(is.na(df)) #0
## [1] 0
#P값이 1이상(N값이 -1)인 단어들 모음
pn <- read_excel("230516_긍부정_finalDataset.xlsx",
sheet = "standard1",
# range = "A1:D35",
col_names = TRUE,
na = "NA")
str(pn)
## tibble [32 × 4] (S3: tbl_df/tbl/data.frame)
## $ ...1 : num [1:32] 6 12 19 27 33 39 54 72 78 83 ...
## $ Words : chr [1:32] "고마워" "사랑해" "안아" "좋아해" ...
## $ SentiWord_Dict: num [1:32] 2.6 2.7 2.3 2.58 2.33 ...
## $ P/N : chr [1:32] "P" "P" "P" "P" ...
unique(pn$`P/N`) #변수 확인
## [1] "P" "N"
is.na(unique(df$'cdi_categ_words'))
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# pn 데이터 분할 및 할당(긍/부정어)
# (1) 구간설정해서 할당- to count the number of p/n words.
pn_pos <- pn[1:17,]
pn_neg <- pn[18:32,]
# (2) 직접 할당- to calculate likert score.
positive <- eval(parse(text = pn_pos[,2]))
positive_weights<- eval(parse(text = pn_pos[,3]))
negative<- eval(parse(text = pn_neg[,2]))
negative_weights<- eval(parse(text = pn_neg[,3]))
df$pn_words.for (i in 1:length(df$subject)){
vec <- gsub(',', ' ', df$cdi_categ_words[i])
vec <- gsub(' +', ' ', vec)
list <- strsplit(vec, ' ')
if (length(pn$Words[which(as.vector(pn$Words) %in% list[[1]])]) == 0) {
df$pn_words[i] <- NA
} else {
df$pn_words[i] <- paste(pn$Words[which(pn$Words %in% list[[1]])], collapse = ",") #일치하는 단어 paste하기
}
}
pos_neg_wordcount <- function(x){
pos_prop <- length(intersect(x, positive)) / length(positive)
neg_prop <- length(intersect(x, negative)) / length(negative)
return((pos_prop - neg_prop) / (pos_prop + neg_prop))
if (pos_prop >= neg_prop) {
return((pos_prop - neg_prop) / (pos_prop + neg_prop))
} else {
return(-1 * (neg_prop - pos_prop) / (pos_prop + neg_prop))
}
}
#
for (i in 1:length(df$subject)){
df[i,5] -> ah
unlist(strsplit(as.character(ah), ',')) -> ah
pos_neg_wordcount(ah) -> ah
df$pn_wordcount[i] <- paste(ah)
}
pos_neg_index <- function(x){
pos_prop <- sum(positive_weights[match(x, positive)], na.rm = TRUE) / sum(positive_weights)
neg_prop <- abs(sum(negative_weights[match(x, negative)], na.rm = TRUE)) / abs(sum(negative_weights))
if (pos_prop >= neg_prop) {
return((pos_prop - neg_prop) / (pos_prop + neg_prop))
} else {
return(-1 * (neg_prop - pos_prop) / (pos_prop + neg_prop))
}
}
for (i in 1:length(df$subject)){
df[i,5] -> a
unlist(strsplit(as.character(a), ',')) -> a
pos_neg_index(a) -> a
df$pn_score[i] <- paste(a)
}
as.numeric(df$pn_wordcount) -> df$pn_wordcount
as.numeric(df$pn_score) -> df$pn_score
df$pn_wordcount <- sprintf("%.2f", df$pn_wordcount) # 소숫점 두째 자리까지
df$pn_score <- sprintf("%.2f", df$pn_score) # 소숫점 두째 자리까지
str(df) #264 obs
## 'data.frame': 262 obs. of 7 variables:
## $ subject : chr "추제니" "양리온(P02)" "김서하" "전시우(P06)" ...
## $ Birthdate : chr "2020-07-06" "2018-04-03" "2018-05-30" "2017-11-11" ...
## $ TestingDate : chr "2022-05-27" "2020-02-19" "2020-02-20" "2020-05-02" ...
## $ cdi_categ_words: chr "고마워, 네/응, 돼, 만세, 목욕, 빠이빠이, 쉬, 아니(야), 안녕, 안돼, 양치, 응가/똥, 하지마,가, 가리켜, 가져, 간지"| __truncated__ "고마워, 네/응, 돼, 만세, 목욕, 빠이빠이, 쉬, 아니(야), 안녕, 안돼, 양치, 응가/똥, 화이팅, 하지마,가, 간지럽혀, "| __truncated__ "네/응, 빠이빠이, 아니(야), 안녕,(통에)넣어, 마셔, 박수쳐, 뽀뽀해, 사랑해, 앉아, 일어나/일어서, (잠)자,더러워, "| __truncated__ "고마워, 네/응, 돼, 만세, 목욕, 빠이빠이, 쉬, 아니(야), 안녕, 안돼, 양치, 응가/똥, 화이팅, 하지마,가, 가리켜, 가"| __truncated__ ...
## $ pn_words : chr "고마워,사랑해,안아,좋아해,놀아,간지럽혀,도와,숨어,웃어,춤춰,간지러워,맛있어,예뻐,괜찮아,귀여워,재미있어,커,안돼"| __truncated__ "고마워,사랑해,안아,좋아해,놀아,간지럽혀,도와,숨어,웃어,춤춰,간지러워,맛있어,예뻐,괜찮아,귀여워,재미있어,안돼,울"| __truncated__ "사랑해,더러워,아파" "고마워,사랑해,안아,좋아해,놀아,간지럽혀,도와,숨어,웃어,춤춰,간지러워,맛있어,예뻐,괜찮아,귀여워,재미있어,커,안돼"| __truncated__ ...
## $ pn_wordcount : chr "0.20" "0.04" "-0.39" "0.00" ...
## $ pn_score : chr "0.19" "0.06" "-0.29" "0.00" ...
options(encoding = 'UTF-8')
cdi_percentile <- read.csv("cdi_merge_0416.csv",head=T) #264 obs
#na 확인
sum(is.na(cdi_percentile))
## [1] 0
#check mode
mode(cdi_percentile$TestingDate) #character
## [1] "character"
mode(cdi_percentile$Birthdate) #character
## [1] "character"
mode(cdi_percentile$subject) #character
## [1] "character"
mode(df$TestingDate) #character
## [1] "character"
mode(df$Birthdate) #character
## [1] "character"
mode(df$subject) #character
## [1] "character"
dfdf$Index <- 1:nrow(df)
df <- left_join(df, cdi_percentile, by = "Index")
mode(df$subject)
## [1] "NULL"
mode(cdi_percentile$subject)
## [1] "character"
#정규분포 확인
# qqnorm(df$pn_wordcount, ylab="pn_wordcount") ; qqline(df$pn_wordcount, col='red')
# qqnorm(df$pn_score, ylab="pn_score") ; qqline(df$pn_score, col='blue')
# lm(eachPercentile ~ log(pn_wordcount + 0.000001) , data=df) -> lm_fit2;summary(lm_fit2)
as.numeric(df$pn_score)->df$pn_score
df$pn_wordcount <- as.numeric(df$pn_wordcount)
mode(df$AgeAtCDI)
## [1] "numeric"
#final result
summary(lm(eachPercentile ~ pn_wordcount , data=df))
##
## Call:
## lm(formula = eachPercentile ~ pn_wordcount, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.703 -22.142 0.772 25.880 42.892
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.108 1.873 29.95 < 2e-16 ***
## pn_wordcount 14.656 4.806 3.05 0.00257 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.1 on 225 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.03969, Adjusted R-squared: 0.03543
## F-statistic: 9.3 on 1 and 225 DF, p-value: 0.002566
summary(lm(eachPercentile ~ pn_wordcount + Gender , data=df)) #with Gender
##
## Call:
## lm(formula = eachPercentile ~ pn_wordcount + Gender, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.609 -22.140 0.681 25.836 42.988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.0125 2.6800 20.901 < 2e-16 ***
## pn_wordcount 14.6581 4.8167 3.043 0.00262 **
## GenderM 0.1859 3.7398 0.050 0.96041
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.17 on 224 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.0397, Adjusted R-squared: 0.03113
## F-statistic: 4.631 on 2 and 224 DF, p-value: 0.0107
summary(lm(eachPercentile ~ pn_wordcount + AgeAtCDI, data = df))
##
## Call:
## lm(formula = eachPercentile ~ pn_wordcount + AgeAtCDI, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.231 -21.884 0.716 26.529 43.643
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.7211 10.9149 5.563 7.52e-08 ***
## pn_wordcount 14.3106 4.8816 2.932 0.00372 **
## AgeAtCDI -0.1818 0.4238 -0.429 0.66828
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.15 on 224 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.04048, Adjusted R-squared: 0.03192
## F-statistic: 4.725 on 2 and 224 DF, p-value: 0.009771
summary(lm(pn_wordcount ~ AgeAtCDI, data = df)) #연령변화에 따른 긍정개수 변화-> 긍정개수가 높아질수록 아동의 연령이 0.019만큼 감소한다?
##
## Call:
## lm(formula = pn_wordcount ~ AgeAtCDI, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.12466 -0.14016 0.02911 0.11809 0.98993
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.398398 0.146677 2.716 0.00712 **
## AgeAtCDI -0.014324 0.005708 -2.509 0.01280 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3845 on 225 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.02722, Adjusted R-squared: 0.0229
## F-statistic: 6.297 on 1 and 225 DF, p-value: 0.0128
summary(lm(AgeAtCDI ~ pn_wordcount, data = df)) #긍정개수 변화에 따른 연령변화->연령이 증가할수록 긍정점수가 2.9만큼 감소한다, 즉 부정적으로 된다?
##
## Call:
## lm(formula = AgeAtCDI ~ pn_wordcount, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.1623 -3.4334 -0.2011 2.8668 11.5283
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.3717 0.2952 85.943 <2e-16 ***
## pn_wordcount -1.9006 0.7574 -2.509 0.0128 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.429 on 225 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.02722, Adjusted R-squared: 0.0229
## F-statistic: 6.297 on 1 and 225 DF, p-value: 0.0128
# model comparison
# anova(lm_fit1,lm_fit1_1)
# attributes(lm_fit1)
# lm_fit1$coefficients #기울기와 절편
#fix encoding
# par(family="AppleGothic")
# install.packages("extrafont")
# library(extrafont)
# font_import()
ggplot(data=df, aes(x=df$pn_wordcount, y=df$eachPercentile)) + geom_point(aes(size=df$pn_wordcount, color="red")) + annotate("text", x = 0.18, y = 110, label = "긍정단어에 따른 어휘발달정도", size = 4, family = "AppleSDGothicNeo-Regular")
plot(df$pn_wordcount, df$eachPercentile,
xlab = "어휘수 기반 어휘감성가",
ylab = "언어발달결과",
main = "어휘수 기반 어휘감성가 변화에 따른 언어발달결과 변화",
family = "AppleSDGothicNeo-Regular")
# abline(lm_fit1$coefficients)
# plot(lm_fit1)
#잔차들이(residuals) 특정한 패턴이 없이 0 근처로 모여있으면 모여있을수록 좋음.
#0 근처로 모여있다는 것은 예측 값들이 회귀분석 직선을 중심으로 위아래로 잘 퍼져있다는 것이며, 특정 예측값들에 영향을 받지 않고 골로구 분포하고 있을 수록 세운 회귀분석 식이 잘 작동한다는 것을 나타냄.
# lm(eachPercentile ~ log(pn_score + 0.000001) , data=df) -> lm_fit4;summary(lm_fit4)
df$pn_score <- as.numeric(df$pn_score)
#final result
summary(lm(eachPercentile ~ pn_score, data=df))
##
## Call:
## lm(formula = eachPercentile ~ pn_score, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.11 -22.49 1.11 25.92 42.98
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.019 1.878 29.832 < 2e-16 ***
## pn_score 14.511 4.847 2.994 0.00306 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.12 on 225 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.03831, Adjusted R-squared: 0.03404
## F-statistic: 8.963 on 1 and 225 DF, p-value: 0.003062
summary(lm(eachPercentile ~ pn_score + Gender, data=df))
##
## Call:
## lm(formula = eachPercentile ~ pn_score + Gender, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.022 -22.486 1.203 25.827 43.073
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55.9266 2.6843 20.834 < 2e-16 ***
## pn_score 14.5124 4.8577 2.988 0.00313 **
## GenderM 0.1814 3.7425 0.048 0.96138
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.19 on 224 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.03832, Adjusted R-squared: 0.02973
## F-statistic: 4.463 on 2 and 224 DF, p-value: 0.01257
summary(lm(eachPercentile ~ pn_score + AgeAtCDI, data = df))
##
## Call:
## lm(formula = eachPercentile ~ pn_score + AgeAtCDI, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.64 -22.68 1.29 26.54 43.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.7425 10.9277 5.559 7.7e-08 ***
## pn_score 14.1550 4.9227 2.875 0.00442 **
## AgeAtCDI -0.1861 0.4241 -0.439 0.66126
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.17 on 224 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.03914, Adjusted R-squared: 0.03056
## F-statistic: 4.562 on 2 and 224 DF, p-value: 0.01143
summary(lm(pn_score ~ AgeAtCDI + Gender, data = df))#연령변화에 따른 긍정점수 변화-> 긍정점수가 높아질수록 아동의 연령이 0.019만큼 감소한다?
##
## Call:
## lm(formula = pn_score ~ AgeAtCDI + Gender, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.12673 -0.14356 0.01709 0.11570 0.97949
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.399645 0.146235 2.733 0.00678 **
## AgeAtCDI -0.014282 0.005711 -2.501 0.01311 *
## GenderM 0.008033 0.051077 0.157 0.87518
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3824 on 224 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.02722, Adjusted R-squared: 0.01853
## F-statistic: 3.133 on 2 and 224 DF, p-value: 0.04548
summary(lm(AgeAtCDI ~ pn_score, data = df)) #긍정점수 변화에 따른 연령변화->연령이 증가할수록 긍정점수가 2.7만큼 감소한다, 즉 부정적으로 된다?
##
## Call:
## lm(formula = AgeAtCDI ~ pn_score, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.1856 -3.4315 -0.2044 2.8638 11.5156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.3844 0.2957 85.831 <2e-16 ***
## pn_score -1.9113 0.7633 -2.504 0.013 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.429 on 225 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.02711, Adjusted R-squared: 0.02278
## F-statistic: 6.269 on 1 and 225 DF, p-value: 0.01299
# model comparison
# lm(eachPercentile ~ pn_score + Gender, data=df)-> lm_fit3_1;summary(lm_fit3_1)
# anova(lm_fit3,lm_fit3_1)
# attributes(lm_fit3)
# lm_fit3$coefficients #기울기와 절편
ggplot(data=df, aes(x=df$pn_score, y=df$eachPercentile)) + geom_point(aes(size=df$pn_score, color="blue"))
plot(df$pn_score, df$eachPercentile,
xlab = "척도 기반 어휘감성가 변화에 따른 언어발달결과 변화",
ylab = "언어발달결과",
main = "척도 기반 어휘감성가 변화에 따른 언어발달결과 변화",
family = "AppleSDGothicNeo-Regular")
# abline(lm_fit3$coefficients)
# plot(lm_fit3)