final_pnData_230411/2306

1차: 15P24N 총 39개의 용어

1. Installing package and importing data.

package import

rm(list=ls())
# install.packages('readxl')
library(readxl)
# install.packages('dplyr')
library(dplyr)
# install.packages(lme4)
library(lme4)
# install.packages("lmerTest")
library(lmerTest)
# install.packages("ggplot2")
library(ggplot2)
# install.packages("sciplot")
library(sciplot)
# install.packages("openxlsx")
library("openxlsx")
# install.packages("xtable")
library(xtable)

directory 설정

2. CDI data import and clean

# cdi data import
cdi <- read_excel("final_CDI_result.xlsx",
                      sheet = "시트1",
                      col_names = TRUE, # TRUE to use the first row as column names
                      na = "NA") # Character vector of strings to use for missing values
# change Column name
cdi <- rename(cdi, "subject" = "아동 이름")
cdi <- rename(cdi, "Birthdate" = "아동 생일")
cdi <- rename(cdi, "TestingDate" = "검사 날짜")

names(cdi) <- gsub("[0-9]", "", names(cdi))
names(cdi) <- gsub("[[:punct:]]","",names(cdi))
names(cdi) <- gsub(" ", "",names(cdi))
names(cdi)

##  [1] "타임스탬프"     "아동성별"       "subject"        "Birthdate"     
##  [5] "TestingDate"    "아동연령"       "보호자연락처"   "보호자이메일"  
##  [9] "소리"           "탈것"           "장난감및문구류" "동물"          
## [13] "옷"             "가구및방안"     "음식"           "신체부위"      
## [17] "가정용품"       "외부사물"       "일상생활"       "장소"          
## [21] "양정도"         "사람"           "의문사"         "동사"          
## [25] "형용사"         "끝맺는말"       "조사"           "연결하는말"    
## [29] "위치"           "시간"           "대명사"         "돕는말"        
## [33] "표현점수"

df data: Combine the three categories in each row into one

cdi_categ_words <- paste0(cdi$일상생활, ",", cdi$동사, ",", cdi$형용사, ",") # paste with comma end of each cell
cdi_categ_words <- sub(",$", "", cdi_categ_words) #remove the very last comma

as.character(cdi$`subject`) -> subject # subject를 character로 만듦.
as.character(cdi$`Birthdate`) -> Birthdate # Birthdate character로 만듦.
as.character(cdi$TestingDate) -> TestingDate # TestingDate character로 만듦.
data.frame(subject, Birthdate,TestingDate) -> df # 참여자
data.frame(df, cdi_categ_words) -> df #참여자+3개category단어

#na check
sum(is.na(df)) #0

## [1] 0

3. P(ositive)/N(egative) data import and clean

pn <- read_excel("230516_긍부정_finalDataset.xlsx",
                 sheet = "15P24N",
                 col_names = TRUE,
                 na = "NA")
str(pn)

## tibble [39 × 4] (S3: tbl_df/tbl/data.frame)
##  $ ...1          : num [1:39] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Words         : chr [1:39] "고마워" "놀아" "괜찮아" "귀여워" ...
##  $ SentiWord_Dict: num [1:39] 2 1 1 2 2 2 2 1 2 1 ...
##  $ P/N           : chr [1:39] "P" "P" "P" "P" ...

unique(pn$`P/N`) #변수 확인

## [1] "P" "N"

is.na(unique(df$'cdi_categ_words'))

##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

# pn 데이터 분할 및 할당(긍/부정어)
# (1) 구간설정해서 할당- to count the number of p/n words.
pn_pos <- pn[1:15,]
pn_neg <- pn[16:39,]
# (2) 직접 할당- to calculate likert score.
positive <- eval(parse(text = pn_pos[,2]))
positive_weights<- eval(parse(text = pn_pos[,3]))
negative<- eval(parse(text = pn_neg[,2]))
negative_weights<- eval(parse(text = pn_neg[,3]))

# positive <- c("고마워","놀아","괜찮아","귀여워","맛있어","사랑해","예뻐","웃어","재미있어","조용해","좋아해","춤춰","도와","안아","커")
# positive_weights <- c(2,1,1,2,2,2,2,1,2,1,2,1,1,1,1)
# negative <- c("나빠","더러워","때려","맛없어","무거워","무서워","미워해","숨어","시끄러워","싫어","심심해","아파","안돼","없어","울어","추워","혼나","힘들어","물어","간지러워","작아","간지럽혀","더워","버려") 
# negative_weights <- c(-2,-2,-1,-1,-2,-2,-2,-1,-2,-2,-1,-2,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-1,-2)

Pasting corresponding CDI words in P/N.
For statement: if there is a matching word, paste it and put it into df$pn_words.

for (i in 1:length(df$subject)){
  vec <- gsub(',', ' ', df$cdi_categ_words[i])
  vec <- gsub(' +', ' ', vec)
  list <- strsplit(vec, ' ')
  if (length(pn$Words[which(as.vector(pn$Words) %in% list[[1]])]) == 0) {
    df$pn_words[i] <- NA
  } else {
    df$pn_words[i] <- paste(pn$Words[which(pn$Words %in% list[[1]])], collapse = ",") #일치하는 단어 paste하기
  }
}

4. Calculation number of P/N words and P/N score.

function for wordcount

pos_neg_wordcount <- function(x){
  pos_prop <- length(intersect(x, positive)) / length(positive)
  neg_prop <- length(intersect(x, negative)) / length(negative)
  return((pos_prop - neg_prop) / (pos_prop + neg_prop))
  if (pos_prop >= neg_prop) {
    return((pos_prop - neg_prop) / (pos_prop + neg_prop))
  } else {
    return(-1 * (neg_prop - pos_prop) / (pos_prop + neg_prop))
  }
}

FOR statement: Calculate number of words for each individual.

# 
for (i in 1:length(df$subject)){
  df[i,5] -> ah
  unlist(strsplit(as.character(ah), ',')) -> ah
  pos_neg_wordcount(ah) -> ah
  df$pn_wordcount[i] <- paste(ah)
}

function for P/N score

pos_neg_index <- function(x){
  pos_prop <- sum(positive_weights[match(x, positive)], na.rm = TRUE) / sum(positive_weights)
  neg_prop <- abs(sum(negative_weights[match(x, negative)], na.rm = TRUE)) / abs(sum(negative_weights))
  if (pos_prop >= neg_prop) {
    return((pos_prop - neg_prop) / (pos_prop + neg_prop))
  } else {
    return(-1 * (neg_prop - pos_prop) / (pos_prop + neg_prop))
  }
}

FOR statement: Calculate index for each individual.

for (i in 1:length(df$subject)){
  df[i,5] -> a
  unlist(strsplit(as.character(a), ',')) -> a
  pos_neg_index(a) -> a
  df$pn_score[i] <- paste(a)
}

Adjusting output results.

as.numeric(df$pn_wordcount) -> df$pn_wordcount
as.numeric(df$pn_score) -> df$pn_score

df$pn_wordcount <- sprintf("%.2f", df$pn_wordcount) # 소숫점 두째 자리까지
df$pn_score <- sprintf("%.2f", df$pn_score) # 소숫점 두째 자리까지
str(df) #264 obs

## 'data.frame':    262 obs. of  7 variables:
##  $ subject        : chr  "추제니" "양리온(P02)" "김서하" "전시우(P06)" ...
##  $ Birthdate      : chr  "2020-07-06" "2018-04-03" "2018-05-30" "2017-11-11" ...
##  $ TestingDate    : chr  "2022-05-27" "2020-02-19" "2020-02-20" "2020-05-02" ...
##  $ cdi_categ_words: chr  "고마워, 네/응, 돼, 만세, 목욕, 빠이빠이, 쉬, 아니(야), 안녕, 안돼, 양치, 응가/똥, 하지마,가, 가리켜, 가져, 간지"| __truncated__ "고마워, 네/응, 돼, 만세, 목욕, 빠이빠이, 쉬, 아니(야), 안녕, 안돼, 양치, 응가/똥, 화이팅, 하지마,가, 간지럽혀, "| __truncated__ "네/응, 빠이빠이, 아니(야), 안녕,(통에)넣어, 마셔, 박수쳐, 뽀뽀해, 사랑해, 앉아, 일어나/일어서, (잠)자,더러워, "| __truncated__ "고마워, 네/응, 돼, 만세, 목욕, 빠이빠이, 쉬, 아니(야), 안녕, 안돼, 양치, 응가/똥, 화이팅, 하지마,가, 가리켜, 가"| __truncated__ ...
##  $ pn_words       : chr  "고마워,놀아,괜찮아,귀여워,맛있어,사랑해,예뻐,웃어,재미있어,좋아해,춤춰,도와,안아,커,더러워,때려,맛없어,숨어,싫"| __truncated__ "고마워,놀아,괜찮아,귀여워,맛있어,사랑해,예뻐,웃어,재미있어,조용해,좋아해,춤춰,도와,안아,더러워,때려,맛없어,무거"| __truncated__ "사랑해,더러워,아파,추워" "고마워,놀아,괜찮아,귀여워,맛있어,사랑해,예뻐,웃어,재미있어,조용해,좋아해,춤춰,도와,안아,커,나빠,더러워,때려,맛"| __truncated__ ...
##  $ pn_wordcount   : chr  "0.14" "0.06" "-0.30" "0.00" ...
##  $ pn_score       : chr  "0.18" "0.07" "-0.22" "0.00" ...

5. CDI percentile data import

options(encoding = 'UTF-8')
cdi_percentile <- read.csv("cdi_merge_0416.csv",head=T) #264 obs

#na 확인
sum(is.na(cdi_percentile))

## [1] 0

#check mode
mode(cdi_percentile$TestingDate) #character

## [1] "character"

mode(cdi_percentile$Birthdate) #character

## [1] "character"

mode(cdi_percentile$subject) #character

## [1] "character"

mode(df$TestingDate) #character

## [1] "character"

mode(df$Birthdate) #character

## [1] "character"

mode(df$subject) #character

## [1] "character"

Creating final BIG data-> df

df$Index <- 1:nrow(df)
df <- left_join(df, cdi_percentile, by = "Index")
mode(df$subject)

## [1] "NULL"

mode(cdi_percentile$subject)

## [1] "character"

6. Analayis

정규분포 확인(not working with Rmarkdown)

#정규분포 확인
# qqnorm(df$pn_wordcount, ylab="pn_wordcount") ; qqline(df$pn_wordcount, col='red')
# qqnorm(df$pn_score, ylab="pn_score") ; qqline(df$pn_score, col='blue')

Hypothesis 1: 감정어 개수에 따른 백분위 변화수

dependent var.: CDI percentile
fixed effect: number of P/N words
긍정단어 개수를 많이 아는 아동일수록 cdi percentile이 낮다. ***
즉, 긍정단어를 많이 알수록 백분위가 감소한다.

# lm(eachPercentile ~ log(pn_wordcount + 0.000001) , data=df) -> lm_fit2;summary(lm_fit2)
as.numeric(df$pn_score)->df$pn_score
df$pn_wordcount <- as.numeric(df$pn_wordcount)
mode(df$AgeAtCDI)

## [1] "numeric"

#final result
summary(lm(eachPercentile ~ pn_wordcount , data=df))

## 
## Call:
## lm(formula = eachPercentile ~ pn_wordcount, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.062 -24.734   1.419  26.469  43.188 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    55.812      1.924  29.002   <2e-16 ***
## pn_wordcount   10.985      5.191   2.116   0.0354 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.4 on 225 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.01951,    Adjusted R-squared:  0.01516 
## F-statistic: 4.478 on 1 and 225 DF,  p-value: 0.03543

summary(lm(eachPercentile ~ pn_wordcount + Gender , data=df)) #with Gender

## 
## Call:
## lm(formula = eachPercentile ~ pn_wordcount + Gender, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.039 -24.739   1.442  26.447  43.211 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  55.78895    2.72751  20.454   <2e-16 ***
## pn_wordcount 10.98464    5.20281   2.111   0.0359 *  
## GenderM       0.04456    3.77889   0.012   0.9906    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.46 on 224 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.01951,    Adjusted R-squared:  0.01076 
## F-statistic: 2.229 on 2 and 224 DF,  p-value: 0.11

summary(lm(eachPercentile ~ pn_wordcount + AgeAtCDI, data = df))

## 
## Call:
## lm(formula = eachPercentile ~ pn_wordcount + AgeAtCDI, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.411 -24.760   0.751  27.043  43.941 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   60.5319    11.2650   5.373 1.93e-07 ***
## pn_wordcount  10.4429     5.3545   1.950   0.0524 .  
## AgeAtCDI      -0.1849     0.4349  -0.425   0.6710    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.45 on 224 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.0203, Adjusted R-squared:  0.01156 
## F-statistic: 2.321 on 2 and 224 DF,  p-value: 0.1005

summary(lm(pn_wordcount ~ AgeAtCDI, data = df)) #연령변화에 따른 긍정개수 변화-> 긍정개수가 높아질수록 아동의 연령이 0.019만큼 감소한다?

## 
## Call:
## lm(formula = pn_wordcount ~ AgeAtCDI, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.19462 -0.09383  0.02237  0.11605  0.90958 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.564064   0.135120   4.175 4.27e-05 ***
## AgeAtCDI    -0.019332   0.005259  -3.676 0.000296 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3542 on 225 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.05667,    Adjusted R-squared:  0.05247 
## F-statistic: 13.52 on 1 and 225 DF,  p-value: 0.0002959

summary(lm(AgeAtCDI ~ pn_wordcount, data = df)) #긍정개수 변화에 따른 연령변화->연령이 증가할수록 긍정점수가 2.9만큼 감소한다, 즉 부정적으로 된다?

## 
## Call:
## lm(formula = AgeAtCDI ~ pn_wordcount, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.3441 -3.5487 -0.2498  2.8885 11.3771 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   25.5229     0.2956  86.352  < 2e-16 ***
## pn_wordcount  -2.9311     0.7973  -3.676 0.000296 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.361 on 225 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.05667,    Adjusted R-squared:  0.05247 
## F-statistic: 13.52 on 1 and 225 DF,  p-value: 0.0002959

# model comparison

# anova(lm_fit1,lm_fit1_1)
# attributes(lm_fit1)
# lm_fit1$coefficients #기울기와 절편

graph

#fix encoding
# par(family="AppleGothic")
# install.packages("extrafont")
# library(extrafont)
# font_import()

ggplot(data=df, aes(x=df$pn_wordcount, y=df$eachPercentile)) + geom_point(aes(size=df$pn_wordcount, color="red")) + annotate("text", x = 0.18, y = 110, label = "긍정단어에 따른 어휘발달정도", size = 4, family = "AppleSDGothicNeo-Regular")

plot(df$pn_wordcount, df$eachPercentile,
     xlab = "어휘수 기반 어휘감성가",
     ylab = "언어발달결과",
     main = "어휘수 기반 어휘감성가 변화에 따른 언어발달결과 변화", 
     family = "AppleSDGothicNeo-Regular")

# abline(lm_fit1$coefficients)
# plot(lm_fit1)
#잔차들이(residuals) 특정한 패턴이 없이 0 근처로 모여있으면 모여있을수록 좋음. 
#0 근처로 모여있다는 것은 예측 값들이 회귀분석 직선을 중심으로 위아래로 잘 퍼져있다는 것이며, 특정 예측값들에 영향을 받지 않고 골로구 분포하고 있을 수록 세운 회귀분석 식이 잘 작동한다는 것을 나타냄.

Hypothesis 2: 감정점수에 따른 백분위 변화

dependent var.: CDI percentile
fixed effect: P/N score
긍정점수가 높은 아동일수록 cdi percentile이 낮다. ***

# lm(eachPercentile ~ log(pn_score + 0.000001) , data=df) -> lm_fit4;summary(lm_fit4)
df$pn_score <- as.numeric(df$pn_score)

#final result
summary(lm(eachPercentile ~ pn_score, data=df))

## 
## Call:
## lm(formula = eachPercentile ~ pn_score, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.541 -24.606   0.941  26.537  43.229 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   55.771      1.943  28.704   <2e-16 ***
## pn_score       9.617      5.087   1.891     0.06 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.45 on 225 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.01564,    Adjusted R-squared:  0.01126 
## F-statistic: 3.574 on 1 and 225 DF,  p-value: 0.05997

summary(lm(eachPercentile ~ pn_score + Gender, data=df))

## 
## Call:
## lm(formula = eachPercentile ~ pn_score + Gender, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.487 -24.552   0.995  26.486  43.283 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  55.7166     2.7455  20.293   <2e-16 ***
## pn_score      9.6170     5.0981   1.886   0.0605 .  
## GenderM       0.1055     3.7863   0.028   0.9778    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.52 on 224 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.01564,    Adjusted R-squared:  0.006851 
## F-statistic:  1.78 on 2 and 224 DF,  p-value: 0.1711

summary(lm(eachPercentile ~ pn_score + AgeAtCDI, data = df))

## 
## Call:
## lm(formula = eachPercentile ~ pn_score + AgeAtCDI, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.850 -25.285   0.511  27.020  44.255 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  61.2496    11.2792   5.430 1.46e-07 ***
## pn_score      9.0226     5.2359   1.723   0.0862 .  
## AgeAtCDI     -0.2144     0.4348  -0.493   0.6224    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.5 on 224 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.0167, Adjusted R-squared:  0.007925 
## F-statistic: 1.903 on 2 and 224 DF,  p-value: 0.1516

summary(lm(pn_score ~ AgeAtCDI + Gender, data = df))#연령변화에 따른 긍정점수 변화-> 긍정점수가 높아질수록 아동의 연령이 0.019만큼 감소한다?

## 
## Call:
## lm(formula = pn_score ~ AgeAtCDI + Gender, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.20040 -0.08982  0.02762  0.12220  0.88593 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.569719   0.139049   4.097 5.84e-05 ***
## AgeAtCDI    -0.019326   0.005431  -3.559 0.000455 ***
## GenderM      0.017834   0.048567   0.367 0.713810    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3636 on 224 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.05352,    Adjusted R-squared:  0.04506 
## F-statistic: 6.333 on 2 and 224 DF,  p-value: 0.002112

summary(lm(AgeAtCDI ~ pn_score, data = df)) #긍정점수 변화에 따른 연령변화->연령이 증가할수록 긍정점수가 2.7만큼 감소한다, 즉 부정적으로 된다?

## 
## Call:
## lm(formula = AgeAtCDI ~ pn_score, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2131 -3.5645 -0.3139  2.7048 11.3477 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.5523     0.2984  85.627  < 2e-16 ***
## pn_score     -2.7709     0.7813  -3.547 0.000475 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.37 on 225 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.05295,    Adjusted R-squared:  0.04874 
## F-statistic: 12.58 on 1 and 225 DF,  p-value: 0.0004747

# model comparison
# lm(eachPercentile ~ pn_score + Gender, data=df)-> lm_fit3_1;summary(lm_fit3_1)
# anova(lm_fit3,lm_fit3_1)
# attributes(lm_fit3)
# lm_fit3$coefficients #기울기와 절편

graph

ggplot(data=df, aes(x=df$pn_score, y=df$eachPercentile)) + geom_point(aes(size=df$pn_score, color="blue"))

plot(df$pn_score, df$eachPercentile,
     xlab = "척도 기반 어휘감성가 변화에 따른 언어발달결과 변화",
     ylab = "언어발달결과",
     main = "척도 기반 어휘감성가 변화에 따른 언어발달결과 변화",
     family = "AppleSDGothicNeo-Regular")

# abline(lm_fit3$coefficients)
# plot(lm_fit3)

final_pnData_230411/2306

정현아

2023-08-07

1. Installing package and importing data.

directory 설정

2. CDI data import and clean

3. P(ositive)/N(egative) data import and clean

4. Calculation number of P/N words and P/N score.

5. CDI percentile data import

6. Analayis

정규분포 확인(not working with Rmarkdown)

Hypothesis 1: 감정어 개수에 따른 백분위 변화수

Hypothesis 2: 감정점수에 따른 백분위 변화