mlu_data
로 지정한 다음 카피본을 만든다.library("readxl")
mlu <- read_excel("mlu.xlsx")
mlu_data <- as.data.frame(mlu)
summary(mlu_data)
## File age utterances_mlu words_mlu
## Length:35 Length:35 Min. :323.0 Min. : 813
## Class :character Class :character 1st Qu.:561.0 1st Qu.:1368
## Mode :character Mode :character Median :621.0 Median :1716
## Mean :631.8 Mean :1710
## 3rd Qu.:716.0 3rd Qu.:2060
## Max. :890.0 Max. :2766
##
## DurationTime DurationSec Types_freq Token_freq
## Min. :1899-12-31 00:08:47 Min. : 527.0 Min. : 378.0 Min. : 832
## 1st Qu.:1899-12-31 00:15:17 1st Qu.: 940.5 1st Qu.: 567.5 1st Qu.:1446
## Median :1899-12-31 00:17:39 Median :1060.5 Median : 694.0 Median :1798
## Mean :1899-12-31 00:17:57 Mean :1088.1 Mean : 754.8 Mean :1778
## 3rd Qu.:1899-12-31 00:20:45 3rd Qu.:1245.5 3rd Qu.: 775.5 3rd Qu.:2134
## Max. :1899-12-31 00:29:22 Max. :1762.0 Max. :4014.0 Max. :2827
## NA's :1
age
칼럼에는 몇개의 요인이 있는가?class(mlu_data$age)
## [1] "character"
length(mlu_data$age)
## [1] 35
table(mlu_data$age)
##
## A0 A1 A2
## 12 11 12
utterances_mlu
를 utterances
로 words_mlu
를 words
로 이름을 바꾸라.library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mlu_data <- dplyr::rename(mlu_data, utterances= utterances_mlu)
mlu_data <- dplyr::rename(mlu_data, words = words_mlu)
summary(mlu_data)
## File age utterances words
## Length:35 Length:35 Min. :323.0 Min. : 813
## Class :character Class :character 1st Qu.:561.0 1st Qu.:1368
## Mode :character Mode :character Median :621.0 Median :1716
## Mean :631.8 Mean :1710
## 3rd Qu.:716.0 3rd Qu.:2060
## Max. :890.0 Max. :2766
##
## DurationTime DurationSec Types_freq Token_freq
## Min. :1899-12-31 00:08:47 Min. : 527.0 Min. : 378.0 Min. : 832
## 1st Qu.:1899-12-31 00:15:17 1st Qu.: 940.5 1st Qu.: 567.5 1st Qu.:1446
## Median :1899-12-31 00:17:39 Median :1060.5 Median : 694.0 Median :1798
## Mean :1899-12-31 00:17:57 Mean :1088.1 Mean : 754.8 Mean :1778
## 3rd Qu.:1899-12-31 00:20:45 3rd Qu.:1245.5 3rd Qu.: 775.5 3rd Qu.:2134
## Max. :1899-12-31 00:29:22 Max. :1762.0 Max. :4014.0 Max. :2827
## NA's :1
mlu
라는 파생변수를 생성하라mlu_data$mlu <- (mlu_data$utterances/mlu_data$words)
mlu
의 평균 및 quartile값을 파악하라.summary(mlu_data$mlu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2877 0.3429 0.3642 0.3789 0.4087 0.5781
mlu
값을 4개의 등급으로 나누어 가장 문장 길이가 긴 그룹을 A, 그다음 순서대로 B, C, D로 구분하여 grade
칼럼을 생성하라.Min. 1st Qu. Median Mean 3rd Qu. Max.
0.2877 0.3429 0.3642 0.3789 0.4087 0.5781
mlu_data$grade <- ifelse(mlu_data$mlu >= 0.4087, "A",
ifelse(mlu_data$mlu >= 0.3642, "B",
ifelse(mlu_data$mlu >= 0.3429, "C","D")))
table(mlu_data$grade)
##
## A B C D
## 9 9 8 9
age
와 mlu
사이의 빈도분포를 table
명령어를 이용하여 구하라.summary(table(mlu_data$age, mlu_data$mlu))
## Number of cases in table: 35
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 70, df = 68, p-value = 0.4102
## Chi-squared approximation may be incorrect
qplot
을 이용하여 각 나이 그룹을 X축으로 하여 mlu값의 분포를 그래프로 그려라.library("ggplot2")
table(mlu_data$age, mlu_data$mlu)
##
## 0.287679425837321 0.298162729658793 0.299349240780911 0.314020344980097
## A0 0 0 0 0
## A1 0 0 0 0
## A2 1 1 1 1
##
## 0.330183106910809 0.330522765598651 0.335487077534791 0.336656891495601
## A0 0 0 0 1
## A1 1 0 0 0
## A2 0 1 1 0
##
## 0.341048653755314 0.344771241830065 0.350963914977756 0.352684144818976
## A0 0 1 0 1
## A1 0 0 0 0
## A2 1 0 1 0
##
## 0.354716981132075 0.355271565495208 0.356581017304667 0.358195623046003
## A0 1 0 0 1
## A1 0 0 1 0
## A2 0 1 0 0
##
## 0.362921348314607 0.364244741873805 0.371415566681839 0.37212449255751
## A0 0 0 0 0
## A1 0 1 1 0
## A2 1 0 0 1
##
## 0.37568058076225 0.37881679389313 0.380122950819672 0.381700054141852
## A0 0 0 1 1
## A1 1 1 0 0
## A2 0 0 0 0
##
## 0.393457117595049 0.405594405594406 0.411764705882353 0.416149068322981
## A0 0 0 0 0
## A1 1 0 1 1
## A2 0 1 0 0
##
## 0.424221453287197 0.438759689922481 0.452579034941764 0.452936444086887
## A0 0 1 0 1
## A1 1 0 1 0
## A2 0 0 0 0
##
## 0.469242902208202 0.485585585585586 0.578105781057811
## A0 1 1 1
## A1 0 0 0
## A2 0 0 0
qplot(mlu_data$age, mlu_data$mlu)