mpg <- as.data.frame(ggplot2::mpg)
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mpg_a <- mpg %>% filter(displ <= 4)
mpg_b <- mpg %>% filter(displ >= 5)
mean(mpg_a$hwy)
## [1] 25.96319
mean(mpg_b$hwy)
## [1] 18.07895
displ이 4 이하인 자동차의 hwy 평균이 displ 5 이상인 자동차의 평균보다 더 높다.
mpg_audi <- mpg %>% filter(manufacturer == "audi")
mpg_toyota <- mpg %>% filter(manufacturer == "toyota")
mean(mpg_audi$cty)
## [1] 17.61111
mean(mpg_toyota$cty)
## [1] 18.52941
’toyota’의 cty 평균이 ’audi’보다 더 높다.
mpg_new <- mpg %>% filter(manufacturer %in% c("chevrolet", "ford", "honda"))
mean(mpg_new$hwy)
## [1] 22.50943
‘Chevrolet’, ‘ford’, ’honda’의 hwy 전체 평균은 22.50943이다.
mpg <- as.data.frame(ggplot2::mpg)
df <- mpg %>% select(class,cty)
head(df)
## class cty
## 1 compact 18
## 2 compact 21
## 3 compact 20
## 4 compact 21
## 5 compact 16
## 6 compact 18
df_suv <- df %>% filter(class == "suv")
df_compact <- df %>% filter(class == "compact")
mean(df_suv$cty)
## [1] 13.5
mean(df_compact$cty)
## [1] 20.12766
’compact’의 평균 cty가 ’suv’보다 더 높은 것을 알 수 있다.
mpg <- as.data.frame(ggplot2::mpg)
mpg %>% filter(manufacturer == "audi") %>%
arrange(desc(hwy)) %>%
head(5)
## manufacturer model displ year cyl trans drv cty hwy fl class
## 1 audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
## 2 audi a4 2.0 2008 4 auto(av) f 21 30 p compact
## 3 audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
## 4 audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
## 5 audi a4 quattro 2.0 2008 4 manual(m6) 4 20 28 p compact
mpg <- as.data.frame(ggplot2::mpg)
mpg_new <- mpg
mpg_new <- mpg_new %>% mutate(total = cty + hwy)
mpg_new <- mpg_new %>% mutate(mean = total/2)
mpg_new %>%
arrange(desc(mean)) %>%
head(3)
## manufacturer model displ year cyl trans drv cty hwy fl class
## 1 volkswagen new beetle 1.9 1999 4 manual(m5) f 35 44 d subcompact
## 2 volkswagen jetta 1.9 1999 4 manual(m5) f 33 44 d compact
## 3 volkswagen new beetle 1.9 1999 4 auto(l4) f 29 41 d subcompact
## total mean
## 1 79 39.5
## 2 77 38.5
## 3 70 35.0
mpg %>%
mutate(total = cty + hwy,
mean = total/2) %>%
arrange(desc(mean)) %>%
head(3)
## manufacturer model displ year cyl trans drv cty hwy fl class
## 1 volkswagen new beetle 1.9 1999 4 manual(m5) f 35 44 d subcompact
## 2 volkswagen jetta 1.9 1999 4 manual(m5) f 33 44 d compact
## 3 volkswagen new beetle 1.9 1999 4 auto(l4) f 29 41 d subcompact
## total mean
## 1 79 39.5
## 2 77 38.5
## 3 70 35.0
mpg <- as.data.frame(ggplot2::mpg)
mpg %>%
group_by(class) %>%
summarise(mean_cty = mean(cty))
## # A tibble: 7 × 2
## class mean_cty
## <chr> <dbl>
## 1 2seater 15.4
## 2 compact 20.1
## 3 midsize 18.8
## 4 minivan 15.8
## 5 pickup 13
## 6 subcompact 20.4
## 7 suv 13.5
mpg %>%
group_by(class) %>%
summarise(mean_cty = mean(cty)) %>%
arrange(desc(mean_cty))
## # A tibble: 7 × 2
## class mean_cty
## <chr> <dbl>
## 1 subcompact 20.4
## 2 compact 20.1
## 3 midsize 18.8
## 4 minivan 15.8
## 5 2seater 15.4
## 6 suv 13.5
## 7 pickup 13
mpg %>%
group_by(manufacturer) %>%
summarise(mean_hwy = mean(hwy)) %>%
arrange(desc(mean_hwy)) %>%
head(3)
## # A tibble: 3 × 2
## manufacturer mean_hwy
## <chr> <dbl>
## 1 honda 32.6
## 2 volkswagen 29.2
## 3 hyundai 26.9
hwy 평균이 가장 높은 회사 3곳은 ‘honda’,‘volkswagen’,’hyundai’이다.
mpg %>%
filter(class == "compact") %>%
group_by(manufacturer) %>%
summarise(count = n()) %>%
arrange(desc(count))
## # A tibble: 5 × 2
## manufacturer count
## <chr> <int>
## 1 audi 15
## 2 volkswagen 14
## 3 toyota 12
## 4 subaru 4
## 5 nissan 2
‘audi’ 회사에서 경차를 가장 많이 생산하는 것을 알 수 있다.
fuel <- data.frame(fl = c("c", "d", "e", "p", "r"),
price_fl = c(2.35, 2.38, 2.11, 2.76, 2.22),
stringsAsFactors = F)
fuel
## fl price_fl
## 1 c 2.35
## 2 d 2.38
## 3 e 2.11
## 4 p 2.76
## 5 r 2.22
mpg <- as.data.frame(ggplot2::mpg)
mpg <- left_join(mpg, fuel, by = "fl")
mpg %>%
select(model, fl, price_fl) %>%
head(5)
## model fl price_fl
## 1 a4 p 2.76
## 2 a4 p 2.76
## 3 a4 p 2.76
## 4 a4 p 2.76
## 5 a4 p 2.76
library(readxl)
read_excel("C:\\Users\\user\\OneDrive\\바탕 화면\\R\\mlu.xls", sheet = 2) -> mlu
mlu
## # A tibble: 35 × 10
## File age utterances_mlu words_mlu DurationTime DurationSec Types_freq
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 13_A0P04M… A0 566 1290 "00:17:35" 1055 580
## 2 21_A0P05M… A0 565 1602 "00:20:44" 1244 737
## 3 27_A0P06M… A0 470 813 "00:12:07" 727 378
## 4 28_A0P07M… A0 371 976 "00:11:53" 713 419
## 5 29_A0P08M… A0 802 2239 "00:24:45" 1485 814
## 6 2_A0P01M.… A0 563 1243 "00:12:06\"" NA 425
## 7 30_A0P09F… A0 574 1705 "00:21:56" 1316 828
## 8 31_A0P10F… A0 539 1110 "00:10:54" 654 426
## 9 35_A0P11M… A0 705 1847 "00:20:46" 1246 622
## 10 36_A0P12M… A0 752 2120 "00:29:22" 1762 1014
## # ℹ 25 more rows
## # ℹ 3 more variables: Token_freq <dbl>, mlu <dbl>, `token/type` <dbl>
library(dplyr)
mlu %>% filter(utterances_mlu <= 500)
## # A tibble: 5 × 10
## File age utterances_mlu words_mlu DurationTime DurationSec Types_freq
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 27_A0P06M.… A0 470 813 00:12:07 727 378
## 2 28_A0P07M.… A0 371 976 00:11:53 713 419
## 3 15_A1P05F.… A1 381 1046 00:15:14 914 555
## 4 12_A2P04M.… A2 481 1672 00:17:01 1021 921
## 5 18_A2P07M.… A2 323 890 00:08:47 527 476
## # ℹ 3 more variables: Token_freq <dbl>, mlu <dbl>, `token/type` <dbl>
500문장 이하를 말한 엄마가 5명 있다는 것을 알 수 있다.
mlu %>% select(-DurationTime, -DurationSec) -> new_mlu
new_mlu
## # A tibble: 35 × 8
## File age utterances_mlu words_mlu Types_freq Token_freq mlu `token/type`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 13_A… A0 566 1290 580 1346 2.28 2.32
## 2 21_A… A0 565 1602 737 1606 2.84 2.18
## 3 27_A… A0 470 813 378 832 1.73 2.20
## 4 28_A… A0 371 976 419 979 2.63 2.34
## 5 29_A… A0 802 2239 814 2253 2.79 2.77
## 6 2_A0… A0 563 1243 425 1263 2.21 2.97
## 7 30_A… A0 574 1705 828 1712 2.97 2.07
## 8 31_A… A0 539 1110 426 1124 2.06 2.64
## 9 35_A… A0 705 1847 622 1860 2.62 2.99
## 10 36_A… A0 752 2120 1014 2599 2.82 2.56
## # ℹ 25 more rows
mlu %>%
group_by(age) %>%
summarise(mean_mlu = mean(mlu))
## # A tibble: 3 × 2
## age mean_mlu
## <chr> <dbl>
## 1 A0 2.50
## 2 A1 2.59
## 3 A2 2.99
각 나이대 별 mlu의 평균은 A0은 2.50, A1은 2.59, A2는 2.99임을 알 수 있다.
mlu %>%
group_by(age) %>%
summarise(mean_talk = mean(`token/type`))
## # A tibble: 3 × 2
## age mean_talk
## <chr> <dbl>
## 1 A0 2.57
## 2 A1 2.74
## 3 A2 2.66
각 그룹별 token/type 비율의 평균은 A0은 2.57, A1은 2.74, A2는 2.66임을 알 수 있다.