R Midterm

2주차 과제

test <- c(80, 60, 70, 50, 90)

mean(test)

## [1] 70

(mean.test <- mean(test))

## [1] 70

3주차 과제

Q2. Find out about the function ‘rep’ by typing ?rep and generate a vector containing 10 repetitions of the word “noun”, and another vector containing 10 repetitions of the word “noun” and 20 repetitions of “verb”.

우선 ?rep을 작동시켜보면 아래와 같은 구조와 예시가 나온다.

#rep(x, times = 1, length.out = NA, each = 1)

rep(1:4, 2)

## [1] 1 2 3 4 1 2 3 4

rep(1:4, each = 2)       # not the same.

## [1] 1 1 2 2 3 3 4 4

rep(1:4, c(2,2,2,2))     # same as second.

## [1] 1 1 2 2 3 3 4 4

rep(1:4, c(2,1,2,1))

## [1] 1 1 2 3 3 4

rep(1:4, each = 2, len = 4)    # first 4 only.

## [1] 1 1 2 2

rep(1:4, each = 2, len = 10)   # 8 integers plus two recycled 1's.

##  [1] 1 1 2 2 3 3 4 4 1 1

rep(1:4, each = 2, times = 3)  # length 24, 3 complete replications

##  [1] 1 1 2 2 3 3 4 4 1 1 2 2 3 3 4 4 1 1 2 2 3 3 4 4

즉, rep(x, times = 1, length.out = NA, each = 1)에서 각각 의미하는 것은
x: vector
times: 반복 횟수
length.out: 길이 제한
each: x안에 들어 있는 각 element의 반복 횟수

그러므로 “noun”을 10번, “verb”를 20번 반복하는 vector를 만들려면 아래와 같이 입력하면 된다.

(nounverb <- c(rep("noun", 10), rep("verb", 20)))

##  [1] "noun" "noun" "noun" "noun" "noun" "noun" "noun" "noun" "noun" "noun"
## [11] "verb" "verb" "verb" "verb" "verb" "verb" "verb" "verb" "verb" "verb"
## [21] "verb" "verb" "verb" "verb" "verb" "verb" "verb" "verb" "verb" "verb"

Q3. Find out what the funciton ‘seq’ does, and generate a user-defined regular sequence.

우선 ?rep을 작동시켜보면 아래와 같은 구조와 예시가 나온다.

#seq(from = 1, to = 1, by = ((to - from)/(length.out - 1)), length.out = NULL)

seq(0, 1, length.out = 11)

##  [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0

seq(1, 9, by = 2)     # matches 'end'

## [1] 1 3 5 7 9

seq(1, 9, by = pi)    # stays below 'end'

## [1] 1.000000 4.141593 7.283185

seq(1, 6, by = 3)

## [1] 1 4

seq(1.575, 5.125, by = 0.05)

##  [1] 1.575 1.625 1.675 1.725 1.775 1.825 1.875 1.925 1.975 2.025 2.075 2.125
## [13] 2.175 2.225 2.275 2.325 2.375 2.425 2.475 2.525 2.575 2.625 2.675 2.725
## [25] 2.775 2.825 2.875 2.925 2.975 3.025 3.075 3.125 3.175 3.225 3.275 3.325
## [37] 3.375 3.425 3.475 3.525 3.575 3.625 3.675 3.725 3.775 3.825 3.875 3.925
## [49] 3.975 4.025 4.075 4.125 4.175 4.225 4.275 4.325 4.375 4.425 4.475 4.525
## [61] 4.575 4.625 4.675 4.725 4.775 4.825 4.875 4.925 4.975 5.025 5.075 5.125

seq(17) # same as 1:17, or even better seq_len(17)

##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17

즉, seq(from = 1, to = 1, by = ((to - from)/(length.out - 1)), length.out = NULL)에서 의미하는 것은
from, to: sequence의 시작과 끝
by: sequence의 증가량
length.out: sequence의 길이

그러므로 아래와 같은 seq를 만들 수 있다.

seq(0, 100, by = 20)

## [1]   0  20  40  60  80 100

4주차 과제

Q1. p112 혼자서 해보기

install.packages("ggplot2")
install.packages("dplyr")

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

mpg.new <- mpg
mpg.new  <- rename(mpg.new, city = cty, highway = hwy)
head(mpg.new)

## # A tibble: 6 x 11
##   manufacturer model displ  year   cyl trans    drv    city highway fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>    <chr> <int>   <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5) f        18      29 p     compa~
## 2 audi         a4      1.8  1999     4 manual(~ f        21      29 p     compa~
## 3 audi         a4      2    2008     4 manual(~ f        20      31 p     compa~
## 4 audi         a4      2    2008     4 auto(av) f        21      30 p     compa~
## 5 audi         a4      2.8  1999     6 auto(l5) f        16      26 p     compa~
## 6 audi         a4      2.8  1999     6 manual(~ f        18      26 p     compa~

Q2. p123 분석 도전!

midwest <- as.data.frame(ggplot2::midwest)
dim(midwest) #행, 열 개수 파악하기

## [1] 437  28

str(midwest) #데이터 프레임 구조 확인

## 'data.frame':    437 obs. of  28 variables:
##  $ PID                 : int  561 562 563 564 565 566 567 568 569 570 ...
##  $ county              : chr  "ADAMS" "ALEXANDER" "BOND" "BOONE" ...
##  $ state               : chr  "IL" "IL" "IL" "IL" ...
##  $ area                : num  0.052 0.014 0.022 0.017 0.018 0.05 0.017 0.027 0.024 0.058 ...
##  $ poptotal            : int  66090 10626 14991 30806 5836 35688 5322 16805 13437 173025 ...
##  $ popdensity          : num  1271 759 681 1812 324 ...
##  $ popwhite            : int  63917 7054 14477 29344 5264 35157 5298 16519 13384 146506 ...
##  $ popblack            : int  1702 3496 429 127 547 50 1 111 16 16559 ...
##  $ popamerindian       : int  98 19 35 46 14 65 8 30 8 331 ...
##  $ popasian            : int  249 48 16 150 5 195 15 61 23 8033 ...
##  $ popother            : int  124 9 34 1139 6 221 0 84 6 1596 ...
##  $ percwhite           : num  96.7 66.4 96.6 95.3 90.2 ...
##  $ percblack           : num  2.575 32.9 2.862 0.412 9.373 ...
##  $ percamerindan       : num  0.148 0.179 0.233 0.149 0.24 ...
##  $ percasian           : num  0.3768 0.4517 0.1067 0.4869 0.0857 ...
##  $ percother           : num  0.1876 0.0847 0.2268 3.6973 0.1028 ...
##  $ popadults           : int  43298 6724 9669 19272 3979 23444 3583 11323 8825 95971 ...
##  $ perchsd             : num  75.1 59.7 69.3 75.5 68.9 ...
##  $ percollege          : num  19.6 11.2 17 17.3 14.5 ...
##  $ percprof            : num  4.36 2.87 4.49 4.2 3.37 ...
##  $ poppovertyknown     : int  63628 10529 14235 30337 4815 35107 5241 16455 13081 154934 ...
##  $ percpovertyknown    : num  96.3 99.1 95 98.5 82.5 ...
##  $ percbelowpoverty    : num  13.15 32.24 12.07 7.21 13.52 ...
##  $ percchildbelowpovert: num  18 45.8 14 11.2 13 ...
##  $ percadultpoverty    : num  11.01 27.39 10.85 5.54 11.14 ...
##  $ percelderlypoverty  : num  12.44 25.23 12.7 6.22 19.2 ...
##  $ inmetro             : int  0 0 0 1 0 0 0 0 0 1 ...
##  $ category            : chr  "AAR" "LHR" "AAR" "ALU" ...

summary(midwest) #데이터 프레임 요약

##       PID          county             state                area        
##  Min.   : 561   Length:437         Length:437         Min.   :0.00500  
##  1st Qu.: 670   Class :character   Class :character   1st Qu.:0.02400  
##  Median :1221   Mode  :character   Mode  :character   Median :0.03000  
##  Mean   :1437                                         Mean   :0.03317  
##  3rd Qu.:2059                                         3rd Qu.:0.03800  
##  Max.   :3052                                         Max.   :0.11000  
##     poptotal         popdensity          popwhite          popblack      
##  Min.   :   1701   Min.   :   85.05   Min.   :    416   Min.   :      0  
##  1st Qu.:  18840   1st Qu.:  622.41   1st Qu.:  18630   1st Qu.:     29  
##  Median :  35324   Median : 1156.21   Median :  34471   Median :    201  
##  Mean   :  96130   Mean   : 3097.74   Mean   :  81840   Mean   :  11024  
##  3rd Qu.:  75651   3rd Qu.: 2330.00   3rd Qu.:  72968   3rd Qu.:   1291  
##  Max.   :5105067   Max.   :88018.40   Max.   :3204947   Max.   :1317147  
##  popamerindian        popasian         popother        percwhite    
##  Min.   :    4.0   Min.   :     0   Min.   :     0   Min.   :10.69  
##  1st Qu.:   44.0   1st Qu.:    35   1st Qu.:    20   1st Qu.:94.89  
##  Median :   94.0   Median :   102   Median :    66   Median :98.03  
##  Mean   :  343.1   Mean   :  1310   Mean   :  1613   Mean   :95.56  
##  3rd Qu.:  288.0   3rd Qu.:   401   3rd Qu.:   345   3rd Qu.:99.07  
##  Max.   :10289.0   Max.   :188565   Max.   :384119   Max.   :99.82  
##    percblack       percamerindan        percasian        percother      
##  Min.   : 0.0000   Min.   : 0.05623   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.: 0.1157   1st Qu.: 0.15793   1st Qu.:0.1737   1st Qu.:0.09102  
##  Median : 0.5390   Median : 0.21502   Median :0.2972   Median :0.17844  
##  Mean   : 2.6763   Mean   : 0.79894   Mean   :0.4872   Mean   :0.47906  
##  3rd Qu.: 2.6014   3rd Qu.: 0.38362   3rd Qu.:0.5212   3rd Qu.:0.48050  
##  Max.   :40.2100   Max.   :89.17738   Max.   :5.0705   Max.   :7.52427  
##    popadults          perchsd        percollege        percprof      
##  Min.   :   1287   Min.   :46.91   Min.   : 7.336   Min.   : 0.5203  
##  1st Qu.:  12271   1st Qu.:71.33   1st Qu.:14.114   1st Qu.: 2.9980  
##  Median :  22188   Median :74.25   Median :16.798   Median : 3.8142  
##  Mean   :  60973   Mean   :73.97   Mean   :18.273   Mean   : 4.4473  
##  3rd Qu.:  47541   3rd Qu.:77.20   3rd Qu.:20.550   3rd Qu.: 4.9493  
##  Max.   :3291995   Max.   :88.90   Max.   :48.079   Max.   :20.7913  
##  poppovertyknown   percpovertyknown percbelowpoverty percchildbelowpovert
##  Min.   :   1696   Min.   :80.90    Min.   : 2.180   Min.   : 1.919      
##  1st Qu.:  18364   1st Qu.:96.89    1st Qu.: 9.199   1st Qu.:11.624      
##  Median :  33788   Median :98.17    Median :11.822   Median :15.270      
##  Mean   :  93642   Mean   :97.11    Mean   :12.511   Mean   :16.447      
##  3rd Qu.:  72840   3rd Qu.:98.60    3rd Qu.:15.133   3rd Qu.:20.352      
##  Max.   :5023523   Max.   :99.86    Max.   :48.691   Max.   :64.308      
##  percadultpoverty percelderlypoverty    inmetro         category        
##  Min.   : 1.938   Min.   : 3.547     Min.   :0.0000   Length:437        
##  1st Qu.: 7.668   1st Qu.: 8.912     1st Qu.:0.0000   Class :character  
##  Median :10.008   Median :10.869     Median :0.0000   Mode  :character  
##  Mean   :10.919   Mean   :11.389     Mean   :0.3432                     
##  3rd Qu.:13.182   3rd Qu.:13.412     3rd Qu.:1.0000                     
##  Max.   :43.312   Max.   :31.162     Max.   :1.0000

midwest.new <- midwest #데이터 프레임 사본 제작 후 행 이름 재설정 
midwest.new  <- rename(midwest.new, total = poptotal, asian = popasian)

midwest.new$prop_asian <- (midwest.new$asian/midwest.new$total)*100 #파생변수 제작
hist(midwest.new$prop_asian) #히스토그램 제작

mean(midwest.new$prop_asian) # 평균 구하기

## [1] 0.4872462

midwest.new$mean_asian <- ifelse(midwest.new$prop_asian > mean(midwest.new$prop_asian), "large", "small") # 조건문 활용 파생변수 제작

table(midwest.new$mean_asian) # 빈도표 제작

## 
## large small 
##   119   318

qplot(midwest.new$mean_asian) # 막대그래프 제작

Q3. mlu 데이터 분석하기

install.packages("readxl")

library(readxl)

setwd("C:\\Users\\user\\Desktop\\R 실습용")
mlu_data <- read_excel("mlu.xlsx")
mlu_data.ori <- mlu_data #1. 카피본 제작  

table(mlu_data$age) #2. age 칼럼의 요인 개수 알아보기

## 
## A0 A1 A2 
## 12 11 12

mlu_data <- rename(mlu_data, utterances = utterances_mlu, words = words_mlu) #3. 칼럼 이름 바꾸기  

mlu_data$mlu <- mlu_data$words/mlu_data$utterances #4. 파생변수 만들기  

summary(mlu_data$mlu) #5.mlu 칼럼의 평균 및 quartile 값 구하기

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.730   2.447   2.745   2.696   2.916   3.476

mlu_data$grade <- ifelse(mlu_data$mlu < 2.447, "D", 
                         ifelse(mlu_data$mlu < 2.745, "C", 
                                ifelse(mlu_data$mlu < 2.916, "B", "A"))) #6-1. quartile 값을 기준으로 ifelse를 사용해서 grade 파생 변수 만들기   

mlu_data$grade2 <- ntile(mlu_data$mlu, 4) #6-2. 다른 방식으로 dplyr 패키지에 있는 ntile 함수를 이용해서 4등분으로 grouping 후 grade2 파생 변수 만들기  

qplot(data = mlu_data, x = age, y = mlu) #8. age와 mlu사이의 관계를 그래프로 그려보기

7주차 과제

Q1. p133 혼자서 해보기

1-1.

mpg %>% filter(displ <= 4) -> mpg_lower4 #배기량이 4 이하인 열 추출(filter함수 사용)
mpg[mpg$displ <= 4,] -> mpg_lower4 #배기량이 4 이하인 열 추출 (filter함수 사용X)
mean(mpg_lower4$hwy)

## [1] 25.96319

mpg %>% filter(displ >= 5) -> mpg_upper5 #배기량이 5 이상인 열 추출(filter함수 사용)
mpg[mpg$displ >= 5,] -> mpg_upper5 #배기량이 5 이상인 열 추출 (filter함수 사용X)
mean(mpg_upper5$hwy)

## [1] 18.07895

ifelse(mpg$displ <= 4, "lower4", ifelse(mpg$displ >= 5, "upper5", NA)) -> mpg$grade
# 배기량이 5 이상인 자동차보다 4 이상인 자동차의 연비가 평균적으로 더 높다.

1-2.(파생변수와 tapply함수를 이용해서 평균 구하기)

ifelse(mpg$displ <= 4, "lower4", ifelse(mpg$displ >= 5, "upper5", NA)) -> mpg$grade
tapply(mpg$hwy, mpg$grade, mean)

##   lower4   upper5 
## 25.96319 18.07895

# 배기량이 5 이상인 자동차보다 4 이상인 자동차의 연비가 평균적으로 더 높다.

2-1.

mpg %>% filter(manufacturer == "audi") -> mpg_audi #제조사가 audi인 열 추출(filter함수 사용)
mpg[mpg$manufacturer == "audi",] -> mpg_audi #제조사가 audi인 열 추출(filter함수 사용X)
mean(mpg_audi$cty)

## [1] 17.61111

mpg %>% filter(manufacturer == "toyota") -> mpg_toyota #제조사가 honda인 열 추출(filter함수 사용)
mpg[mpg$manufacturer == "toyota",] -> mpg_toyota #제조사가 honda인 열 추출(filter함수 사용X)
mean(mpg_toyota$cty)

## [1] 18.52941

# 제조회사가 audi인 자동차보다 toyota인 자동차의 도시 연비가 평균적으로 더 높다.

2-2.(tapply함수를 이용해서 평균 구하기)

tapply(mpg$cty, mpg$manufacturer, mean)

##       audi  chevrolet      dodge       ford      honda    hyundai       jeep 
##   17.61111   15.00000   13.13514   14.00000   24.44444   18.64286   13.50000 
## land rover    lincoln    mercury     nissan    pontiac     subaru     toyota 
##   11.50000   11.33333   13.25000   18.07692   17.00000   19.28571   18.52941 
## volkswagen 
##   20.92593

# 제조회사가 audi인 자동차보다 toyota인 자동차의 도시 연비가 평균적으로 더 높다.

mpg %>% filter(manufacturer %in% c("chevrolet", "ford", "honda")) -> mpg_ma #제조사가 chevorlet, ford, honda인 열 추출(filter함수 사용)
mpg[mpg$manufacturer %in% c("chevrolet", "ford", "honda"),] -> mpg_ma #제조사가 chevorlet, ford, honda인 열 추출(filter함수 사용X)
mean(mpg_ma$hwy)

## [1] 22.50943

Q2. p138 혼자서 해보기

mpg %>% select(class, cty) -> mpg_cc #class, cty 행 추출(select함수 사용)
mpg[,c("class","cty")] -> mpg_cc #class, cty 행 추출(select함수 사용X)
head(mpg_cc)

## # A tibble: 6 x 2
##   class     cty
##   <chr>   <int>
## 1 compact    18
## 2 compact    21
## 3 compact    20
## 4 compact    21
## 5 compact    16
## 6 compact    18

2-1.

mpg_cc %>% filter(class == "suv") -> mpg_suv #자동차 종류가 suv인 열 추출(filter함수 사용) 
mpg_cc[mpg$class == "suv",] -> mpg_suv #자동차 종류가 suv인 열 추출(filter함수 사용X)
mpg %>% select(class, cty) %>% filter(class == "suv") -> mpg_suv #class, cty 행과 자동차 종류가 suv인 열 추출(select,filter함수 사용) 
mpg[mpg$class == "suv",c("class","cty")] -> mpg_suv #class, cty 행과 자동차 종류가 suv인 열 추출(select,filter함수 사용X) 
mean(mpg_suv$cty)

## [1] 13.5

mpg_cc %>% filter(class == "compact") -> mpg_compact #자동차 종류가 suv인 열 추출(filter함수 사용) 
mpg_cc[mpg$class == "compact",] -> mpg_compact #자동차 종류가 suv인 열 추출(filter함수 사용X)
mpg %>% select(class, cty) %>% filter(class == "compact") -> mpg_compact #class, cty 행과 자동차 종류가 suv인 열 추출(select,filter함수 사용) 
mpg[mpg$class == "compact",c("class","cty")] -> mpg_compact #class, cty 행과 자동차 종류가 suv인 열 추출(select,filter함수 사용X) 
mean(mpg_compact$cty)

## [1] 20.12766

#자동차 종류가 suv인 자동차보다 compact인 자동차의 도시연비가 더 높다

2-2. (tapply함수를 이용해서 평균 구하기)

tapply(mpg$cty, mpg$class, mean)

##    2seater    compact    midsize    minivan     pickup subcompact        suv 
##   15.40000   20.12766   18.75610   15.81818   13.00000   20.37143   13.50000

#자동차 종류가 suv인 자동차보다 compact인 자동차의 도시연비가 더 높다

Q3. p141 혼자서 해보기

mpg %>% filter(manufacturer == "audi") -> mpg_audi #제조사가 audi인 열 추출(filter함수 사용)
mpg_audi %>% arrange(desc(hwy)) %>% head(5) #ordering 후 연비가 1~5위 안에 드는 자동차 출력

## # A tibble: 5 x 12
##   manufacturer model displ  year   cyl trans drv     cty   hwy fl    class grade
##   <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <chr>
## 1 audi         a4      2    2008     4 manu~ f        20    31 p     comp~ lowe~
## 2 audi         a4      2    2008     4 auto~ f        21    30 p     comp~ lowe~
## 3 audi         a4      1.8  1999     4 auto~ f        18    29 p     comp~ lowe~
## 4 audi         a4      1.8  1999     4 manu~ f        21    29 p     comp~ lowe~
## 5 audi         a4 q~   2    2008     4 manu~ 4        20    28 p     comp~ lowe~

mpg[mpg$manufacturer == "audi",] -> mpg_audi #제조사가 audi인 열 추출(filter함수 사용X)
head(mpg_audi[order(-mpg_audi$hwy),],5) #제조사가 audi인 열 추출(arrange함수 사용X)

## # A tibble: 5 x 12
##   manufacturer model displ  year   cyl trans drv     cty   hwy fl    class grade
##   <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <chr>
## 1 audi         a4      2    2008     4 manu~ f        20    31 p     comp~ lowe~
## 2 audi         a4      2    2008     4 auto~ f        21    30 p     comp~ lowe~
## 3 audi         a4      1.8  1999     4 auto~ f        18    29 p     comp~ lowe~
## 4 audi         a4      1.8  1999     4 manu~ f        21    29 p     comp~ lowe~
## 5 audi         a4 q~   2    2008     4 manu~ 4        20    28 p     comp~ lowe~

Q4. p144 혼자서 해보기

mpg.new2 <- mpg
mpg.new2 %>% mutate(total = cty + hwy) -> mpg.new2 #cty와 hwy값을 합친 파생변수 생성

mpg.new2 %>% mutate(mean = total/2) -> mpg.new2 #cty와 hwy값의 평균값을 구한 파생변수 생성

mpg.new2 %>% arrange(desc(mean)) %>% head(3)  #ordering 후 평균 연비가 1~3위 안에 드는 자동차 출력

## # A tibble: 3 x 14
##   manufacturer model displ  year   cyl trans drv     cty   hwy fl    class grade
##   <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <chr>
## 1 volkswagen   new ~   1.9  1999     4 manu~ f        35    44 d     subc~ lowe~
## 2 volkswagen   jetta   1.9  1999     4 manu~ f        33    44 d     comp~ lowe~
## 3 volkswagen   new ~   1.9  1999     4 auto~ f        29    41 d     subc~ lowe~
## # ... with 2 more variables: total <int>, mean <dbl>

mpg %>% mutate(mean = (cty + hwy)/2) %>% arrange(desc(mean)) %>% head(3)

## # A tibble: 3 x 13
##   manufacturer model displ  year   cyl trans drv     cty   hwy fl    class grade
##   <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <chr>
## 1 volkswagen   new ~   1.9  1999     4 manu~ f        35    44 d     subc~ lowe~
## 2 volkswagen   jetta   1.9  1999     4 manu~ f        33    44 d     comp~ lowe~
## 3 volkswagen   new ~   1.9  1999     4 auto~ f        29    41 d     subc~ lowe~
## # ... with 1 more variable: mean <dbl>

Q5. p150 혼자서 해보기

mpg %>% group_by(class) %>% summarise(mean_cty = mean(cty)) #class별 도시 연비의 평균 구하기(group_by, summarise사용)

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 7 x 2
##   class      mean_cty
##   <chr>         <dbl>
## 1 2seater        15.4
## 2 compact        20.1
## 3 midsize        18.8
## 4 minivan        15.8
## 5 pickup         13  
## 6 subcompact     20.4
## 7 suv            13.5

tapply(mpg$cty, mpg$class, mean) #class별 도시 연비의 평균 구하기(group_by, summarise사용X)

##    2seater    compact    midsize    minivan     pickup subcompact        suv 
##   15.40000   20.12766   18.75610   15.81818   13.00000   20.37143   13.50000

mpg %>% group_by(class) %>% 
  summarise(mean_cty = mean(cty)) %>% 
  arrange(desc(mean_cty)) #class별 도시 연비의 평균을 구한 후 정렬하기

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 7 x 2
##   class      mean_cty
##   <chr>         <dbl>
## 1 subcompact     20.4
## 2 compact        20.1
## 3 midsize        18.8
## 4 minivan        15.8
## 5 2seater        15.4
## 6 suv            13.5
## 7 pickup         13

mpg %>% group_by(class) %>% 
  summarise(mean_cty = mean(cty)) %>% 
  arrange(desc(mean_cty)) %>% 
  head(3) #class별 도시 연비의 평균을 구한 후 정렬 후 1~3위 출력

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 3 x 2
##   class      mean_cty
##   <chr>         <dbl>
## 1 subcompact     20.4
## 2 compact        20.1
## 3 midsize        18.8

mpg %>% filter(class == "compact") %>% group_by(manufacturer) %>% summarise(compact_n = n()) %>% arrange(desc(compact_n))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 5 x 2
##   manufacturer compact_n
##   <chr>            <int>
## 1 audi                15
## 2 volkswagen          14
## 3 toyota              12
## 4 subaru               4
## 5 nissan               2

Q6. mlu 데이터 분석하기

mlu_data2 <- mlu_data.ori  #1. 카피본 제작  
dim(mlu_data2 %>% filter(utterances_mlu <= 500)) #2. 주어진 녹음 시간 중 500문장 이하로 발화한 양육자는 5명이다.

## [1] 5 8

mlu_data2 %>% select(-DurationTime, -DurationSec) -> mlu_data2 #3. 분석에 필요없는 DurationTime, DurationSec column 제거
mlu_data2 %>% mutate(mlu = words_mlu/utterances_mlu) %>% #4-1. mlu 파생변수 추가
  group_by(age) %>% summarise(mean_mlu = mean(mlu)) #4-2. 나이별 mlu 평균 구하기

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 3 x 2
##   age   mean_mlu
##   <chr>    <dbl>
## 1 A0        2.50
## 2 A1        2.59
## 3 A2        2.99

mlu_data2 %>% mutate(TTR = Token_freq/Types_freq) %>% #5-1. TTR 파생변수 추가
  group_by(age) %>% summarise(mean_TTR = mean(TTR)) #5-2. 나이별 TTR 평균 구하기

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 3 x 2
##   age   mean_TTR
##   <chr>    <dbl>
## 1 A0        2.41
## 2 A1        2.74
## 3 A2        2.66

9주차 과제

Q1. p156 혼자서 해보기

fuel <- data.frame(fl = c("c", "d", "e", "p", "r"), 
                   price_fl = c(2.35, 2.38, 2.11, 2.76, 2.22), 
                   stringsAsFactors = F)

#1
mpg.new3 <- mpg #카피본 제작
mpg.fuel <- left_join(mpg.new3, fuel, by = "fl") #데이터 추가

#2
mpg.fuel %>% select(model, fl, price_fl) %>% head(5)

## # A tibble: 5 x 3
##   model fl    price_fl
##   <chr> <chr>    <dbl>
## 1 a4    p         2.76
## 2 a4    p         2.76
## 3 a4    p         2.76
## 4 a4    p         2.76
## 5 a4    p         2.76

Q2. p160 분석 도전

#1
midwest.new2 <- midwest #카피본 제작
midwest.new2 <- midwest.new2 %>% mutate(prop_popchild = (poptotal-popadults)/poptotal*100) 

#2
midwest.new2 %>% arrange(desc(prop_popchild)) %>% select(county, prop_popchild) %>% head(5)

##      county prop_popchild
## 1  ISABELLA      51.50117
## 2 MENOMINEE      50.59126
## 3    ATHENS      49.32073
## 4   MECOSTA      49.05918
## 5    MONROE      47.35818

#3
midwest.new2 <- midwest.new2 %>% 
  mutate(grade_popchild = ifelse(prop_popchild >= 40, "large", 
                                 ifelse(prop_popchild >= 30, "middle", "small")))
midwest.new2 %>% 
  group_by(grade_popchild) %>% 
  summarise(n_county = n())

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 3 x 2
##   grade_popchild n_county
##   <chr>             <int>
## 1 large                32
## 2 middle              396
## 3 small                 9

#4
midwest.new2 <- midwest.new2 %>% mutate(prop_popasian = popasian/poptotal*100) 
midwest.new2 %>% select(state, county, prop_popasian) %>% arrange(prop_popasian) %>% head(10)

##    state    county prop_popasian
## 1     WI MENOMINEE    0.00000000
## 2     IN    BENTON    0.01059210
## 3     IN   CARROLL    0.01594981
## 4     OH    VINTON    0.02703190
## 5     WI      IRON    0.03250447
## 6     IL     SCOTT    0.05315379
## 7     IN      CLAY    0.06071645
## 8     MI    OSCODA    0.06375925
## 9     OH     PERRY    0.06654625
## 10    IL     PIATT    0.07074865

Q3. p170 혼자서 해보기

mpg <- as.data.frame(ggplot2::mpg)
mpg.new4 <- mpg
mpg.new4[c(65, 124, 131, 153, 212), "hwy"] <- NA

#1
table(is.na(mpg.new4$drv))

## 
## FALSE 
##   234

table(is.na(mpg.new4$hwy))

## 
## FALSE  TRUE 
##   229     5

#2
mpg.new4 %>% filter(!is.na(hwy)) %>% group_by(drv) %>% summarise(mean_hwy = mean(hwy))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 3 x 2
##   drv   mean_hwy
##   <chr>    <dbl>
## 1 4         19.2
## 2 f         28.2
## 3 r         21

Q4. p178 혼자서 해보기

#1
mpg <- as.data.frame(ggplot2::mpg)
mpg[c(10, 14, 58, 93), "drv"] <- "k"
mpg[c(29, 43, 129, 203), "cty"] <- c(3, 4, 39, 42)
ifelse(mpg$drv %in% "k", NA, mpg$drv) -> mpg$drv #이상치를 NA로 변환
table(is.na(mpg$drv)) #이상치가 NA로 변환되었는 지 확인

## 
## FALSE  TRUE 
##   230     4

#2
str(mpg)

## 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: chr  "audi" "audi" "audi" "audi" ...
##  $ model       : chr  "a4" "a4" "a4" "a4" ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr  "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr  "f" "f" "f" "f" ...
##  $ cty         : num  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr  "p" "p" "p" "p" ...
##  $ class       : chr  "compact" "compact" "compact" "compact" ...

as.numeric(mpg$cty) -> mpg$cty
boxplot(mpg$cty)$stat

##      [,1]
## [1,]    9
## [2,]   14
## [3,]   17
## [4,]   19
## [5,]   26

ifelse(mpg$cty < 9 | mpg$cty > 26, NA, mpg$cty) -> mpg$cty 
boxplot(mpg$cty)

#3
mpg %>% filter(!is.na(drv), !is.na(cty)) %>% group_by(drv) %>% summarise(mean_cty = mean(cty))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 3 x 2
##   drv   mean_cty
##   <chr>    <dbl>
## 1 4         14.2
## 2 f         19.5
## 3 r         14.0

Q5. p188 혼자서 해보기

#1
mpg <- as.data.frame(ggplot2::mpg)
qplot(data = mpg, x = cty, y = hwy) + geom_point()

#2
qplot(data = midwest, x = poptotal, y = popasian) + geom_point() + xlim(0, 500000) + ylim(0, 10000)

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

Q6. p193 혼자서 해보기

#1
mpg %>% filter(class == "suv") %>%  group_by(manufacturer) %>% summarise(mean_cty = mean(cty)) %>% arrange(desc(mean_cty)) %>% head(5) -> mpg_upper5

## `summarise()` ungrouping output (override with `.groups` argument)

ggplot(data = mpg_upper5, aes(x = reorder(manufacturer, -mean_cty), y = mean_cty)) + geom_col()

#2
ggplot(data = mpg, aes(x = class)) + geom_bar()

Q7. p195 혼자서 해보기

#1
ggplot(data = economics, aes(x = date, y = psavert)) + geom_line()

Q8. p198 혼자서 해보기

mpg %>% filter(class == c("subcompact", "compact", "suv")) -> mpg.scs
ggplot(data = mpg.scs, aes(x = class, y = cty)) + geom_boxplot()

11주차 과제

한국 복지 패널 데이터 분석 준비하기

install.packages("foreign")

library(foreign)
library(dplyr)
library(ggplot2)
library(readxl)

setwd("C:\\Users\\user\\Desktop\\R 실습용")
raw_welfare <- read.spss(file = "Koweps_hpc10_2015_beta1.sav", to.data.frame = T)

## Warning in read.spss(file = "Koweps_hpc10_2015_beta1.sav", to.data.frame = T):
## Koweps_hpc10_2015_beta1.sav: Compression bias (0) is not the usual value of 100

welfare <- raw_welfare

dim(welfare)

## [1] 16664   957

welfare <- rename(welfare, 
                  sex = h10_g3, #성별
                  birth = h10_g4, #태어난 연도 
                  marriage = h10_g10, #혼인 상태
                  religion = h10_g11, #종교
                  income = p1002_8aq1, #월급
                  code_job = h10_eco9, #직업 코드
                  code_region = h10_reg7) #지역 코드

성별에 따른 월급 차이

#성별 변수 검토
class(welfare$sex)

## [1] "numeric"

table(welfare$sex)

## 
##    1    2 
## 7578 9086

ifelse(welfare$sex == 1, "male", "female") -> welfare$sex
qplot(welfare$sex)

#월급 변수 검토
class(welfare$income)

## [1] "numeric"

summary(welfare$income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   122.0   192.5   241.6   316.6  2400.0   12030

qplot(welfare$income) + xlim(0, 1000)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 12051 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).

#결측치 제거하기 
ifelse(welfare$income %in% c(0, 9999), NA, welfare$income) -> welfare$income
table(is.na(welfare$income))

## 
## FALSE  TRUE 
##  4620 12044

#성별에 따른 월급 차이 분석하기 
welfare %>% filter(!is.na(income)) %>% group_by(sex) %>% summarise(mean_income = mean(income)) -> sex_income

## `summarise()` ungrouping output (override with `.groups` argument)

sex_income

## # A tibble: 2 x 2
##   sex    mean_income
##   <chr>        <dbl>
## 1 female        163.
## 2 male          312.

ggplot(data = sex_income, aes(x = sex, y = mean_income)) + geom_col()

나이 및 연령대에 따른 월급 차이

#나이 변수 검토
class(welfare$birth)

## [1] "numeric"

summary(welfare$birth)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1907    1946    1966    1968    1988    2014

qplot(welfare$birth)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

welfare$age <- 2015 - welfare$birth + 1 #나이 파생변수 만들기
summary(welfare$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   28.00   50.00   48.43   70.00  109.00

qplot(welfare$age)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#나이에 따른 월급 차이 분석하기 
welfare %>% filter(!is.na(income)) %>% group_by(age) %>% summarise(mean_income = mean(income)) -> age_income

## `summarise()` ungrouping output (override with `.groups` argument)

age_income

## # A tibble: 69 x 2
##      age mean_income
##    <dbl>       <dbl>
##  1    20        121.
##  2    21        106.
##  3    22        130.
##  4    23        142.
##  5    24        134.
##  6    25        145.
##  7    26        158.
##  8    27        188.
##  9    28        205.
## 10    29        189.
## # ... with 59 more rows

ggplot(data = age_income, aes(x = age, y = mean_income)) + geom_line()

#연령대에 따른 월급 차이 분석하기 
welfare$ageg <- ifelse(welfare$age < 30, "young", 
                       ifelse(welfare$age <= 59, "middle", "old")) #연령대 파생변수 만들기
table(welfare$ageg)

## 
## middle    old  young 
##   6049   6281   4334

qplot(welfare$ageg)

welfare %>% filter(!is.na(income)) %>% group_by(ageg) %>% summarise(mean_income = mean(income)) -> ageg_income

## `summarise()` ungrouping output (override with `.groups` argument)

ageg_income

## # A tibble: 3 x 2
##   ageg   mean_income
##   <chr>        <dbl>
## 1 middle        282.
## 2 old           125.
## 3 young         164.

ggplot(data = ageg_income, aes(x = ageg, y = mean_income)) + geom_col() + scale_x_discrete(limit = c("young", "middle", "old"))

성별 및 연령대, 나이에 따른 월급 차이

# 성별 및 연령대에 따른 월급 차이
welfare %>% filter(!is.na(income)) %>% group_by(ageg, sex) %>% summarise(mean_income = mean(income)) -> sex_income

## `summarise()` regrouping output by 'ageg' (override with `.groups` argument)

sex_income

## # A tibble: 6 x 3
## # Groups:   ageg [3]
##   ageg   sex    mean_income
##   <chr>  <chr>        <dbl>
## 1 middle female       188. 
## 2 middle male         353. 
## 3 old    female        81.5
## 4 old    male         174. 
## 5 young  female       160. 
## 6 young  male         171.

ggplot(data = sex_income, aes(x = ageg, y = mean_income, fill = sex)) + geom_col(position = "dodge") + scale_x_discrete(limit = c("young", "middle", "old"))

# 성별 및 나이에 따른 월급 차이
welfare %>% filter(!is.na(income)) %>% group_by(age, sex) %>% summarise(mean_income = mean(income)) -> sex_age

## `summarise()` regrouping output by 'age' (override with `.groups` argument)

sex_income

## # A tibble: 6 x 3
## # Groups:   ageg [3]
##   ageg   sex    mean_income
##   <chr>  <chr>        <dbl>
## 1 middle female       188. 
## 2 middle male         353. 
## 3 old    female        81.5
## 4 old    male         174. 
## 5 young  female       160. 
## 6 young  male         171.

ggplot(data = sex_age, aes(x = age, y = mean_income, col = sex)) + geom_line()

직업별 월급 차이

class(welfare$code_job)

## [1] "numeric"

table(welfare$code_job)

## 
##  111  120  131  132  133  134  135  139  141  149  151  152  153  159  211  212 
##    2   16   10   11    9    3    7   10   35   20   26   18   15   16    8    4 
##  213  221  222  223  224  231  232  233  234  235  236  237  239  241  242  243 
##    3   17   31   12    4   41    5    3    6   48   14    2   29   12    4   63 
##  244  245  246  247  248  251  252  253  254  259  261  271  272  273  274  281 
##    4   33   59   77   38   14  111   24   67  109    4   15   11    4   36   17 
##  283  284  285  286  289  311  312  313  314  320  330  391  392  399  411  412 
##    8   10   26   16    5  140  260  220   84   75   15    4   13   87   47   12 
##  421  422  423  429  431  432  441  442  510  521  522  530  611  612  613  620 
##  124   71    5   14   20   33  154  197  192  353    5  106 1320   11   40    2 
##  630  710  721  722  730  741  742  743  751  752  753  761  762  771  772  773 
##   20   29   30   22   16   27    3   34   34    5   49   69   27   11   61   86 
##  774  780  791  792  799  811  812  819  821  822  823  831  832  841  842  843 
##    7   17    5   21   45   16    1    6    9    9   23    5   17   32   10    4 
##  851  852  853  854  855  861  862  863  864  871  873  874  875  876  881  882 
##   19   13    7   33    9    3   14   17   31    2  257   34   37    2    2    3 
##  891  892  899  910  921  922  930  941  942  951  952  953  991  992  999 1011 
##    8   19   16  102   31   74  289  325   99  125  122   73   45   12  141    2 
## 1012 
##   17

library(readxl)

#데이터 전처리
setwd("C:\\Users\\user\\Desktop\\R 실습용")
list_job <- read_excel("Koweps_Codebook.xlsx", col_names = T, sheet = 2)
head(list_job)

## # A tibble: 6 x 2
##   code_job job                                
##      <dbl> <chr>                              
## 1      111 의회의원 고위공무원 및 공공단체임원
## 2      112 기업고위임원                       
## 3      120 행정 및 경영지원 관리자            
## 4      131 연구 교육 및 법률 관련 관리자      
## 5      132 보험 및 금융 관리자                
## 6      133 보건 및 사회복지 관련 관리자

dim(list_job)

## [1] 149   2

welfare <- left_join(welfare, list_job, id = "code_job")

## Joining, by = "code_job"

welfare %>% filter(!is.na(code_job)) %>% select(code_job, job) %>% head(10)

##    code_job                                job
## 1       942                   경비원 및 검표원
## 2       762                             전기공
## 3       530 방문 노점 및 통신 판매 관련 종사자
## 4       999        기타 서비스관련 단순 종사원
## 5       312                    경영관련 사무원
## 6       254             문리 기술 및 예능 강사
## 7       510                        영업 종사자
## 8       530 방문 노점 및 통신 판매 관련 종사자
## 9       286   스포츠 및 레크레이션 관련 전문가
## 10      521                   매장 판매 종사자

#직업별 상위 10위, 하위 10위의 월급 평균 구하기 
job_income <- welfare %>% filter(!is.na(job) & !is.na(income)) %>% group_by(job) %>% summarise(mean_income = mean(income)) %>% arrange(desc(mean_income))

## `summarise()` ungrouping output (override with `.groups` argument)

top10 <- job_income %>% head(10)
bottom10 <- job_income %>% tail(10)
ggplot(data = top10, aes(x = reorder(job, mean_income), y = mean_income)) + geom_col() + coord_flip()

ggplot(data = bottom10, aes(x = reorder(job, -mean_income), y = mean_income)) + geom_col() + coord_flip() + ylim(0, 850)

성별 직업 빈도

male_job <- welfare %>% filter(!is.na(job) & sex == "male") %>% group_by(job) %>% summarise(n = n()) %>% arrange(desc(n))

## `summarise()` ungrouping output (override with `.groups` argument)

male_top10 <- male_job %>% head(10)

female_job <- welfare %>% filter(!is.na(job) & sex == "female") %>% group_by(job) %>% summarise(n = n()) %>% arrange(desc(n))

## `summarise()` ungrouping output (override with `.groups` argument)

female_top10 <- female_job %>% head(10)

ggplot(data = male_top10, aes(x = reorder(job, n), y = n)) + geom_col() + coord_flip()

ggplot(data = female_top10, aes(x = reorder(job, n), y = n)) + geom_col() + coord_flip()

종교 유무에 따른 이혼율

class(welfare$religion)

## [1] "numeric"

table(welfare$religion)

## 
##    1    2 
## 8047 8617

class(welfare$marriage)

## [1] "numeric"

table(welfare$marriage)

## 
##    0    1    2    3    4    5    6 
## 2861 8431 2117  712   84 2433   26

#데이터 전처리
welfare$religion <- ifelse(welfare$religion == 1, "yes", "no")
qplot(welfare$religion)

welfare$group_marriage <- ifelse(welfare$marriage == 1, "marriage", 
                                ifelse(welfare$marriage == 3, "divorce", NA))
table(welfare$group_marriage)

## 
##  divorce marriage 
##      712     8431

qplot(welfare$group_marriage)

religion_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(religion, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))

## `summarise()` regrouping output by 'religion' (override with `.groups` argument)

divorce <- religion_marriage %>% filter(group_marriage == "divorce") %>% select(religion, pct) 

#그래프 그리기 
ggplot(data = divorce, aes(x = religion, y = pct)) + geom_col()

#연령대별 이혼율 분석
ageg_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(ageg, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))

## `summarise()` regrouping output by 'ageg' (override with `.groups` argument)

ageg_divorce <- ageg_marriage %>% filter(group_marriage == "divorce") %>% select(ageg, pct)
ggplot(data = ageg_divorce, aes(x = ageg, y = pct)) + geom_col()

#연령대 및 종교 유무에 따른 이혼율 분석
ageg_religion_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(ageg, religion, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))

## `summarise()` regrouping output by 'ageg', 'religion' (override with `.groups` argument)

ageg_religion_divorce <- ageg_religion_marriage %>% filter(group_marriage == "divorce") %>% select(ageg, religion, pct)
ggplot(data = ageg_religion_divorce, aes(x = ageg, y = pct, fill = religion)) + geom_col(position = "dodge")

지역별 연령대 비율

class(welfare$code_region)

## [1] "numeric"

table(welfare$code_region)

## 
##    1    2    3    4    5    6    7 
## 2486 3711 2785 2036 1467 1257 2922

#데이터 전처리
list_region <- data.frame(code_region = c(1:7), 
                          region = c("서울", "수도권(인천/경기)", "부산/경남/울산", "대구/경북", "대전/충남", "강원/충북", "광주/전남/전북/제주도"))
welfare <- left_join(welfare, list_region, by = "code_region")

region_ageg <- welfare %>% group_by(region, ageg) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 2))

## `summarise()` regrouping output by 'region' (override with `.groups` argument)

#그래프 그리기
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip()

#노년층 비율이 높은 순으로 정렬하기
list_order_old <- region_ageg %>% filter(ageg == "old") %>% arrange(pct) 
order <- list_order_old$region
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip() + scale_x_discrete(limits = order)

#연령대 순으로 막대 색깔 나열하기
region_ageg$ageg <- factor(region_ageg$ageg, level = c("old", "middle", "young"))
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip() + scale_x_discrete(limits = order)

Vowel and coda duration as a function of speaking rate

setwd("C:\\Users\\user\\Desktop\\R 실습용")
coda <- read.delim(file = "all_data.txt") 
V_speed <- coda %>% filter(phoneme == c("AE1", "EH1", "IH1")) %>% group_by(voice, speed) %>% summarise(mean_duration = mean(duration))

## `summarise()` regrouping output by 'voice' (override with `.groups` argument)

coda_speed <- coda %>% filter(phoneme == c("K", "G")) %>% group_by(voice, speed) %>% summarise(mean_duration = mean(duration))

## `summarise()` regrouping output by 'voice' (override with `.groups` argument)

ggplot(data = V_speed, aes(x = speed, y = mean_duration, fill = voice)) + geom_col(position = "dodge") #모음의 길이

ggplot(data = coda_speed, aes(x = speed, y = mean_duration, fill = voice)) + geom_col(position = "dodge") #coda의 길이

Vowel and coda duration as a function of sentence position

V_pos <- coda %>% filter(phoneme == c("AE1", "EH1", "IH1")) %>% group_by(position, voice) %>% summarise(mean_duration = mean(duration))

## `summarise()` regrouping output by 'position' (override with `.groups` argument)

coda_pos <- coda %>% filter(phoneme == c("K", "G")) %>% group_by(position, voice) %>% summarise(mean_duration = mean(duration))

## `summarise()` regrouping output by 'position' (override with `.groups` argument)

ggplot(data = V_pos, aes(x = position, y = mean_duration, fill = voice)) + geom_col(position = "dodge") #모음의 길이

ggplot(data = coda_pos, aes(x = position, y = mean_duration, fill = voice)) + geom_col(position = "dodge") #coda의 길이

Vowel and coda duration as a function of vowel height

V_height <- coda %>% filter(phoneme == c("AE1", "EH1", "IH1")) %>% group_by(height, voice) %>% summarise(mean_duration = mean(duration))

## `summarise()` regrouping output by 'height' (override with `.groups` argument)

coda_height <- coda %>% filter(phoneme == c("K", "G")) %>% group_by(height, voice) %>% summarise(mean_duration = mean(duration))

## `summarise()` regrouping output by 'height' (override with `.groups` argument)

ggplot(data = V_height, aes(x = height, y = mean_duration, fill = voice)) + geom_col(position = "dodge") #모음의 길이

ggplot(data = coda_height, aes(x = height, y = mean_duration, fill = voice)) + geom_col(position = "dodge") #coda의 길이

V-to-C ratio under varying speaking rate in CVC words ending in [-voice] and [+voice] stops

V_speed <- coda %>% filter(phoneme == c("AE1", "EH1", "IH1")) %>% group_by(voice, speed) %>% summarise(V_mean_duration = mean(duration))

## `summarise()` regrouping output by 'voice' (override with `.groups` argument)

coda_speed <- coda %>% filter(phoneme == c("K", "G")) %>% group_by(voice, speed) %>% summarise(coda_mean_duration = mean(duration))

## `summarise()` regrouping output by 'voice' (override with `.groups` argument)

VC_ratio <- left_join(V_speed, coda_speed, by = c("voice", "speed"))
VC_ratio <- VC_ratio %>% mutate(VC_ratio = V_mean_duration/coda_mean_duration) 
ggplot(data = VC_ratio, aes(x = voice, y = VC_ratio, fill = speed)) + geom_col(position = "dodge")

## 12주차 과제

install.packages("stringr")
install.packages("wordcloud")

library(KoNLP)

## Checking user defined dictionary!

library(stringr)
library(dplyr)
library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)

국정원 트윗 텍스트 마이닝

#데이터 정제하기
setwd("C:\\Users\\user\\Desktop\\R 실습용")
twitter <- read.csv("twitter.csv", header = T, stringsAsFactors = F, fileEncoding = "UTF-8")
twitter <- rename(twitter, no = 번호, id = 계정이름, date = 작성일, tw = 내용)
twitter$tw <- str_replace_all(twitter$tw, "\\W", " ")

#가장 많이 사용된 단어 알아보기 
nouns <- extractNoun(twitter$tw)
wordcount <- table(unlist(nouns)) 
df_word <- as.data.frame(wordcount, stringsAsFactors = F)
df_word <- rename(df_word, word = Var1, freq = Freq)
df_word <- filter(df_word, nchar(word) >= 2)
top_20 <- df_word %>% arrange(desc(freq)) %>% head(20)

#단어 빈도 그래프 만들기
order <- arrange(top_20, freq)$word
ggplot(data = top_20, aes(x = word, y = freq)) + ylim(0, 2500) + geom_col() + coord_flip() + scale_x_discrete(limit = order) + geom_text(aes(label = freq), hjust = -0.3)

#wordcloud 만들기
pal <- brewer.pal(8, "Dark2")
set.seed(1234)
wordcloud(words = df_word$word,
          freq = df_word$freq, 
          min.freq = 10, 
          max.words = 200, 
          random.order = F, 
          rot.per = .1, 
          scale = c(6, 0.2), 
          colors = pal)

pal <- brewer.pal(9, "Blues")[5:9]
set.seed(1234)
wordcloud(words = df_word$word,
          freq = df_word$freq, 
          min.freq = 10, 
          max.words = 200, 
          random.order = F, 
          rot.per = .1, 
          scale = c(6, 0.2), 
          colors = pal)