Q1
test <- c(80, 60, 70, 50, 90)
Q2
mean(test)
## [1] 70
Q3
(mean.test <- mean(test))
## [1] 70
Q2. Find out about the function βrepβ by typing ?rep and generate a vector containing 10 repetitions of the word βnounβ, and another vector containing 10 repetitions of the word βnounβ and 20 repetitions of βverbβ.
μ°μ ?repμ μλμμΌλ³΄λ©΄ μλμ κ°μ ꡬ쑰μ μμκ° λμ¨λ€.
#rep(x, times = 1, length.out = NA, each = 1)
rep(1:4, 2)
## [1] 1 2 3 4 1 2 3 4
rep(1:4, each = 2) # not the same.
## [1] 1 1 2 2 3 3 4 4
rep(1:4, c(2,2,2,2)) # same as second.
## [1] 1 1 2 2 3 3 4 4
rep(1:4, c(2,1,2,1))
## [1] 1 1 2 3 3 4
rep(1:4, each = 2, len = 4) # first 4 only.
## [1] 1 1 2 2
rep(1:4, each = 2, len = 10) # 8 integers plus two recycled 1's.
## [1] 1 1 2 2 3 3 4 4 1 1
rep(1:4, each = 2, times = 3) # length 24, 3 complete replications
## [1] 1 1 2 2 3 3 4 4 1 1 2 2 3 3 4 4 1 1 2 2 3 3 4 4
μ¦, rep(x, times = 1, length.out = NA, each = 1)μμ κ°κ° μλ―Ένλ κ²μ
x: vector
times: λ°λ³΅ νμ
length.out: κΈΈμ΄ μ ν
each: xμμ λ€μ΄ μλ κ° elementμ λ°λ³΅ νμ
κ·Έλ¬λ―λ‘ βnounβμ 10λ², βverbβλ₯Ό 20λ² λ°λ³΅νλ vectorλ₯Ό λ§λ€λ €λ©΄ μλμ κ°μ΄ μ λ ₯νλ©΄ λλ€.
(nounverb <- c(rep("noun", 10), rep("verb", 20)))
## [1] "noun" "noun" "noun" "noun" "noun" "noun" "noun" "noun" "noun" "noun"
## [11] "verb" "verb" "verb" "verb" "verb" "verb" "verb" "verb" "verb" "verb"
## [21] "verb" "verb" "verb" "verb" "verb" "verb" "verb" "verb" "verb" "verb"
Q3. Find out what the funciton βseqβ does, and generate a user-defined regular sequence.
μ°μ ?repμ μλμμΌλ³΄λ©΄ μλμ κ°μ ꡬ쑰μ μμκ° λμ¨λ€.
#seq(from = 1, to = 1, by = ((to - from)/(length.out - 1)), length.out = NULL)
seq(0, 1, length.out = 11)
## [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
seq(1, 9, by = 2) # matches 'end'
## [1] 1 3 5 7 9
seq(1, 9, by = pi) # stays below 'end'
## [1] 1.000000 4.141593 7.283185
seq(1, 6, by = 3)
## [1] 1 4
seq(1.575, 5.125, by = 0.05)
## [1] 1.575 1.625 1.675 1.725 1.775 1.825 1.875 1.925 1.975 2.025 2.075 2.125
## [13] 2.175 2.225 2.275 2.325 2.375 2.425 2.475 2.525 2.575 2.625 2.675 2.725
## [25] 2.775 2.825 2.875 2.925 2.975 3.025 3.075 3.125 3.175 3.225 3.275 3.325
## [37] 3.375 3.425 3.475 3.525 3.575 3.625 3.675 3.725 3.775 3.825 3.875 3.925
## [49] 3.975 4.025 4.075 4.125 4.175 4.225 4.275 4.325 4.375 4.425 4.475 4.525
## [61] 4.575 4.625 4.675 4.725 4.775 4.825 4.875 4.925 4.975 5.025 5.075 5.125
seq(17) # same as 1:17, or even better seq_len(17)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
μ¦, seq(from = 1, to = 1, by = ((to - from)/(length.out - 1)), length.out = NULL)μμ μλ―Ένλ κ²μ
from, to: sequenceμ μμκ³Ό λ
by: sequenceμ μ¦κ°λ
length.out: sequenceμ κΈΈμ΄
κ·Έλ¬λ―λ‘ μλμ κ°μ seqλ₯Ό λ§λ€ μ μλ€.
seq(0, 100, by = 20)
## [1] 0 20 40 60 80 100
Q1. p112 νΌμμ ν΄λ³΄κΈ°
install.packages("ggplot2")
install.packages("dplyr")
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mpg.new <- mpg
mpg.new <- rename(mpg.new, city = cty, highway = hwy)
head(mpg.new)
## # A tibble: 6 x 11
## manufacturer model displ year cyl trans drv city highway fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compa~
## 2 audi a4 1.8 1999 4 manual(~ f 21 29 p compa~
## 3 audi a4 2 2008 4 manual(~ f 20 31 p compa~
## 4 audi a4 2 2008 4 auto(av) f 21 30 p compa~
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p compa~
## 6 audi a4 2.8 1999 6 manual(~ f 18 26 p compa~
Q2. p123 λΆμ λμ !
midwest <- as.data.frame(ggplot2::midwest)
dim(midwest) #ν, μ΄ κ°μ νμ
νκΈ°
## [1] 437 28
str(midwest) #λ°μ΄ν° νλ μ ꡬ쑰 νμΈ
## 'data.frame': 437 obs. of 28 variables:
## $ PID : int 561 562 563 564 565 566 567 568 569 570 ...
## $ county : chr "ADAMS" "ALEXANDER" "BOND" "BOONE" ...
## $ state : chr "IL" "IL" "IL" "IL" ...
## $ area : num 0.052 0.014 0.022 0.017 0.018 0.05 0.017 0.027 0.024 0.058 ...
## $ poptotal : int 66090 10626 14991 30806 5836 35688 5322 16805 13437 173025 ...
## $ popdensity : num 1271 759 681 1812 324 ...
## $ popwhite : int 63917 7054 14477 29344 5264 35157 5298 16519 13384 146506 ...
## $ popblack : int 1702 3496 429 127 547 50 1 111 16 16559 ...
## $ popamerindian : int 98 19 35 46 14 65 8 30 8 331 ...
## $ popasian : int 249 48 16 150 5 195 15 61 23 8033 ...
## $ popother : int 124 9 34 1139 6 221 0 84 6 1596 ...
## $ percwhite : num 96.7 66.4 96.6 95.3 90.2 ...
## $ percblack : num 2.575 32.9 2.862 0.412 9.373 ...
## $ percamerindan : num 0.148 0.179 0.233 0.149 0.24 ...
## $ percasian : num 0.3768 0.4517 0.1067 0.4869 0.0857 ...
## $ percother : num 0.1876 0.0847 0.2268 3.6973 0.1028 ...
## $ popadults : int 43298 6724 9669 19272 3979 23444 3583 11323 8825 95971 ...
## $ perchsd : num 75.1 59.7 69.3 75.5 68.9 ...
## $ percollege : num 19.6 11.2 17 17.3 14.5 ...
## $ percprof : num 4.36 2.87 4.49 4.2 3.37 ...
## $ poppovertyknown : int 63628 10529 14235 30337 4815 35107 5241 16455 13081 154934 ...
## $ percpovertyknown : num 96.3 99.1 95 98.5 82.5 ...
## $ percbelowpoverty : num 13.15 32.24 12.07 7.21 13.52 ...
## $ percchildbelowpovert: num 18 45.8 14 11.2 13 ...
## $ percadultpoverty : num 11.01 27.39 10.85 5.54 11.14 ...
## $ percelderlypoverty : num 12.44 25.23 12.7 6.22 19.2 ...
## $ inmetro : int 0 0 0 1 0 0 0 0 0 1 ...
## $ category : chr "AAR" "LHR" "AAR" "ALU" ...
summary(midwest) #λ°μ΄ν° νλ μ μμ½
## PID county state area
## Min. : 561 Length:437 Length:437 Min. :0.00500
## 1st Qu.: 670 Class :character Class :character 1st Qu.:0.02400
## Median :1221 Mode :character Mode :character Median :0.03000
## Mean :1437 Mean :0.03317
## 3rd Qu.:2059 3rd Qu.:0.03800
## Max. :3052 Max. :0.11000
## poptotal popdensity popwhite popblack
## Min. : 1701 Min. : 85.05 Min. : 416 Min. : 0
## 1st Qu.: 18840 1st Qu.: 622.41 1st Qu.: 18630 1st Qu.: 29
## Median : 35324 Median : 1156.21 Median : 34471 Median : 201
## Mean : 96130 Mean : 3097.74 Mean : 81840 Mean : 11024
## 3rd Qu.: 75651 3rd Qu.: 2330.00 3rd Qu.: 72968 3rd Qu.: 1291
## Max. :5105067 Max. :88018.40 Max. :3204947 Max. :1317147
## popamerindian popasian popother percwhite
## Min. : 4.0 Min. : 0 Min. : 0 Min. :10.69
## 1st Qu.: 44.0 1st Qu.: 35 1st Qu.: 20 1st Qu.:94.89
## Median : 94.0 Median : 102 Median : 66 Median :98.03
## Mean : 343.1 Mean : 1310 Mean : 1613 Mean :95.56
## 3rd Qu.: 288.0 3rd Qu.: 401 3rd Qu.: 345 3rd Qu.:99.07
## Max. :10289.0 Max. :188565 Max. :384119 Max. :99.82
## percblack percamerindan percasian percother
## Min. : 0.0000 Min. : 0.05623 Min. :0.0000 Min. :0.00000
## 1st Qu.: 0.1157 1st Qu.: 0.15793 1st Qu.:0.1737 1st Qu.:0.09102
## Median : 0.5390 Median : 0.21502 Median :0.2972 Median :0.17844
## Mean : 2.6763 Mean : 0.79894 Mean :0.4872 Mean :0.47906
## 3rd Qu.: 2.6014 3rd Qu.: 0.38362 3rd Qu.:0.5212 3rd Qu.:0.48050
## Max. :40.2100 Max. :89.17738 Max. :5.0705 Max. :7.52427
## popadults perchsd percollege percprof
## Min. : 1287 Min. :46.91 Min. : 7.336 Min. : 0.5203
## 1st Qu.: 12271 1st Qu.:71.33 1st Qu.:14.114 1st Qu.: 2.9980
## Median : 22188 Median :74.25 Median :16.798 Median : 3.8142
## Mean : 60973 Mean :73.97 Mean :18.273 Mean : 4.4473
## 3rd Qu.: 47541 3rd Qu.:77.20 3rd Qu.:20.550 3rd Qu.: 4.9493
## Max. :3291995 Max. :88.90 Max. :48.079 Max. :20.7913
## poppovertyknown percpovertyknown percbelowpoverty percchildbelowpovert
## Min. : 1696 Min. :80.90 Min. : 2.180 Min. : 1.919
## 1st Qu.: 18364 1st Qu.:96.89 1st Qu.: 9.199 1st Qu.:11.624
## Median : 33788 Median :98.17 Median :11.822 Median :15.270
## Mean : 93642 Mean :97.11 Mean :12.511 Mean :16.447
## 3rd Qu.: 72840 3rd Qu.:98.60 3rd Qu.:15.133 3rd Qu.:20.352
## Max. :5023523 Max. :99.86 Max. :48.691 Max. :64.308
## percadultpoverty percelderlypoverty inmetro category
## Min. : 1.938 Min. : 3.547 Min. :0.0000 Length:437
## 1st Qu.: 7.668 1st Qu.: 8.912 1st Qu.:0.0000 Class :character
## Median :10.008 Median :10.869 Median :0.0000 Mode :character
## Mean :10.919 Mean :11.389 Mean :0.3432
## 3rd Qu.:13.182 3rd Qu.:13.412 3rd Qu.:1.0000
## Max. :43.312 Max. :31.162 Max. :1.0000
midwest.new <- midwest #λ°μ΄ν° νλ μ μ¬λ³Έ μ μ ν ν μ΄λ¦ μ¬μ€μ
midwest.new <- rename(midwest.new, total = poptotal, asian = popasian)
midwest.new$prop_asian <- (midwest.new$asian/midwest.new$total)*100 #νμλ³μ μ μ
hist(midwest.new$prop_asian) #νμ€ν κ·Έλ¨ μ μ
mean(midwest.new$prop_asian) # νκ· κ΅¬νκΈ°
## [1] 0.4872462
midwest.new$mean_asian <- ifelse(midwest.new$prop_asian > mean(midwest.new$prop_asian), "large", "small") # 쑰건문 νμ© νμλ³μ μ μ
table(midwest.new$mean_asian) # λΉλν μ μ
##
## large small
## 119 318
qplot(midwest.new$mean_asian) # λ§λκ·Έλν μ μ
Q3. mlu λ°μ΄ν° λΆμνκΈ°
install.packages("readxl")
library(readxl)
setwd("C:\\Users\\user\\Desktop\\R μ€μ΅μ©")
mlu_data <- read_excel("mlu.xlsx")
mlu_data.ori <- mlu_data #1. μΉ΄νΌλ³Έ μ μ
table(mlu_data$age) #2. age μΉΌλΌμ μμΈ κ°μ μμ보기
##
## A0 A1 A2
## 12 11 12
mlu_data <- rename(mlu_data, utterances = utterances_mlu, words = words_mlu) #3. μΉΌλΌ μ΄λ¦ λ°κΎΈκΈ°
mlu_data$mlu <- mlu_data$words/mlu_data$utterances #4. νμλ³μ λ§λ€κΈ°
summary(mlu_data$mlu) #5.mlu μΉΌλΌμ νκ· λ° quartile κ° κ΅¬νκΈ°
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.730 2.447 2.745 2.696 2.916 3.476
mlu_data$grade <- ifelse(mlu_data$mlu < 2.447, "D",
ifelse(mlu_data$mlu < 2.745, "C",
ifelse(mlu_data$mlu < 2.916, "B", "A"))) #6-1. quartile κ°μ κΈ°μ€μΌλ‘ ifelseλ₯Ό μ¬μ©ν΄μ grade νμ λ³μ λ§λ€κΈ°
mlu_data$grade2 <- ntile(mlu_data$mlu, 4) #6-2. λ€λ₯Έ λ°©μμΌλ‘ dplyr ν¨ν€μ§μ μλ ntile ν¨μλ₯Ό μ΄μ©ν΄μ 4λ±λΆμΌλ‘ grouping ν grade2 νμ λ³μ λ§λ€κΈ°
qplot(data = mlu_data, x = age, y = mlu) #8. ageμ mluμ¬μ΄μ κ΄κ³λ₯Ό κ·Έλνλ‘ κ·Έλ €λ³΄κΈ°
Q1. p133 νΌμμ ν΄λ³΄κΈ°
1-1.
mpg %>% filter(displ <= 4) -> mpg_lower4 #λ°°κΈ°λμ΄ 4 μ΄νμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©)
mpg[mpg$displ <= 4,] -> mpg_lower4 #λ°°κΈ°λμ΄ 4 μ΄νμΈ μ΄ μΆμΆ (filterν¨μ μ¬μ©X)
mean(mpg_lower4$hwy)
## [1] 25.96319
mpg %>% filter(displ >= 5) -> mpg_upper5 #λ°°κΈ°λμ΄ 5 μ΄μμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©)
mpg[mpg$displ >= 5,] -> mpg_upper5 #λ°°κΈ°λμ΄ 5 μ΄μμΈ μ΄ μΆμΆ (filterν¨μ μ¬μ©X)
mean(mpg_upper5$hwy)
## [1] 18.07895
ifelse(mpg$displ <= 4, "lower4", ifelse(mpg$displ >= 5, "upper5", NA)) -> mpg$grade
# λ°°κΈ°λμ΄ 5 μ΄μμΈ μλμ°¨λ³΄λ€ 4 μ΄μμΈ μλμ°¨μ μ°λΉκ° νκ· μ μΌλ‘ λ λλ€.
1-2.(νμλ³μμ tapplyν¨μλ₯Ό μ΄μ©ν΄μ νκ· κ΅¬νκΈ°)
ifelse(mpg$displ <= 4, "lower4", ifelse(mpg$displ >= 5, "upper5", NA)) -> mpg$grade
tapply(mpg$hwy, mpg$grade, mean)
## lower4 upper5
## 25.96319 18.07895
# λ°°κΈ°λμ΄ 5 μ΄μμΈ μλμ°¨λ³΄λ€ 4 μ΄μμΈ μλμ°¨μ μ°λΉκ° νκ· μ μΌλ‘ λ λλ€.
2-1.
mpg %>% filter(manufacturer == "audi") -> mpg_audi #μ μ‘°μ¬κ° audiμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©)
mpg[mpg$manufacturer == "audi",] -> mpg_audi #μ μ‘°μ¬κ° audiμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©X)
mean(mpg_audi$cty)
## [1] 17.61111
mpg %>% filter(manufacturer == "toyota") -> mpg_toyota #μ μ‘°μ¬κ° hondaμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©)
mpg[mpg$manufacturer == "toyota",] -> mpg_toyota #μ μ‘°μ¬κ° hondaμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©X)
mean(mpg_toyota$cty)
## [1] 18.52941
# μ μ‘°νμ¬κ° audiμΈ μλμ°¨λ³΄λ€ toyotaμΈ μλμ°¨μ λμ μ°λΉκ° νκ· μ μΌλ‘ λ λλ€.
2-2.(tapplyν¨μλ₯Ό μ΄μ©ν΄μ νκ· κ΅¬νκΈ°)
tapply(mpg$cty, mpg$manufacturer, mean)
## audi chevrolet dodge ford honda hyundai jeep
## 17.61111 15.00000 13.13514 14.00000 24.44444 18.64286 13.50000
## land rover lincoln mercury nissan pontiac subaru toyota
## 11.50000 11.33333 13.25000 18.07692 17.00000 19.28571 18.52941
## volkswagen
## 20.92593
# μ μ‘°νμ¬κ° audiμΈ μλμ°¨λ³΄λ€ toyotaμΈ μλμ°¨μ λμ μ°λΉκ° νκ· μ μΌλ‘ λ λλ€.
mpg %>% filter(manufacturer %in% c("chevrolet", "ford", "honda")) -> mpg_ma #μ μ‘°μ¬κ° chevorlet, ford, hondaμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©)
mpg[mpg$manufacturer %in% c("chevrolet", "ford", "honda"),] -> mpg_ma #μ μ‘°μ¬κ° chevorlet, ford, hondaμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©X)
mean(mpg_ma$hwy)
## [1] 22.50943
Q2. p138 νΌμμ ν΄λ³΄κΈ°
mpg %>% select(class, cty) -> mpg_cc #class, cty ν μΆμΆ(selectν¨μ μ¬μ©)
mpg[,c("class","cty")] -> mpg_cc #class, cty ν μΆμΆ(selectν¨μ μ¬μ©X)
head(mpg_cc)
## # A tibble: 6 x 2
## class cty
## <chr> <int>
## 1 compact 18
## 2 compact 21
## 3 compact 20
## 4 compact 21
## 5 compact 16
## 6 compact 18
2-1.
mpg_cc %>% filter(class == "suv") -> mpg_suv #μλμ°¨ μ’
λ₯κ° suvμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©)
mpg_cc[mpg$class == "suv",] -> mpg_suv #μλμ°¨ μ’
λ₯κ° suvμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©X)
mpg %>% select(class, cty) %>% filter(class == "suv") -> mpg_suv #class, cty νκ³Ό μλμ°¨ μ’
λ₯κ° suvμΈ μ΄ μΆμΆ(select,filterν¨μ μ¬μ©)
mpg[mpg$class == "suv",c("class","cty")] -> mpg_suv #class, cty νκ³Ό μλμ°¨ μ’
λ₯κ° suvμΈ μ΄ μΆμΆ(select,filterν¨μ μ¬μ©X)
mean(mpg_suv$cty)
## [1] 13.5
mpg_cc %>% filter(class == "compact") -> mpg_compact #μλμ°¨ μ’
λ₯κ° suvμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©)
mpg_cc[mpg$class == "compact",] -> mpg_compact #μλμ°¨ μ’
λ₯κ° suvμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©X)
mpg %>% select(class, cty) %>% filter(class == "compact") -> mpg_compact #class, cty νκ³Ό μλμ°¨ μ’
λ₯κ° suvμΈ μ΄ μΆμΆ(select,filterν¨μ μ¬μ©)
mpg[mpg$class == "compact",c("class","cty")] -> mpg_compact #class, cty νκ³Ό μλμ°¨ μ’
λ₯κ° suvμΈ μ΄ μΆμΆ(select,filterν¨μ μ¬μ©X)
mean(mpg_compact$cty)
## [1] 20.12766
#μλμ°¨ μ’
λ₯κ° suvμΈ μλμ°¨λ³΄λ€ compactμΈ μλμ°¨μ λμμ°λΉκ° λ λλ€
2-2. (tapplyν¨μλ₯Ό μ΄μ©ν΄μ νκ· κ΅¬νκΈ°)
tapply(mpg$cty, mpg$class, mean)
## 2seater compact midsize minivan pickup subcompact suv
## 15.40000 20.12766 18.75610 15.81818 13.00000 20.37143 13.50000
#μλμ°¨ μ’
λ₯κ° suvμΈ μλμ°¨λ³΄λ€ compactμΈ μλμ°¨μ λμμ°λΉκ° λ λλ€
Q3. p141 νΌμμ ν΄λ³΄κΈ°
mpg %>% filter(manufacturer == "audi") -> mpg_audi #μ μ‘°μ¬κ° audiμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©)
mpg_audi %>% arrange(desc(hwy)) %>% head(5) #ordering ν μ°λΉκ° 1~5μ μμ λλ μλμ°¨ μΆλ ₯
## # A tibble: 5 x 12
## manufacturer model displ year cyl trans drv cty hwy fl class grade
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <chr>
## 1 audi a4 2 2008 4 manu~ f 20 31 p comp~ lowe~
## 2 audi a4 2 2008 4 auto~ f 21 30 p comp~ lowe~
## 3 audi a4 1.8 1999 4 auto~ f 18 29 p comp~ lowe~
## 4 audi a4 1.8 1999 4 manu~ f 21 29 p comp~ lowe~
## 5 audi a4 q~ 2 2008 4 manu~ 4 20 28 p comp~ lowe~
mpg[mpg$manufacturer == "audi",] -> mpg_audi #μ μ‘°μ¬κ° audiμΈ μ΄ μΆμΆ(filterν¨μ μ¬μ©X)
head(mpg_audi[order(-mpg_audi$hwy),],5) #μ μ‘°μ¬κ° audiμΈ μ΄ μΆμΆ(arrangeν¨μ μ¬μ©X)
## # A tibble: 5 x 12
## manufacturer model displ year cyl trans drv cty hwy fl class grade
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <chr>
## 1 audi a4 2 2008 4 manu~ f 20 31 p comp~ lowe~
## 2 audi a4 2 2008 4 auto~ f 21 30 p comp~ lowe~
## 3 audi a4 1.8 1999 4 auto~ f 18 29 p comp~ lowe~
## 4 audi a4 1.8 1999 4 manu~ f 21 29 p comp~ lowe~
## 5 audi a4 q~ 2 2008 4 manu~ 4 20 28 p comp~ lowe~
Q4. p144 νΌμμ ν΄λ³΄κΈ°
mpg.new2 <- mpg
mpg.new2 %>% mutate(total = cty + hwy) -> mpg.new2 #ctyμ hwyκ°μ ν©μΉ νμλ³μ μμ±
mpg.new2 %>% mutate(mean = total/2) -> mpg.new2 #ctyμ hwyκ°μ νκ· κ°μ ꡬν νμλ³μ μμ±
mpg.new2 %>% arrange(desc(mean)) %>% head(3) #ordering ν νκ· μ°λΉκ° 1~3μ μμ λλ μλμ°¨ μΆλ ₯
## # A tibble: 3 x 14
## manufacturer model displ year cyl trans drv cty hwy fl class grade
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <chr>
## 1 volkswagen new ~ 1.9 1999 4 manu~ f 35 44 d subc~ lowe~
## 2 volkswagen jetta 1.9 1999 4 manu~ f 33 44 d comp~ lowe~
## 3 volkswagen new ~ 1.9 1999 4 auto~ f 29 41 d subc~ lowe~
## # ... with 2 more variables: total <int>, mean <dbl>
mpg %>% mutate(mean = (cty + hwy)/2) %>% arrange(desc(mean)) %>% head(3)
## # A tibble: 3 x 13
## manufacturer model displ year cyl trans drv cty hwy fl class grade
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <chr>
## 1 volkswagen new ~ 1.9 1999 4 manu~ f 35 44 d subc~ lowe~
## 2 volkswagen jetta 1.9 1999 4 manu~ f 33 44 d comp~ lowe~
## 3 volkswagen new ~ 1.9 1999 4 auto~ f 29 41 d subc~ lowe~
## # ... with 1 more variable: mean <dbl>
Q5. p150 νΌμμ ν΄λ³΄κΈ°
mpg %>% group_by(class) %>% summarise(mean_cty = mean(cty)) #classλ³ λμ μ°λΉμ νκ· κ΅¬νκΈ°(group_by, summariseμ¬μ©)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 7 x 2
## class mean_cty
## <chr> <dbl>
## 1 2seater 15.4
## 2 compact 20.1
## 3 midsize 18.8
## 4 minivan 15.8
## 5 pickup 13
## 6 subcompact 20.4
## 7 suv 13.5
tapply(mpg$cty, mpg$class, mean) #classλ³ λμ μ°λΉμ νκ· κ΅¬νκΈ°(group_by, summariseμ¬μ©X)
## 2seater compact midsize minivan pickup subcompact suv
## 15.40000 20.12766 18.75610 15.81818 13.00000 20.37143 13.50000
mpg %>% group_by(class) %>%
summarise(mean_cty = mean(cty)) %>%
arrange(desc(mean_cty)) #classλ³ λμ μ°λΉμ νκ· μ ꡬν ν μ λ ¬νκΈ°
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 7 x 2
## class mean_cty
## <chr> <dbl>
## 1 subcompact 20.4
## 2 compact 20.1
## 3 midsize 18.8
## 4 minivan 15.8
## 5 2seater 15.4
## 6 suv 13.5
## 7 pickup 13
mpg %>% group_by(class) %>%
summarise(mean_cty = mean(cty)) %>%
arrange(desc(mean_cty)) %>%
head(3) #classλ³ λμ μ°λΉμ νκ· μ ꡬν ν μ λ ¬ ν 1~3μ μΆλ ₯
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## class mean_cty
## <chr> <dbl>
## 1 subcompact 20.4
## 2 compact 20.1
## 3 midsize 18.8
mpg %>% filter(class == "compact") %>% group_by(manufacturer) %>% summarise(compact_n = n()) %>% arrange(desc(compact_n))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 2
## manufacturer compact_n
## <chr> <int>
## 1 audi 15
## 2 volkswagen 14
## 3 toyota 12
## 4 subaru 4
## 5 nissan 2
Q6. mlu λ°μ΄ν° λΆμνκΈ°
mlu_data2 <- mlu_data.ori #1. μΉ΄νΌλ³Έ μ μ
dim(mlu_data2 %>% filter(utterances_mlu <= 500)) #2. μ£Όμ΄μ§ λ
Ήμ μκ° μ€ 500λ¬Έμ₯ μ΄νλ‘ λ°νν μμ‘μλ 5λͺ
μ΄λ€.
## [1] 5 8
mlu_data2 %>% select(-DurationTime, -DurationSec) -> mlu_data2 #3. λΆμμ νμμλ DurationTime, DurationSec column μ κ±°
mlu_data2 %>% mutate(mlu = words_mlu/utterances_mlu) %>% #4-1. mlu νμλ³μ μΆκ°
group_by(age) %>% summarise(mean_mlu = mean(mlu)) #4-2. λμ΄λ³ mlu νκ· κ΅¬νκΈ°
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## age mean_mlu
## <chr> <dbl>
## 1 A0 2.50
## 2 A1 2.59
## 3 A2 2.99
mlu_data2 %>% mutate(TTR = Token_freq/Types_freq) %>% #5-1. TTR νμλ³μ μΆκ°
group_by(age) %>% summarise(mean_TTR = mean(TTR)) #5-2. λμ΄λ³ TTR νκ· κ΅¬νκΈ°
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## age mean_TTR
## <chr> <dbl>
## 1 A0 2.41
## 2 A1 2.74
## 3 A2 2.66
Q1. p156 νΌμμ ν΄λ³΄κΈ°
fuel <- data.frame(fl = c("c", "d", "e", "p", "r"),
price_fl = c(2.35, 2.38, 2.11, 2.76, 2.22),
stringsAsFactors = F)
#1
mpg.new3 <- mpg #μΉ΄νΌλ³Έ μ μ
mpg.fuel <- left_join(mpg.new3, fuel, by = "fl") #λ°μ΄ν° μΆκ°
#2
mpg.fuel %>% select(model, fl, price_fl) %>% head(5)
## # A tibble: 5 x 3
## model fl price_fl
## <chr> <chr> <dbl>
## 1 a4 p 2.76
## 2 a4 p 2.76
## 3 a4 p 2.76
## 4 a4 p 2.76
## 5 a4 p 2.76
Q2. p160 λΆμ λμ
#1
midwest.new2 <- midwest #μΉ΄νΌλ³Έ μ μ
midwest.new2 <- midwest.new2 %>% mutate(prop_popchild = (poptotal-popadults)/poptotal*100)
#2
midwest.new2 %>% arrange(desc(prop_popchild)) %>% select(county, prop_popchild) %>% head(5)
## county prop_popchild
## 1 ISABELLA 51.50117
## 2 MENOMINEE 50.59126
## 3 ATHENS 49.32073
## 4 MECOSTA 49.05918
## 5 MONROE 47.35818
#3
midwest.new2 <- midwest.new2 %>%
mutate(grade_popchild = ifelse(prop_popchild >= 40, "large",
ifelse(prop_popchild >= 30, "middle", "small")))
midwest.new2 %>%
group_by(grade_popchild) %>%
summarise(n_county = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## grade_popchild n_county
## <chr> <int>
## 1 large 32
## 2 middle 396
## 3 small 9
#4
midwest.new2 <- midwest.new2 %>% mutate(prop_popasian = popasian/poptotal*100)
midwest.new2 %>% select(state, county, prop_popasian) %>% arrange(prop_popasian) %>% head(10)
## state county prop_popasian
## 1 WI MENOMINEE 0.00000000
## 2 IN BENTON 0.01059210
## 3 IN CARROLL 0.01594981
## 4 OH VINTON 0.02703190
## 5 WI IRON 0.03250447
## 6 IL SCOTT 0.05315379
## 7 IN CLAY 0.06071645
## 8 MI OSCODA 0.06375925
## 9 OH PERRY 0.06654625
## 10 IL PIATT 0.07074865
Q3. p170 νΌμμ ν΄λ³΄κΈ°
mpg <- as.data.frame(ggplot2::mpg)
mpg.new4 <- mpg
mpg.new4[c(65, 124, 131, 153, 212), "hwy"] <- NA
#1
table(is.na(mpg.new4$drv))
##
## FALSE
## 234
table(is.na(mpg.new4$hwy))
##
## FALSE TRUE
## 229 5
#2
mpg.new4 %>% filter(!is.na(hwy)) %>% group_by(drv) %>% summarise(mean_hwy = mean(hwy))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## drv mean_hwy
## <chr> <dbl>
## 1 4 19.2
## 2 f 28.2
## 3 r 21
Q4. p178 νΌμμ ν΄λ³΄κΈ°
#1
mpg <- as.data.frame(ggplot2::mpg)
mpg[c(10, 14, 58, 93), "drv"] <- "k"
mpg[c(29, 43, 129, 203), "cty"] <- c(3, 4, 39, 42)
ifelse(mpg$drv %in% "k", NA, mpg$drv) -> mpg$drv #μ΄μμΉλ₯Ό NAλ‘ λ³ν
table(is.na(mpg$drv)) #μ΄μμΉκ° NAλ‘ λ³νλμλ μ§ νμΈ
##
## FALSE TRUE
## 230 4
#2
str(mpg)
## 'data.frame': 234 obs. of 11 variables:
## $ manufacturer: chr "audi" "audi" "audi" "audi" ...
## $ model : chr "a4" "a4" "a4" "a4" ...
## $ displ : num 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr "f" "f" "f" "f" ...
## $ cty : num 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr "p" "p" "p" "p" ...
## $ class : chr "compact" "compact" "compact" "compact" ...
as.numeric(mpg$cty) -> mpg$cty
boxplot(mpg$cty)$stat
## [,1]
## [1,] 9
## [2,] 14
## [3,] 17
## [4,] 19
## [5,] 26
ifelse(mpg$cty < 9 | mpg$cty > 26, NA, mpg$cty) -> mpg$cty
boxplot(mpg$cty)
#3
mpg %>% filter(!is.na(drv), !is.na(cty)) %>% group_by(drv) %>% summarise(mean_cty = mean(cty))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## drv mean_cty
## <chr> <dbl>
## 1 4 14.2
## 2 f 19.5
## 3 r 14.0
Q5. p188 νΌμμ ν΄λ³΄κΈ°
#1
mpg <- as.data.frame(ggplot2::mpg)
qplot(data = mpg, x = cty, y = hwy) + geom_point()
#2
qplot(data = midwest, x = poptotal, y = popasian) + geom_point() + xlim(0, 500000) + ylim(0, 10000)
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
Q6. p193 νΌμμ ν΄λ³΄κΈ°
#1
mpg %>% filter(class == "suv") %>% group_by(manufacturer) %>% summarise(mean_cty = mean(cty)) %>% arrange(desc(mean_cty)) %>% head(5) -> mpg_upper5
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data = mpg_upper5, aes(x = reorder(manufacturer, -mean_cty), y = mean_cty)) + geom_col()
#2
ggplot(data = mpg, aes(x = class)) + geom_bar()
Q7. p195 νΌμμ ν΄λ³΄κΈ°
#1
ggplot(data = economics, aes(x = date, y = psavert)) + geom_line()
Q8. p198 νΌμμ ν΄λ³΄κΈ°
mpg %>% filter(class == c("subcompact", "compact", "suv")) -> mpg.scs
ggplot(data = mpg.scs, aes(x = class, y = cty)) + geom_boxplot()
νκ΅ λ³΅μ§ ν¨λ λ°μ΄ν° λΆμ μ€λΉνκΈ°
install.packages("foreign")
library(foreign)
library(dplyr)
library(ggplot2)
library(readxl)
setwd("C:\\Users\\user\\Desktop\\R μ€μ΅μ©")
raw_welfare <- read.spss(file = "Koweps_hpc10_2015_beta1.sav", to.data.frame = T)
## Warning in read.spss(file = "Koweps_hpc10_2015_beta1.sav", to.data.frame = T):
## Koweps_hpc10_2015_beta1.sav: Compression bias (0) is not the usual value of 100
welfare <- raw_welfare
dim(welfare)
## [1] 16664 957
welfare <- rename(welfare,
sex = h10_g3, #μ±λ³
birth = h10_g4, #νμ΄λ μ°λ
marriage = h10_g10, #νΌμΈ μν
religion = h10_g11, #μ’
κ΅
income = p1002_8aq1, #μκΈ
code_job = h10_eco9, #μ§μ
μ½λ
code_region = h10_reg7) #μ§μ μ½λ
μ±λ³μ λ°λ₯Έ μκΈ μ°¨μ΄
#μ±λ³ λ³μ κ²ν
class(welfare$sex)
## [1] "numeric"
table(welfare$sex)
##
## 1 2
## 7578 9086
ifelse(welfare$sex == 1, "male", "female") -> welfare$sex
qplot(welfare$sex)
#μκΈ λ³μ κ²ν
class(welfare$income)
## [1] "numeric"
summary(welfare$income)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 122.0 192.5 241.6 316.6 2400.0 12030
qplot(welfare$income) + xlim(0, 1000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 12051 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
#κ²°μΈ‘μΉ μ κ±°νκΈ°
ifelse(welfare$income %in% c(0, 9999), NA, welfare$income) -> welfare$income
table(is.na(welfare$income))
##
## FALSE TRUE
## 4620 12044
#μ±λ³μ λ°λ₯Έ μκΈ μ°¨μ΄ λΆμνκΈ°
welfare %>% filter(!is.na(income)) %>% group_by(sex) %>% summarise(mean_income = mean(income)) -> sex_income
## `summarise()` ungrouping output (override with `.groups` argument)
sex_income
## # A tibble: 2 x 2
## sex mean_income
## <chr> <dbl>
## 1 female 163.
## 2 male 312.
ggplot(data = sex_income, aes(x = sex, y = mean_income)) + geom_col()
λμ΄ λ° μ°λ Ήλμ λ°λ₯Έ μκΈ μ°¨μ΄
#λμ΄ λ³μ κ²ν
class(welfare$birth)
## [1] "numeric"
summary(welfare$birth)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1907 1946 1966 1968 1988 2014
qplot(welfare$birth)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
welfare$age <- 2015 - welfare$birth + 1 #λμ΄ νμλ³μ λ§λ€κΈ°
summary(welfare$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 28.00 50.00 48.43 70.00 109.00
qplot(welfare$age)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#λμ΄μ λ°λ₯Έ μκΈ μ°¨μ΄ λΆμνκΈ°
welfare %>% filter(!is.na(income)) %>% group_by(age) %>% summarise(mean_income = mean(income)) -> age_income
## `summarise()` ungrouping output (override with `.groups` argument)
age_income
## # A tibble: 69 x 2
## age mean_income
## <dbl> <dbl>
## 1 20 121.
## 2 21 106.
## 3 22 130.
## 4 23 142.
## 5 24 134.
## 6 25 145.
## 7 26 158.
## 8 27 188.
## 9 28 205.
## 10 29 189.
## # ... with 59 more rows
ggplot(data = age_income, aes(x = age, y = mean_income)) + geom_line()
#μ°λ Ήλμ λ°λ₯Έ μκΈ μ°¨μ΄ λΆμνκΈ°
welfare$ageg <- ifelse(welfare$age < 30, "young",
ifelse(welfare$age <= 59, "middle", "old")) #μ°λ Ήλ νμλ³μ λ§λ€κΈ°
table(welfare$ageg)
##
## middle old young
## 6049 6281 4334
qplot(welfare$ageg)
welfare %>% filter(!is.na(income)) %>% group_by(ageg) %>% summarise(mean_income = mean(income)) -> ageg_income
## `summarise()` ungrouping output (override with `.groups` argument)
ageg_income
## # A tibble: 3 x 2
## ageg mean_income
## <chr> <dbl>
## 1 middle 282.
## 2 old 125.
## 3 young 164.
ggplot(data = ageg_income, aes(x = ageg, y = mean_income)) + geom_col() + scale_x_discrete(limit = c("young", "middle", "old"))
μ±λ³ λ° μ°λ Ήλ, λμ΄μ λ°λ₯Έ μκΈ μ°¨μ΄
# μ±λ³ λ° μ°λ Ήλμ λ°λ₯Έ μκΈ μ°¨μ΄
welfare %>% filter(!is.na(income)) %>% group_by(ageg, sex) %>% summarise(mean_income = mean(income)) -> sex_income
## `summarise()` regrouping output by 'ageg' (override with `.groups` argument)
sex_income
## # A tibble: 6 x 3
## # Groups: ageg [3]
## ageg sex mean_income
## <chr> <chr> <dbl>
## 1 middle female 188.
## 2 middle male 353.
## 3 old female 81.5
## 4 old male 174.
## 5 young female 160.
## 6 young male 171.
ggplot(data = sex_income, aes(x = ageg, y = mean_income, fill = sex)) + geom_col(position = "dodge") + scale_x_discrete(limit = c("young", "middle", "old"))
# μ±λ³ λ° λμ΄μ λ°λ₯Έ μκΈ μ°¨μ΄
welfare %>% filter(!is.na(income)) %>% group_by(age, sex) %>% summarise(mean_income = mean(income)) -> sex_age
## `summarise()` regrouping output by 'age' (override with `.groups` argument)
sex_income
## # A tibble: 6 x 3
## # Groups: ageg [3]
## ageg sex mean_income
## <chr> <chr> <dbl>
## 1 middle female 188.
## 2 middle male 353.
## 3 old female 81.5
## 4 old male 174.
## 5 young female 160.
## 6 young male 171.
ggplot(data = sex_age, aes(x = age, y = mean_income, col = sex)) + geom_line()
μ§μ λ³ μκΈ μ°¨μ΄
class(welfare$code_job)
## [1] "numeric"
table(welfare$code_job)
##
## 111 120 131 132 133 134 135 139 141 149 151 152 153 159 211 212
## 2 16 10 11 9 3 7 10 35 20 26 18 15 16 8 4
## 213 221 222 223 224 231 232 233 234 235 236 237 239 241 242 243
## 3 17 31 12 4 41 5 3 6 48 14 2 29 12 4 63
## 244 245 246 247 248 251 252 253 254 259 261 271 272 273 274 281
## 4 33 59 77 38 14 111 24 67 109 4 15 11 4 36 17
## 283 284 285 286 289 311 312 313 314 320 330 391 392 399 411 412
## 8 10 26 16 5 140 260 220 84 75 15 4 13 87 47 12
## 421 422 423 429 431 432 441 442 510 521 522 530 611 612 613 620
## 124 71 5 14 20 33 154 197 192 353 5 106 1320 11 40 2
## 630 710 721 722 730 741 742 743 751 752 753 761 762 771 772 773
## 20 29 30 22 16 27 3 34 34 5 49 69 27 11 61 86
## 774 780 791 792 799 811 812 819 821 822 823 831 832 841 842 843
## 7 17 5 21 45 16 1 6 9 9 23 5 17 32 10 4
## 851 852 853 854 855 861 862 863 864 871 873 874 875 876 881 882
## 19 13 7 33 9 3 14 17 31 2 257 34 37 2 2 3
## 891 892 899 910 921 922 930 941 942 951 952 953 991 992 999 1011
## 8 19 16 102 31 74 289 325 99 125 122 73 45 12 141 2
## 1012
## 17
library(readxl)
#λ°μ΄ν° μ μ²λ¦¬
setwd("C:\\Users\\user\\Desktop\\R μ€μ΅μ©")
list_job <- read_excel("Koweps_Codebook.xlsx", col_names = T, sheet = 2)
head(list_job)
## # A tibble: 6 x 2
## code_job job
## <dbl> <chr>
## 1 111 μνμμ κ³ μ곡무μ λ° κ³΅κ³΅λ¨μ²΄μμ
## 2 112 κΈ°μ
κ³ μμμ
## 3 120 νμ λ° κ²½μμ§μ κ΄λ¦¬μ
## 4 131 μ°κ΅¬ κ΅μ‘ λ° λ²λ₯ κ΄λ ¨ κ΄λ¦¬μ
## 5 132 보ν λ° κΈμ΅ κ΄λ¦¬μ
## 6 133 보건 λ° μ¬νλ³΅μ§ κ΄λ ¨ κ΄λ¦¬μ
dim(list_job)
## [1] 149 2
welfare <- left_join(welfare, list_job, id = "code_job")
## Joining, by = "code_job"
welfare %>% filter(!is.na(code_job)) %>% select(code_job, job) %>% head(10)
## code_job job
## 1 942 κ²½λΉμ λ° κ²νμ
## 2 762 μ 기곡
## 3 530 λ°©λ¬Έ λ
Έμ λ° ν΅μ νλ§€ κ΄λ ¨ μ’
μ¬μ
## 4 999 κΈ°ν μλΉμ€κ΄λ ¨ λ¨μ μ’
μ¬μ
## 5 312 κ²½μκ΄λ ¨ μ¬λ¬΄μ
## 6 254 문리 κΈ°μ λ° μλ₯ κ°μ¬
## 7 510 μμ
μ’
μ¬μ
## 8 530 λ°©λ¬Έ λ
Έμ λ° ν΅μ νλ§€ κ΄λ ¨ μ’
μ¬μ
## 9 286 μ€ν¬μΈ λ° λ ν¬λ μ΄μ
κ΄λ ¨ μ λ¬Έκ°
## 10 521 λ§€μ₯ νλ§€ μ’
μ¬μ
#μ§μ
λ³ μμ 10μ, νμ 10μμ μκΈ νκ· κ΅¬νκΈ°
job_income <- welfare %>% filter(!is.na(job) & !is.na(income)) %>% group_by(job) %>% summarise(mean_income = mean(income)) %>% arrange(desc(mean_income))
## `summarise()` ungrouping output (override with `.groups` argument)
top10 <- job_income %>% head(10)
bottom10 <- job_income %>% tail(10)
ggplot(data = top10, aes(x = reorder(job, mean_income), y = mean_income)) + geom_col() + coord_flip()
ggplot(data = bottom10, aes(x = reorder(job, -mean_income), y = mean_income)) + geom_col() + coord_flip() + ylim(0, 850)
μ±λ³ μ§μ λΉλ
male_job <- welfare %>% filter(!is.na(job) & sex == "male") %>% group_by(job) %>% summarise(n = n()) %>% arrange(desc(n))
## `summarise()` ungrouping output (override with `.groups` argument)
male_top10 <- male_job %>% head(10)
female_job <- welfare %>% filter(!is.na(job) & sex == "female") %>% group_by(job) %>% summarise(n = n()) %>% arrange(desc(n))
## `summarise()` ungrouping output (override with `.groups` argument)
female_top10 <- female_job %>% head(10)
ggplot(data = male_top10, aes(x = reorder(job, n), y = n)) + geom_col() + coord_flip()
ggplot(data = female_top10, aes(x = reorder(job, n), y = n)) + geom_col() + coord_flip()
μ’ κ΅ μ 무μ λ°λ₯Έ μ΄νΌμ¨
class(welfare$religion)
## [1] "numeric"
table(welfare$religion)
##
## 1 2
## 8047 8617
class(welfare$marriage)
## [1] "numeric"
table(welfare$marriage)
##
## 0 1 2 3 4 5 6
## 2861 8431 2117 712 84 2433 26
#λ°μ΄ν° μ μ²λ¦¬
welfare$religion <- ifelse(welfare$religion == 1, "yes", "no")
qplot(welfare$religion)
welfare$group_marriage <- ifelse(welfare$marriage == 1, "marriage",
ifelse(welfare$marriage == 3, "divorce", NA))
table(welfare$group_marriage)
##
## divorce marriage
## 712 8431
qplot(welfare$group_marriage)
religion_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(religion, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))
## `summarise()` regrouping output by 'religion' (override with `.groups` argument)
divorce <- religion_marriage %>% filter(group_marriage == "divorce") %>% select(religion, pct)
#κ·Έλν 그리기
ggplot(data = divorce, aes(x = religion, y = pct)) + geom_col()
#μ°λ Ήλλ³ μ΄νΌμ¨ λΆμ
ageg_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(ageg, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))
## `summarise()` regrouping output by 'ageg' (override with `.groups` argument)
ageg_divorce <- ageg_marriage %>% filter(group_marriage == "divorce") %>% select(ageg, pct)
ggplot(data = ageg_divorce, aes(x = ageg, y = pct)) + geom_col()
#μ°λ Ήλ λ° μ’
κ΅ μ 무μ λ°λ₯Έ μ΄νΌμ¨ λΆμ
ageg_religion_marriage <- welfare %>% filter(!is.na(group_marriage)) %>% group_by(ageg, religion, group_marriage) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 1))
## `summarise()` regrouping output by 'ageg', 'religion' (override with `.groups` argument)
ageg_religion_divorce <- ageg_religion_marriage %>% filter(group_marriage == "divorce") %>% select(ageg, religion, pct)
ggplot(data = ageg_religion_divorce, aes(x = ageg, y = pct, fill = religion)) + geom_col(position = "dodge")
μ§μλ³ μ°λ Ήλ λΉμ¨
class(welfare$code_region)
## [1] "numeric"
table(welfare$code_region)
##
## 1 2 3 4 5 6 7
## 2486 3711 2785 2036 1467 1257 2922
#λ°μ΄ν° μ μ²λ¦¬
list_region <- data.frame(code_region = c(1:7),
region = c("μμΈ", "μλκΆ(μΈμ²/κ²½κΈ°)", "λΆμ°/κ²½λ¨/μΈμ°", "λꡬ/κ²½λΆ", "λμ /μΆ©λ¨", "κ°μ/μΆ©λΆ", "κ΄μ£Ό/μ λ¨/μ λΆ/μ μ£Όλ"))
welfare <- left_join(welfare, list_region, by = "code_region")
region_ageg <- welfare %>% group_by(region, ageg) %>% summarise(n = n()) %>% mutate(tot_group = sum(n)) %>% mutate(pct = round(n/tot_group*100, 2))
## `summarise()` regrouping output by 'region' (override with `.groups` argument)
#κ·Έλν 그리기
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip()
#λ
Έλ
μΈ΅ λΉμ¨μ΄ λμ μμΌλ‘ μ λ ¬νκΈ°
list_order_old <- region_ageg %>% filter(ageg == "old") %>% arrange(pct)
order <- list_order_old$region
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip() + scale_x_discrete(limits = order)
#μ°λ Ήλ μμΌλ‘ λ§λ μκΉ λμ΄νκΈ°
region_ageg$ageg <- factor(region_ageg$ageg, level = c("old", "middle", "young"))
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip() + scale_x_discrete(limits = order)
Vowel and coda duration as a function of speaking rate
setwd("C:\\Users\\user\\Desktop\\R μ€μ΅μ©")
coda <- read.delim(file = "all_data.txt")
V_speed <- coda %>% filter(phoneme == c("AE1", "EH1", "IH1")) %>% group_by(voice, speed) %>% summarise(mean_duration = mean(duration))
## `summarise()` regrouping output by 'voice' (override with `.groups` argument)
coda_speed <- coda %>% filter(phoneme == c("K", "G")) %>% group_by(voice, speed) %>% summarise(mean_duration = mean(duration))
## `summarise()` regrouping output by 'voice' (override with `.groups` argument)
ggplot(data = V_speed, aes(x = speed, y = mean_duration, fill = voice)) + geom_col(position = "dodge") #λͺ¨μμ κΈΈμ΄
ggplot(data = coda_speed, aes(x = speed, y = mean_duration, fill = voice)) + geom_col(position = "dodge") #codaμ κΈΈμ΄
Vowel and coda duration as a function of sentence position
V_pos <- coda %>% filter(phoneme == c("AE1", "EH1", "IH1")) %>% group_by(position, voice) %>% summarise(mean_duration = mean(duration))
## `summarise()` regrouping output by 'position' (override with `.groups` argument)
coda_pos <- coda %>% filter(phoneme == c("K", "G")) %>% group_by(position, voice) %>% summarise(mean_duration = mean(duration))
## `summarise()` regrouping output by 'position' (override with `.groups` argument)
ggplot(data = V_pos, aes(x = position, y = mean_duration, fill = voice)) + geom_col(position = "dodge") #λͺ¨μμ κΈΈμ΄
ggplot(data = coda_pos, aes(x = position, y = mean_duration, fill = voice)) + geom_col(position = "dodge") #codaμ κΈΈμ΄
Vowel and coda duration as a function of vowel height
V_height <- coda %>% filter(phoneme == c("AE1", "EH1", "IH1")) %>% group_by(height, voice) %>% summarise(mean_duration = mean(duration))
## `summarise()` regrouping output by 'height' (override with `.groups` argument)
coda_height <- coda %>% filter(phoneme == c("K", "G")) %>% group_by(height, voice) %>% summarise(mean_duration = mean(duration))
## `summarise()` regrouping output by 'height' (override with `.groups` argument)
ggplot(data = V_height, aes(x = height, y = mean_duration, fill = voice)) + geom_col(position = "dodge") #λͺ¨μμ κΈΈμ΄
ggplot(data = coda_height, aes(x = height, y = mean_duration, fill = voice)) + geom_col(position = "dodge") #codaμ κΈΈμ΄
V-to-C ratio under varying speaking rate in CVC words ending in [-voice] and [+voice] stops
V_speed <- coda %>% filter(phoneme == c("AE1", "EH1", "IH1")) %>% group_by(voice, speed) %>% summarise(V_mean_duration = mean(duration))
## `summarise()` regrouping output by 'voice' (override with `.groups` argument)
coda_speed <- coda %>% filter(phoneme == c("K", "G")) %>% group_by(voice, speed) %>% summarise(coda_mean_duration = mean(duration))
## `summarise()` regrouping output by 'voice' (override with `.groups` argument)
VC_ratio <- left_join(V_speed, coda_speed, by = c("voice", "speed"))
VC_ratio <- VC_ratio %>% mutate(VC_ratio = V_mean_duration/coda_mean_duration)
ggplot(data = VC_ratio, aes(x = voice, y = VC_ratio, fill = speed)) + geom_col(position = "dodge")
## 12μ£Όμ°¨ κ³Όμ
install.packages("stringr")
install.packages("wordcloud")
library(KoNLP)
## Checking user defined dictionary!
library(stringr)
library(dplyr)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
κ΅μ μ νΈμ ν μ€νΈ λ§μ΄λ
#λ°μ΄ν° μ μ νκΈ°
setwd("C:\\Users\\user\\Desktop\\R μ€μ΅μ©")
twitter <- read.csv("twitter.csv", header = T, stringsAsFactors = F, fileEncoding = "UTF-8")
twitter <- rename(twitter, no = λ²νΈ, id = κ³μ μ΄λ¦, date = μμ±μΌ, tw = λ΄μ©)
twitter$tw <- str_replace_all(twitter$tw, "\\W", " ")
#κ°μ₯ λ§μ΄ μ¬μ©λ λ¨μ΄ μμ보기
nouns <- extractNoun(twitter$tw)
wordcount <- table(unlist(nouns))
df_word <- as.data.frame(wordcount, stringsAsFactors = F)
df_word <- rename(df_word, word = Var1, freq = Freq)
df_word <- filter(df_word, nchar(word) >= 2)
top_20 <- df_word %>% arrange(desc(freq)) %>% head(20)
#λ¨μ΄ λΉλ κ·Έλν λ§λ€κΈ°
order <- arrange(top_20, freq)$word
ggplot(data = top_20, aes(x = word, y = freq)) + ylim(0, 2500) + geom_col() + coord_flip() + scale_x_discrete(limit = order) + geom_text(aes(label = freq), hjust = -0.3)
#wordcloud λ§λ€κΈ°
pal <- brewer.pal(8, "Dark2")
set.seed(1234)
wordcloud(words = df_word$word,
freq = df_word$freq,
min.freq = 10,
max.words = 200,
random.order = F,
rot.per = .1,
scale = c(6, 0.2),
colors = pal)
pal <- brewer.pal(9, "Blues")[5:9]
set.seed(1234)
wordcloud(words = df_word$word,
freq = df_word$freq,
min.freq = 10,
max.words = 200,
random.order = F,
rot.per = .1,
scale = c(6, 0.2),
colors = pal)