# R - EDA
이 문서는 Colony Studio의 데이터 정제 과정을 기록하기위해 만든 문서입니다. dplyr, %>%, select, arrange, mutate, filter, groupby, summarise
정제 문서 실행하기 위한 라이브러리 삽입
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
예제 위한 데이터셋 구성
iris_ex = filter(iris, Sepal.Length >6.5)
iris 라는 dataset에서 Sepal.Length, Species라는 열을 선택하여 출력
# select
select(iris_ex, Sepal.Length, Species)
## Sepal.Length Species
## 1 7.0 versicolor
## 2 6.9 versicolor
## 3 6.6 versicolor
## 4 6.7 versicolor
## 5 6.6 versicolor
## 6 6.8 versicolor
## 7 6.7 versicolor
## 8 6.7 versicolor
## 9 7.1 virginica
## 10 7.6 virginica
## 11 7.3 virginica
## 12 6.7 virginica
## 13 7.2 virginica
## 14 6.8 virginica
## 15 7.7 virginica
## 16 7.7 virginica
## 17 6.9 virginica
## 18 7.7 virginica
## 19 6.7 virginica
## 20 7.2 virginica
## 21 7.2 virginica
## 22 7.4 virginica
## 23 7.9 virginica
## 24 7.7 virginica
## 25 6.9 virginica
## 26 6.7 virginica
## 27 6.9 virginica
## 28 6.8 virginica
## 29 6.7 virginica
## 30 6.7 virginica
#iris_ex %>%
# select(Sepal.Length, Species)
filter로간단하게 필터 적용하기
filter(iris_ex, Sepal.Length > 7)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 7.1 3.0 5.9 2.1 virginica
## 2 7.6 3.0 6.6 2.1 virginica
## 3 7.3 2.9 6.3 1.8 virginica
## 4 7.2 3.6 6.1 2.5 virginica
## 5 7.7 3.8 6.7 2.2 virginica
## 6 7.7 2.6 6.9 2.3 virginica
## 7 7.7 2.8 6.7 2.0 virginica
## 8 7.2 3.2 6.0 1.8 virginica
## 9 7.2 3.0 5.8 1.6 virginica
## 10 7.4 2.8 6.1 1.9 virginica
## 11 7.9 3.8 6.4 2.0 virginica
## 12 7.7 3.0 6.1 2.3 virginica
#iris_ex %>%
# filter(Sepal.Length > 7)
mutate 활용하기 iris_ex데이터셋에 Sepal.Length와 Petal.Length을 더한 값을 나타내는 length라는coloumn을 추가한다.
mutate(iris_ex, length = Sepal.Length + Petal.Length)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species length
## 1 7.0 3.2 4.7 1.4 versicolor 11.7
## 2 6.9 3.1 4.9 1.5 versicolor 11.8
## 3 6.6 2.9 4.6 1.3 versicolor 11.2
## 4 6.7 3.1 4.4 1.4 versicolor 11.1
## 5 6.6 3.0 4.4 1.4 versicolor 11.0
## 6 6.8 2.8 4.8 1.4 versicolor 11.6
## 7 6.7 3.0 5.0 1.7 versicolor 11.7
## 8 6.7 3.1 4.7 1.5 versicolor 11.4
## 9 7.1 3.0 5.9 2.1 virginica 13.0
## 10 7.6 3.0 6.6 2.1 virginica 14.2
## 11 7.3 2.9 6.3 1.8 virginica 13.6
## 12 6.7 2.5 5.8 1.8 virginica 12.5
## 13 7.2 3.6 6.1 2.5 virginica 13.3
## 14 6.8 3.0 5.5 2.1 virginica 12.3
## 15 7.7 3.8 6.7 2.2 virginica 14.4
## 16 7.7 2.6 6.9 2.3 virginica 14.6
## 17 6.9 3.2 5.7 2.3 virginica 12.6
## 18 7.7 2.8 6.7 2.0 virginica 14.4
## 19 6.7 3.3 5.7 2.1 virginica 12.4
## 20 7.2 3.2 6.0 1.8 virginica 13.2
## 21 7.2 3.0 5.8 1.6 virginica 13.0
## 22 7.4 2.8 6.1 1.9 virginica 13.5
## 23 7.9 3.8 6.4 2.0 virginica 14.3
## 24 7.7 3.0 6.1 2.3 virginica 13.8
## 25 6.9 3.1 5.4 2.1 virginica 12.3
## 26 6.7 3.1 5.6 2.4 virginica 12.3
## 27 6.9 3.1 5.1 2.3 virginica 12.0
## 28 6.8 3.2 5.9 2.3 virginica 12.7
## 29 6.7 3.3 5.7 2.5 virginica 12.4
## 30 6.7 3.0 5.2 2.3 virginica 11.9
#iris_ex %>%
# mutate(length = Sepal.Length + Petal.Length)
mutate로 mean값 구하고, 결과를 dataset의 column으로 추가하기
mutate(iris_ex, length = mean(Sepal.Length + Petal.Length))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species length
## 1 7.0 3.2 4.7 1.4 versicolor 12.67333
## 2 6.9 3.1 4.9 1.5 versicolor 12.67333
## 3 6.6 2.9 4.6 1.3 versicolor 12.67333
## 4 6.7 3.1 4.4 1.4 versicolor 12.67333
## 5 6.6 3.0 4.4 1.4 versicolor 12.67333
## 6 6.8 2.8 4.8 1.4 versicolor 12.67333
## 7 6.7 3.0 5.0 1.7 versicolor 12.67333
## 8 6.7 3.1 4.7 1.5 versicolor 12.67333
## 9 7.1 3.0 5.9 2.1 virginica 12.67333
## 10 7.6 3.0 6.6 2.1 virginica 12.67333
## 11 7.3 2.9 6.3 1.8 virginica 12.67333
## 12 6.7 2.5 5.8 1.8 virginica 12.67333
## 13 7.2 3.6 6.1 2.5 virginica 12.67333
## 14 6.8 3.0 5.5 2.1 virginica 12.67333
## 15 7.7 3.8 6.7 2.2 virginica 12.67333
## 16 7.7 2.6 6.9 2.3 virginica 12.67333
## 17 6.9 3.2 5.7 2.3 virginica 12.67333
## 18 7.7 2.8 6.7 2.0 virginica 12.67333
## 19 6.7 3.3 5.7 2.1 virginica 12.67333
## 20 7.2 3.2 6.0 1.8 virginica 12.67333
## 21 7.2 3.0 5.8 1.6 virginica 12.67333
## 22 7.4 2.8 6.1 1.9 virginica 12.67333
## 23 7.9 3.8 6.4 2.0 virginica 12.67333
## 24 7.7 3.0 6.1 2.3 virginica 12.67333
## 25 6.9 3.1 5.4 2.1 virginica 12.67333
## 26 6.7 3.1 5.6 2.4 virginica 12.67333
## 27 6.9 3.1 5.1 2.3 virginica 12.67333
## 28 6.8 3.2 5.9 2.3 virginica 12.67333
## 29 6.7 3.3 5.7 2.5 virginica 12.67333
## 30 6.7 3.0 5.2 2.3 virginica 12.67333
#iris_ex %>%
# mutate(length = mean(Sepal.Length + Petal.Length))
오름차순, 내림차순, 두개의 순서주기
arrange(iris_ex, Sepal.Length) #default는 오름차순
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 6.6 2.9 4.6 1.3 versicolor
## 2 6.6 3.0 4.4 1.4 versicolor
## 3 6.7 3.1 4.4 1.4 versicolor
## 4 6.7 3.0 5.0 1.7 versicolor
## 5 6.7 3.1 4.7 1.5 versicolor
## 6 6.7 2.5 5.8 1.8 virginica
## 7 6.7 3.3 5.7 2.1 virginica
## 8 6.7 3.1 5.6 2.4 virginica
## 9 6.7 3.3 5.7 2.5 virginica
## 10 6.7 3.0 5.2 2.3 virginica
## 11 6.8 2.8 4.8 1.4 versicolor
## 12 6.8 3.0 5.5 2.1 virginica
## 13 6.8 3.2 5.9 2.3 virginica
## 14 6.9 3.1 4.9 1.5 versicolor
## 15 6.9 3.2 5.7 2.3 virginica
## 16 6.9 3.1 5.4 2.1 virginica
## 17 6.9 3.1 5.1 2.3 virginica
## 18 7.0 3.2 4.7 1.4 versicolor
## 19 7.1 3.0 5.9 2.1 virginica
## 20 7.2 3.6 6.1 2.5 virginica
## 21 7.2 3.2 6.0 1.8 virginica
## 22 7.2 3.0 5.8 1.6 virginica
## 23 7.3 2.9 6.3 1.8 virginica
## 24 7.4 2.8 6.1 1.9 virginica
## 25 7.6 3.0 6.6 2.1 virginica
## 26 7.7 3.8 6.7 2.2 virginica
## 27 7.7 2.6 6.9 2.3 virginica
## 28 7.7 2.8 6.7 2.0 virginica
## 29 7.7 3.0 6.1 2.3 virginica
## 30 7.9 3.8 6.4 2.0 virginica
arrange(iris_ex, desc(Sepal.Length)) #내림차순으로 설정
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 7.9 3.8 6.4 2.0 virginica
## 2 7.7 3.8 6.7 2.2 virginica
## 3 7.7 2.6 6.9 2.3 virginica
## 4 7.7 2.8 6.7 2.0 virginica
## 5 7.7 3.0 6.1 2.3 virginica
## 6 7.6 3.0 6.6 2.1 virginica
## 7 7.4 2.8 6.1 1.9 virginica
## 8 7.3 2.9 6.3 1.8 virginica
## 9 7.2 3.6 6.1 2.5 virginica
## 10 7.2 3.2 6.0 1.8 virginica
## 11 7.2 3.0 5.8 1.6 virginica
## 12 7.1 3.0 5.9 2.1 virginica
## 13 7.0 3.2 4.7 1.4 versicolor
## 14 6.9 3.1 4.9 1.5 versicolor
## 15 6.9 3.2 5.7 2.3 virginica
## 16 6.9 3.1 5.4 2.1 virginica
## 17 6.9 3.1 5.1 2.3 virginica
## 18 6.8 2.8 4.8 1.4 versicolor
## 19 6.8 3.0 5.5 2.1 virginica
## 20 6.8 3.2 5.9 2.3 virginica
## 21 6.7 3.1 4.4 1.4 versicolor
## 22 6.7 3.0 5.0 1.7 versicolor
## 23 6.7 3.1 4.7 1.5 versicolor
## 24 6.7 2.5 5.8 1.8 virginica
## 25 6.7 3.3 5.7 2.1 virginica
## 26 6.7 3.1 5.6 2.4 virginica
## 27 6.7 3.3 5.7 2.5 virginica
## 28 6.7 3.0 5.2 2.3 virginica
## 29 6.6 2.9 4.6 1.3 versicolor
## 30 6.6 3.0 4.4 1.4 versicolor
arrange(iris_ex, desc(Sepal.Length), desc(Petal.Length)) # 첫번째 파라미터 우선 정렬후 동값에대해서는 두번째 파라미터기준으로 판단하여 정렬
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 7.9 3.8 6.4 2.0 virginica
## 2 7.7 2.6 6.9 2.3 virginica
## 3 7.7 3.8 6.7 2.2 virginica
## 4 7.7 2.8 6.7 2.0 virginica
## 5 7.7 3.0 6.1 2.3 virginica
## 6 7.6 3.0 6.6 2.1 virginica
## 7 7.4 2.8 6.1 1.9 virginica
## 8 7.3 2.9 6.3 1.8 virginica
## 9 7.2 3.6 6.1 2.5 virginica
## 10 7.2 3.2 6.0 1.8 virginica
## 11 7.2 3.0 5.8 1.6 virginica
## 12 7.1 3.0 5.9 2.1 virginica
## 13 7.0 3.2 4.7 1.4 versicolor
## 14 6.9 3.2 5.7 2.3 virginica
## 15 6.9 3.1 5.4 2.1 virginica
## 16 6.9 3.1 5.1 2.3 virginica
## 17 6.9 3.1 4.9 1.5 versicolor
## 18 6.8 3.2 5.9 2.3 virginica
## 19 6.8 3.0 5.5 2.1 virginica
## 20 6.8 2.8 4.8 1.4 versicolor
## 21 6.7 2.5 5.8 1.8 virginica
## 22 6.7 3.3 5.7 2.1 virginica
## 23 6.7 3.3 5.7 2.5 virginica
## 24 6.7 3.1 5.6 2.4 virginica
## 25 6.7 3.0 5.2 2.3 virginica
## 26 6.7 3.0 5.0 1.7 versicolor
## 27 6.7 3.1 4.7 1.5 versicolor
## 28 6.7 3.1 4.4 1.4 versicolor
## 29 6.6 2.9 4.6 1.3 versicolor
## 30 6.6 3.0 4.4 1.4 versicolor
#iris_ex %>%
# arrange(desc(Sepal.Length), desc(Petal.Length))
그룹별로 요약해보기. 데이터셋에서 어떤 그룹을 특정하고, 그 그룹의 maen, max등을 각 column별로 구해보고 싶을 때.
# group_by + summarise
summarise(group_by(iris_ex, Species), mean(Sepal.Length), mean(Petal.Length))
## Source: local data frame [2 x 3]
##
## Species mean(Sepal.Length) mean(Petal.Length)
## 1 versicolor 6.750000 4.687500
## 2 virginica 7.159091 5.963636
#iris_ex %>%
# group_by(Species) %>%
# summarise(mean(Sepal.Length), mean(Petal.Length))