# R - EDA


이 문서는 Colony Studio의 데이터 정제 과정을 기록하기위해 만든 문서입니다. dplyr, %>%, select, arrange, mutate, filter, groupby, summarise

import library

정제 문서 실행하기 위한 라이브러리 삽입

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union




setting

예제 위한 데이터셋 구성

iris_ex = filter(iris, Sepal.Length >6.5)




select

iris 라는 dataset에서 Sepal.Length, Species라는 열을 선택하여 출력

# select
select(iris_ex, Sepal.Length, Species)
##    Sepal.Length    Species
## 1           7.0 versicolor
## 2           6.9 versicolor
## 3           6.6 versicolor
## 4           6.7 versicolor
## 5           6.6 versicolor
## 6           6.8 versicolor
## 7           6.7 versicolor
## 8           6.7 versicolor
## 9           7.1  virginica
## 10          7.6  virginica
## 11          7.3  virginica
## 12          6.7  virginica
## 13          7.2  virginica
## 14          6.8  virginica
## 15          7.7  virginica
## 16          7.7  virginica
## 17          6.9  virginica
## 18          7.7  virginica
## 19          6.7  virginica
## 20          7.2  virginica
## 21          7.2  virginica
## 22          7.4  virginica
## 23          7.9  virginica
## 24          7.7  virginica
## 25          6.9  virginica
## 26          6.7  virginica
## 27          6.9  virginica
## 28          6.8  virginica
## 29          6.7  virginica
## 30          6.7  virginica
#iris_ex %>%
#  select(Sepal.Length, Species)




filter

filter로간단하게 필터 적용하기

filter(iris_ex, Sepal.Length > 7)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 1           7.1         3.0          5.9         2.1 virginica
## 2           7.6         3.0          6.6         2.1 virginica
## 3           7.3         2.9          6.3         1.8 virginica
## 4           7.2         3.6          6.1         2.5 virginica
## 5           7.7         3.8          6.7         2.2 virginica
## 6           7.7         2.6          6.9         2.3 virginica
## 7           7.7         2.8          6.7         2.0 virginica
## 8           7.2         3.2          6.0         1.8 virginica
## 9           7.2         3.0          5.8         1.6 virginica
## 10          7.4         2.8          6.1         1.9 virginica
## 11          7.9         3.8          6.4         2.0 virginica
## 12          7.7         3.0          6.1         2.3 virginica
#iris_ex %>%
#  filter(Sepal.Length > 7)




mutate

mutate 활용하기 iris_ex데이터셋에 Sepal.Length와 Petal.Length을 더한 값을 나타내는 length라는coloumn을 추가한다.

mutate(iris_ex, length = Sepal.Length + Petal.Length)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species length
## 1           7.0         3.2          4.7         1.4 versicolor   11.7
## 2           6.9         3.1          4.9         1.5 versicolor   11.8
## 3           6.6         2.9          4.6         1.3 versicolor   11.2
## 4           6.7         3.1          4.4         1.4 versicolor   11.1
## 5           6.6         3.0          4.4         1.4 versicolor   11.0
## 6           6.8         2.8          4.8         1.4 versicolor   11.6
## 7           6.7         3.0          5.0         1.7 versicolor   11.7
## 8           6.7         3.1          4.7         1.5 versicolor   11.4
## 9           7.1         3.0          5.9         2.1  virginica   13.0
## 10          7.6         3.0          6.6         2.1  virginica   14.2
## 11          7.3         2.9          6.3         1.8  virginica   13.6
## 12          6.7         2.5          5.8         1.8  virginica   12.5
## 13          7.2         3.6          6.1         2.5  virginica   13.3
## 14          6.8         3.0          5.5         2.1  virginica   12.3
## 15          7.7         3.8          6.7         2.2  virginica   14.4
## 16          7.7         2.6          6.9         2.3  virginica   14.6
## 17          6.9         3.2          5.7         2.3  virginica   12.6
## 18          7.7         2.8          6.7         2.0  virginica   14.4
## 19          6.7         3.3          5.7         2.1  virginica   12.4
## 20          7.2         3.2          6.0         1.8  virginica   13.2
## 21          7.2         3.0          5.8         1.6  virginica   13.0
## 22          7.4         2.8          6.1         1.9  virginica   13.5
## 23          7.9         3.8          6.4         2.0  virginica   14.3
## 24          7.7         3.0          6.1         2.3  virginica   13.8
## 25          6.9         3.1          5.4         2.1  virginica   12.3
## 26          6.7         3.1          5.6         2.4  virginica   12.3
## 27          6.9         3.1          5.1         2.3  virginica   12.0
## 28          6.8         3.2          5.9         2.3  virginica   12.7
## 29          6.7         3.3          5.7         2.5  virginica   12.4
## 30          6.7         3.0          5.2         2.3  virginica   11.9
#iris_ex %>%
#  mutate(length = Sepal.Length + Petal.Length)



mutate 응용

mutate로 mean값 구하고, 결과를 dataset의 column으로 추가하기

mutate(iris_ex, length = mean(Sepal.Length + Petal.Length))
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species   length
## 1           7.0         3.2          4.7         1.4 versicolor 12.67333
## 2           6.9         3.1          4.9         1.5 versicolor 12.67333
## 3           6.6         2.9          4.6         1.3 versicolor 12.67333
## 4           6.7         3.1          4.4         1.4 versicolor 12.67333
## 5           6.6         3.0          4.4         1.4 versicolor 12.67333
## 6           6.8         2.8          4.8         1.4 versicolor 12.67333
## 7           6.7         3.0          5.0         1.7 versicolor 12.67333
## 8           6.7         3.1          4.7         1.5 versicolor 12.67333
## 9           7.1         3.0          5.9         2.1  virginica 12.67333
## 10          7.6         3.0          6.6         2.1  virginica 12.67333
## 11          7.3         2.9          6.3         1.8  virginica 12.67333
## 12          6.7         2.5          5.8         1.8  virginica 12.67333
## 13          7.2         3.6          6.1         2.5  virginica 12.67333
## 14          6.8         3.0          5.5         2.1  virginica 12.67333
## 15          7.7         3.8          6.7         2.2  virginica 12.67333
## 16          7.7         2.6          6.9         2.3  virginica 12.67333
## 17          6.9         3.2          5.7         2.3  virginica 12.67333
## 18          7.7         2.8          6.7         2.0  virginica 12.67333
## 19          6.7         3.3          5.7         2.1  virginica 12.67333
## 20          7.2         3.2          6.0         1.8  virginica 12.67333
## 21          7.2         3.0          5.8         1.6  virginica 12.67333
## 22          7.4         2.8          6.1         1.9  virginica 12.67333
## 23          7.9         3.8          6.4         2.0  virginica 12.67333
## 24          7.7         3.0          6.1         2.3  virginica 12.67333
## 25          6.9         3.1          5.4         2.1  virginica 12.67333
## 26          6.7         3.1          5.6         2.4  virginica 12.67333
## 27          6.9         3.1          5.1         2.3  virginica 12.67333
## 28          6.8         3.2          5.9         2.3  virginica 12.67333
## 29          6.7         3.3          5.7         2.5  virginica 12.67333
## 30          6.7         3.0          5.2         2.3  virginica 12.67333
#iris_ex %>%
#  mutate(length = mean(Sepal.Length + Petal.Length))




arrange

오름차순, 내림차순, 두개의 순서주기

arrange(iris_ex, Sepal.Length) #default는 오름차순
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1           6.6         2.9          4.6         1.3 versicolor
## 2           6.6         3.0          4.4         1.4 versicolor
## 3           6.7         3.1          4.4         1.4 versicolor
## 4           6.7         3.0          5.0         1.7 versicolor
## 5           6.7         3.1          4.7         1.5 versicolor
## 6           6.7         2.5          5.8         1.8  virginica
## 7           6.7         3.3          5.7         2.1  virginica
## 8           6.7         3.1          5.6         2.4  virginica
## 9           6.7         3.3          5.7         2.5  virginica
## 10          6.7         3.0          5.2         2.3  virginica
## 11          6.8         2.8          4.8         1.4 versicolor
## 12          6.8         3.0          5.5         2.1  virginica
## 13          6.8         3.2          5.9         2.3  virginica
## 14          6.9         3.1          4.9         1.5 versicolor
## 15          6.9         3.2          5.7         2.3  virginica
## 16          6.9         3.1          5.4         2.1  virginica
## 17          6.9         3.1          5.1         2.3  virginica
## 18          7.0         3.2          4.7         1.4 versicolor
## 19          7.1         3.0          5.9         2.1  virginica
## 20          7.2         3.6          6.1         2.5  virginica
## 21          7.2         3.2          6.0         1.8  virginica
## 22          7.2         3.0          5.8         1.6  virginica
## 23          7.3         2.9          6.3         1.8  virginica
## 24          7.4         2.8          6.1         1.9  virginica
## 25          7.6         3.0          6.6         2.1  virginica
## 26          7.7         3.8          6.7         2.2  virginica
## 27          7.7         2.6          6.9         2.3  virginica
## 28          7.7         2.8          6.7         2.0  virginica
## 29          7.7         3.0          6.1         2.3  virginica
## 30          7.9         3.8          6.4         2.0  virginica
arrange(iris_ex, desc(Sepal.Length)) #내림차순으로 설정
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1           7.9         3.8          6.4         2.0  virginica
## 2           7.7         3.8          6.7         2.2  virginica
## 3           7.7         2.6          6.9         2.3  virginica
## 4           7.7         2.8          6.7         2.0  virginica
## 5           7.7         3.0          6.1         2.3  virginica
## 6           7.6         3.0          6.6         2.1  virginica
## 7           7.4         2.8          6.1         1.9  virginica
## 8           7.3         2.9          6.3         1.8  virginica
## 9           7.2         3.6          6.1         2.5  virginica
## 10          7.2         3.2          6.0         1.8  virginica
## 11          7.2         3.0          5.8         1.6  virginica
## 12          7.1         3.0          5.9         2.1  virginica
## 13          7.0         3.2          4.7         1.4 versicolor
## 14          6.9         3.1          4.9         1.5 versicolor
## 15          6.9         3.2          5.7         2.3  virginica
## 16          6.9         3.1          5.4         2.1  virginica
## 17          6.9         3.1          5.1         2.3  virginica
## 18          6.8         2.8          4.8         1.4 versicolor
## 19          6.8         3.0          5.5         2.1  virginica
## 20          6.8         3.2          5.9         2.3  virginica
## 21          6.7         3.1          4.4         1.4 versicolor
## 22          6.7         3.0          5.0         1.7 versicolor
## 23          6.7         3.1          4.7         1.5 versicolor
## 24          6.7         2.5          5.8         1.8  virginica
## 25          6.7         3.3          5.7         2.1  virginica
## 26          6.7         3.1          5.6         2.4  virginica
## 27          6.7         3.3          5.7         2.5  virginica
## 28          6.7         3.0          5.2         2.3  virginica
## 29          6.6         2.9          4.6         1.3 versicolor
## 30          6.6         3.0          4.4         1.4 versicolor
arrange(iris_ex, desc(Sepal.Length), desc(Petal.Length)) # 첫번째 파라미터 우선 정렬후 동값에대해서는 두번째 파라미터기준으로 판단하여 정렬
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1           7.9         3.8          6.4         2.0  virginica
## 2           7.7         2.6          6.9         2.3  virginica
## 3           7.7         3.8          6.7         2.2  virginica
## 4           7.7         2.8          6.7         2.0  virginica
## 5           7.7         3.0          6.1         2.3  virginica
## 6           7.6         3.0          6.6         2.1  virginica
## 7           7.4         2.8          6.1         1.9  virginica
## 8           7.3         2.9          6.3         1.8  virginica
## 9           7.2         3.6          6.1         2.5  virginica
## 10          7.2         3.2          6.0         1.8  virginica
## 11          7.2         3.0          5.8         1.6  virginica
## 12          7.1         3.0          5.9         2.1  virginica
## 13          7.0         3.2          4.7         1.4 versicolor
## 14          6.9         3.2          5.7         2.3  virginica
## 15          6.9         3.1          5.4         2.1  virginica
## 16          6.9         3.1          5.1         2.3  virginica
## 17          6.9         3.1          4.9         1.5 versicolor
## 18          6.8         3.2          5.9         2.3  virginica
## 19          6.8         3.0          5.5         2.1  virginica
## 20          6.8         2.8          4.8         1.4 versicolor
## 21          6.7         2.5          5.8         1.8  virginica
## 22          6.7         3.3          5.7         2.1  virginica
## 23          6.7         3.3          5.7         2.5  virginica
## 24          6.7         3.1          5.6         2.4  virginica
## 25          6.7         3.0          5.2         2.3  virginica
## 26          6.7         3.0          5.0         1.7 versicolor
## 27          6.7         3.1          4.7         1.5 versicolor
## 28          6.7         3.1          4.4         1.4 versicolor
## 29          6.6         2.9          4.6         1.3 versicolor
## 30          6.6         3.0          4.4         1.4 versicolor
#iris_ex %>%
#  arrange(desc(Sepal.Length), desc(Petal.Length))




group_by

그룹별로 요약해보기. 데이터셋에서 어떤 그룹을 특정하고, 그 그룹의 maen, max등을 각 column별로 구해보고 싶을 때.

# group_by + summarise
summarise(group_by(iris_ex, Species), mean(Sepal.Length), mean(Petal.Length))
## Source: local data frame [2 x 3]
## 
##      Species mean(Sepal.Length) mean(Petal.Length)
## 1 versicolor           6.750000           4.687500
## 2  virginica           7.159091           5.963636
#iris_ex %>%
#  group_by(Species) %>%
#  summarise(mean(Sepal.Length), mean(Petal.Length))