#파이프연산자
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data("diamonds")
#diamonds 데이터 프레임에서 head부분을 가져와 행과 열의 dimension을 알고싶다
diamonds %>% head %>% dim
## [1] 6 10
# data("diamonds")
#df<-head.diamonds
#dim.df??
#rename함수/컬럼이름변경?
diamonds1<-diamonds %>% rename(c=clarity,p=price)
head(diamonds1,3)
## # A tibble: 3 × 10
## carat cut color c depth table p x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
#count()함수
df<-diamonds %>% count()
class(df)
## [1] "tbl_df" "tbl" "data.frame"
count(diamonds,cut)
## # A tibble: 5 × 2
## cut n
## <ord> <int>
## 1 Fair 1610
## 2 Good 4906
## 3 Very Good 12082
## 4 Premium 13791
## 5 Ideal 21551
#select()함수 데이터셋에서 열을 추출하는 함수
df1<-diamonds %>% select(1:5)
head(df1)
## # A tibble: 6 × 5
## carat cut color clarity depth
## <dbl> <ord> <ord> <ord> <dbl>
## 1 0.23 Ideal E SI2 61.5
## 2 0.21 Premium E SI1 59.8
## 3 0.23 Good E VS1 56.9
## 4 0.29 Premium I VS2 62.4
## 5 0.31 Good J SI2 63.3
## 6 0.24 Very Good J VVS2 62.8
#특정열 제거시 -부호를 붙이면 제거가 된다.
df1<-diamonds %>% select(-depth)
head(df1)
## # A tibble: 6 × 9
## carat cut color clarity table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 57 336 3.94 3.96 2.48
#slice() 행번호로 행을 필터링
diamonds %>% slice(1:5)
## # A tibble: 5 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
diamonds %>% slice(-1)
## # A tibble: 53,939 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 2 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 3 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 4 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 5 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 6 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 7 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 8 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 9 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## 10 0.3 Good J SI1 64 55 339 4.25 4.28 2.73
## # … with 53,929 more rows
#filter() 행에 조건을 부여해 특정 행을 추출할때 사용
diamonds %>% filter(cut=="Good")
## # A tibble: 4,906 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 2 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 3 0.3 Good J SI1 64 55 339 4.25 4.28 2.73
## 4 0.3 Good J SI1 63.4 54 351 4.23 4.29 2.7
## 5 0.3 Good J SI1 63.8 56 351 4.23 4.26 2.71
## 6 0.3 Good I SI2 63.3 56 351 4.26 4.3 2.71
## 7 0.23 Good F VS1 58.2 59 402 4.06 4.08 2.37
## 8 0.23 Good E VS1 64.1 59 402 3.83 3.85 2.46
## 9 0.31 Good H SI1 64 54 402 4.29 4.31 2.75
## 10 0.26 Good D VS2 65.2 56 403 3.99 4.02 2.61
## # … with 4,896 more rows
max(diamonds$price)
## [1] 18823
diamonds %>% filter(price==max(price))
## # A tibble: 1 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2.29 Premium I VS2 60.8 60 18823 8.5 8.47 5.16
diamonds %>% filter(price==18823)
## # A tibble: 1 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2.29 Premium I VS2 60.8 60 18823 8.5 8.47 5.16
diamonds %>% filter(cut!="Premium")
## # A tibble: 40,149 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 3 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 4 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 5 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 6 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 7 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 8 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## 9 0.3 Good J SI1 64 55 339 4.25 4.28 2.73
## 10 0.23 Ideal J VS1 62.8 56 340 3.93 3.9 2.46
## # … with 40,139 more rows
diamonds %>% filter(price>=1000)
## # A tibble: 39,441 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.7 Ideal E SI1 62.5 57 2757 5.7 5.72 3.57
## 2 0.86 Fair E SI2 55.1 69 2757 6.45 6.33 3.52
## 3 0.7 Ideal G VS2 61.6 56 2757 5.7 5.67 3.5
## 4 0.71 Very Good E VS2 62.4 57 2759 5.68 5.73 3.56
## 5 0.78 Very Good G SI2 63.8 56 2759 5.81 5.85 3.72
## 6 0.7 Good E VS2 57.5 58 2759 5.85 5.9 3.38
## 7 0.7 Good F VS1 59.4 62 2759 5.71 5.76 3.4
## 8 0.96 Fair F SI2 66.3 62 2759 6.27 5.95 4.07
## 9 0.73 Very Good E SI1 61.6 59 2760 5.77 5.78 3.56
## 10 0.8 Premium H SI1 61.5 58 2760 5.97 5.93 3.66
## # … with 39,431 more rows
diamonds %>% filter(price!=1000)
## # A tibble: 53,915 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # … with 53,905 more rows
diamonds %>% filter(price<=10000)
## # A tibble: 48,718 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # … with 48,708 more rows
#and 와 or
diamonds %>% filter(price!=1000 & color=="E")
## # A tibble: 9,785 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 5 0.2 Premium E SI2 60.2 62 345 3.79 3.75 2.27
## 6 0.32 Premium E I1 60.9 58 345 4.38 4.42 2.68
## 7 0.23 Very Good E VS2 63.8 55 352 3.85 3.92 2.48
## 8 0.23 Very Good E VS1 60.7 59 402 3.97 4.01 2.42
## 9 0.23 Very Good E VS1 59.5 58 402 4.01 4.06 2.4
## 10 0.23 Good E VS1 64.1 59 402 3.83 3.85 2.46
## # … with 9,775 more rows
diamonds %>% filter(carat<1|carat>5)
## # A tibble: 34,881 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # … with 34,871 more rows
#%in% 여러 값중에 하나라도 맞는 행들을 필터링
diamonds %>% filter(cut%in%c("ideal","Good"))
## # A tibble: 4,906 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 2 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 3 0.3 Good J SI1 64 55 339 4.25 4.28 2.73
## 4 0.3 Good J SI1 63.4 54 351 4.23 4.29 2.7
## 5 0.3 Good J SI1 63.8 56 351 4.23 4.26 2.71
## 6 0.3 Good I SI2 63.3 56 351 4.26 4.3 2.71
## 7 0.23 Good F VS1 58.2 59 402 4.06 4.08 2.37
## 8 0.23 Good E VS1 64.1 59 402 3.83 3.85 2.46
## 9 0.31 Good H SI1 64 54 402 4.29 4.31 2.75
## 10 0.26 Good D VS2 65.2 56 403 3.99 4.02 2.61
## # … with 4,896 more rows
#%in%c()는 c()안에 있는 것을 모두 포함한다는 의미
#1단계 select() 열 추출, 2단계 filter() 행 추출
diamonds %>% select("carat","depth","price") %>% filter(depth==max(depth)|price==min(price))
## # A tibble: 4 × 3
## carat depth price
## <dbl> <dbl> <int>
## 1 0.23 61.5 326
## 2 0.21 59.8 326
## 3 0.5 79 2579
## 4 0.5 79 2579
#mutate() 파생변수 만들기 (derive?)
df2<-diamonds %>% mutate(Ratio=price/carat,Double=Ratio*2)
glimpse(df2)
## Rows: 53,940
## Columns: 12
## $ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
## $ Ratio <dbl> 1417.391, 1552.381, 1421.739, 1151.724, 1080.645, 1400.000, 14…
## $ Double <dbl> 2834.783, 3104.762, 2843.478, 2303.448, 2161.290, 2800.000, 28…
class(df2)
## [1] "tbl_df" "tbl" "data.frame"
#groupby()/summarize() 집단별 통계량 구하기.
diamonds %>% summarize(mean(price))
## # A tibble: 1 × 1
## `mean(price)`
## <dbl>
## 1 3933.
diamonds %>% summarize(AvgPrice=mean(price),
MedianPrice=median(price),
AvgCarat=mean(carat))
## # A tibble: 1 × 3
## AvgPrice MedianPrice AvgCarat
## <dbl> <dbl> <dbl>
## 1 3933. 2401 0.798
#groupby
diamonds %>% group_by(cut) %>% summarize(AvgPrice=mean(price),SumCarat=sum(carat))
## # A tibble: 5 × 3
## cut AvgPrice SumCarat
## <ord> <dbl> <dbl>
## 1 Fair 4359. 1684.
## 2 Good 3929. 4166.
## 3 Very Good 3982. 9743.
## 4 Premium 4584. 12301.
## 5 Ideal 3458. 15147.
diamonds %>% group_by(cut) %>%
summarize(n=n()) %>%
mutate(total=sum(n),pct=n/total*100)
## # A tibble: 5 × 4
## cut n total pct
## <ord> <int> <int> <dbl>
## 1 Fair 1610 53940 2.98
## 2 Good 4906 53940 9.10
## 3 Very Good 12082 53940 22.4
## 4 Premium 13791 53940 25.6
## 5 Ideal 21551 53940 40.0
#mutate() + ifelse() 연속형 데이터 범주별 변수 생성
quantile(diamonds$price)
## 0% 25% 50% 75% 100%
## 326.00 950.00 2401.00 5324.25 18823.00
diamonds1<-diamonds %>% mutate(price_class=ifelse(price>=5324.25,"best",
ifelse(price>=2401,"good",
ifelse(price>=950,"normal","bad"))))
table(diamonds1$price_class)
##
## bad best good normal
## 13483 13485 13496 13476
#arrange() 데이터 정렬하기
#내림차순 정렬
diamonds %>% group_by(cut) %>% summarize(AvgPrice=mean(price)) %>% arrange(desc(AvgPrice))
## # A tibble: 5 × 2
## cut AvgPrice
## <ord> <dbl>
## 1 Premium 4584.
## 2 Fair 4359.
## 3 Very Good 3982.
## 4 Good 3929.
## 5 Ideal 3458.
#오름차순 정렬
diamonds %>% group_by(cut) %>% summarize(AvgPrice=mean(price)) %>% arrange(AvgPrice)
## # A tibble: 5 × 2
## cut AvgPrice
## <ord> <dbl>
## 1 Ideal 3458.
## 2 Good 3929.
## 3 Very Good 3982.
## 4 Fair 4359.
## 5 Premium 4584.
#데이터 결합하기
ott1<-data.frame(id=c(1,2,3),car=c("bmw",'bmw',"bmw"),fe=c(20,22,24))
ott2<-data.frame(id=c(1,4,5),fe1=c(30,34,35))
ott1
## id car fe
## 1 1 bmw 20
## 2 2 bmw 22
## 3 3 bmw 24
ott2
## id fe1
## 1 1 30
## 2 4 34
## 3 5 35
#left_join= 지정한 변수와 데이터 셋1을 기준으로 데이터 셋2에 나머지 변수를 결합/ 못채우는 값은 NA처리
left_join(ott1,ott2,by='id')
## id car fe fe1
## 1 1 bmw 20 30
## 2 2 bmw 22 NA
## 3 3 bmw 24 NA
#inner_join= 데이터셋 1과 데이터셋2에서 기준으로 지정한 변수값이 동일 할때만 결합
inner_join(ott1,ott2,by='id')
## id car fe fe1
## 1 1 bmw 20 30
#full_join= 데이터셋1과 데이터셋2에서 기준으로 지정한 변수값 전체를 결합
full_join(ott1,ott2,by='id')
## id car fe fe1
## 1 1 bmw 20 30
## 2 2 bmw 22 NA
## 3 3 bmw 24 NA
## 4 4 <NA> NA 34
## 5 5 <NA> NA 35