02.Data_preprocessing

#파이프연산자
library(dplyr)

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
data("diamonds")
#diamonds 데이터 프레임에서 head부분을 가져와 행과 열의 dimension을 알고싶다
diamonds %>% head %>% dim

## [1]  6 10

# data("diamonds")
#df<-head.diamonds
#dim.df??

#rename함수/컬럼이름변경?
diamonds1<-diamonds %>% rename(c=clarity,p=price)
head(diamonds1,3)

## # A tibble: 3 × 10
##   carat cut     color c     depth table     p     x     y     z
##   <dbl> <ord>   <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2    61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1    59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1    56.9    65   327  4.05  4.07  2.31

#count()함수 
df<-diamonds %>% count()
class(df)

## [1] "tbl_df"     "tbl"        "data.frame"

count(diamonds,cut)

## # A tibble: 5 × 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1610
## 2 Good       4906
## 3 Very Good 12082
## 4 Premium   13791
## 5 Ideal     21551

#select()함수 데이터셋에서 열을 추출하는 함수
df1<-diamonds %>% select(1:5)
head(df1)

## # A tibble: 6 × 5
##   carat cut       color clarity depth
##   <dbl> <ord>     <ord> <ord>   <dbl>
## 1  0.23 Ideal     E     SI2      61.5
## 2  0.21 Premium   E     SI1      59.8
## 3  0.23 Good      E     VS1      56.9
## 4  0.29 Premium   I     VS2      62.4
## 5  0.31 Good      J     SI2      63.3
## 6  0.24 Very Good J     VVS2     62.8

#특정열 제거시 -부호를 붙이면 제거가 된다.
df1<-diamonds %>% select(-depth)
head(df1)

## # A tibble: 6 × 9
##   carat cut       color clarity table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2        55   326  3.95  3.98  2.43
## 2  0.21 Premium   E     SI1        61   326  3.89  3.84  2.31
## 3  0.23 Good      E     VS1        65   327  4.05  4.07  2.31
## 4  0.29 Premium   I     VS2        58   334  4.2   4.23  2.63
## 5  0.31 Good      J     SI2        58   335  4.34  4.35  2.75
## 6  0.24 Very Good J     VVS2       57   336  3.94  3.96  2.48

#slice() 행번호로 행을 필터링
diamonds %>% slice(1:5)

## # A tibble: 5 × 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good    J     SI2      63.3    58   335  4.34  4.35  2.75

diamonds %>% slice(-1)

## # A tibble: 53,939 × 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  2  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  3  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  4  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  5  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  6  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  7  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  8  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
##  9  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
## 10  0.3  Good      J     SI1      64      55   339  4.25  4.28  2.73
## # … with 53,929 more rows

#filter() 행에 조건을 부여해 특정 행을 추출할때 사용
diamonds %>% filter(cut=="Good")

## # A tibble: 4,906 × 10
##    carat cut   color clarity depth table price     x     y     z
##    <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
##  2  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
##  3  0.3  Good  J     SI1      64      55   339  4.25  4.28  2.73
##  4  0.3  Good  J     SI1      63.4    54   351  4.23  4.29  2.7 
##  5  0.3  Good  J     SI1      63.8    56   351  4.23  4.26  2.71
##  6  0.3  Good  I     SI2      63.3    56   351  4.26  4.3   2.71
##  7  0.23 Good  F     VS1      58.2    59   402  4.06  4.08  2.37
##  8  0.23 Good  E     VS1      64.1    59   402  3.83  3.85  2.46
##  9  0.31 Good  H     SI1      64      54   402  4.29  4.31  2.75
## 10  0.26 Good  D     VS2      65.2    56   403  3.99  4.02  2.61
## # … with 4,896 more rows

max(diamonds$price)

## [1] 18823

diamonds %>% filter(price==max(price))

## # A tibble: 1 × 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  2.29 Premium I     VS2      60.8    60 18823   8.5  8.47  5.16

diamonds %>% filter(price==18823)

## # A tibble: 1 × 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  2.29 Premium I     VS2      60.8    60 18823   8.5  8.47  5.16

diamonds %>% filter(cut!="Premium")

## # A tibble: 40,149 × 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  3  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  4  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  5  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  6  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  7  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
##  8  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
##  9  0.3  Good      J     SI1      64      55   339  4.25  4.28  2.73
## 10  0.23 Ideal     J     VS1      62.8    56   340  3.93  3.9   2.46
## # … with 40,139 more rows

diamonds %>% filter(price>=1000)

## # A tibble: 39,441 × 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.7  Ideal     E     SI1      62.5    57  2757  5.7   5.72  3.57
##  2  0.86 Fair      E     SI2      55.1    69  2757  6.45  6.33  3.52
##  3  0.7  Ideal     G     VS2      61.6    56  2757  5.7   5.67  3.5 
##  4  0.71 Very Good E     VS2      62.4    57  2759  5.68  5.73  3.56
##  5  0.78 Very Good G     SI2      63.8    56  2759  5.81  5.85  3.72
##  6  0.7  Good      E     VS2      57.5    58  2759  5.85  5.9   3.38
##  7  0.7  Good      F     VS1      59.4    62  2759  5.71  5.76  3.4 
##  8  0.96 Fair      F     SI2      66.3    62  2759  6.27  5.95  4.07
##  9  0.73 Very Good E     SI1      61.6    59  2760  5.77  5.78  3.56
## 10  0.8  Premium   H     SI1      61.5    58  2760  5.97  5.93  3.66
## # … with 39,431 more rows

diamonds %>% filter(price!=1000)

## # A tibble: 53,915 × 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # … with 53,905 more rows

diamonds %>% filter(price<=10000)

## # A tibble: 48,718 × 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # … with 48,708 more rows

#and 와 or
diamonds %>% filter(price!=1000 & color=="E")

## # A tibble: 9,785 × 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
##  5  0.2  Premium   E     SI2      60.2    62   345  3.79  3.75  2.27
##  6  0.32 Premium   E     I1       60.9    58   345  4.38  4.42  2.68
##  7  0.23 Very Good E     VS2      63.8    55   352  3.85  3.92  2.48
##  8  0.23 Very Good E     VS1      60.7    59   402  3.97  4.01  2.42
##  9  0.23 Very Good E     VS1      59.5    58   402  4.01  4.06  2.4 
## 10  0.23 Good      E     VS1      64.1    59   402  3.83  3.85  2.46
## # … with 9,775 more rows

diamonds %>% filter(carat<1|carat>5)

## # A tibble: 34,881 × 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # … with 34,871 more rows

#%in% 여러 값중에 하나라도 맞는 행들을 필터링
diamonds %>% filter(cut%in%c("ideal","Good"))

## # A tibble: 4,906 × 10
##    carat cut   color clarity depth table price     x     y     z
##    <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
##  2  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
##  3  0.3  Good  J     SI1      64      55   339  4.25  4.28  2.73
##  4  0.3  Good  J     SI1      63.4    54   351  4.23  4.29  2.7 
##  5  0.3  Good  J     SI1      63.8    56   351  4.23  4.26  2.71
##  6  0.3  Good  I     SI2      63.3    56   351  4.26  4.3   2.71
##  7  0.23 Good  F     VS1      58.2    59   402  4.06  4.08  2.37
##  8  0.23 Good  E     VS1      64.1    59   402  3.83  3.85  2.46
##  9  0.31 Good  H     SI1      64      54   402  4.29  4.31  2.75
## 10  0.26 Good  D     VS2      65.2    56   403  3.99  4.02  2.61
## # … with 4,896 more rows

  #%in%c()는 c()안에 있는 것을 모두 포함한다는 의미
#1단계 select() 열 추출, 2단계 filter() 행 추출
diamonds %>% select("carat","depth","price") %>% filter(depth==max(depth)|price==min(price))

## # A tibble: 4 × 3
##   carat depth price
##   <dbl> <dbl> <int>
## 1  0.23  61.5   326
## 2  0.21  59.8   326
## 3  0.5   79    2579
## 4  0.5   79    2579

#mutate() 파생변수 만들기 (derive?)
df2<-diamonds %>% mutate(Ratio=price/carat,Double=Ratio*2)
glimpse(df2)

## Rows: 53,940
## Columns: 12
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
## $ Ratio   <dbl> 1417.391, 1552.381, 1421.739, 1151.724, 1080.645, 1400.000, 14…
## $ Double  <dbl> 2834.783, 3104.762, 2843.478, 2303.448, 2161.290, 2800.000, 28…

class(df2)

## [1] "tbl_df"     "tbl"        "data.frame"

#groupby()/summarize() 집단별 통계량 구하기. 
diamonds %>% summarize(mean(price))

## # A tibble: 1 × 1
##   `mean(price)`
##           <dbl>
## 1         3933.

diamonds %>% summarize(AvgPrice=mean(price),
                       MedianPrice=median(price),
                       AvgCarat=mean(carat))

## # A tibble: 1 × 3
##   AvgPrice MedianPrice AvgCarat
##      <dbl>       <dbl>    <dbl>
## 1    3933.        2401    0.798

  #groupby
diamonds %>% group_by(cut) %>% summarize(AvgPrice=mean(price),SumCarat=sum(carat))

## # A tibble: 5 × 3
##   cut       AvgPrice SumCarat
##   <ord>        <dbl>    <dbl>
## 1 Fair         4359.    1684.
## 2 Good         3929.    4166.
## 3 Very Good    3982.    9743.
## 4 Premium      4584.   12301.
## 5 Ideal        3458.   15147.

diamonds %>% group_by(cut) %>% 
  summarize(n=n()) %>% 
  mutate(total=sum(n),pct=n/total*100)

## # A tibble: 5 × 4
##   cut           n total   pct
##   <ord>     <int> <int> <dbl>
## 1 Fair       1610 53940  2.98
## 2 Good       4906 53940  9.10
## 3 Very Good 12082 53940 22.4 
## 4 Premium   13791 53940 25.6 
## 5 Ideal     21551 53940 40.0

#mutate() + ifelse() 연속형 데이터 범주별 변수 생성
quantile(diamonds$price)

##       0%      25%      50%      75%     100% 
##   326.00   950.00  2401.00  5324.25 18823.00

diamonds1<-diamonds %>% mutate(price_class=ifelse(price>=5324.25,"best",
                                                  ifelse(price>=2401,"good", 
                                                  ifelse(price>=950,"normal","bad"))))
table(diamonds1$price_class)

## 
##    bad   best   good normal 
##  13483  13485  13496  13476

#arrange() 데이터 정렬하기 
#내림차순 정렬
diamonds %>% group_by(cut) %>% summarize(AvgPrice=mean(price)) %>% arrange(desc(AvgPrice))

## # A tibble: 5 × 2
##   cut       AvgPrice
##   <ord>        <dbl>
## 1 Premium      4584.
## 2 Fair         4359.
## 3 Very Good    3982.
## 4 Good         3929.
## 5 Ideal        3458.

#오름차순 정렬
diamonds %>% group_by(cut) %>% summarize(AvgPrice=mean(price)) %>% arrange(AvgPrice)

## # A tibble: 5 × 2
##   cut       AvgPrice
##   <ord>        <dbl>
## 1 Ideal        3458.
## 2 Good         3929.
## 3 Very Good    3982.
## 4 Fair         4359.
## 5 Premium      4584.

#데이터 결합하기
ott1<-data.frame(id=c(1,2,3),car=c("bmw",'bmw',"bmw"),fe=c(20,22,24))
ott2<-data.frame(id=c(1,4,5),fe1=c(30,34,35))
ott1

##   id car fe
## 1  1 bmw 20
## 2  2 bmw 22
## 3  3 bmw 24

ott2

##   id fe1
## 1  1  30
## 2  4  34
## 3  5  35

#left_join= 지정한 변수와 데이터 셋1을 기준으로 데이터 셋2에 나머지 변수를 결합/ 못채우는 값은 NA처리 
left_join(ott1,ott2,by='id')

##   id car fe fe1
## 1  1 bmw 20  30
## 2  2 bmw 22  NA
## 3  3 bmw 24  NA

#inner_join= 데이터셋 1과 데이터셋2에서 기준으로 지정한 변수값이 동일 할때만 결합
inner_join(ott1,ott2,by='id')

##   id car fe fe1
## 1  1 bmw 20  30

#full_join= 데이터셋1과 데이터셋2에서 기준으로 지정한 변수값 전체를 결합
full_join(ott1,ott2,by='id')

##   id  car fe fe1
## 1  1  bmw 20  30
## 2  2  bmw 22  NA
## 3  3  bmw 24  NA
## 4  4 <NA> NA  34
## 5  5 <NA> NA  35

02.Data_preprocessing

cha byeong hyun

2022-12-22