ls()
## character(0)
rm(list = ls())
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
data("diamonds")
diamonds %>% head %>% dim
## [1]  6 10
summary(diamonds)
##      carat               cut        color        clarity          depth      
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00  
##                                     J: 2808   (Other): 2531                  
##      table           price             x                y         
##  Min.   :43.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710   1st Qu.: 4.720  
##  Median :57.00   Median : 2401   Median : 5.700   Median : 5.710  
##  Mean   :57.46   Mean   : 3933   Mean   : 5.731   Mean   : 5.735  
##  3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.: 6.540  
##  Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900  
##                                                                   
##        z         
##  Min.   : 0.000  
##  1st Qu.: 2.910  
##  Median : 3.530  
##  Mean   : 3.539  
##  3rd Qu.: 4.040  
##  Max.   :31.800  
## 
#변수이름 변경하기
diamonds1<-diamonds %>% rename(c=clarity,p=price)
str(diamonds1)
## tibble [53,940 x 10] (S3: tbl_df/tbl/data.frame)
##  $ carat: num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut  : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color: Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ c    : Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth: num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table: num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
##  $ p    : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
##  $ x    : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y    : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z    : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
head(diamonds1,3)
## # A tibble: 3 x 10
##   carat cut     color c     depth table     p     x     y     z
##   <dbl> <ord>   <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2    61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1    59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1    56.9    65   327  4.05  4.07  2.31
#빈도분석
count(diamonds,cut)
## # A tibble: 5 x 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1610
## 2 Good       4906
## 3 Very Good 12082
## 4 Premium   13791
## 5 Ideal     21551
table(diamonds$cut)
## 
##      Fair      Good Very Good   Premium     Ideal 
##      1610      4906     12082     13791     21551
#데이터 열 추출
df1<-diamonds %>% select(carat,price)
head(df1,3)
## # A tibble: 3 x 2
##   carat price
##   <dbl> <int>
## 1  0.23   326
## 2  0.21   326
## 3  0.23   327
#불필요한 열변수 제거
df2<-diamonds %>% select(-carat,-price)
head(df2,3)
## # A tibble: 3 x 8
##   cut     color clarity depth table     x     y     z
##   <ord>   <ord> <ord>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Ideal   E     SI2      61.5    55  3.95  3.98  2.43
## 2 Premium E     SI1      59.8    61  3.89  3.84  2.31
## 3 Good    E     VS1      56.9    65  4.05  4.07  2.31
#행의 일부 추출하기
diamonds %>% slice(1:5)
## # A tibble: 5 x 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good    J     SI2      63.3    58   335  4.34  4.35  2.75
diamonds %>% slice(-1)
## # A tibble: 53,939 x 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  2  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  3  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  4  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  5  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  6  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  7  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  8  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
##  9  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
## 10  0.3  Good      J     SI1      64      55   339  4.25  4.28  2.73
## # ... with 53,929 more rows
#코딩넘버 신규 부여

#조건으로 행 추출
diamonds %>% filter(cut=="Good") %>% head(3)
## # A tibble: 3 x 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
## 2  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
## 3  0.3  Good  J     SI1      64      55   339  4.25  4.28  2.73
max(diamonds$price)
## [1] 18823
diamonds %>% filter(price==max(price))
## # A tibble: 1 x 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  2.29 Premium I     VS2      60.8    60 18823   8.5  8.47  5.16
diamonds %>% filter(price==18823)
## # A tibble: 1 x 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  2.29 Premium I     VS2      60.8    60 18823   8.5  8.47  5.16
diamonds %>% filter(cut!="Premium") %>% head(3)
## # A tibble: 3 x 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
## 3  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
diamonds %>% filter(price>=1000) %>% head(3)
## # A tibble: 3 x 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.7  Ideal E     SI1      62.5    57  2757  5.7   5.72  3.57
## 2  0.86 Fair  E     SI2      55.1    69  2757  6.45  6.33  3.52
## 3  0.7  Ideal G     VS2      61.6    56  2757  5.7   5.67  3.5
diamonds %>% filter(price!=1000) %>% head(3)
## # A tibble: 3 x 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
diamonds %>% filter(price==1000) %>% head(3)
## # A tibble: 3 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.38 Very Good E     VVS2     61.8    56  1000  4.66  4.68  2.88
## 2  0.39 Very Good F     VS1      57.1    61  1000  4.86  4.91  2.79
## 3  0.38 Very Good E     VS1      61.5    58  1000  4.64  4.69  2.87
diamonds %>% filter(price<=1000) %>% head(3)
## # A tibble: 3 x 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
diamonds %>% filter(price!=1000&cut=="Ideal") %>% head(3)
## # A tibble: 3 x 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.23 Ideal J     VS1      62.8    56   340  3.93  3.9   2.46
## 3  0.31 Ideal J     SI2      62.2    54   344  4.35  4.37  2.71
diamonds %>% filter(price!=1000&cut=="Ideal"&color=="E") %>% head(3)
## # A tibble: 3 x 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.26 Ideal E     VVS2     62.9    58   554  4.02  4.06  2.54
## 3  0.7  Ideal E     SI1      62.5    57  2757  5.7   5.72  3.57
diamonds %>% filter(carat<2|carat>5) %>% head(3)
## # A tibble: 3 x 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
#여러 열의 값이 여려가능한 값 가운데 하나라도 맞는 행들을 필터링할때 %in%를 사용함
diamonds %>% filter(cut %in%c("Ideal","Good")) %>% head(3)
## # A tibble: 3 x 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.23 Good  E     VS1      56.9    65   327  4.05  4.07  2.31
## 3  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75
#열을 추출한 후에 조건에 맞는 행을 추출하기
diamonds %>% select(carat,depth,price) %>% filter(depth==max(depth)|price==min(price))
## # A tibble: 4 x 3
##   carat depth price
##   <dbl> <dbl> <int>
## 1  0.23  61.5   326
## 2  0.21  59.8   326
## 3  0.5   79    2579
## 4  0.5   79    2579
#파생변수 만들기
diamonds %>% mutate(Ratio=price/carat,Double=Ratio*2) %>% head(3)
## # A tibble: 3 x 12
##   carat cut     color clarity depth table price     x     y     z Ratio Double
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43 1417.  2835.
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31 1552.  3105.
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31 1422.  2843.
#집단멸 통계량 구하기
diamonds %>% summarise(mean(price))
## # A tibble: 1 x 1
##   `mean(price)`
##           <dbl>
## 1         3933.
#summarize는 mean, max, medium 적용가능
diamonds %>% summarise(AvgPrice=mean(price),MedianPrice=median(price),AvgCarat=mean(carat))
## # A tibble: 1 x 3
##   AvgPrice MedianPrice AvgCarat
##      <dbl>       <dbl>    <dbl>
## 1    3933.        2401    0.798
diamonds %>% group_by(cut) %>% summarise(AvgPrice=mean(price),SumCarat=sum(carat))
## # A tibble: 5 x 3
##   cut       AvgPrice SumCarat
##   <ord>        <dbl>    <dbl>
## 1 Fair         4359.    1684.
## 2 Good         3929.    4166.
## 3 Very Good    3982.    9743.
## 4 Premium      4584.   12301.
## 5 Ideal        3458.   15147.
diamonds %>% group_by(cut) %>% summarize(n=n()) %>%mutate(total=sum(n),pct=n/total*100)
## # A tibble: 5 x 4
##   cut           n total   pct
##   <ord>     <int> <int> <dbl>
## 1 Fair       1610 53940  2.98
## 2 Good       4906 53940  9.10
## 3 Very Good 12082 53940 22.4 
## 4 Premium   13791 53940 25.6 
## 5 Ideal     21551 53940 40.0
#연속 데이터로 범주별 변수 만들기
quantile(diamonds$price)
##       0%      25%      50%      75%     100% 
##   326.00   950.00  2401.00  5324.25 18823.00
diamonds2<-diamonds %>% mutate(price_class=ifelse(price>=5324.25,"best",ifelse(price>=2401,"good",ifelse(price>=950,"normal","bad"))))
table(diamonds2$price_class)
## 
##    bad   best   good normal 
##  13483  13485  13496  13476
#데이터 정렬하기
diamonds %>% group_by(cut) %>% summarise(AvgPrice=mean(price)) %>% arrange(desc(AvgPrice))
## # A tibble: 5 x 2
##   cut       AvgPrice
##   <ord>        <dbl>
## 1 Premium      4584.
## 2 Fair         4359.
## 3 Very Good    3982.
## 4 Good         3929.
## 5 Ideal        3458.
#데이터 결합하기