bigdata_01

#ctrl+alt+i
library(dplyr)

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
data("diamonds")
glimpse(diamonds)

## Rows: 53,940
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…

#01 파이프(pipes) 연산자
head(diamonds)

## # A tibble: 6 × 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48

tail(diamonds)

## # A tibble: 6 × 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.72 Premium   D     SI1      62.7    59  2757  5.69  5.73  3.58
## 2  0.72 Ideal     D     SI1      60.8    57  2757  5.75  5.76  3.5 
## 3  0.72 Good      D     SI1      63.1    55  2757  5.69  5.75  3.61
## 4  0.7  Very Good D     SI1      62.8    60  2757  5.66  5.68  3.56
## 5  0.86 Premium   H     SI2      61      58  2757  6.15  6.12  3.74
## 6  0.75 Ideal     D     SI2      62.2    55  2757  5.83  5.87  3.64

diamonds %>% head

## # A tibble: 6 × 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48

diamonds %>% head %>% dim

## [1]  6 10

#02 rename(): 변수이름 바꾸기
df<-diamonds %>% rename(c=clarity,p=price)
head(df,10)

## # A tibble: 10 × 10
##    carat cut       color c     depth table     p     x     y     z
##    <dbl> <ord>     <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Ideal     E     SI2    61.5    55   326  3.95  3.98  2.43
##  2  0.21 Premium   E     SI1    59.8    61   326  3.89  3.84  2.31
##  3  0.23 Good      E     VS1    56.9    65   327  4.05  4.07  2.31
##  4  0.29 Premium   I     VS2    62.4    58   334  4.2   4.23  2.63
##  5  0.31 Good      J     SI2    63.3    58   335  4.34  4.35  2.75
##  6  0.24 Very Good J     VVS2   62.8    57   336  3.94  3.96  2.48
##  7  0.24 Very Good I     VVS1   62.3    57   336  3.95  3.98  2.47
##  8  0.26 Very Good H     SI1    61.9    55   337  4.07  4.11  2.53
##  9  0.22 Fair      E     VS2    65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good H     VS1    59.4    61   338  4     4.05  2.39

#03 count(): 빈도분석
count(diamonds,cut)

## # A tibble: 5 × 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1610
## 2 Good       4906
## 3 Very Good 12082
## 4 Premium   13791
## 5 Ideal     21551

diamonds %>% count(cut)

## # A tibble: 5 × 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1610
## 2 Good       4906
## 3 Very Good 12082
## 4 Premium   13791
## 5 Ideal     21551

diamonds %>% count(color)

## # A tibble: 7 × 2
##   color     n
##   <ord> <int>
## 1 D      6775
## 2 E      9797
## 3 F      9542
## 4 G     11292
## 5 H      8304
## 6 I      5422
## 7 J      2808

diamonds %>% count(clarity)

## # A tibble: 8 × 2
##   clarity     n
##   <ord>   <int>
## 1 I1        741
## 2 SI2      9194
## 3 SI1     13065
## 4 VS2     12258
## 5 VS1      8171
## 6 VVS2     5066
## 7 VVS1     3655
## 8 IF       1790

class(count(diamonds,cut))

## [1] "tbl_df"     "tbl"        "data.frame"

#데이터 프레임 행,열 구성 2차원 구성임
count_result <- count(diamonds, cut)
class(count_result)

## [1] "tbl_df"     "tbl"        "data.frame"

bigdata_01_dplyr

seung hee

2023-09-17