이번에도 MASS 패키지 내 Cars93 데이터셋을 사용해보자.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(MASS)
## 
## 다음의 패키지를 부착합니다: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
Cars93 <- Cars93[ , 1:10] #변수는 그냥 10개만 쓰겠음
Cars93 <- janitor::clean_names(Cars93) #이름 깔끔하게

1. one categorical variable

1-1-1. janitor::tabyl()

참고: (https://www.rdocumentation.org/packages/janitor/versions/2.1.0/topics/tabyl)

library(janitor)
## 
## 다음의 패키지를 부착합니다: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
Cars93 %>% tabyl(manufacturer
                 , show_na = T
                 , show_missing_levels = T)
##   manufacturer n    percent
##          Acura 2 0.02150538
##           Audi 2 0.02150538
##            BMW 1 0.01075269
##          Buick 4 0.04301075
##       Cadillac 2 0.02150538
##      Chevrolet 8 0.08602151
##       Chrylser 1 0.01075269
##       Chrysler 2 0.02150538
##          Dodge 6 0.06451613
##          Eagle 2 0.02150538
##           Ford 8 0.08602151
##            Geo 2 0.02150538
##          Honda 3 0.03225806
##        Hyundai 4 0.04301075
##       Infiniti 1 0.01075269
##          Lexus 2 0.02150538
##        Lincoln 2 0.02150538
##          Mazda 5 0.05376344
##  Mercedes-Benz 2 0.02150538
##        Mercury 2 0.02150538
##     Mitsubishi 2 0.02150538
##         Nissan 4 0.04301075
##     Oldsmobile 4 0.04301075
##       Plymouth 1 0.01075269
##        Pontiac 5 0.05376344
##           Saab 1 0.01075269
##         Saturn 1 0.01075269
##         Subaru 3 0.03225806
##         Suzuki 1 0.01075269
##         Toyota 4 0.04301075
##     Volkswagen 4 0.04301075
##          Volvo 2 0.02150538

1-1-2. tabyl() + adorn_pct_formatting()

Cars93 %>% tabyl(manufacturer, show_na = T) %>% adorn_pct_formatting(digits = 2) 
##   manufacturer n percent
##          Acura 2   2.15%
##           Audi 2   2.15%
##            BMW 1   1.08%
##          Buick 4   4.30%
##       Cadillac 2   2.15%
##      Chevrolet 8   8.60%
##       Chrylser 1   1.08%
##       Chrysler 2   2.15%
##          Dodge 6   6.45%
##          Eagle 2   2.15%
##           Ford 8   8.60%
##            Geo 2   2.15%
##          Honda 3   3.23%
##        Hyundai 4   4.30%
##       Infiniti 1   1.08%
##          Lexus 2   2.15%
##        Lincoln 2   2.15%
##          Mazda 5   5.38%
##  Mercedes-Benz 2   2.15%
##        Mercury 2   2.15%
##     Mitsubishi 2   2.15%
##         Nissan 4   4.30%
##     Oldsmobile 4   4.30%
##       Plymouth 1   1.08%
##        Pontiac 5   5.38%
##           Saab 1   1.08%
##         Saturn 1   1.08%
##         Subaru 3   3.23%
##         Suzuki 1   1.08%
##         Toyota 4   4.30%
##     Volkswagen 4   4.30%
##          Volvo 2   2.15%

1-2-1. basic function: table

table(Cars93$manufacturer, useNA = "ifany") #useNA = "ifany": 결측치가 있다면 표시해라
## 
##         Acura          Audi           BMW         Buick      Cadillac 
##             2             2             1             4             2 
##     Chevrolet      Chrylser      Chrysler         Dodge         Eagle 
##             8             1             2             6             2 
##          Ford           Geo         Honda       Hyundai      Infiniti 
##             8             2             3             4             1 
##         Lexus       Lincoln         Mazda Mercedes-Benz       Mercury 
##             2             2             5             2             2 
##    Mitsubishi        Nissan    Oldsmobile      Plymouth       Pontiac 
##             2             4             4             1             5 
##          Saab        Saturn        Subaru        Suzuki        Toyota 
##             1             1             3             1             4 
##    Volkswagen         Volvo 
##             4             2

1-2-2. basic function: table + sum 나오게 보기

addmargins(table(Cars93$manufacturer, useNA = "ifany"))
## 
##         Acura          Audi           BMW         Buick      Cadillac 
##             2             2             1             4             2 
##     Chevrolet      Chrylser      Chrysler         Dodge         Eagle 
##             8             1             2             6             2 
##          Ford           Geo         Honda       Hyundai      Infiniti 
##             8             2             3             4             1 
##         Lexus       Lincoln         Mazda Mercedes-Benz       Mercury 
##             2             2             5             2             2 
##    Mitsubishi        Nissan    Oldsmobile      Plymouth       Pontiac 
##             2             4             4             1             5 
##          Saab        Saturn        Subaru        Suzuki        Toyota 
##             1             1             3             1             4 
##    Volkswagen         Volvo           Sum 
##             4             2            93

1-2-3. basic function: table + sum 나오게 보기 + 비율로 보기

addmargins(prop.table(table(Cars93$manufacturer, useNA = "ifany")))
## 
##         Acura          Audi           BMW         Buick      Cadillac 
##    0.02150538    0.02150538    0.01075269    0.04301075    0.02150538 
##     Chevrolet      Chrylser      Chrysler         Dodge         Eagle 
##    0.08602151    0.01075269    0.02150538    0.06451613    0.02150538 
##          Ford           Geo         Honda       Hyundai      Infiniti 
##    0.08602151    0.02150538    0.03225806    0.04301075    0.01075269 
##         Lexus       Lincoln         Mazda Mercedes-Benz       Mercury 
##    0.02150538    0.02150538    0.05376344    0.02150538    0.02150538 
##    Mitsubishi        Nissan    Oldsmobile      Plymouth       Pontiac 
##    0.02150538    0.04301075    0.04301075    0.01075269    0.05376344 
##          Saab        Saturn        Subaru        Suzuki        Toyota 
##    0.01075269    0.01075269    0.03225806    0.01075269    0.04301075 
##    Volkswagen         Volvo           Sum 
##    0.04301075    0.02150538    1.00000000

1-2-4. basic function: table + sum 나오게 보기 + 백분율로 보기

addmargins(prop.table(table(Cars93$manufacturer, useNA = "ifany"))) * 100
## 
##         Acura          Audi           BMW         Buick      Cadillac 
##      2.150538      2.150538      1.075269      4.301075      2.150538 
##     Chevrolet      Chrylser      Chrysler         Dodge         Eagle 
##      8.602151      1.075269      2.150538      6.451613      2.150538 
##          Ford           Geo         Honda       Hyundai      Infiniti 
##      8.602151      2.150538      3.225806      4.301075      1.075269 
##         Lexus       Lincoln         Mazda Mercedes-Benz       Mercury 
##      2.150538      2.150538      5.376344      2.150538      2.150538 
##    Mitsubishi        Nissan    Oldsmobile      Plymouth       Pontiac 
##      2.150538      4.301075      4.301075      1.075269      5.376344 
##          Saab        Saturn        Subaru        Suzuki        Toyota 
##      1.075269      1.075269      3.225806      1.075269      4.301075 
##    Volkswagen         Volvo           Sum 
##      4.301075      2.150538    100.000000

1-3. dplyr::count()

Cars93 %>% count(manufacturer)
##     manufacturer n
## 1          Acura 2
## 2           Audi 2
## 3            BMW 1
## 4          Buick 4
## 5       Cadillac 2
## 6      Chevrolet 8
## 7       Chrylser 1
## 8       Chrysler 2
## 9          Dodge 6
## 10         Eagle 2
## 11          Ford 8
## 12           Geo 2
## 13         Honda 3
## 14       Hyundai 4
## 15      Infiniti 1
## 16         Lexus 2
## 17       Lincoln 2
## 18         Mazda 5
## 19 Mercedes-Benz 2
## 20       Mercury 2
## 21    Mitsubishi 2
## 22        Nissan 4
## 23    Oldsmobile 4
## 24      Plymouth 1
## 25       Pontiac 5
## 26          Saab 1
## 27        Saturn 1
## 28        Subaru 3
## 29        Suzuki 1
## 30        Toyota 4
## 31    Volkswagen 4
## 32         Volvo 2

\(~\) \(~\)

2. Two categorical variables

2-1-1. janitor::tabyl(v1, v2)

Cars93 %>% tabyl(manufacturer, air_bags
                 , show_na = T
                 , show_missing_levels = T)
##   manufacturer Driver & Passenger Driver only None
##          Acura                  1           0    1
##           Audi                  1           1    0
##            BMW                  0           1    0
##          Buick                  0           4    0
##       Cadillac                  1           1    0
##      Chevrolet                  1           3    4
##       Chrylser                  1           0    0
##       Chrysler                  1           1    0
##          Dodge                  0           5    1
##          Eagle                  1           0    1
##           Ford                  0           5    3
##            Geo                  0           1    1
##          Honda                  2           1    0
##        Hyundai                  0           0    4
##       Infiniti                  0           1    0
##          Lexus                  1           1    0
##        Lincoln                  2           0    0
##          Mazda                  0           2    3
##  Mercedes-Benz                  1           1    0
##        Mercury                  0           1    1
##     Mitsubishi                  0           1    1
##         Nissan                  0           3    1
##     Oldsmobile                  0           2    2
##       Plymouth                  0           0    1
##        Pontiac                  2           0    3
##           Saab                  0           1    0
##         Saturn                  0           1    0
##         Subaru                  0           1    2
##         Suzuki                  0           0    1
##         Toyota                  0           4    0
##     Volkswagen                  0           0    4
##          Volvo                  1           1    0

2-1-2. janitor::tabyl(v1, v2) %>% adorn_pct_formatting(digits = 2) %>% adorn_ns()

Cars93 %>% tabyl(manufacturer, air_bags
                 , show_na = T
                 , show_missing_levels = T) %>% 
  adorn_pct_formatting(digits = 2) %>% 
  adorn_ns()
##   manufacturer Driver & Passenger Driver only        None
##          Acura        100.00% (1)   0.00% (0) 100.00% (1)
##           Audi        100.00% (1) 100.00% (1)   0.00% (0)
##            BMW          0.00% (0) 100.00% (1)   0.00% (0)
##          Buick          0.00% (0) 400.00% (4)   0.00% (0)
##       Cadillac        100.00% (1) 100.00% (1)   0.00% (0)
##      Chevrolet        100.00% (1) 300.00% (3) 400.00% (4)
##       Chrylser        100.00% (1)   0.00% (0)   0.00% (0)
##       Chrysler        100.00% (1) 100.00% (1)   0.00% (0)
##          Dodge          0.00% (0) 500.00% (5) 100.00% (1)
##          Eagle        100.00% (1)   0.00% (0) 100.00% (1)
##           Ford          0.00% (0) 500.00% (5) 300.00% (3)
##            Geo          0.00% (0) 100.00% (1) 100.00% (1)
##          Honda        200.00% (2) 100.00% (1)   0.00% (0)
##        Hyundai          0.00% (0)   0.00% (0) 400.00% (4)
##       Infiniti          0.00% (0) 100.00% (1)   0.00% (0)
##          Lexus        100.00% (1) 100.00% (1)   0.00% (0)
##        Lincoln        200.00% (2)   0.00% (0)   0.00% (0)
##          Mazda          0.00% (0) 200.00% (2) 300.00% (3)
##  Mercedes-Benz        100.00% (1) 100.00% (1)   0.00% (0)
##        Mercury          0.00% (0) 100.00% (1) 100.00% (1)
##     Mitsubishi          0.00% (0) 100.00% (1) 100.00% (1)
##         Nissan          0.00% (0) 300.00% (3) 100.00% (1)
##     Oldsmobile          0.00% (0) 200.00% (2) 200.00% (2)
##       Plymouth          0.00% (0)   0.00% (0) 100.00% (1)
##        Pontiac        200.00% (2)   0.00% (0) 300.00% (3)
##           Saab          0.00% (0) 100.00% (1)   0.00% (0)
##         Saturn          0.00% (0) 100.00% (1)   0.00% (0)
##         Subaru          0.00% (0) 100.00% (1) 200.00% (2)
##         Suzuki          0.00% (0)   0.00% (0) 100.00% (1)
##         Toyota          0.00% (0) 400.00% (4)   0.00% (0)
##     Volkswagen          0.00% (0)   0.00% (0) 400.00% (4)
##          Volvo        100.00% (1) 100.00% (1)   0.00% (0)

2-2-1. 사례수 보기 table + sum 나오게 보기 + 백분율로 보기

row 변수를 먼저, col 변수를 나중에 쓰기

addmargins(prop.table(table(Cars93$manufacturer, Cars93$air_bags, useNA = "ifany"))) * 100
##                
##                 Driver & Passenger Driver only       None        Sum
##   Acura                   1.075269    0.000000   1.075269   2.150538
##   Audi                    1.075269    1.075269   0.000000   2.150538
##   BMW                     0.000000    1.075269   0.000000   1.075269
##   Buick                   0.000000    4.301075   0.000000   4.301075
##   Cadillac                1.075269    1.075269   0.000000   2.150538
##   Chevrolet               1.075269    3.225806   4.301075   8.602151
##   Chrylser                1.075269    0.000000   0.000000   1.075269
##   Chrysler                1.075269    1.075269   0.000000   2.150538
##   Dodge                   0.000000    5.376344   1.075269   6.451613
##   Eagle                   1.075269    0.000000   1.075269   2.150538
##   Ford                    0.000000    5.376344   3.225806   8.602151
##   Geo                     0.000000    1.075269   1.075269   2.150538
##   Honda                   2.150538    1.075269   0.000000   3.225806
##   Hyundai                 0.000000    0.000000   4.301075   4.301075
##   Infiniti                0.000000    1.075269   0.000000   1.075269
##   Lexus                   1.075269    1.075269   0.000000   2.150538
##   Lincoln                 2.150538    0.000000   0.000000   2.150538
##   Mazda                   0.000000    2.150538   3.225806   5.376344
##   Mercedes-Benz           1.075269    1.075269   0.000000   2.150538
##   Mercury                 0.000000    1.075269   1.075269   2.150538
##   Mitsubishi              0.000000    1.075269   1.075269   2.150538
##   Nissan                  0.000000    3.225806   1.075269   4.301075
##   Oldsmobile              0.000000    2.150538   2.150538   4.301075
##   Plymouth                0.000000    0.000000   1.075269   1.075269
##   Pontiac                 2.150538    0.000000   3.225806   5.376344
##   Saab                    0.000000    1.075269   0.000000   1.075269
##   Saturn                  0.000000    1.075269   0.000000   1.075269
##   Subaru                  0.000000    1.075269   2.150538   3.225806
##   Suzuki                  0.000000    0.000000   1.075269   1.075269
##   Toyota                  0.000000    4.301075   0.000000   4.301075
##   Volkswagen              0.000000    0.000000   4.301075   4.301075
##   Volvo                   1.075269    1.075269   0.000000   2.150538
##   Sum                    17.204301   46.236559  36.559140 100.000000

2-2-2. margin = 1: 각 행의 합이 1이 되도록

addmargins(prop.table(table(Cars93$manufacturer, Cars93$air_bags, useNA = "ifany"), margin = 1))
##                
##                 Driver & Passenger Driver only       None        Sum
##   Acura                  0.5000000   0.0000000  0.5000000  1.0000000
##   Audi                   0.5000000   0.5000000  0.0000000  1.0000000
##   BMW                    0.0000000   1.0000000  0.0000000  1.0000000
##   Buick                  0.0000000   1.0000000  0.0000000  1.0000000
##   Cadillac               0.5000000   0.5000000  0.0000000  1.0000000
##   Chevrolet              0.1250000   0.3750000  0.5000000  1.0000000
##   Chrylser               1.0000000   0.0000000  0.0000000  1.0000000
##   Chrysler               0.5000000   0.5000000  0.0000000  1.0000000
##   Dodge                  0.0000000   0.8333333  0.1666667  1.0000000
##   Eagle                  0.5000000   0.0000000  0.5000000  1.0000000
##   Ford                   0.0000000   0.6250000  0.3750000  1.0000000
##   Geo                    0.0000000   0.5000000  0.5000000  1.0000000
##   Honda                  0.6666667   0.3333333  0.0000000  1.0000000
##   Hyundai                0.0000000   0.0000000  1.0000000  1.0000000
##   Infiniti               0.0000000   1.0000000  0.0000000  1.0000000
##   Lexus                  0.5000000   0.5000000  0.0000000  1.0000000
##   Lincoln                1.0000000   0.0000000  0.0000000  1.0000000
##   Mazda                  0.0000000   0.4000000  0.6000000  1.0000000
##   Mercedes-Benz          0.5000000   0.5000000  0.0000000  1.0000000
##   Mercury                0.0000000   0.5000000  0.5000000  1.0000000
##   Mitsubishi             0.0000000   0.5000000  0.5000000  1.0000000
##   Nissan                 0.0000000   0.7500000  0.2500000  1.0000000
##   Oldsmobile             0.0000000   0.5000000  0.5000000  1.0000000
##   Plymouth               0.0000000   0.0000000  1.0000000  1.0000000
##   Pontiac                0.4000000   0.0000000  0.6000000  1.0000000
##   Saab                   0.0000000   1.0000000  0.0000000  1.0000000
##   Saturn                 0.0000000   1.0000000  0.0000000  1.0000000
##   Subaru                 0.0000000   0.3333333  0.6666667  1.0000000
##   Suzuki                 0.0000000   0.0000000  1.0000000  1.0000000
##   Toyota                 0.0000000   1.0000000  0.0000000  1.0000000
##   Volkswagen             0.0000000   0.0000000  1.0000000  1.0000000
##   Volvo                  0.5000000   0.5000000  0.0000000  1.0000000
##   Sum                    7.1916667  14.6500000 10.1583333 32.0000000

2-2-3. margin = 2: 각 열의 합이 1이 되도록

addmargins(prop.table(table(Cars93$manufacturer, Cars93$air_bags, useNA = "ifany"), margin = 2))
##                
##                 Driver & Passenger Driver only       None        Sum
##   Acura                 0.06250000  0.00000000 0.02941176 0.09191176
##   Audi                  0.06250000  0.02325581 0.00000000 0.08575581
##   BMW                   0.00000000  0.02325581 0.00000000 0.02325581
##   Buick                 0.00000000  0.09302326 0.00000000 0.09302326
##   Cadillac              0.06250000  0.02325581 0.00000000 0.08575581
##   Chevrolet             0.06250000  0.06976744 0.11764706 0.24991450
##   Chrylser              0.06250000  0.00000000 0.00000000 0.06250000
##   Chrysler              0.06250000  0.02325581 0.00000000 0.08575581
##   Dodge                 0.00000000  0.11627907 0.02941176 0.14569083
##   Eagle                 0.06250000  0.00000000 0.02941176 0.09191176
##   Ford                  0.00000000  0.11627907 0.08823529 0.20451436
##   Geo                   0.00000000  0.02325581 0.02941176 0.05266758
##   Honda                 0.12500000  0.02325581 0.00000000 0.14825581
##   Hyundai               0.00000000  0.00000000 0.11764706 0.11764706
##   Infiniti              0.00000000  0.02325581 0.00000000 0.02325581
##   Lexus                 0.06250000  0.02325581 0.00000000 0.08575581
##   Lincoln               0.12500000  0.00000000 0.00000000 0.12500000
##   Mazda                 0.00000000  0.04651163 0.08823529 0.13474692
##   Mercedes-Benz         0.06250000  0.02325581 0.00000000 0.08575581
##   Mercury               0.00000000  0.02325581 0.02941176 0.05266758
##   Mitsubishi            0.00000000  0.02325581 0.02941176 0.05266758
##   Nissan                0.00000000  0.06976744 0.02941176 0.09917921
##   Oldsmobile            0.00000000  0.04651163 0.05882353 0.10533516
##   Plymouth              0.00000000  0.00000000 0.02941176 0.02941176
##   Pontiac               0.12500000  0.00000000 0.08823529 0.21323529
##   Saab                  0.00000000  0.02325581 0.00000000 0.02325581
##   Saturn                0.00000000  0.02325581 0.00000000 0.02325581
##   Subaru                0.00000000  0.02325581 0.05882353 0.08207934
##   Suzuki                0.00000000  0.00000000 0.02941176 0.02941176
##   Toyota                0.00000000  0.09302326 0.00000000 0.09302326
##   Volkswagen            0.00000000  0.00000000 0.11764706 0.11764706
##   Volvo                 0.06250000  0.02325581 0.00000000 0.08575581
##   Sum                   1.00000000  1.00000000 1.00000000 3.00000000

\(~\)

3. One categorical & One continuous variable

3-1. tapply,

3-1-1. mean(평균)

tapply(Cars93$price, Cars93$air_bags
       , mean
       , na.rm = T  #na값이 있다면 그것을 제거한 뒤에 계산해라
       , useNA = "ifany")
## Driver & Passenger        Driver only               None 
##           28.36875           21.22326           13.17353

3-1-2. median(중위값)

tapply(Cars93$price, Cars93$air_bags
       , median
       , na.rm = T  #na값이 있다면 그것을 제거한 뒤에 계산해라
       , useNA = "ifany")
## Driver & Passenger        Driver only               None 
##              25.55              19.90              11.90

3-1-3. range(범위)

tapply(Cars93$price, Cars93$air_bags
       , range
       , na.rm = T  #na값이 있다면 그것을 제거한 뒤에 계산해라
       , useNA = "ifany")
## $`Driver & Passenger`
## [1] "15.1"  "ifany"
## 
## $`Driver only`
## [1] "11.1"  "ifany"
## 
## $None
## [1] "10"    "ifany"

3-1-4. quantile(사분위)

tapply(Cars93$price, Cars93$air_bags
       , quantile
       , na.rm = T  #na값이 있다면 그것을 제거한 뒤에 계산해라
       , useNA = "ifany")
## $`Driver & Passenger`
##     0%    25%    50%    75%   100% 
## 15.100 18.225 25.550 35.425 61.900 
## 
## $`Driver only`
##    0%   25%   50%   75%  100% 
##  9.80 15.65 19.90 26.20 47.90 
## 
## $None
##   0%  25%  50%  75% 100% 
##  7.4  9.4 11.9 16.2 23.3

3-2. aggregate

양적변수 ~ 질적변수, 데이터, 보고싶은 값

aggregate(price ~ air_bags, Cars93, mean)
##             air_bags    price
## 1 Driver & Passenger 28.36875
## 2        Driver only 21.22326
## 3               None 13.17353
aggregate(price ~ air_bags, Cars93, median)
##             air_bags price
## 1 Driver & Passenger 25.55
## 2        Driver only 19.90
## 3               None 11.90

3-3. group_by() + summarize()

Cars93 %>% 
  group_by(air_bags) %>%
  summarize(MEAN = mean(price, na.rm = T)
            , SD = sd(price, na.rm = T)
            , SUM = sum(price, na.rm = T)
            , MAX = max(price, na.rm = T)
            , MIN = min(price, na.rm = T)
            , N = n()
            , "NA" = sum(is.na(price)))
## # A tibble: 3 x 8
##   air_bags            MEAN    SD   SUM   MAX   MIN     N  `NA`
##   <fct>              <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 Driver & Passenger  28.4 12.5   454.  61.9  15.1    16     0
## 2 Driver only         21.2  8.24  913.  47.9   9.8    43     0
## 3 None                13.2  4.36  448.  23.3   7.4    34     0