library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
여러개의 변수 값들을 하나에 column에 넣으면 안된다.
하나의 테이블 안에 여러개의 관측단위를 집어넣으면 안된다.
##Option #1
## name treatmenta treatmentb
## 1 John Smith NA 18
## 2 Jane Doe 4 1
## 3 Mary Johnson 6 7
##Option #2
## treatment John.Smith Jane.Doe Mary.Johnson
## 1 a NA 4 6
## 2 b 18 1 7
## data 1
#week city_A city_B city_C
# 1 14 18 23
# 2 15 21 24
# 3 12 25 23
# 4 13 17 25
# the variable temperature appears in three columns
## data 2
#week 1 2 3 4 ... <-열에 이름이 없음.
#city_A 14 15 12 13
#city_B 18 21 25 17
#city_C 23 24 23 25
#multiple variables appear in each column and multiple
#observation
#week city temperature
# 1 A 14
# 1 B 18
# 1 C 23
# 2 A 15
# 2 B 21
# 2 C 24
# ...
library(dslabs)
data("murders")
head(murders)
## state abb region population total
## 1 Alabama AL South 4779736 135
## 2 Alaska AK West 710231 19
## 3 Arizona AZ West 6392017 232
## 4 Arkansas AR South 2915918 93
## 5 California CA West 37253956 1257
## 6 Colorado CO West 5029196 65
각각의 행이 하나의 관측치를 말하고, 각각의 열이 다른 변수들을 나타낼때, murders라는 데이터셋은 tidy의 가장 좋은 한 종류라 할 수 있다.
murders<-mutate(murders,rate=total/population*100000)
head(murders)
## state abb region population total rate
## 1 Alabama AL South 4779736 135 2.824424
## 2 Alaska AK West 710231 19 2.675186
## 3 Arizona AZ West 6392017 232 3.629527
## 4 Arkansas AR South 2915918 93 3.189390
## 5 California CA West 37253956 1257 3.374138
## 6 Colorado CO West 5029196 65 1.292453
filter(murders,rate<=0.71)
## state abb region population total rate
## 1 Hawaii HI West 1360301 7 0.5145920
## 2 Iowa IA North Central 3046355 21 0.6893484
## 3 New Hampshire NH Northeast 1316470 5 0.3798036
## 4 North Dakota ND North Central 672591 4 0.5947151
## 5 Vermont VT Northeast 625741 2 0.3196211
rate가 0.71이하인 것을 보여줘~
filter(murders, population>12830632)
## state abb region population total rate
## 1 California CA West 37253956 1257 3.374138
## 2 Florida FL South 19687653 669 3.398069
## 3 New York NY Northeast 19378102 517 2.667960
## 4 Texas TX South 25145561 805 3.201360
population이 12830632를 초과하는 값만 보여줘!
filter(murders, state=="California")
## state abb region population total rate
## 1 California CA West 37253956 1257 3.374138
캘리포니아만 보여줘!
filter(murders, state!="California")
## state abb region population total rate
## 1 Alabama AL South 4779736 135 2.8244238
## 2 Alaska AK West 710231 19 2.6751860
## 3 Arizona AZ West 6392017 232 3.6295273
## 4 Arkansas AR South 2915918 93 3.1893901
## 5 Colorado CO West 5029196 65 1.2924531
## 6 Connecticut CT Northeast 3574097 97 2.7139722
## 7 Delaware DE South 897934 38 4.2319369
## 8 District of Columbia DC South 601723 99 16.4527532
## 9 Florida FL South 19687653 669 3.3980688
## 10 Georgia GA South 9920000 376 3.7903226
## 11 Hawaii HI West 1360301 7 0.5145920
## 12 Idaho ID West 1567582 12 0.7655102
## 13 Illinois IL North Central 12830632 364 2.8369608
## 14 Indiana IN North Central 6483802 142 2.1900730
## 15 Iowa IA North Central 3046355 21 0.6893484
## 16 Kansas KS North Central 2853118 63 2.2081106
## 17 Kentucky KY South 4339367 116 2.6732010
## 18 Louisiana LA South 4533372 351 7.7425810
## 19 Maine ME Northeast 1328361 11 0.8280881
## 20 Maryland MD South 5773552 293 5.0748655
## 21 Massachusetts MA Northeast 6547629 118 1.8021791
## 22 Michigan MI North Central 9883640 413 4.1786225
## 23 Minnesota MN North Central 5303925 53 0.9992600
## 24 Mississippi MS South 2967297 120 4.0440846
## 25 Missouri MO North Central 5988927 321 5.3598917
## 26 Montana MT West 989415 12 1.2128379
## 27 Nebraska NE North Central 1826341 32 1.7521372
## 28 Nevada NV West 2700551 84 3.1104763
## 29 New Hampshire NH Northeast 1316470 5 0.3798036
## 30 New Jersey NJ Northeast 8791894 246 2.7980319
## 31 New Mexico NM West 2059179 67 3.2537239
## 32 New York NY Northeast 19378102 517 2.6679599
## 33 North Carolina NC South 9535483 286 2.9993237
## 34 North Dakota ND North Central 672591 4 0.5947151
## 35 Ohio OH North Central 11536504 310 2.6871225
## 36 Oklahoma OK South 3751351 111 2.9589340
## 37 Oregon OR West 3831074 36 0.9396843
## 38 Pennsylvania PA Northeast 12702379 457 3.5977513
## 39 Rhode Island RI Northeast 1052567 16 1.5200933
## 40 South Carolina SC South 4625364 207 4.4753235
## 41 South Dakota SD North Central 814180 8 0.9825837
## 42 Tennessee TN South 6346105 219 3.4509357
## 43 Texas TX South 25145561 805 3.2013603
## 44 Utah UT West 2763885 22 0.7959810
## 45 Vermont VT Northeast 625741 2 0.3196211
## 46 Virginia VA South 8001024 250 3.1246001
## 47 Washington WA West 6724540 93 1.3829942
## 48 West Virginia WV South 1852994 27 1.4571013
## 49 Wisconsin WI North Central 5686986 97 1.7056487
## 50 Wyoming WY West 563626 5 0.8871131
캘리포니아만 빼고 보여줘!
filter(murders, state %in% c("Alabama", "Arkansas", "Delaware"))
## state abb region population total rate
## 1 Alabama AL South 4779736 135 2.824424
## 2 Arkansas AR South 2915918 93 3.189390
## 3 Delaware DE South 897934 38 4.231937
state에서 “Alabama”, “Arkansas”, “Delaware”를 보여줘!
filter(murders, region %in% c("South"))
## state abb region population total rate
## 1 Alabama AL South 4779736 135 2.824424
## 2 Arkansas AR South 2915918 93 3.189390
## 3 Delaware DE South 897934 38 4.231937
## 4 District of Columbia DC South 601723 99 16.452753
## 5 Florida FL South 19687653 669 3.398069
## 6 Georgia GA South 9920000 376 3.790323
## 7 Kentucky KY South 4339367 116 2.673201
## 8 Louisiana LA South 4533372 351 7.742581
## 9 Maryland MD South 5773552 293 5.074866
## 10 Mississippi MS South 2967297 120 4.044085
## 11 North Carolina NC South 9535483 286 2.999324
## 12 Oklahoma OK South 3751351 111 2.958934
## 13 South Carolina SC South 4625364 207 4.475323
## 14 Tennessee TN South 6346105 219 3.450936
## 15 Texas TX South 25145561 805 3.201360
## 16 Virginia VA South 8001024 250 3.124600
## 17 West Virginia WV South 1852994 27 1.457101
filter(murders, population<5000000 & region =="Northeast" & total>80)
## state abb region population total rate
## 1 Connecticut CT Northeast 3574097 97 2.713972
filter(murders, population<5000000 | region =="Northeast")
## state abb region population total rate
## 1 Alabama AL South 4779736 135 2.8244238
## 2 Alaska AK West 710231 19 2.6751860
## 3 Arkansas AR South 2915918 93 3.1893901
## 4 Connecticut CT Northeast 3574097 97 2.7139722
## 5 Delaware DE South 897934 38 4.2319369
## 6 District of Columbia DC South 601723 99 16.4527532
## 7 Hawaii HI West 1360301 7 0.5145920
## 8 Idaho ID West 1567582 12 0.7655102
## 9 Iowa IA North Central 3046355 21 0.6893484
## 10 Kansas KS North Central 2853118 63 2.2081106
## 11 Kentucky KY South 4339367 116 2.6732010
## 12 Louisiana LA South 4533372 351 7.7425810
## 13 Maine ME Northeast 1328361 11 0.8280881
## 14 Massachusetts MA Northeast 6547629 118 1.8021791
## 15 Mississippi MS South 2967297 120 4.0440846
## 16 Montana MT West 989415 12 1.2128379
## 17 Nebraska NE North Central 1826341 32 1.7521372
## 18 Nevada NV West 2700551 84 3.1104763
## 19 New Hampshire NH Northeast 1316470 5 0.3798036
## 20 New Jersey NJ Northeast 8791894 246 2.7980319
## 21 New Mexico NM West 2059179 67 3.2537239
## 22 New York NY Northeast 19378102 517 2.6679599
## 23 North Dakota ND North Central 672591 4 0.5947151
## 24 Oklahoma OK South 3751351 111 2.9589340
## 25 Oregon OR West 3831074 36 0.9396843
## 26 Pennsylvania PA Northeast 12702379 457 3.5977513
## 27 Rhode Island RI Northeast 1052567 16 1.5200933
## 28 South Carolina SC South 4625364 207 4.4753235
## 29 South Dakota SD North Central 814180 8 0.9825837
## 30 Utah UT West 2763885 22 0.7959810
## 31 Vermont VT Northeast 625741 2 0.3196211
## 32 West Virginia WV South 1852994 27 1.4571013
## 33 Wyoming WY West 563626 5 0.8871131
&나 |(or)로 조건을 이어붙일 수도 있다.
new_table<-select(murders, state, region, rate)
new_table
## state region rate
## 1 Alabama South 2.8244238
## 2 Alaska West 2.6751860
## 3 Arizona West 3.6295273
## 4 Arkansas South 3.1893901
## 5 California West 3.3741383
## 6 Colorado West 1.2924531
## 7 Connecticut Northeast 2.7139722
## 8 Delaware South 4.2319369
## 9 District of Columbia South 16.4527532
## 10 Florida South 3.3980688
## 11 Georgia South 3.7903226
## 12 Hawaii West 0.5145920
## 13 Idaho West 0.7655102
## 14 Illinois North Central 2.8369608
## 15 Indiana North Central 2.1900730
## 16 Iowa North Central 0.6893484
## 17 Kansas North Central 2.2081106
## 18 Kentucky South 2.6732010
## 19 Louisiana South 7.7425810
## 20 Maine Northeast 0.8280881
## 21 Maryland South 5.0748655
## 22 Massachusetts Northeast 1.8021791
## 23 Michigan North Central 4.1786225
## 24 Minnesota North Central 0.9992600
## 25 Mississippi South 4.0440846
## 26 Missouri North Central 5.3598917
## 27 Montana West 1.2128379
## 28 Nebraska North Central 1.7521372
## 29 Nevada West 3.1104763
## 30 New Hampshire Northeast 0.3798036
## 31 New Jersey Northeast 2.7980319
## 32 New Mexico West 3.2537239
## 33 New York Northeast 2.6679599
## 34 North Carolina South 2.9993237
## 35 North Dakota North Central 0.5947151
## 36 Ohio North Central 2.6871225
## 37 Oklahoma South 2.9589340
## 38 Oregon West 0.9396843
## 39 Pennsylvania Northeast 3.5977513
## 40 Rhode Island Northeast 1.5200933
## 41 South Carolina South 4.4753235
## 42 South Dakota North Central 0.9825837
## 43 Tennessee South 3.4509357
## 44 Texas South 3.2013603
## 45 Utah West 0.7959810
## 46 Vermont Northeast 0.3196211
## 47 Virginia South 3.1246001
## 48 Washington West 1.3829942
## 49 West Virginia South 1.4571013
## 50 Wisconsin North Central 1.7056487
## 51 Wyoming West 0.8871131
%>% ctrl+shift+M
연산이 여러개가 등장할때 굉장히 유용함
연산을 하고싶은 순서대로 코드를 짤 수 있음
# operation3(operation2(operation1(x)))
이런걸 하는 시간을 줄이자!
x<-c(1,2,3,4)
sqrt(x)
## [1] 1.000000 1.414214 1.732051 2.000000
sum(sqrt(x))
## [1] 6.146264
sqrt(sum(sqrt(x)))
## [1] 2.479166
마지막 생각이 코드의 첫 부분에 있음.생각의 흐름대로 코드가 짜여지지 않음.
x %>% sqrt() %>% sum() %>% sqrt()
## [1] 2.479166
x %>%
sqrt() %>%
sum() %>%
sqrt()
## [1] 2.479166
생각의 흐름대로 코드가 짜여지고 있음.
murders %>%
select(state, region, rate) %>%
filter(rate<=0.71) %>%
filter(region == "West")
## state region rate
## 1 Hawaii West 0.514592
murders 데이터에서,
변수 state, region, rate를 선택한 다음,
rate가 0.71 이하인 값을 추출하고,
region이 West인 값을 추출하라.
16 %>% sqrt()
## [1] 4
16을
제곱근 연산 하라.
sqrt(16)
## [1] 4
제곱근 연산하라
16을
생각의 흐름대로 가지 않는다
16 %>% sqrt() %>% log2()
## [1] 2
16을
제곱근 연산하고
log2연산하라
log2(sqrt(16))
## [1] 2
log2연산하라
제곱근 연산하라
16을
library(dplyr)
library(dslabs)
data(heights)
head(heights,10)
## sex height
## 1 Male 75
## 2 Male 70
## 3 Male 68
## 4 Male 74
## 5 Male 61
## 6 Female 65
## 7 Female 66
## 8 Female 62
## 9 Female 66
## 10 Male 67
str(heights)
## 'data.frame': 1050 obs. of 2 variables:
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 1 1 1 1 2 ...
## $ height: num 75 70 68 74 61 65 66 62 66 67 ...
f<-heights %>%
filter(sex == "Female") %>%
summarize(average = mean(height), stadard_deviation = sd(height))
f
## average stadard_deviation
## 1 64.93942 3.760656
m<-heights %>%
filter(sex == "Male") %>%
summarize(average = mean(height), stadard_deviation = sd(height))
m
## average stadard_deviation
## 1 69.31475 3.611024
murders<-murders %>% mutate(rate=total/population*100000)
head(murders)
## state abb region population total rate
## 1 Alabama AL South 4779736 135 2.824424
## 2 Alaska AK West 710231 19 2.675186
## 3 Arizona AZ West 6392017 232 3.629527
## 4 Arkansas AR South 2915918 93 3.189390
## 5 California CA West 37253956 1257 3.374138
## 6 Colorado CO West 5029196 65 1.292453
murders data에 rate라는 변수를 mutate함수를 통해 추가한다.
summarize(murders, mean(rate))
## mean(rate)
## 1 2.779125
이건 rate 열의 평균이지, (즉, 각각의 state별 rate를 구하고 그 후, 평균을 낸 것.),US의 전체 평균이 아니다. 즉, 모든 total 값을 다 더한다음에, 모든 population 값으로 나눈 것이 아니다.
us_murders_rate<-murders %>%
summarize(rate=sum(total)/ sum(population)*100000)
us_murders_rate
## rate
## 1 3.034555
이렇게 total에 대해서 먼저 sum을 하고, population에 대해서 sum을 한 후에 연산을 해줘야 us의 전체의 평균이라고 할 수 있다.
같은 변수에 대해서 median, minimum, maximum을 모두 알고 싶을때, 어떻게 해야 할까?
heights %>%
filter(sex=="Female") %>%
summarize(median_min_max=quantile(height, c(0.5, 0, 1)))
## median_min_max
## 1 64.98031
## 2 51.00000
## 3 79.00000
?quantile
## starting httpd help server ... done
이렇게 quantile함수를 써서 한번에 확인할 수 있다. 이 때, quantile함수에서 0.5는 평균, 0을 최소값, 1을 최대값을 의미한다.
median_min_max <- function(x){
qs<-quantile(x, c(0.5,0,1))
# class(qs) <- 함수 확인용
# qs < -함수 확인용
data.frame(median = qs[1], minimum=qs[2], maximum = qs[3])
}
하지만, 이 값들을 다른 column에서 각각 나오게 하는 방법은 없을까?
반복되는 연산이 지속될 경우, Function을 지정해 사용한다.
heights %>%
filter(sex == "Female") %>%
summarize(median_min_max((height)))
## median minimum maximum
## 1 64.98031 51 79
그 후, summarize를 사용하여 구하면, 한번에 값이 나오는 것을 확인할 수 있다.
heights %>% group_by(sex)
## # A tibble: 1,050 x 2
## # Groups: sex [2]
## sex height
## <fct> <dbl>
## 1 Male 75
## 2 Male 70
## 3 Male 68
## 4 Male 74
## 5 Male 61
## 6 Female 65
## 7 Female 66
## 8 Female 62
## 9 Female 66
## 10 Male 67
## # ... with 1,040 more rows
보기엔 똑같은데, Groups : sex[2]에서, 성별을 기준으로 구분을 지어놓았다는 것을 알 수 있다.
즉, 데이터의 모양은 변하지 않았으나, 그룹이 구분이 지어졌다는 것이다.
heights %>%
group_by(sex) %>% # 그룹에 따라서
summarize ( average = mean(height),
standard_deviation = sd(height),
max = max(height),
min = min(height),
median = median(height))
## # A tibble: 2 x 6
## sex average standard_deviation max min median
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Female 64.9 3.76 79 51 65.0
## 2 Male 69.3 3.61 82.7 50 69
heights데이터에 대해서
sex로 구분을 지어라
그리고 요약하라
average에 평균을 저장하고,
stadard_deviation에 표준편차를 저장하고,
max에 최대값을 저장하고,
min에 최소값을 저장하고,
median에 중앙값을 저장하라.
murders %>%
group_by(region, state) %>%
summarize(median_min_max(rate))
## `summarise()` has grouped output by 'region'. You can override using the `.groups` argument.
## # A tibble: 51 x 5
## # Groups: region [4]
## region state median minimum maximum
## <fct> <chr> <dbl> <dbl> <dbl>
## 1 Northeast Connecticut 2.71 2.71 2.71
## 2 Northeast Maine 0.828 0.828 0.828
## 3 Northeast Massachusetts 1.80 1.80 1.80
## 4 Northeast New Hampshire 0.380 0.380 0.380
## 5 Northeast New Jersey 2.80 2.80 2.80
## 6 Northeast New York 2.67 2.67 2.67
## 7 Northeast Pennsylvania 3.60 3.60 3.60
## 8 Northeast Rhode Island 1.52 1.52 1.52
## 9 Northeast Vermont 0.320 0.320 0.320
## 10 South Alabama 2.82 2.82 2.82
## # ... with 41 more rows
group_by안에 여러개의 변수 또한 지정 가능하다. 첫번째 변수로 분류해준 후, 두번째 변수로 다시 분류해준다.
heights %>% mutate(school = c(rep("A",500), rep("B", 550))) -> height2
head(height2)
## sex height school
## 1 Male 75 A
## 2 Male 70 A
## 3 Male 68 A
## 4 Male 74 A
## 5 Male 61 A
## 6 Female 65 A
height2 %>%
group_by(sex, school) %>% # 그룹에 따라서
summarize ( average = mean(height),
standard_deviation = sd(height),
max = max(height),
min = min(height),
median = median(height))
## `summarise()` has grouped output by 'sex'. You can override using the `.groups` argument.
## # A tibble: 4 x 7
## # Groups: sex [2]
## sex school average standard_deviation max min median
## <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Female A 64.9 3.62 78.7 51 65.0
## 2 Female B 64.9 3.92 79 52 65
## 3 Male A 69.5 3.45 80 53.8 70
## 4 Male B 69.1 3.74 82.7 50 69
학교 변수를 추가하고, 값을 구했더니, 4개의 sector로 구분되었다는 것을 알 수 있다.
class(us_murders_rate)
## [1] "data.frame"
숫자가 하나 저장된 data.frame의 형태로 되어있다. 이 데이터 프레임을 없애고, 하나의 값으로 추출하고 싶다.
us_murders_rate[1,1]
## [1] 3.034555
이런 방식으로 행과 열을 지정해서 값을 뽑아도 되지만,
us_murders_rate %>% pull(rate)
## [1] 3.034555
직관적으로 us_murders_rate에서 rate열에 있는 값을 뽑아낼때는 이렇게 사용하면 된다.
us_murders_rate<-murders %>%
summarize(rate=sum(total)/ sum(population)*100000) %>%
pull(rate)
us_murders_rate
## [1] 3.034555
class(us_murders_rate)
## [1] "numeric"
이 때, 값이 data.frame이 아니라 다르게 뽑아서 나온다는 것을 알 수 있다.
변수를 큰수부터 작은수까지, 작은수부터 큰수까지 정렬하는 함수이다.
murders %>%
arrange(population) %>%
head()
## state abb region population total rate
## 1 Wyoming WY West 563626 5 0.8871131
## 2 District of Columbia DC South 601723 99 16.4527532
## 3 Vermont VT Northeast 625741 2 0.3196211
## 4 North Dakota ND North Central 672591 4 0.5947151
## 5 Alaska AK West 710231 19 2.6751860
## 6 South Dakota SD North Central 814180 8 0.9825837
murders %>%
arrange(rate) %>%
head()
## state abb region population total rate
## 1 Vermont VT Northeast 625741 2 0.3196211
## 2 New Hampshire NH Northeast 1316470 5 0.3798036
## 3 Hawaii HI West 1360301 7 0.5145920
## 4 North Dakota ND North Central 672591 4 0.5947151
## 5 Iowa IA North Central 3046355 21 0.6893484
## 6 Idaho ID West 1567582 12 0.7655102
작은수 부터 큰수까지 정렬이 되는 것을 확인할 수 있다.
murders %>%
arrange(desc(population)) %>%
head()
## state abb region population total rate
## 1 California CA West 37253956 1257 3.374138
## 2 Texas TX South 25145561 805 3.201360
## 3 Florida FL South 19687653 669 3.398069
## 4 New York NY Northeast 19378102 517 2.667960
## 5 Illinois IL North Central 12830632 364 2.836961
## 6 Pennsylvania PA Northeast 12702379 457 3.597751
murders %>%
arrange(desc(rate)) %>%
head()
## state abb region population total rate
## 1 District of Columbia DC South 601723 99 16.452753
## 2 Louisiana LA South 4533372 351 7.742581
## 3 Missouri MO North Central 5988927 321 5.359892
## 4 Maryland MD South 5773552 293 5.074866
## 5 South Carolina SC South 4625364 207 4.475323
## 6 Delaware DE South 897934 38 4.231937
큰 수부터 작은수로 정렬되는 것을 확인할 수 있다.
murders %>%
arrange(region,rate) %>%
head()
## state abb region population total rate
## 1 Vermont VT Northeast 625741 2 0.3196211
## 2 New Hampshire NH Northeast 1316470 5 0.3798036
## 3 Maine ME Northeast 1328361 11 0.8280881
## 4 Rhode Island RI Northeast 1052567 16 1.5200933
## 5 Massachusetts MA Northeast 6547629 118 1.8021791
## 6 New York NY Northeast 19378102 517 2.6679599
murders %>%
arrange(region,desc(rate)) %>%
head()
## state abb region population total rate
## 1 Pennsylvania PA Northeast 12702379 457 3.597751
## 2 New Jersey NJ Northeast 8791894 246 2.798032
## 3 Connecticut CT Northeast 3574097 97 2.713972
## 4 New York NY Northeast 19378102 517 2.667960
## 5 Massachusetts MA Northeast 6547629 118 1.802179
## 6 Rhode Island RI Northeast 1052567 16 1.520093
2개의 변수에 대해서 정렬할 필요가 있을 때는 다음과 같이 arrange함수에 두개의 열값을 넣어주면 된다.
murders %>% top_n(10, rate)
## state abb region population total rate
## 1 Arizona AZ West 6392017 232 3.629527
## 2 Delaware DE South 897934 38 4.231937
## 3 District of Columbia DC South 601723 99 16.452753
## 4 Georgia GA South 9920000 376 3.790323
## 5 Louisiana LA South 4533372 351 7.742581
## 6 Maryland MD South 5773552 293 5.074866
## 7 Michigan MI North Central 9883640 413 4.178622
## 8 Mississippi MS South 2967297 120 4.044085
## 9 Missouri MO North Central 5988927 321 5.359892
## 10 South Carolina SC South 4625364 207 4.475323
murders %>%
top_n(10, rate) %>%
arrange(rate)
## state abb region population total rate
## 1 Arizona AZ West 6392017 232 3.629527
## 2 Georgia GA South 9920000 376 3.790323
## 3 Mississippi MS South 2967297 120 4.044085
## 4 Michigan MI North Central 9883640 413 4.178622
## 5 Delaware DE South 897934 38 4.231937
## 6 South Carolina SC South 4625364 207 4.475323
## 7 Maryland MD South 5773552 293 5.074866
## 8 Missouri MO North Central 5988927 321 5.359892
## 9 Louisiana LA South 4533372 351 7.742581
## 10 District of Columbia DC South 601723 99 16.452753
상위 10개를 뽑고 rate 순으로 arrage를 시킨다.
데이터 프레임의 다른 형태
data.frame의 최신판
tibbles가 data.frame보다 더 보기 좋다.
4.테이블 안에 함수 삽입 가능
5.데이터 프레임에는 항상 numbers, strings, logical values만 들어가야되는데, tibbles에서는 list나 functions가 들어갈 수 있다.
6.데이터 프레임보다 큰 범위가 되고,최신에 트렌드에 맞게 진화함
murders %>% group_by(region) %>% class()
## [1] "grouped_df" "tbl_df" "tbl" "data.frame"
head(murders)
## state abb region population total rate
## 1 Alabama AL South 4779736 135 2.824424
## 2 Alaska AK West 710231 19 2.675186
## 3 Arizona AZ West 6392017 232 3.629527
## 4 Arkansas AR South 2915918 93 3.189390
## 5 California CA West 37253956 1257 3.374138
## 6 Colorado CO West 5029196 65 1.292453
데이터 프레임은 무조건 모든 관측치를 다 보여줌
as_tibble(murders)
## # A tibble: 51 x 6
## state abb region population total rate
## <chr> <chr> <fct> <dbl> <dbl> <dbl>
## 1 Alabama AL South 4779736 135 2.82
## 2 Alaska AK West 710231 19 2.68
## 3 Arizona AZ West 6392017 232 3.63
## 4 Arkansas AR South 2915918 93 3.19
## 5 California CA West 37253956 1257 3.37
## 6 Colorado CO West 5029196 65 1.29
## 7 Connecticut CT Northeast 3574097 97 2.71
## 8 Delaware DE South 897934 38 4.23
## 9 District of Columbia DC South 601723 99 16.5
## 10 Florida FL South 19687653 669 3.40
## # ... with 41 more rows
10개만 보여주고,
열의 형태도 나오며
메모리 절약 가능하고 보기가 좋다
class(murders[,4])
## [1] "numeric"
열과 행이 있지가 않음
class(as_tibble(murders[,4])) # 열과 행이 존재
## [1] "tbl_df" "tbl" "data.frame"
열과 행이 존재함. class가 3가지의 형태로 나옴.
tibble(id =c(1,2,3), func =c (mean, median, sd))
## # A tibble: 3 x 2
## id func
## <dbl> <list>
## 1 1 <fn>
## 2 2 <fn>
## 3 3 <fn>
테이블 안에 함수 삽입 가능하고, 데이터 프레임에는 항상 numbers, strings, logical values만 들어가야되는데, tibbles에서는 list나 functions가 들어갈 수 있다.
grades<-tibble(names = c("John", "Juan", "Jean", "Yao"),
exam_1 = c(95,80,90,85),
exam_2 = c(90,85,85,90))
grades
## # A tibble: 4 x 3
## names exam_1 exam_2
## <chr> <dbl> <dbl>
## 1 John 95 90
## 2 Juan 80 85
## 3 Jean 90 85
## 4 Yao 85 90
tibble()함수는 짧은 tibble을 만들때, 빠르고 쉽게 만들 수 있다.
grades<-data.frame(names = c("John", "Juan", "Jean", "Yao"),
exam_1 = c(95,80,90,85),
exam_2 = c(90,85,85,90))
grades
## names exam_1 exam_2
## 1 John 95 90
## 2 Juan 80 85
## 3 Jean 90 85
## 4 Yao 85 90
데이터 프레임을 티블로 만드려면, 우선 데이터 프레임을 만든후,
as_tibble(grades)
## # A tibble: 4 x 3
## names exam_1 exam_2
## <chr> <dbl> <dbl>
## 1 John 95 90
## 2 Juan 80 85
## 3 Jean 90 85
## 4 Yao 85 90
as_tibble(grades) %>% class()
## [1] "tbl_df" "tbl" "data.frame"
이렇게 as_tibble함수를 사용하면 된다.