library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.3
## -- Attaching packages ------------------------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 2.2.1 √ purrr 0.2.4
## √ tibble 1.3.4 √ dplyr 0.7.4
## √ tidyr 0.7.2 √ stringr 1.2.0
## √ readr 1.1.1 √ forcats 0.2.0
## Warning: package 'stringr' was built under R version 3.4.3
## -- Conflicts ---------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
table1
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
table2
## # A tibble: 12 x 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
table3
## # A tibble: 6 x 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
table4a
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
table4b
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 19987071 20595360
## 2 Brazil 172006362 174504898
## 3 China 1272915272 1280428583
table1 %>%
mutate(rate = cases / population * 10000)
## # A tibble: 6 x 5
## country year cases population rate
## <chr> <int> <int> <int> <dbl>
## 1 Afghanistan 1999 745 19987071 0.372741
## 2 Afghanistan 2000 2666 20595360 1.294466
## 3 Brazil 1999 37737 172006362 2.193930
## 4 Brazil 2000 80488 174504898 4.612363
## 5 China 1999 212258 1272915272 1.667495
## 6 China 2000 213766 1280428583 1.669488
table1 %>%
group_by(year) %>%
summarise(mean_cases = mean(cases), sum_case = sum(cases))
## # A tibble: 2 x 3
## year mean_cases sum_case
## <int> <dbl> <int>
## 1 1999 83580.00 250740
## 2 2000 98973.33 296920
table1 %>%
count(year, wt=cases)
## # A tibble: 2 x 2
## year n
## <int> <int>
## 1 1999 250740
## 2 2000 296920
library(ggplot2)
ggplot(data=table1, aes(x=year, y=cases)) +
geom_line(aes(group=country), color="grey50") +
geom_point(aes(color = country))

table2
## # A tibble: 12 x 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
table2 %>%
filter(type == "cases") %>%
group_by(year,country) %>%
summarise(sum_cnt = sum(count))
## # A tibble: 6 x 3
## # Groups: year [?]
## year country sum_cnt
## <int> <chr> <int>
## 1 1999 Afghanistan 745
## 2 1999 Brazil 37737
## 3 1999 China 212258
## 4 2000 Afghanistan 2666
## 5 2000 Brazil 80488
## 6 2000 China 213766
tbl2_pop <- table2 %>%
filter(type == "population") %>%
group_by(year,country) %>%
summarise(population = sum(count))
tbl2_case <- table2 %>%
filter(type == "cases") %>%
group_by(year,country) %>%
summarise(cases = sum(count))
tbl2_pop
## # A tibble: 6 x 3
## # Groups: year [?]
## year country population
## <int> <chr> <int>
## 1 1999 Afghanistan 19987071
## 2 1999 Brazil 172006362
## 3 1999 China 1272915272
## 4 2000 Afghanistan 20595360
## 5 2000 Brazil 174504898
## 6 2000 China 1280428583
tbl2_case
## # A tibble: 6 x 3
## # Groups: year [?]
## year country cases
## <int> <chr> <int>
## 1 1999 Afghanistan 745
## 2 1999 Brazil 37737
## 3 1999 China 212258
## 4 2000 Afghanistan 2666
## 5 2000 Brazil 80488
## 6 2000 China 213766
tbl2_join <- left_join(tbl2_pop, tbl2_case)
## Joining, by = c("year", "country")
tbl2_join
## # A tibble: 6 x 4
## # Groups: year [?]
## year country population cases
## <int> <chr> <int> <int>
## 1 1999 Afghanistan 19987071 745
## 2 1999 Brazil 172006362 37737
## 3 1999 China 1272915272 212258
## 4 2000 Afghanistan 20595360 2666
## 5 2000 Brazil 174504898 80488
## 6 2000 China 1280428583 213766
tbl2_join %>%
mutate(rate = cases/population * 10000)
## # A tibble: 6 x 5
## # Groups: year [2]
## year country population cases rate
## <int> <chr> <int> <int> <dbl>
## 1 1999 Afghanistan 19987071 745 0.372741
## 2 1999 Brazil 172006362 37737 2.193930
## 3 1999 China 1272915272 212258 1.667495
## 4 2000 Afghanistan 20595360 2666 1.294466
## 5 2000 Brazil 174504898 80488 4.612363
## 6 2000 China 1280428583 213766 1.669488
ggplot(data=tbl2_join, aes(year,cases,group=country)) +
geom_line() +
geom_point(aes(color = country))

tidy4a <- table4a %>%
gather(`1999`,`2000`, key="year", value="cases")
tidy4b <- table4b %>%
gather(`1999`,`2000`, key="year", value="population")
left_join(tidy4a, tidy4b)
## Joining, by = c("country", "year")
## # A tibble: 6 x 4
## country year cases population
## <chr> <chr> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Brazil 1999 37737 172006362
## 3 China 1999 212258 1272915272
## 4 Afghanistan 2000 2666 20595360
## 5 Brazil 2000 80488 174504898
## 6 China 2000 213766 1280428583
spread(table2, key=type, value=count)
## # A tibble: 6 x 4
## country year cases population
## * <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
stocks <- tibble(
year = c(2015,2015,2016,2016),
half = c(1,2,1,2),
return = c(1.88, 0.59, 0.92, 0.17)
)
stocks
## # A tibble: 4 x 3
## year half return
## <dbl> <dbl> <dbl>
## 1 2015 1 1.88
## 2 2015 2 0.59
## 3 2016 1 0.92
## 4 2016 2 0.17
stocks %>%
spread(year, return)
## # A tibble: 2 x 3
## half `2015` `2016`
## * <dbl> <dbl> <dbl>
## 1 1 1.88 0.92
## 2 2 0.59 0.17
stocks %>%
spread(year, return) %>%
gather("year", "return", `2015`:`2016`)
## # A tibble: 4 x 3
## half year return
## <dbl> <chr> <dbl>
## 1 1 2015 1.88
## 2 2 2015 0.59
## 3 1 2016 0.92
## 4 2 2016 0.17
table4a %>%
gather(`1999`,`2000`, key="year", value = "cases")
## # A tibble: 6 x 3
## country year cases
## <chr> <chr> <int>
## 1 Afghanistan 1999 745
## 2 Brazil 1999 37737
## 3 China 1999 212258
## 4 Afghanistan 2000 2666
## 5 Brazil 2000 80488
## 6 China 2000 213766
people <- tribble(
~name, ~keyc, ~valuec,
#-----------------|-----------|------
"Phillip Woods", "age", 45,
"Phillip Woods", "height", 186,
#"Phillip Woods", "age", 50,
"Jessica Cordero", "age", 37,
"Jessica Cordero", "height", 156
)
spread(people, key=keyc, value=valuec)
## # A tibble: 2 x 3
## name age height
## * <chr> <dbl> <dbl>
## 1 Jessica Cordero 37 156
## 2 Phillip Woods 45 186
people <- tribble(
~name, ~sn, ~keyc, ~valuec,
#-----------------|---|------------|------
"Phillip Woods", "jr.", "age", 45,
"Phillip Woods", "", "height", 186,
"Phillip Woods", "" , "age", 50,
"Jessica Cordero","" , "age", 37,
"Jessica Cordero","", "height", 156
)
spread(people, key=keyc, value=valuec)
## # A tibble: 3 x 4
## name sn age height
## * <chr> <chr> <dbl> <dbl>
## 1 Jessica Cordero 37 156
## 2 Phillip Woods 50 186
## 3 Phillip Woods jr. 45 NA
preg <- tribble(
~pregnant, ~male, ~female,
"yes", NA, 10,
"no", 20, 12
)
table3
## # A tibble: 6 x 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
table5
## # A tibble: 6 x 4
## country century year rate
## * <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
table3 %>%
separate(
rate,
into = c("cases", "population"),
convert = T)
## # A tibble: 6 x 4
## country year cases population
## * <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
table3 %>%
separate(
year,
into = c("century", "year"),
convert = T,
sep = 2
)
## # A tibble: 6 x 4
## country century year rate
## * <chr> <int> <int> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 0 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 0 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 0 213766/1280428583
table5 %>%
unite(new, century, year, sep = "")
## # A tibble: 6 x 3
## country new rate
## * <chr> <chr> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
?separate
## starting httpd help server ...
## done
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
separate(x, c("one","two","three"), extra='merge')
## # A tibble: 3 x 3
## one two three
## * <chr> <chr> <chr>
## 1 a b c
## 2 d e f,g
## 3 h i j
tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
separate(x, c("one","two","three"), fill="left")
## # A tibble: 3 x 3
## one two three
## * <chr> <chr> <chr>
## 1 a b c
## 2 <NA> d e
## 3 f g i
?unite
table5 %>%
unite(new, century, year, sep = "", remove = F)
## # A tibble: 6 x 5
## country new century year rate
## * <chr> <chr> <chr> <chr> <chr>
## 1 Afghanistan 1999 19 99 745/19987071
## 2 Afghanistan 2000 20 00 2666/20595360
## 3 Brazil 1999 19 99 37737/172006362
## 4 Brazil 2000 20 00 80488/174504898
## 5 China 1999 19 99 212258/1272915272
## 6 China 2000 20 00 213766/1280428583
tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
separate(x, c("one","two","three"), remove = F)
## Warning: Too few values at 1 locations: 2
## # A tibble: 3 x 4
## x one two three
## * <chr> <chr> <chr> <chr>
## 1 a,b,c a b c
## 2 d,e d e <NA>
## 3 f,g,i f g i
table3 %>%
separate(
year,
into = c("century", "year"),
convert = T,
sep = 2,
remove = F
)
## # A tibble: 6 x 5
## country year century year rate
## * <chr> <int> <int> <int> <chr>
## 1 Afghanistan 1999 19 99 745/19987071
## 2 Afghanistan 2000 20 0 2666/20595360
## 3 Brazil 1999 19 99 37737/172006362
## 4 Brazil 2000 20 0 80488/174504898
## 5 China 1999 19 99 212258/1272915272
## 6 China 2000 20 0 213766/1280428583
table3 %>%
extract(
rate,
"ext_col"
)
## # A tibble: 6 x 3
## country year ext_col
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745
## 2 Afghanistan 2000 2666
## 3 Brazil 1999 37737
## 4 Brazil 2000 80488
## 5 China 1999 212258
## 6 China 2000 213766
library(dplyr)
df <- data.frame(x = c(NA, "a-b", "a-d", "b-c", "d-e"))
df
## x
## 1 <NA>
## 2 a-b
## 3 a-d
## 4 b-c
## 5 d-e
df %>% extract(x, "A")
## A
## 1 <NA>
## 2 a
## 3 a
## 4 b
## 5 d
df %>% extract(x, c("A", "B"), "([[:alnum:]]+)-([[:alnum:]]+)")
## A B
## 1 <NA> <NA>
## 2 a b
## 3 a d
## 4 b c
## 5 d e
# If no match, NA:
df %>% extract(x, c("A", "B"), "([a-d]+)-([a-d]+)")
## A B
## 1 <NA> <NA>
## 2 a b
## 3 a d
## 4 b c
## 5 <NA> <NA>
stocks <- tibble(
year = c(2015,2015,2015,2015,2016,2016,2016),
qtr = c( 1, 2, 3, 4, 2, 3, 4),
return = c(1.88,0.59,0.35, NA,0.92,0.17,2.66)
)
stocks %>%
spread(year, return)
## # A tibble: 4 x 3
## qtr `2015` `2016`
## * <dbl> <dbl> <dbl>
## 1 1 1.88 NA
## 2 2 0.59 0.92
## 3 3 0.35 0.17
## 4 4 NA 2.66
stocks %>%
spread(year, return) %>%
gather(year, return, `2015`:`2016`, na.rm = T)
## # A tibble: 6 x 3
## qtr year return
## * <dbl> <chr> <dbl>
## 1 1 2015 1.88
## 2 2 2015 0.59
## 3 3 2015 0.35
## 4 2 2016 0.92
## 5 3 2016 0.17
## 6 4 2016 2.66
stocks %>%
complete(year, qtr)
## # A tibble: 8 x 3
## year qtr return
## <dbl> <dbl> <dbl>
## 1 2015 1 1.88
## 2 2015 2 0.59
## 3 2015 3 0.35
## 4 2015 4 NA
## 5 2016 1 NA
## 6 2016 2 0.92
## 7 2016 3 0.17
## 8 2016 4 2.66
who
## # A tibble: 7,240 x 60
## country iso2 iso3 year new_sp_m014 new_sp_m1524 new_sp_m2534
## <chr> <chr> <chr> <int> <int> <int> <int>
## 1 Afghanistan AF AFG 1980 NA NA NA
## 2 Afghanistan AF AFG 1981 NA NA NA
## 3 Afghanistan AF AFG 1982 NA NA NA
## 4 Afghanistan AF AFG 1983 NA NA NA
## 5 Afghanistan AF AFG 1984 NA NA NA
## 6 Afghanistan AF AFG 1985 NA NA NA
## 7 Afghanistan AF AFG 1986 NA NA NA
## 8 Afghanistan AF AFG 1987 NA NA NA
## 9 Afghanistan AF AFG 1988 NA NA NA
## 10 Afghanistan AF AFG 1989 NA NA NA
## # ... with 7,230 more rows, and 53 more variables: new_sp_m3544 <int>,
## # new_sp_m4554 <int>, new_sp_m5564 <int>, new_sp_m65 <int>,
## # new_sp_f014 <int>, new_sp_f1524 <int>, new_sp_f2534 <int>,
## # new_sp_f3544 <int>, new_sp_f4554 <int>, new_sp_f5564 <int>,
## # new_sp_f65 <int>, new_sn_m014 <int>, new_sn_m1524 <int>,
## # new_sn_m2534 <int>, new_sn_m3544 <int>, new_sn_m4554 <int>,
## # new_sn_m5564 <int>, new_sn_m65 <int>, new_sn_f014 <int>,
## # new_sn_f1524 <int>, new_sn_f2534 <int>, new_sn_f3544 <int>,
## # new_sn_f4554 <int>, new_sn_f5564 <int>, new_sn_f65 <int>,
## # new_ep_m014 <int>, new_ep_m1524 <int>, new_ep_m2534 <int>,
## # new_ep_m3544 <int>, new_ep_m4554 <int>, new_ep_m5564 <int>,
## # new_ep_m65 <int>, new_ep_f014 <int>, new_ep_f1524 <int>,
## # new_ep_f2534 <int>, new_ep_f3544 <int>, new_ep_f4554 <int>,
## # new_ep_f5564 <int>, new_ep_f65 <int>, newrel_m014 <int>,
## # newrel_m1524 <int>, newrel_m2534 <int>, newrel_m3544 <int>,
## # newrel_m4554 <int>, newrel_m5564 <int>, newrel_m65 <int>,
## # newrel_f014 <int>, newrel_f1524 <int>, newrel_f2534 <int>,
## # newrel_f3544 <int>, newrel_f4554 <int>, newrel_f5564 <int>,
## # newrel_f65 <int>