knitr:: opts_chunk$set(echo=TRUE, results = "asis", cache = TRUE)
library(tidyverse)
library(kableExtra)
12.2.1
From 3 tables below:
kable(table2) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
country
|
year
|
type
|
count
|
|
Afghanistan
|
1999
|
cases
|
745
|
|
Afghanistan
|
1999
|
population
|
19987071
|
|
Afghanistan
|
2000
|
cases
|
2666
|
|
Afghanistan
|
2000
|
population
|
20595360
|
|
Brazil
|
1999
|
cases
|
37737
|
|
Brazil
|
1999
|
population
|
172006362
|
|
Brazil
|
2000
|
cases
|
80488
|
|
Brazil
|
2000
|
population
|
174504898
|
|
China
|
1999
|
cases
|
212258
|
|
China
|
1999
|
population
|
1272915272
|
|
China
|
2000
|
cases
|
213766
|
|
China
|
2000
|
population
|
1280428583
|
kable(table4a) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
country
|
1999
|
2000
|
|
Afghanistan
|
745
|
2666
|
|
Brazil
|
37737
|
80488
|
|
China
|
212258
|
213766
|
kable(table4b) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
country
|
1999
|
2000
|
|
Afghanistan
|
19987071
|
20595360
|
|
Brazil
|
172006362
|
174504898
|
|
China
|
1272915272
|
1280428583
|
Q3. Recreate the plot showing change in cases over time using table2 instead of table1. What do you need to do first?
12.2.3
Q1. Why are gather() and spread() not perfectly symmetrical?
stocks <- tibble(
year = c(2015, 2015, 2016, 2016),
half = c( 1, 2, 1, 2),
return = c(1.88, 0.59, 0.92, 0.17)
)
kable(stocks) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
year
|
half
|
return
|
|
2015
|
1
|
1.88
|
|
2015
|
2
|
0.59
|
|
2016
|
1
|
0.92
|
|
2016
|
2
|
0.17
|
temp4 <- stocks %>%
spread(year, return)
kable(temp4) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
half
|
2015
|
2016
|
|
1
|
1.88
|
0.92
|
|
2
|
0.59
|
0.17
|
temp5 <- stocks %>%
spread(year, return) %>%
gather("year", "return", `2015`:`2016`)
kable(temp5) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
half
|
year
|
return
|
|
1
|
2015
|
1.88
|
|
2
|
2015
|
0.59
|
|
1
|
2016
|
0.92
|
|
2
|
2016
|
0.17
|
When we use spread, R pushs other columns to the left, when we use gather, R keep the other columns where they are and put new columns to the right. Thats why column’s position are not the same after we use spread then gather.
Both spread() and gather() have a convert argument. What does it do?
If convert = TRUE will automatically convert values to logical, integer, numeric, complex or factor as appropriate.
Q2. Why does this code fail?
table4a %>%
gather(1999, 2000, key = "year", value = "cases")
## Error in inds_combine(.vars, ind_list): Position must be between 0 and n
The code fail because column’s name is supposed to be in ’ ’
temp7 <- table4a %>%
gather('1999', '2000', key = "year", value = "cases")
kable(temp7) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
country
|
year
|
cases
|
|
Afghanistan
|
1999
|
745
|
|
Brazil
|
1999
|
37737
|
|
China
|
1999
|
212258
|
|
Afghanistan
|
2000
|
2666
|
|
Brazil
|
2000
|
80488
|
|
China
|
2000
|
213766
|
Q3. Why does spreading this tibble fail? How could you add a new column to fix the problem?
people <- tribble(
~name, ~key, ~value,
#-----------------|--------|------
"Phillip Woods", "age", 45,
"Phillip Woods", "height", 186,
"Phillip Woods", "age", 50,
"Jessica Cordero", "age", 37,
"Jessica Cordero", "height", 156
)
people %>% spread(key, value)
## Each row of output must be identified by a unique combination of keys.
## Keys are shared for 2 rows:
## * 1, 3
## Do you need to create unique ID with tibble::rowid_to_column()?
There is an error because name Phillip Woods has 2 different ages 45 and 50. To fix this, we can add one more column “ID” and assume that there is 2 people with the same name “Phillips Woods”.
temp14 <- people %>%
mutate(ID = c("1","1","2","3","3")) %>%
spread(key,value) %>%
arrange(ID)
kable(temp14) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
name
|
ID
|
age
|
height
|
|
Phillip Woods
|
1
|
45
|
186
|
|
Phillip Woods
|
2
|
50
|
NA
|
|
Jessica Cordero
|
3
|
37
|
156
|
Q4. Tidy the simple tibble below. Do you need to spread or gather it? What are the variables?
preg <- tribble(
~pregnant, ~male, ~female,
"yes", NA, 10,
"no", 20, 12
)
kable(preg) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
pregnant
|
male
|
female
|
|
yes
|
NA
|
10
|
|
no
|
20
|
12
|
temp8 <- preg %>%
gather("male", "female",key = "Gender", value = "count") %>%
filter(!is.na(count))
kable(temp8) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
pregnant
|
Gender
|
count
|
|
no
|
male
|
20
|
|
yes
|
female
|
10
|
|
no
|
female
|
12
|
12.4.3
Q1. What do the extra and fill arguments do in separate()? Experiment with the various options for the following two toy datasets.
temp9 <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
separate(x, c("one", "two", "three"))
## Warning: Expected 3 pieces. Additional pieces discarded in 1 rows [2].
temp10 <- tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
separate(x, c("one", "two", "three"))
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].
kable(temp9) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
one
|
two
|
three
|
|
a
|
b
|
c
|
|
d
|
e
|
f
|
|
h
|
i
|
j
|
kable(temp10) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
one
|
two
|
three
|
|
a
|
b
|
c
|
|
d
|
e
|
NA
|
|
f
|
g
|
i
|
Separate remove extra g or ignore missing letters in “d,e”
Q2. Both unite() and separate() have a remove argument. What does it do? Why would you set it to FALSE?
Set remove = FALSE to keep orginal data in the first column.
temp11 <- tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
separate(x, c("one", "two", "three"), remove = FALSE)
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].
kable(temp11) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
x
|
one
|
two
|
three
|
|
a,b,c
|
a
|
b
|
c
|
|
d,e
|
d
|
e
|
NA
|
|
f,g,i
|
f
|
g
|
i
|
Q3. Compare and contrast separate() and extract(). Why are there three variations of separation (by position, by separator, and with groups), but only one unite?
temp12 <- tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
separate(x, c("one", "two", "three"), remove = FALSE)
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].
temp13 <- temp12 %>%
unite(new,one, two, three, sep =" ")
kable(temp12) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
x
|
one
|
two
|
three
|
|
a,b,c
|
a
|
b
|
c
|
|
d,e
|
d
|
e
|
NA
|
|
f,g,i
|
f
|
g
|
i
|
kable(temp13) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
x
|
new
|
|
a,b,c
|
a b c
|
|
d,e
|
d e NA
|
|
f,g,i
|
f g i
|
12.5-1
Q1. Compare and contrast the fill arguments to spread() and complete().
Fill arguments in spread only take 1 value
Fill arguments in complete take many values for many variable.
See example below:
treatment <- tribble(
~ person, ~ treatment, ~response,
"Derrick Whitmore", 1, 7,
NA, 2, 10,
NA, 3, 9,
"Katherine Burke", 1, 4
)
address <- c(NA,NA,NA,NA)
treatment <- cbind(treatment,address)
temp19<- treatment %>%
spread(treatment, response, fill = "replace with this")
kable(temp19) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
person
|
address
|
1
|
2
|
3
|
|
Derrick Whitmore
|
NA
|
7
|
replace with this
|
replace with this
|
|
Katherine Burke
|
NA
|
4
|
replace with this
|
replace with this
|
|
NA
|
NA
|
replace with this
|
10
|
9
|
temp20 <- treatment %>%
complete(person, fill = list(person = 'name1', address = 'to be entered later'))
kable(temp20) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
person
|
treatment
|
response
|
address
|
|
Derrick Whitmore
|
1
|
7
|
to be entered later
|
|
Katherine Burke
|
1
|
4
|
to be entered later
|
|
name1
|
2
|
10
|
to be entered later
|
|
name1
|
3
|
9
|
to be entered later
|
Q2. What does the direction argument to fill() do?
Direction “up”, fill missing value by the next non-missing value
Direction “up”, fill missing value by the previous non-missing value
Example
df <- data.frame(Month = 1:12, Year = c(NA, NA,2000, rep(NA,9)))
kable(df) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
Month
|
Year
|
|
1
|
NA
|
|
2
|
NA
|
|
3
|
2000
|
|
4
|
NA
|
|
5
|
NA
|
|
6
|
NA
|
|
7
|
NA
|
|
8
|
NA
|
|
9
|
NA
|
|
10
|
NA
|
|
11
|
NA
|
|
12
|
NA
|
temp14 <- df %>%
fill(Year, .direction = "up")
kable(temp14) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
Month
|
Year
|
|
1
|
2000
|
|
2
|
2000
|
|
3
|
2000
|
|
4
|
NA
|
|
5
|
NA
|
|
6
|
NA
|
|
7
|
NA
|
|
8
|
NA
|
|
9
|
NA
|
|
10
|
NA
|
|
11
|
NA
|
|
12
|
NA
|
temp15 <- df %>%
fill(Year, .direction = "down")
kable(temp15) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
Month
|
Year
|
|
1
|
NA
|
|
2
|
NA
|
|
3
|
2000
|
|
4
|
2000
|
|
5
|
2000
|
|
6
|
2000
|
|
7
|
2000
|
|
8
|
2000
|
|
9
|
2000
|
|
10
|
2000
|
|
11
|
2000
|
|
12
|
2000
|
12.6 Case study
who <- tidyr::who
kable(head(who)) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
country
|
iso2
|
iso3
|
year
|
new_sp_m014
|
new_sp_m1524
|
new_sp_m2534
|
new_sp_m3544
|
new_sp_m4554
|
new_sp_m5564
|
new_sp_m65
|
new_sp_f014
|
new_sp_f1524
|
new_sp_f2534
|
new_sp_f3544
|
new_sp_f4554
|
new_sp_f5564
|
new_sp_f65
|
new_sn_m014
|
new_sn_m1524
|
new_sn_m2534
|
new_sn_m3544
|
new_sn_m4554
|
new_sn_m5564
|
new_sn_m65
|
new_sn_f014
|
new_sn_f1524
|
new_sn_f2534
|
new_sn_f3544
|
new_sn_f4554
|
new_sn_f5564
|
new_sn_f65
|
new_ep_m014
|
new_ep_m1524
|
new_ep_m2534
|
new_ep_m3544
|
new_ep_m4554
|
new_ep_m5564
|
new_ep_m65
|
new_ep_f014
|
new_ep_f1524
|
new_ep_f2534
|
new_ep_f3544
|
new_ep_f4554
|
new_ep_f5564
|
new_ep_f65
|
newrel_m014
|
newrel_m1524
|
newrel_m2534
|
newrel_m3544
|
newrel_m4554
|
newrel_m5564
|
newrel_m65
|
newrel_f014
|
newrel_f1524
|
newrel_f2534
|
newrel_f3544
|
newrel_f4554
|
newrel_f5564
|
newrel_f65
|
|
Afghanistan
|
AF
|
AFG
|
1980
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
Afghanistan
|
AF
|
AFG
|
1981
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
Afghanistan
|
AF
|
AFG
|
1982
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
Afghanistan
|
AF
|
AFG
|
1983
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
Afghanistan
|
AF
|
AFG
|
1984
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
Afghanistan
|
AF
|
AFG
|
1985
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
who1 <- who %>%
gather(key, value, new_sp_m014:newrel_f65, na.rm = TRUE) %>%
mutate(key = stringr::str_replace(key, "newrel", "new_rel")) %>%
separate(key, c("new", "var", "sexage")) %>%
select(-new, -iso2, -iso3) %>%
separate(sexage, c("sex", "age"), sep = 1)
who1 %>%
filter(value == 0) %>%
nrow()
[1] 11080 The data has 0 values which mean NA values in this data frame is explictly.
NA means data is missing and 0 mean there is 0 cases.
There is no implicit missing values.
Q2. What happens if you neglect the mutate() step? (mutate(key = stringr::str_replace(key, “newrel”, “new_rel”)))
If you neglect the mutate() step you will have 2 different type of values: “newrel” and “new_rel” even though they are one.
Q3. I claimed that iso2 and iso3 were redundant with country. Confirm this claim.
length(unique(who$country))
[1] 219
[1] 219
[1] 219
All three columns have the same number of unique values 219 countries. Prove that country, iso2, iso3 are the same.