knitr:: opts_chunk$set(echo=TRUE, results = "asis", cache = TRUE)
library(tidyverse)
library(kableExtra)

12.2.1

From 3 tables below:

  • Table2
kable(table2) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
country year type count
Afghanistan 1999 cases 745
Afghanistan 1999 population 19987071
Afghanistan 2000 cases 2666
Afghanistan 2000 population 20595360
Brazil 1999 cases 37737
Brazil 1999 population 172006362
Brazil 2000 cases 80488
Brazil 2000 population 174504898
China 1999 cases 212258
China 1999 population 1272915272
China 2000 cases 213766
China 2000 population 1280428583
  • Table4a
kable(table4a) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
country 1999 2000
Afghanistan 745 2666
Brazil 37737 80488
China 212258 213766
  • Table4b
kable(table4b) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
country 1999 2000
Afghanistan 19987071 20595360
Brazil 172006362 174504898
China 1272915272 1280428583

Q2. Compute the rate for table2, and table4a + table4b. You will need to perform four operations:

Step1: Convert Table 2 to calculate rate ( rate = cases/population *10000)
cases <- table2 %>%
    filter(type == "cases")
population <- table2 %>%
    filter(type == "population")
join_table <- cases %>%
    inner_join(population, by = c("country", "year"))
names(join_table)[4] <- "cases"
names(join_table)[6] <- "population"
join_table <- join_table %>%
    select(country, year,cases,population) %>%
    mutate(rate = cases/population*10000)
kable(join_table) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
country year cases population rate
Afghanistan 1999 745 19987071 0.372741
Afghanistan 2000 2666 20595360 1.294466
Brazil 1999 37737 172006362 2.193931
Brazil 2000 80488 174504898 4.612363
China 1999 212258 1272915272 1.667495
China 2000 213766 1280428583 1.669488
Step2: Convert Table 4 (a&b) to calculate rate ( rate = cases/population *10000)
temp1 <- gather(table4a, year, cases, 2:3) %>%
    inner_join(gather(table4b, year, population, 2:3), by = c("country", "year")) %>%
    mutate(rate = cases/ population * 10000)
kable(temp1) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
country year cases population rate
Afghanistan 1999 745 19987071 0.372741
Brazil 1999 37737 172006362 2.193931
China 1999 212258 1272915272 1.667495
Afghanistan 2000 2666 20595360 1.294466
Brazil 2000 80488 174504898 4.612363
China 2000 213766 1280428583 1.669488
  • Table 4 seems easier to work with because we could use function gather.

Q3. Recreate the plot showing change in cases over time using table2 instead of table1. What do you need to do first?

First, the cases need to be extract . Then graph can be made:
temp3 <- table2 %>%
  filter(type == "cases")
ggplot(temp3, mapping = aes(x = year, y = count, color = country)) +
  scale_x_continuous(breaks = seq(1999,2000, by = 1)) +
  geom_line()

12.2.3

Q1. Why are gather() and spread() not perfectly symmetrical?

stocks <- tibble(
  year   = c(2015, 2015, 2016, 2016),
  half  = c(   1,    2,     1,    2),
  return = c(1.88, 0.59, 0.92, 0.17)
)
kable(stocks) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
year half return
2015 1 1.88
2015 2 0.59
2016 1 0.92
2016 2 0.17
temp4 <- stocks %>% 
  spread(year, return) 
kable(temp4) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
half 2015 2016
1 1.88 0.92
2 0.59 0.17
temp5 <- stocks %>% 
  spread(year, return) %>% 
  gather("year", "return", `2015`:`2016`)
kable(temp5) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
half year return
1 2015 1.88
2 2015 0.59
1 2016 0.92
2 2016 0.17

  When we use spread, R pushs other columns to the left, when we use gather, R keep the other columns where they are and put new columns to the right. Thats why column’s position are not the same after we use spread then gather.

Both spread() and gather() have a convert argument. What does it do?

  If convert = TRUE will automatically convert values to logical, integer, numeric, complex or factor as appropriate.

Q2. Why does this code fail?

table4a %>% 
  gather(1999, 2000, key = "year", value = "cases")
## Error in inds_combine(.vars, ind_list): Position must be between 0 and n

  The code fail because column’s name is supposed to be in ’ ’

temp7 <- table4a %>% 
  gather('1999', '2000', key = "year", value = "cases")
kable(temp7) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
country year cases
Afghanistan 1999 745
Brazil 1999 37737
China 1999 212258
Afghanistan 2000 2666
Brazil 2000 80488
China 2000 213766

Q3. Why does spreading this tibble fail? How could you add a new column to fix the problem?

people <- tribble(
  ~name,             ~key,    ~value,
  #-----------------|--------|------
  "Phillip Woods",   "age",       45,
  "Phillip Woods",   "height",   186,
  "Phillip Woods",   "age",       50,
  "Jessica Cordero", "age",       37,
  "Jessica Cordero", "height",   156
)
people %>% spread(key, value)
## Each row of output must be identified by a unique combination of keys.
## Keys are shared for 2 rows:
## * 1, 3
## Do you need to create unique ID with tibble::rowid_to_column()?

  There is an error because name Phillip Woods has 2 different ages 45 and 50. To fix this, we can add one more column “ID” and assume that there is 2 people with the same name “Phillips Woods”.

temp14 <- people %>%
  mutate(ID = c("1","1","2","3","3")) %>%
  spread(key,value) %>%
  arrange(ID)
kable(temp14) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
name ID age height
Phillip Woods 1 45 186
Phillip Woods 2 50 NA
Jessica Cordero 3 37 156

Q4. Tidy the simple tibble below. Do you need to spread or gather it? What are the variables?

preg <- tribble(
  ~pregnant, ~male, ~female,
  "yes",     NA,    10,
  "no",      20,    12
)
kable(preg) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
pregnant male female
yes NA 10
no 20 12
temp8 <- preg %>%
  gather("male", "female",key = "Gender", value = "count") %>%
  filter(!is.na(count))
kable(temp8) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
pregnant Gender count
no male 20
yes female 10
no female 12

12.4.3

Q1. What do the extra and fill arguments do in separate()? Experiment with the various options for the following two toy datasets.

temp9 <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% 
  separate(x, c("one", "two", "three"))
## Warning: Expected 3 pieces. Additional pieces discarded in 1 rows [2].
temp10 <- tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% 
  separate(x, c("one", "two", "three"))
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].
kable(temp9) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
one two three
a b c
d e f
h i j
kable(temp10) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
one two three
a b c
d e NA
f g i

  Separate remove extra g or ignore missing letters in “d,e”

Q2. Both unite() and separate() have a remove argument. What does it do? Why would you set it to FALSE?

  Set remove = FALSE to keep orginal data in the first column.

temp11 <- tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% 
  separate(x, c("one", "two", "three"), remove = FALSE)
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].
kable(temp11) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
x one two three
a,b,c a b c
d,e d e NA
f,g,i f g i

Q3. Compare and contrast separate() and extract(). Why are there three variations of separation (by position, by separator, and with groups), but only one unite?

temp12 <- tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% 
  separate(x, c("one", "two", "three"), remove = FALSE)
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].
temp13 <- temp12 %>% 
  unite(new,one, two, three, sep =" ")
kable(temp12) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
x one two three
a,b,c a b c
d,e d e NA
f,g,i f g i
kable(temp13) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
x new
a,b,c a b c
d,e d e NA
f,g,i f g i

12.5-1

Q1. Compare and contrast the fill arguments to spread() and complete().

  Fill arguments in spread only take 1 value

  Fill arguments in complete take many values for many variable.

  See example below:

treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 1,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)
address <- c(NA,NA,NA,NA)
treatment <- cbind(treatment,address)
temp19<- treatment %>% 
  spread(treatment, response, fill = "replace with this")
kable(temp19) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
person address 1 2 3
Derrick Whitmore NA 7 replace with this replace with this
Katherine Burke NA 4 replace with this replace with this
NA NA replace with this 10 9
temp20 <- treatment %>%
  complete(person, fill = list(person = 'name1', address = 'to be entered later'))
kable(temp20) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
person treatment response address
Derrick Whitmore 1 7 to be entered later
Katherine Burke 1 4 to be entered later
name1 2 10 to be entered later
name1 3 9 to be entered later

Q2. What does the direction argument to fill() do?

  Direction “up”, fill missing value by the next non-missing value

  Direction “up”, fill missing value by the previous non-missing value

  Example

df <- data.frame(Month = 1:12, Year = c(NA, NA,2000, rep(NA,9)))
kable(df) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
Month Year
1 NA
2 NA
3 2000
4 NA
5 NA
6 NA
7 NA
8 NA
9 NA
10 NA
11 NA
12 NA
temp14 <- df %>%
  fill(Year, .direction = "up")
kable(temp14) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
Month Year
1 2000
2 2000
3 2000
4 NA
5 NA
6 NA
7 NA
8 NA
9 NA
10 NA
11 NA
12 NA
temp15 <- df %>%
  fill(Year, .direction = "down")
kable(temp15) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
Month Year
1 NA
2 NA
3 2000
4 2000
5 2000
6 2000
7 2000
8 2000
9 2000
10 2000
11 2000
12 2000

12.6 Case study

who <- tidyr::who
kable(head(who)) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
country iso2 iso3 year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544 new_sp_m4554 new_sp_m5564 new_sp_m65 new_sp_f014 new_sp_f1524 new_sp_f2534 new_sp_f3544 new_sp_f4554 new_sp_f5564 new_sp_f65 new_sn_m014 new_sn_m1524 new_sn_m2534 new_sn_m3544 new_sn_m4554 new_sn_m5564 new_sn_m65 new_sn_f014 new_sn_f1524 new_sn_f2534 new_sn_f3544 new_sn_f4554 new_sn_f5564 new_sn_f65 new_ep_m014 new_ep_m1524 new_ep_m2534 new_ep_m3544 new_ep_m4554 new_ep_m5564 new_ep_m65 new_ep_f014 new_ep_f1524 new_ep_f2534 new_ep_f3544 new_ep_f4554 new_ep_f5564 new_ep_f65 newrel_m014 newrel_m1524 newrel_m2534 newrel_m3544 newrel_m4554 newrel_m5564 newrel_m65 newrel_f014 newrel_f1524 newrel_f2534 newrel_f3544 newrel_f4554 newrel_f5564 newrel_f65
Afghanistan AF AFG 1980 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Afghanistan AF AFG 1981 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Afghanistan AF AFG 1982 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Afghanistan AF AFG 1983 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Afghanistan AF AFG 1984 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Afghanistan AF AFG 1985 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
who1 <- who %>%
  gather(key, value, new_sp_m014:newrel_f65, na.rm = TRUE) %>% 
  mutate(key = stringr::str_replace(key, "newrel", "new_rel")) %>%
  separate(key, c("new", "var", "sexage")) %>% 
  select(-new, -iso2, -iso3) %>% 
  separate(sexage, c("sex", "age"), sep = 1)
who1 %>%
  filter(value == 0) %>%
  nrow()

[1] 11080   The data has 0 values which mean NA values in this data frame is explictly.

  NA means data is missing and 0 mean there is 0 cases.

  There is no implicit missing values.

Q2. What happens if you neglect the mutate() step? (mutate(key = stringr::str_replace(key, “newrel”, “new_rel”)))

If you neglect the mutate() step you will have 2 different type of values: “newrel” and “new_rel” even though they are one.

Q3. I claimed that iso2 and iso3 were redundant with country. Confirm this claim.

length(unique(who$country))

[1] 219

length(unique(who$iso2))

[1] 219

length(unique(who$iso3))

[1] 219

All three columns have the same number of unique values 219 countries. Prove that country, iso2, iso3 are the same.

Q4. For each country, year, and sex compute the total number of cases of TB. Make an informative visualisation of the data.

temp16 <- who1 %>%
  group_by(year) %>%
  summarise(value = sum(value)) %>%
  ggplot() +
  geom_line(mapping = aes(x = year, y =value)) +
  labs(x = 'Year', y = 'Cases', title = 'Cases of tuberculosis for all countries over year')
temp16

temp17 <- who1 %>%
  group_by(country) %>%
  summarise(value = sum(value)) %>%
  arrange(desc(value)) %>% 
  mutate(value = format(value, scientific = F,big.mark = ",")) %>%
  top_n(4)
## Selecting by value
list <- temp17$country
kable(temp17, caption = 'Top 4 country with highest caeses of Tuberculosis') %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
Top 4 country with highest caeses of Tuberculosis
country value
China 8,389,839
India 7,098,552
South Africa 3,010,272
Indonesia 2,909,925
sex.labs <- c('Female','Male')
names(sex.labs) <- c('f','m')
temp18 <- who1 %>%
  mutate(country = ifelse(country %in% list, country,'other')) %>%
  group_by(country,year,sex) %>%
  summarise(value = sum(value)) %>%
  filter(year > 1990) %>%
  ggplot() +
  geom_line(mapping = aes(x = year, y = value, color = country)) +
  facet_wrap(~sex,
  labeller = labeller(sex = sex.labs)) +
  labs(x = 'Year', y = 'Cases', title = 'Cases of Tuberculosis over the year')
temp18