library(tidyverse)
## -- Attaching packages ------ tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'tidyr' was built under R version 3.6.2
## -- Conflicts --------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

1.How can you tell if an object is a tibble?

tibble::as_tibble(mtcars);
## # A tibble: 32 x 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
##  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
##  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
##  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
##  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
##  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
##  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
##  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
##  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
## 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
## # ... with 22 more rows
print(mtcars)
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

an object is a tibble if the first 10 rows are what are shown

2.Compare and contrast the following operations on a data.frame and equivalent tibble. What is different? Why might the default data frame behaviours cause you frustration?

#df
df <- data.frame(abc = 1, xyz = "a")
df$x
## [1] a
## Levels: a
df[, "xyz"]
## [1] a
## Levels: a
df[, c("abc", "xyz")]
##   abc xyz
## 1   1   a
#tibble
df<-tibble(
  abc=1,
  xyz="a"
)


df$x
## Warning: Unknown or uninitialised column: 'x'.
## NULL
df[, "xyz"]
## # A tibble: 1 x 1
##   xyz  
##   <chr>
## 1 a
df[, c("abc","xyz")]
## # A tibble: 1 x 2
##     abc xyz  
##   <dbl> <chr>
## 1     1 a

The difference is that tibble tells you whether there are unknown columns while data frame does not.

3.If you have the name of a variable stored in an object, e.g. var <- “mpg”, how can you extract the reference variable from a tibble?

tb_cars<-as.tibble(mtcars);
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
var<- "mpg";
tb_cars[var];
## # A tibble: 32 x 1
##      mpg
##    <dbl>
##  1  21  
##  2  21  
##  3  22.8
##  4  21.4
##  5  18.7
##  6  18.1
##  7  14.3
##  8  24.4
##  9  22.8
## 10  19.2
## # ... with 22 more rows

4.Practice referring to non-syntactic names in the following data frame by:

annoying <- tibble(
  `1` = 1:10,
  `2` = `1` * 2 + rnorm(length(`1`))
)

1.Extracting the variable called 1

  annoying['1'] 
## # A tibble: 10 x 1
##      `1`
##    <int>
##  1     1
##  2     2
##  3     3
##  4     4
##  5     5
##  6     6
##  7     7
##  8     8
##  9     9
## 10    10

2.Plotting a scatterplot of 1 vs 2

ggplot(data=annoying)+
  geom_point(aes(x=`1`, y=`2`));

3.Creating a new column 3 which is 2 divided by 1

mutate(annoying, `3`=`2`/`1`);
## # A tibble: 10 x 3
##      `1`   `2`   `3`
##    <int> <dbl> <dbl>
##  1     1  2.98  2.98
##  2     2  3.33  1.67
##  3     3  5.62  1.87
##  4     4 10.4   2.60
##  5     5 10.7   2.13
##  6     6 12.0   1.99
##  7     7 14.5   2.07
##  8     8 17.3   2.16
##  9     9 15.9   1.77
## 10    10 19.9   1.99

4.Renaming the columns to one, two and three.

#transmute(annoying,one=`1`,two = `2`, three = `3`)

5.What does tibble::enframe() do? When might you use it?

#enframe() converts named atomic vectors or lists to one- or two-column data frames
#?enframe;

6.What option controls how many additional column names are printed at the footer of a tibble?

# tibble.max_extra_cols

1.What function would you use to read a file where fields were separated with “|”?

#read_delim()

2.Apart from file, skip, and comment, what other arguments do read_csv() and read_tsv() have in common?

#Both function just call 'read_delim()'

3.What are the most important arguments to read_fwf()?

#row_names
#col_names

4.Sometimes strings in a CSV file contain commas. To prevent them from causing problems they need to be surrounded by a quoting character, like " or ’. By convention, read_csv() assumes that the quoting character will be ", and if you want to change it you’ll need to use read_delim() instead. What arguments do you need to specify to read the following text into a data frame?

read_csv("x,y\n1,a,b",col_names = c("a","b"));
## Warning: 1 parsing failure.
## row col  expected    actual         file
##   2  -- 2 columns 3 columns literal data
## # A tibble: 2 x 2
##   a     b    
##   <chr> <chr>
## 1 x     y    
## 2 1     a

5.Identify what is wrong with each of the following inline CSV files. What happens when you run the code?

No arguments have been passed. (i).

read_csv("a,b\n1,2,3\n4,5,6", na=
         ".",col_names=FALSE)
## Warning: 2 parsing failures.
## row col  expected    actual         file
##   2  -- 2 columns 3 columns literal data
##   3  -- 2 columns 3 columns literal data
## # A tibble: 3 x 2
##   X1    X2   
##   <chr> <chr>
## 1 a     b    
## 2 1     2    
## 3 4     5

(ii).

read_csv("a,b,c\n1,2\n1,2,3,4", na=".", col_names=FALSE);
## Warning: 2 parsing failures.
## row col  expected    actual         file
##   2  -- 3 columns 2 columns literal data
##   3  -- 3 columns 4 columns literal data
## # A tibble: 3 x 3
##   X1    X2    X3   
##   <chr> <chr> <chr>
## 1 a     b     c    
## 2 1     2     <NA> 
## 3 1     2     3

(iii).

read_csv("a,b\n\"1", col_names=FALSE);
## Warning: 2 parsing failures.
## row col                     expected    actual         file
##   2  X1 closing quote at end of file           literal data
##   2  -- 2 columns                    1 columns literal data
## # A tibble: 2 x 2
##   X1    X2   
##   <chr> <chr>
## 1 a     b    
## 2 1     <NA>

(iv).

read_csv("a,b\n1,2\na,b",na=".", col_names=FALSE)
## # A tibble: 3 x 2
##   X1    X2   
##   <chr> <chr>
## 1 a     b    
## 2 1     2    
## 3 a     b

(v).

read_csv("a;b\n1;3",col_names=FALSE)
## # A tibble: 2 x 1
##   X1   
##   <chr>
## 1 a;b  
## 2 1;3

11.3 PARSING A VECTOR

1.What are the most important arguments to locale()?

#encoding=""

2.What happens if you try and set decimal_mark and grouping_mark to the same character? What happens to the default value of grouping_mark when you set decimal_mark to “,”? What happens to the default value of decimal_mark when you set the grouping_mark to “.”?

-If you set decimal_mark and grouping_mark to the same character,the execution will be halted,they are supposed to be different.

#decimal_mark=","
locale("es", decimal_mark = ",")
## <locale>
## Numbers:  123.456,78
## Formats:  %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days:   domingo (dom.), lunes (lun.), martes (mar.), miércoles (mié.),
##         jueves (jue.), viernes (vie.), sábado (sáb.)
## Months: enero (ene.), febrero (feb.), marzo (mar.), abril (abr.), mayo
##         (may.), junio (jun.), julio (jul.), agosto (ago.),
##         septiembre (sept.), octubre (oct.), noviembre (nov.),
##         diciembre (dic.)
## AM/PM:  a. m./p. m.
#grouping_mark=","
locale("es", decimal_mark = ".")
## <locale>
## Numbers:  123,456.78
## Formats:  %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days:   domingo (dom.), lunes (lun.), martes (mar.), miércoles (mié.),
##         jueves (jue.), viernes (vie.), sábado (sáb.)
## Months: enero (ene.), febrero (feb.), marzo (mar.), abril (abr.), mayo
##         (may.), junio (jun.), julio (jul.), agosto (ago.),
##         septiembre (sept.), octubre (oct.), noviembre (nov.),
##         diciembre (dic.)
## AM/PM:  a. m./p. m.

when you set the grouping_mark to “.”,it remains the same.

3.I didn’t discuss the date_format and time_format options to locale(). What do they do? Construct an example that shows when they might be useful.

parse_date("01/02/15", locale = locale(date_format = "%d/%m/%y"))
## [1] "2015-02-01"

4.If you live outside the US, create a new locale object that encapsulates the settings for the types of file you read most commonly.

5.What’s the difference between read_csv() and read_csv2()?

#read_csv
read_csv("a;b\n1;3",col_names=FALSE)
## # A tibble: 2 x 1
##   X1   
##   <chr>
## 1 a;b  
## 2 1;3
#read_csv2
read_csv2("a;b\n1;3",col_names=FALSE)
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## # A tibble: 2 x 2
##   X1    X2   
##   <chr> <chr>
## 1 a     b    
## 2 1     3

read_csv() is comma delimited while read_csv2() is semi-colon delimited.

6.What are the most common encodings used in Europe? What are the most common encodings used in Asia? Do some googling to find out.

#ISO-8859 - an 8 bit encoding used for most european languages.


# UTF-8- commonly used encoding in asia.

7.Generate the correct format string to parse each of the following dates and times:

#(i)
d1 <- "January 1, 2010"
parse_date(d1, "%B %d, %Y")
## [1] "2010-01-01"
#(ii)

d2 <- "2015-Mar-07"
parse_date(d2, "%Y-%b-%d")
## [1] "2015-03-07"
#(iii)
d3 <- "06-Jun-2017"
parse_date(d3, "%d-%b-%Y")
## [1] "2017-06-06"
#(iv)
d4 <- c("August 19 (2015)", "July 1 (2015)")
parse_date(d4, "%B %d (%Y)")
## [1] "2015-08-19" "2015-07-01"
#(v)
d5 <- "12/30/14" # Dec 30, 2014
parse_date(d5, "%m/%d/%y")
## [1] "2014-12-30"
#(vi)
t1 <- "1705"
parse_time(t1, "%H%M")
## 17:05:00
#(vii)
t2 <- "11:15:10.12 PM"
parse_time(t2, "%I:%M:%OS %p")
## 23:15:10.12

12.2 TIDY DATA

library(tidyverse);

12.2.1 Exercises

  1. Using prose, describe how the variables and observations are organised in each of the sample tables.

In table1,each observation has its own column

table1
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

In table2 , each row represents the country, year, and the variable type of either case or population.

table2
## # A tibble: 12 x 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583

In table3,case and count are mutated into a new variable count

 table3
## # A tibble: 6 x 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583

In table4, cases and population are represented in different tables where each row represents country, and the years 1999 and 2000

 table4a
## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766
 table4b
## # A tibble: 3 x 3
##   country         `1999`     `2000`
## * <chr>            <int>      <int>
## 1 Afghanistan   19987071   20595360
## 2 Brazil       172006362  174504898
## 3 China       1272915272 1280428583

2.Compute the rate for table2, and table4a + table4b. You will need to perform four operations: 1.Extract the number of TB cases per country per year. 2.Extract the matching population per country per year. 3.Divide cases by population, and multiply by 10000. 4.Store back in the appropriate place.

#for table 2:
countries <- filter(table2, type == 'cases')$country
years <- filter(table2, type == 'cases')$year
cases <- filter(table2, type == 'cases')$count
populations <- filter(table2, type == 'population')$count

table2_rate <- tibble(country = countries,
                      year = years,
                      rate = cases/populations * 10000)

table2_rate
## # A tibble: 6 x 3
##   country      year  rate
##   <chr>       <int> <dbl>
## 1 Afghanistan  1999 0.373
## 2 Afghanistan  2000 1.29 
## 3 Brazil       1999 2.19 
## 4 Brazil       2000 4.61 
## 5 China        1999 1.67 
## 6 China        2000 1.67
#for table 4a+ 4b
countries <- table4a$country
cases_1999 <- table4a$`1999`
cases_2000 <- table4a$`2000`
populations_1999 <- table4b$`1999`
populations_2000 <- table4b$`2000`

table_1999_rate <- tibble(country = countries,
                          year = 1999,
                          rate = cases_1999 / populations_1999 * 10000)

table_2000_rate <- tibble(country = countries,
                          year = 2000,
                          rate = cases_2000 / populations_2000 * 10000)

table4_rate <- rbind(table_1999_rate, table_2000_rate) %>% arrange(country)

table4_rate
## # A tibble: 6 x 3
##   country      year  rate
##   <chr>       <dbl> <dbl>
## 1 Afghanistan  1999 0.373
## 2 Afghanistan  2000 1.29 
## 3 Brazil       1999 2.19 
## 4 Brazil       2000 4.61 
## 5 China        1999 1.67 
## 6 China        2000 1.67

3.Recreate the plot showing change in cases over time using table2 instead of table1. What do you need to do first?

We need to first filter table2 to include only the rows for cases.

ggplot(data = filter(table2, type == 'cases'),
       mapping = aes(x = year, y= count)) +
  geom_line(mapping = aes(group = country),
            color = 'grey50') +
  geom_point(mapping = aes(color = country)) + 
  labs(y = 'cases') +
  scale_x_continuous(breaks = (c(1999,2000)))

12.3.3 Exercises

1.Why are gather() and spread() not perfectly symmetrical? Carefully consider the following example:

#initially:
stocks <- tibble(
  year   = c(2015, 2015, 2016, 2016),
  half  = c(   1,    2,     1,    2),
  return = c(1.88, 0.59, 0.92, 0.17)
)

tibble(stocks)
## # A tibble: 4 x 1
##   stocks$year $half $return
##         <dbl> <dbl>   <dbl>
## 1        2015     1    1.88
## 2        2015     2    0.59
## 3        2016     1    0.92
## 4        2016     2    0.17
stocks <- tibble(
  year   = c(2015, 2015, 2016, 2016),
  half  = c(   1,    2,     1,    2),
  return = c(1.88, 0.59, 0.92, 0.17)
)
stocks %>% 
  spread(year, return) %>% 
  gather("year", "return", `2015`:`2016`)
## # A tibble: 4 x 3
##    half year  return
##   <dbl> <chr>  <dbl>
## 1     1 2015    1.88
## 2     2 2015    0.59
## 3     1 2016    0.92
## 4     2 2016    0.17

First, the column positions have been shuffled. By default, the key and value columns are moved to the end in spread() operation.

Also,the data type for year was converted from dbl to chr. In the intermediate step, spread(), 2015 and 2016 became the names of the variables. So when using gather(), 2015 and 2016 were naturally treated as strings, and the variable type for year became chr.

To override this behaviour, we can use args CONVERT=TRUE to convert variable years from string to an integer.

stocks <- tibble(
  year   = c(2015, 2015, 2016, 2016),
  half  = c(   1,    2,     1,    2),
  return = c(1.88, 0.59, 0.92, 0.17)
)
stocks %>% 
  spread(year, return) %>% 
  gather("year", "return", `2015`:`2016`, convert= TRUE)
## # A tibble: 4 x 3
##    half  year return
##   <dbl> <int>  <dbl>
## 1     1  2015   1.88
## 2     2  2015   0.59
## 3     1  2016   0.92
## 4     2  2016   0.17

2.Why does this code fail?

#table4a %>% 
 # gather(1999, 2000, key = "year", value = #"cases")
#> Error in inds_combine(.vars, ind_list): Position must be between 0 and n

First, the values 1999 and 2000 are values for variable year and should be in single quotation marks. Second, to include 1999 and 2000, instead of using , we use : .

table4a %>% 
  gather('1999': '2000', key = "year", value = "cases")
## # A tibble: 6 x 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Brazil      1999   37737
## 3 China       1999  212258
## 4 Afghanistan 2000    2666
## 5 Brazil      2000   80488
## 6 China       2000  213766

3.Why does spreading this tibble fail? How could you add a new column to fix the problem?

There is a duplication of rows with different values hence the table cannot be spread.

people <- tribble(
  ~name,             ~key,    ~value,
  #-----------------|--------|------
  "Phillip Woods",   "age",       45,
  "Phillip Woods",   "height",   186,
  "Phillip Woods",   "age",       50,
  "Jessica Cordero", "age",       37,
  "Jessica Cordero", "height",   156
)
people$id <- c(1, 1, 2, 1, 1)
people
## # A tibble: 5 x 4
##   name            key    value    id
##   <chr>           <chr>  <dbl> <dbl>
## 1 Phillip Woods   age       45     1
## 2 Phillip Woods   height   186     1
## 3 Phillip Woods   age       50     2
## 4 Jessica Cordero age       37     1
## 5 Jessica Cordero height   156     1

we can then use spread()

people <- tribble(
  ~name,             ~key,    ~value,
  #-----------------|--------|------
  "Phillip Woods",   "age",       45,
  "Phillip Woods",   "height",   186,
  "Phillip Woods",   "age",       50,
  "Jessica Cordero", "age",       37,
  "Jessica Cordero", "height",   156
)
people$id <- c(1, 1, 2, 1, 1)

people %>% spread(key="key",value="value")
## # A tibble: 3 x 4
##   name               id   age height
##   <chr>           <dbl> <dbl>  <dbl>
## 1 Jessica Cordero     1    37    156
## 2 Phillip Woods       1    45    186
## 3 Phillip Woods       2    50     NA

4.Tidy the simple tibble below. Do you need to spread or gather it? What are the variables?

We can use gather(), and add na.rm = TRUE to remove the NA observation.

preg <- tribble(
  ~pregnant, ~male, ~female,
  "yes",     NA,    10,
  "no",      20,    12
)
preg%>% gather(key="gender",value="value", 2:3, na.rm=TRUE);
## # A tibble: 3 x 3
##   pregnant gender value
##   <chr>    <chr>  <dbl>
## 1 no       male      20
## 2 yes      female    10
## 3 no       female    12

12.4.3 Separating and Unity: Exercises

1.What do the extra and fill arguments do in separate()? Experiment with the various options for the following two toy datasets.

# extra  controls what happens when there are too many pieces
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% 
  separate(x, c("one", "two", "three"), extra='merge')
## # A tibble: 3 x 3
##   one   two   three
##   <chr> <chr> <chr>
## 1 a     b     c    
## 2 d     e     f,g  
## 3 h     i     j
#fill controls what happens when there are not enough

tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% 
  separate(x, c("one", "two", "three"), fill='right')
## # A tibble: 3 x 3
##   one   two   three
##   <chr> <chr> <chr>
## 1 a     b     c    
## 2 d     e     <NA> 
## 3 f     g     i

2.Both unite() and separate() have a remove argument. What does it do? Why would you set it to FALSE?

If set to FALSE, the original separate column, or the united columns, are retained in the output.

#If TRUE, remove input column from output data frame.

tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% 
  separate(x, c("one", "two", "three"), remove=FALSE);
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].
## # A tibble: 3 x 4
##   x     one   two   three
##   <chr> <chr> <chr> <chr>
## 1 a,b,c a     b     c    
## 2 d,e   d     e     <NA> 
## 3 f,g,i f     g     i

3.Compare and contrast separate() and extract(). Why are there three variations of separation (by position, by separator, and with groups), but only one unite?

extract() uses regluar expression to capture groups and turn groups into multiple columns.

There are many ways to separate a column into multiple columns. In contrast, there is only one way to put together multiple columns into a single column.

12.5.1 Missing Values

1.Compare and contrast the fill arguments to spread() and complete().

In spread(), all NAs will be replaced by the fill value. The fill argument only takes in one value.

In complete(), NAs under different variables can be replaced by different values. The fill argument takes in a list that specifies the values to replace NA for different variables.

2.What does the direction argument to fill() do?

The default value is down. Any NAs will be replaced by the previous non-missing value. The filling direction can be reversed if .direction is set to up.

12.6.1 Case Study

1.In this case study I set na.rm = TRUE just to make it easier to check that we had the correct values. Is this reasonable? Think about how missing values are represented in this dataset. Are there implicit missing values? What’s the difference between an NA and zero?

who %>%
  group_by(country) %>%
  summarize(year_min = min(year), year_max = max(year)) %>%
  ggplot() +
  geom_point(mapping = aes(x = country, y = year_min), color = 'red') +
  geom_point(mapping = aes(x = country , y= year_max), color = 'blue') +
  coord_flip()

There’s some countries missing values.Most countries have 34 years of recorded data, some countries have less.

To check the number of missing values(NA) in each column:

who %>% select(-c(1:4)) 
## # A tibble: 7,240 x 56
##    new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544 new_sp_m4554
##          <int>        <int>        <int>        <int>        <int>
##  1          NA           NA           NA           NA           NA
##  2          NA           NA           NA           NA           NA
##  3          NA           NA           NA           NA           NA
##  4          NA           NA           NA           NA           NA
##  5          NA           NA           NA           NA           NA
##  6          NA           NA           NA           NA           NA
##  7          NA           NA           NA           NA           NA
##  8          NA           NA           NA           NA           NA
##  9          NA           NA           NA           NA           NA
## 10          NA           NA           NA           NA           NA
## # ... with 7,230 more rows, and 51 more variables: new_sp_m5564 <int>,
## #   new_sp_m65 <int>, new_sp_f014 <int>, new_sp_f1524 <int>,
## #   new_sp_f2534 <int>, new_sp_f3544 <int>, new_sp_f4554 <int>,
## #   new_sp_f5564 <int>, new_sp_f65 <int>, new_sn_m014 <int>,
## #   new_sn_m1524 <int>, new_sn_m2534 <int>, new_sn_m3544 <int>,
## #   new_sn_m4554 <int>, new_sn_m5564 <int>, new_sn_m65 <int>,
## #   new_sn_f014 <int>, new_sn_f1524 <int>, new_sn_f2534 <int>,
## #   new_sn_f3544 <int>, new_sn_f4554 <int>, new_sn_f5564 <int>,
## #   new_sn_f65 <int>, new_ep_m014 <int>, new_ep_m1524 <int>,
## #   new_ep_m2534 <int>, new_ep_m3544 <int>, new_ep_m4554 <int>,
## #   new_ep_m5564 <int>, new_ep_m65 <int>, new_ep_f014 <int>,
## #   new_ep_f1524 <int>, new_ep_f2534 <int>, new_ep_f3544 <int>,
## #   new_ep_f4554 <int>, new_ep_f5564 <int>, new_ep_f65 <int>,
## #   newrel_m014 <int>, newrel_m1524 <int>, newrel_m2534 <int>,
## #   newrel_m3544 <int>, newrel_m4554 <int>, newrel_m5564 <int>,
## #   newrel_m65 <int>, newrel_f014 <int>, newrel_f1524 <int>,
## #   newrel_f2534 <int>, newrel_f3544 <int>, newrel_f4554 <int>,
## #   newrel_f5564 <int>, newrel_f65 <int>

2.What happens if you neglect the mutate() step? (mutate(key = stringr::str_replace(key, “newrel”, “new_rel”)))

we’ll get errors in the subsequent steps.All variable names are not consistent.

3.I claimed that iso2 and iso3 were redundant with country. Confirm this claim.

who
## # A tibble: 7,240 x 60
##    country iso2  iso3   year new_sp_m014 new_sp_m1524 new_sp_m2534
##    <chr>   <chr> <chr> <int>       <int>        <int>        <int>
##  1 Afghan~ AF    AFG    1980          NA           NA           NA
##  2 Afghan~ AF    AFG    1981          NA           NA           NA
##  3 Afghan~ AF    AFG    1982          NA           NA           NA
##  4 Afghan~ AF    AFG    1983          NA           NA           NA
##  5 Afghan~ AF    AFG    1984          NA           NA           NA
##  6 Afghan~ AF    AFG    1985          NA           NA           NA
##  7 Afghan~ AF    AFG    1986          NA           NA           NA
##  8 Afghan~ AF    AFG    1987          NA           NA           NA
##  9 Afghan~ AF    AFG    1988          NA           NA           NA
## 10 Afghan~ AF    AFG    1989          NA           NA           NA
## # ... with 7,230 more rows, and 53 more variables: new_sp_m3544 <int>,
## #   new_sp_m4554 <int>, new_sp_m5564 <int>, new_sp_m65 <int>,
## #   new_sp_f014 <int>, new_sp_f1524 <int>, new_sp_f2534 <int>,
## #   new_sp_f3544 <int>, new_sp_f4554 <int>, new_sp_f5564 <int>,
## #   new_sp_f65 <int>, new_sn_m014 <int>, new_sn_m1524 <int>,
## #   new_sn_m2534 <int>, new_sn_m3544 <int>, new_sn_m4554 <int>,
## #   new_sn_m5564 <int>, new_sn_m65 <int>, new_sn_f014 <int>,
## #   new_sn_f1524 <int>, new_sn_f2534 <int>, new_sn_f3544 <int>,
## #   new_sn_f4554 <int>, new_sn_f5564 <int>, new_sn_f65 <int>,
## #   new_ep_m014 <int>, new_ep_m1524 <int>, new_ep_m2534 <int>,
## #   new_ep_m3544 <int>, new_ep_m4554 <int>, new_ep_m5564 <int>,
## #   new_ep_m65 <int>, new_ep_f014 <int>, new_ep_f1524 <int>,
## #   new_ep_f2534 <int>, new_ep_f3544 <int>, new_ep_f4554 <int>,
## #   new_ep_f5564 <int>, new_ep_f65 <int>, newrel_m014 <int>,
## #   newrel_m1524 <int>, newrel_m2534 <int>, newrel_m3544 <int>,
## #   newrel_m4554 <int>, newrel_m5564 <int>, newrel_m65 <int>,
## #   newrel_f014 <int>, newrel_f1524 <int>, newrel_f2534 <int>,
## #   newrel_f3544 <int>, newrel_f4554 <int>, newrel_f5564 <int>,
## #   newrel_f65 <int>
# to check if two coulumns are identical
if("iso2"=="iso3"){
  print("iso2 and iso3 are identical")
}else {print("not identical")}
## [1] "not identical"

The two columns are not identical but redundant.

#to check if the columns are redundant
who %>% select(2:3);
## # A tibble: 7,240 x 2
##    iso2  iso3 
##    <chr> <chr>
##  1 AF    AFG  
##  2 AF    AFG  
##  3 AF    AFG  
##  4 AF    AFG  
##  5 AF    AFG  
##  6 AF    AFG  
##  7 AF    AFG  
##  8 AF    AFG  
##  9 AF    AFG  
## 10 AF    AFG  
## # ... with 7,230 more rows

4.For each country, year, and sex compute the total number of cases of TB. Make an informative visualisation of the data.

who %>%
  gather(code, value, new_sp_m014:newrel_f65, na.rm = TRUE) %>% 
  mutate(code = stringr::str_replace(code, "newrel", "new_rel")) %>%
  separate(code, c("new", "var", "sexage")) %>% 
  select(-new, -iso2, -iso3) %>% 
  separate(sexage, c("sex", "age"), sep = 1) %>%
  group_by(country, year, sex) %>%
  summarize(total_case = sum(value)) %>%
  unite(country_sex, country, sex, remove = FALSE) %>%
  ggplot() +
  geom_line(mapping = aes(x = year, y = total_case, color = sex,group = country_sex))