library(tidyverse)
## -- Attaching packages ------ tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'tidyr' was built under R version 3.6.2
## -- Conflicts --------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
1.How can you tell if an object is a tibble?
tibble::as_tibble(mtcars);
## # A tibble: 32 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
## 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
## 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
## 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
## # ... with 22 more rows
print(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
an object is a tibble if the first 10 rows are what are shown
2.Compare and contrast the following operations on a data.frame and equivalent tibble. What is different? Why might the default data frame behaviours cause you frustration?
#df
df <- data.frame(abc = 1, xyz = "a")
df$x
## [1] a
## Levels: a
df[, "xyz"]
## [1] a
## Levels: a
df[, c("abc", "xyz")]
## abc xyz
## 1 1 a
#tibble
df<-tibble(
abc=1,
xyz="a"
)
df$x
## Warning: Unknown or uninitialised column: 'x'.
## NULL
df[, "xyz"]
## # A tibble: 1 x 1
## xyz
## <chr>
## 1 a
df[, c("abc","xyz")]
## # A tibble: 1 x 2
## abc xyz
## <dbl> <chr>
## 1 1 a
The difference is that tibble tells you whether there are unknown columns while data frame does not.
3.If you have the name of a variable stored in an object, e.g. var <- “mpg”, how can you extract the reference variable from a tibble?
tb_cars<-as.tibble(mtcars);
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
var<- "mpg";
tb_cars[var];
## # A tibble: 32 x 1
## mpg
## <dbl>
## 1 21
## 2 21
## 3 22.8
## 4 21.4
## 5 18.7
## 6 18.1
## 7 14.3
## 8 24.4
## 9 22.8
## 10 19.2
## # ... with 22 more rows
4.Practice referring to non-syntactic names in the following data frame by:
annoying <- tibble(
`1` = 1:10,
`2` = `1` * 2 + rnorm(length(`1`))
)
1.Extracting the variable called 1
annoying['1']
## # A tibble: 10 x 1
## `1`
## <int>
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## 7 7
## 8 8
## 9 9
## 10 10
2.Plotting a scatterplot of 1 vs 2
ggplot(data=annoying)+
geom_point(aes(x=`1`, y=`2`));
3.Creating a new column 3 which is 2 divided by 1
mutate(annoying, `3`=`2`/`1`);
## # A tibble: 10 x 3
## `1` `2` `3`
## <int> <dbl> <dbl>
## 1 1 2.98 2.98
## 2 2 3.33 1.67
## 3 3 5.62 1.87
## 4 4 10.4 2.60
## 5 5 10.7 2.13
## 6 6 12.0 1.99
## 7 7 14.5 2.07
## 8 8 17.3 2.16
## 9 9 15.9 1.77
## 10 10 19.9 1.99
4.Renaming the columns to one, two and three.
#transmute(annoying,one=`1`,two = `2`, three = `3`)
5.What does tibble::enframe() do? When might you use it?
#enframe() converts named atomic vectors or lists to one- or two-column data frames
#?enframe;
6.What option controls how many additional column names are printed at the footer of a tibble?
# tibble.max_extra_cols
1.What function would you use to read a file where fields were separated with “|”?
#read_delim()
2.Apart from file, skip, and comment, what other arguments do read_csv() and read_tsv() have in common?
#Both function just call 'read_delim()'
3.What are the most important arguments to read_fwf()?
#row_names
#col_names
4.Sometimes strings in a CSV file contain commas. To prevent them from causing problems they need to be surrounded by a quoting character, like " or ’. By convention, read_csv() assumes that the quoting character will be ", and if you want to change it you’ll need to use read_delim() instead. What arguments do you need to specify to read the following text into a data frame?
read_csv("x,y\n1,a,b",col_names = c("a","b"));
## Warning: 1 parsing failure.
## row col expected actual file
## 2 -- 2 columns 3 columns literal data
## # A tibble: 2 x 2
## a b
## <chr> <chr>
## 1 x y
## 2 1 a
5.Identify what is wrong with each of the following inline CSV files. What happens when you run the code?
No arguments have been passed. (i).
read_csv("a,b\n1,2,3\n4,5,6", na=
".",col_names=FALSE)
## Warning: 2 parsing failures.
## row col expected actual file
## 2 -- 2 columns 3 columns literal data
## 3 -- 2 columns 3 columns literal data
## # A tibble: 3 x 2
## X1 X2
## <chr> <chr>
## 1 a b
## 2 1 2
## 3 4 5
(ii).
read_csv("a,b,c\n1,2\n1,2,3,4", na=".", col_names=FALSE);
## Warning: 2 parsing failures.
## row col expected actual file
## 2 -- 3 columns 2 columns literal data
## 3 -- 3 columns 4 columns literal data
## # A tibble: 3 x 3
## X1 X2 X3
## <chr> <chr> <chr>
## 1 a b c
## 2 1 2 <NA>
## 3 1 2 3
(iii).
read_csv("a,b\n\"1", col_names=FALSE);
## Warning: 2 parsing failures.
## row col expected actual file
## 2 X1 closing quote at end of file literal data
## 2 -- 2 columns 1 columns literal data
## # A tibble: 2 x 2
## X1 X2
## <chr> <chr>
## 1 a b
## 2 1 <NA>
(iv).
read_csv("a,b\n1,2\na,b",na=".", col_names=FALSE)
## # A tibble: 3 x 2
## X1 X2
## <chr> <chr>
## 1 a b
## 2 1 2
## 3 a b
(v).
read_csv("a;b\n1;3",col_names=FALSE)
## # A tibble: 2 x 1
## X1
## <chr>
## 1 a;b
## 2 1;3
11.3 PARSING A VECTOR
1.What are the most important arguments to locale()?
#encoding=""
2.What happens if you try and set decimal_mark and grouping_mark to the same character? What happens to the default value of grouping_mark when you set decimal_mark to “,”? What happens to the default value of decimal_mark when you set the grouping_mark to “.”?
-If you set decimal_mark and grouping_mark to the same character,the execution will be halted,they are supposed to be different.
#decimal_mark=","
locale("es", decimal_mark = ",")
## <locale>
## Numbers: 123.456,78
## Formats: %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days: domingo (dom.), lunes (lun.), martes (mar.), miércoles (mié.),
## jueves (jue.), viernes (vie.), sábado (sáb.)
## Months: enero (ene.), febrero (feb.), marzo (mar.), abril (abr.), mayo
## (may.), junio (jun.), julio (jul.), agosto (ago.),
## septiembre (sept.), octubre (oct.), noviembre (nov.),
## diciembre (dic.)
## AM/PM: a. m./p. m.
#grouping_mark=","
locale("es", decimal_mark = ".")
## <locale>
## Numbers: 123,456.78
## Formats: %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days: domingo (dom.), lunes (lun.), martes (mar.), miércoles (mié.),
## jueves (jue.), viernes (vie.), sábado (sáb.)
## Months: enero (ene.), febrero (feb.), marzo (mar.), abril (abr.), mayo
## (may.), junio (jun.), julio (jul.), agosto (ago.),
## septiembre (sept.), octubre (oct.), noviembre (nov.),
## diciembre (dic.)
## AM/PM: a. m./p. m.
when you set the grouping_mark to “.”,it remains the same.
3.I didn’t discuss the date_format and time_format options to locale(). What do they do? Construct an example that shows when they might be useful.
parse_date("01/02/15", locale = locale(date_format = "%d/%m/%y"))
## [1] "2015-02-01"
4.If you live outside the US, create a new locale object that encapsulates the settings for the types of file you read most commonly.
5.What’s the difference between read_csv() and read_csv2()?
#read_csv
read_csv("a;b\n1;3",col_names=FALSE)
## # A tibble: 2 x 1
## X1
## <chr>
## 1 a;b
## 2 1;3
#read_csv2
read_csv2("a;b\n1;3",col_names=FALSE)
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## # A tibble: 2 x 2
## X1 X2
## <chr> <chr>
## 1 a b
## 2 1 3
read_csv() is comma delimited while read_csv2() is semi-colon delimited.
6.What are the most common encodings used in Europe? What are the most common encodings used in Asia? Do some googling to find out.
#ISO-8859 - an 8 bit encoding used for most european languages.
# UTF-8- commonly used encoding in asia.
7.Generate the correct format string to parse each of the following dates and times:
#(i)
d1 <- "January 1, 2010"
parse_date(d1, "%B %d, %Y")
## [1] "2010-01-01"
#(ii)
d2 <- "2015-Mar-07"
parse_date(d2, "%Y-%b-%d")
## [1] "2015-03-07"
#(iii)
d3 <- "06-Jun-2017"
parse_date(d3, "%d-%b-%Y")
## [1] "2017-06-06"
#(iv)
d4 <- c("August 19 (2015)", "July 1 (2015)")
parse_date(d4, "%B %d (%Y)")
## [1] "2015-08-19" "2015-07-01"
#(v)
d5 <- "12/30/14" # Dec 30, 2014
parse_date(d5, "%m/%d/%y")
## [1] "2014-12-30"
#(vi)
t1 <- "1705"
parse_time(t1, "%H%M")
## 17:05:00
#(vii)
t2 <- "11:15:10.12 PM"
parse_time(t2, "%I:%M:%OS %p")
## 23:15:10.12
12.2 TIDY DATA
library(tidyverse);
12.2.1 Exercises
In table1,each observation has its own column
table1
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
In table2 , each row represents the country, year, and the variable type of either case or population.
table2
## # A tibble: 12 x 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
In table3,case and count are mutated into a new variable count
table3
## # A tibble: 6 x 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
In table4, cases and population are represented in different tables where each row represents country, and the years 1999 and 2000
table4a
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
table4b
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 19987071 20595360
## 2 Brazil 172006362 174504898
## 3 China 1272915272 1280428583
2.Compute the rate for table2, and table4a + table4b. You will need to perform four operations: 1.Extract the number of TB cases per country per year. 2.Extract the matching population per country per year. 3.Divide cases by population, and multiply by 10000. 4.Store back in the appropriate place.
#for table 2:
countries <- filter(table2, type == 'cases')$country
years <- filter(table2, type == 'cases')$year
cases <- filter(table2, type == 'cases')$count
populations <- filter(table2, type == 'population')$count
table2_rate <- tibble(country = countries,
year = years,
rate = cases/populations * 10000)
table2_rate
## # A tibble: 6 x 3
## country year rate
## <chr> <int> <dbl>
## 1 Afghanistan 1999 0.373
## 2 Afghanistan 2000 1.29
## 3 Brazil 1999 2.19
## 4 Brazil 2000 4.61
## 5 China 1999 1.67
## 6 China 2000 1.67
#for table 4a+ 4b
countries <- table4a$country
cases_1999 <- table4a$`1999`
cases_2000 <- table4a$`2000`
populations_1999 <- table4b$`1999`
populations_2000 <- table4b$`2000`
table_1999_rate <- tibble(country = countries,
year = 1999,
rate = cases_1999 / populations_1999 * 10000)
table_2000_rate <- tibble(country = countries,
year = 2000,
rate = cases_2000 / populations_2000 * 10000)
table4_rate <- rbind(table_1999_rate, table_2000_rate) %>% arrange(country)
table4_rate
## # A tibble: 6 x 3
## country year rate
## <chr> <dbl> <dbl>
## 1 Afghanistan 1999 0.373
## 2 Afghanistan 2000 1.29
## 3 Brazil 1999 2.19
## 4 Brazil 2000 4.61
## 5 China 1999 1.67
## 6 China 2000 1.67
3.Recreate the plot showing change in cases over time using table2 instead of table1. What do you need to do first?
We need to first filter table2 to include only the rows for cases.
ggplot(data = filter(table2, type == 'cases'),
mapping = aes(x = year, y= count)) +
geom_line(mapping = aes(group = country),
color = 'grey50') +
geom_point(mapping = aes(color = country)) +
labs(y = 'cases') +
scale_x_continuous(breaks = (c(1999,2000)))
12.3.3 Exercises
1.Why are gather() and spread() not perfectly symmetrical? Carefully consider the following example:
#initially:
stocks <- tibble(
year = c(2015, 2015, 2016, 2016),
half = c( 1, 2, 1, 2),
return = c(1.88, 0.59, 0.92, 0.17)
)
tibble(stocks)
## # A tibble: 4 x 1
## stocks$year $half $return
## <dbl> <dbl> <dbl>
## 1 2015 1 1.88
## 2 2015 2 0.59
## 3 2016 1 0.92
## 4 2016 2 0.17
stocks <- tibble(
year = c(2015, 2015, 2016, 2016),
half = c( 1, 2, 1, 2),
return = c(1.88, 0.59, 0.92, 0.17)
)
stocks %>%
spread(year, return) %>%
gather("year", "return", `2015`:`2016`)
## # A tibble: 4 x 3
## half year return
## <dbl> <chr> <dbl>
## 1 1 2015 1.88
## 2 2 2015 0.59
## 3 1 2016 0.92
## 4 2 2016 0.17
First, the column positions have been shuffled. By default, the key and value columns are moved to the end in spread() operation.
Also,the data type for year was converted from dbl to chr. In the intermediate step, spread(), 2015 and 2016 became the names of the variables. So when using gather(), 2015 and 2016 were naturally treated as strings, and the variable type for year became chr.
To override this behaviour, we can use args CONVERT=TRUE to convert variable years from string to an integer.
stocks <- tibble(
year = c(2015, 2015, 2016, 2016),
half = c( 1, 2, 1, 2),
return = c(1.88, 0.59, 0.92, 0.17)
)
stocks %>%
spread(year, return) %>%
gather("year", "return", `2015`:`2016`, convert= TRUE)
## # A tibble: 4 x 3
## half year return
## <dbl> <int> <dbl>
## 1 1 2015 1.88
## 2 2 2015 0.59
## 3 1 2016 0.92
## 4 2 2016 0.17
2.Why does this code fail?
#table4a %>%
# gather(1999, 2000, key = "year", value = #"cases")
#> Error in inds_combine(.vars, ind_list): Position must be between 0 and n
First, the values 1999 and 2000 are values for variable year and should be in single quotation marks. Second, to include 1999 and 2000, instead of using , we use : .
table4a %>%
gather('1999': '2000', key = "year", value = "cases")
## # A tibble: 6 x 3
## country year cases
## <chr> <chr> <int>
## 1 Afghanistan 1999 745
## 2 Brazil 1999 37737
## 3 China 1999 212258
## 4 Afghanistan 2000 2666
## 5 Brazil 2000 80488
## 6 China 2000 213766
3.Why does spreading this tibble fail? How could you add a new column to fix the problem?
There is a duplication of rows with different values hence the table cannot be spread.
people <- tribble(
~name, ~key, ~value,
#-----------------|--------|------
"Phillip Woods", "age", 45,
"Phillip Woods", "height", 186,
"Phillip Woods", "age", 50,
"Jessica Cordero", "age", 37,
"Jessica Cordero", "height", 156
)
people$id <- c(1, 1, 2, 1, 1)
people
## # A tibble: 5 x 4
## name key value id
## <chr> <chr> <dbl> <dbl>
## 1 Phillip Woods age 45 1
## 2 Phillip Woods height 186 1
## 3 Phillip Woods age 50 2
## 4 Jessica Cordero age 37 1
## 5 Jessica Cordero height 156 1
we can then use spread()
people <- tribble(
~name, ~key, ~value,
#-----------------|--------|------
"Phillip Woods", "age", 45,
"Phillip Woods", "height", 186,
"Phillip Woods", "age", 50,
"Jessica Cordero", "age", 37,
"Jessica Cordero", "height", 156
)
people$id <- c(1, 1, 2, 1, 1)
people %>% spread(key="key",value="value")
## # A tibble: 3 x 4
## name id age height
## <chr> <dbl> <dbl> <dbl>
## 1 Jessica Cordero 1 37 156
## 2 Phillip Woods 1 45 186
## 3 Phillip Woods 2 50 NA
4.Tidy the simple tibble below. Do you need to spread or gather it? What are the variables?
We can use gather(), and add na.rm = TRUE to remove the NA observation.
preg <- tribble(
~pregnant, ~male, ~female,
"yes", NA, 10,
"no", 20, 12
)
preg%>% gather(key="gender",value="value", 2:3, na.rm=TRUE);
## # A tibble: 3 x 3
## pregnant gender value
## <chr> <chr> <dbl>
## 1 no male 20
## 2 yes female 10
## 3 no female 12
12.4.3 Separating and Unity: Exercises
1.What do the extra and fill arguments do in separate()? Experiment with the various options for the following two toy datasets.
# extra controls what happens when there are too many pieces
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
separate(x, c("one", "two", "three"), extra='merge')
## # A tibble: 3 x 3
## one two three
## <chr> <chr> <chr>
## 1 a b c
## 2 d e f,g
## 3 h i j
#fill controls what happens when there are not enough
tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
separate(x, c("one", "two", "three"), fill='right')
## # A tibble: 3 x 3
## one two three
## <chr> <chr> <chr>
## 1 a b c
## 2 d e <NA>
## 3 f g i
2.Both unite() and separate() have a remove argument. What does it do? Why would you set it to FALSE?
If set to FALSE, the original separate column, or the united columns, are retained in the output.
#If TRUE, remove input column from output data frame.
tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
separate(x, c("one", "two", "three"), remove=FALSE);
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].
## # A tibble: 3 x 4
## x one two three
## <chr> <chr> <chr> <chr>
## 1 a,b,c a b c
## 2 d,e d e <NA>
## 3 f,g,i f g i
3.Compare and contrast separate() and extract(). Why are there three variations of separation (by position, by separator, and with groups), but only one unite?
extract() uses regluar expression to capture groups and turn groups into multiple columns.
There are many ways to separate a column into multiple columns. In contrast, there is only one way to put together multiple columns into a single column.
12.5.1 Missing Values
1.Compare and contrast the fill arguments to spread() and complete().
In spread(), all NAs will be replaced by the fill value. The fill argument only takes in one value.
In complete(), NAs under different variables can be replaced by different values. The fill argument takes in a list that specifies the values to replace NA for different variables.
2.What does the direction argument to fill() do?
The default value is down. Any NAs will be replaced by the previous non-missing value. The filling direction can be reversed if .direction is set to up.
12.6.1 Case Study
1.In this case study I set na.rm = TRUE just to make it easier to check that we had the correct values. Is this reasonable? Think about how missing values are represented in this dataset. Are there implicit missing values? What’s the difference between an NA and zero?
who %>%
group_by(country) %>%
summarize(year_min = min(year), year_max = max(year)) %>%
ggplot() +
geom_point(mapping = aes(x = country, y = year_min), color = 'red') +
geom_point(mapping = aes(x = country , y= year_max), color = 'blue') +
coord_flip()
There’s some countries missing values.Most countries have 34 years of recorded data, some countries have less.
To check the number of missing values(NA) in each column:
who %>% select(-c(1:4))
## # A tibble: 7,240 x 56
## new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544 new_sp_m4554
## <int> <int> <int> <int> <int>
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## 4 NA NA NA NA NA
## 5 NA NA NA NA NA
## 6 NA NA NA NA NA
## 7 NA NA NA NA NA
## 8 NA NA NA NA NA
## 9 NA NA NA NA NA
## 10 NA NA NA NA NA
## # ... with 7,230 more rows, and 51 more variables: new_sp_m5564 <int>,
## # new_sp_m65 <int>, new_sp_f014 <int>, new_sp_f1524 <int>,
## # new_sp_f2534 <int>, new_sp_f3544 <int>, new_sp_f4554 <int>,
## # new_sp_f5564 <int>, new_sp_f65 <int>, new_sn_m014 <int>,
## # new_sn_m1524 <int>, new_sn_m2534 <int>, new_sn_m3544 <int>,
## # new_sn_m4554 <int>, new_sn_m5564 <int>, new_sn_m65 <int>,
## # new_sn_f014 <int>, new_sn_f1524 <int>, new_sn_f2534 <int>,
## # new_sn_f3544 <int>, new_sn_f4554 <int>, new_sn_f5564 <int>,
## # new_sn_f65 <int>, new_ep_m014 <int>, new_ep_m1524 <int>,
## # new_ep_m2534 <int>, new_ep_m3544 <int>, new_ep_m4554 <int>,
## # new_ep_m5564 <int>, new_ep_m65 <int>, new_ep_f014 <int>,
## # new_ep_f1524 <int>, new_ep_f2534 <int>, new_ep_f3544 <int>,
## # new_ep_f4554 <int>, new_ep_f5564 <int>, new_ep_f65 <int>,
## # newrel_m014 <int>, newrel_m1524 <int>, newrel_m2534 <int>,
## # newrel_m3544 <int>, newrel_m4554 <int>, newrel_m5564 <int>,
## # newrel_m65 <int>, newrel_f014 <int>, newrel_f1524 <int>,
## # newrel_f2534 <int>, newrel_f3544 <int>, newrel_f4554 <int>,
## # newrel_f5564 <int>, newrel_f65 <int>
2.What happens if you neglect the mutate() step? (mutate(key = stringr::str_replace(key, “newrel”, “new_rel”)))
we’ll get errors in the subsequent steps.All variable names are not consistent.
3.I claimed that iso2 and iso3 were redundant with country. Confirm this claim.
who
## # A tibble: 7,240 x 60
## country iso2 iso3 year new_sp_m014 new_sp_m1524 new_sp_m2534
## <chr> <chr> <chr> <int> <int> <int> <int>
## 1 Afghan~ AF AFG 1980 NA NA NA
## 2 Afghan~ AF AFG 1981 NA NA NA
## 3 Afghan~ AF AFG 1982 NA NA NA
## 4 Afghan~ AF AFG 1983 NA NA NA
## 5 Afghan~ AF AFG 1984 NA NA NA
## 6 Afghan~ AF AFG 1985 NA NA NA
## 7 Afghan~ AF AFG 1986 NA NA NA
## 8 Afghan~ AF AFG 1987 NA NA NA
## 9 Afghan~ AF AFG 1988 NA NA NA
## 10 Afghan~ AF AFG 1989 NA NA NA
## # ... with 7,230 more rows, and 53 more variables: new_sp_m3544 <int>,
## # new_sp_m4554 <int>, new_sp_m5564 <int>, new_sp_m65 <int>,
## # new_sp_f014 <int>, new_sp_f1524 <int>, new_sp_f2534 <int>,
## # new_sp_f3544 <int>, new_sp_f4554 <int>, new_sp_f5564 <int>,
## # new_sp_f65 <int>, new_sn_m014 <int>, new_sn_m1524 <int>,
## # new_sn_m2534 <int>, new_sn_m3544 <int>, new_sn_m4554 <int>,
## # new_sn_m5564 <int>, new_sn_m65 <int>, new_sn_f014 <int>,
## # new_sn_f1524 <int>, new_sn_f2534 <int>, new_sn_f3544 <int>,
## # new_sn_f4554 <int>, new_sn_f5564 <int>, new_sn_f65 <int>,
## # new_ep_m014 <int>, new_ep_m1524 <int>, new_ep_m2534 <int>,
## # new_ep_m3544 <int>, new_ep_m4554 <int>, new_ep_m5564 <int>,
## # new_ep_m65 <int>, new_ep_f014 <int>, new_ep_f1524 <int>,
## # new_ep_f2534 <int>, new_ep_f3544 <int>, new_ep_f4554 <int>,
## # new_ep_f5564 <int>, new_ep_f65 <int>, newrel_m014 <int>,
## # newrel_m1524 <int>, newrel_m2534 <int>, newrel_m3544 <int>,
## # newrel_m4554 <int>, newrel_m5564 <int>, newrel_m65 <int>,
## # newrel_f014 <int>, newrel_f1524 <int>, newrel_f2534 <int>,
## # newrel_f3544 <int>, newrel_f4554 <int>, newrel_f5564 <int>,
## # newrel_f65 <int>
# to check if two coulumns are identical
if("iso2"=="iso3"){
print("iso2 and iso3 are identical")
}else {print("not identical")}
## [1] "not identical"
The two columns are not identical but redundant.
#to check if the columns are redundant
who %>% select(2:3);
## # A tibble: 7,240 x 2
## iso2 iso3
## <chr> <chr>
## 1 AF AFG
## 2 AF AFG
## 3 AF AFG
## 4 AF AFG
## 5 AF AFG
## 6 AF AFG
## 7 AF AFG
## 8 AF AFG
## 9 AF AFG
## 10 AF AFG
## # ... with 7,230 more rows
4.For each country, year, and sex compute the total number of cases of TB. Make an informative visualisation of the data.
who %>%
gather(code, value, new_sp_m014:newrel_f65, na.rm = TRUE) %>%
mutate(code = stringr::str_replace(code, "newrel", "new_rel")) %>%
separate(code, c("new", "var", "sexage")) %>%
select(-new, -iso2, -iso3) %>%
separate(sexage, c("sex", "age"), sep = 1) %>%
group_by(country, year, sex) %>%
summarize(total_case = sum(value)) %>%
unite(country_sex, country, sex, remove = FALSE) %>%
ggplot() +
geom_line(mapping = aes(x = year, y = total_case, color = sex,group = country_sex))