Introduction to Tibble

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

*Convert dataframe to tibble

library(tibble)

df<-as_tibble(iris)

head(df)
## # A tibble: 6 x 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl>  <fctr>
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

*Creating a tibble

df1<-tibble(x=rnorm(10), y=runif(10),Ab=sample(LETTERS,replace = T,size=10))

head(df1)
## # A tibble: 6 x 3
##            x          y    Ab
##        <dbl>      <dbl> <chr>
## 1  0.6783612 0.85004568     G
## 2  0.2780036 0.08800381     X
## 3  0.9491178 0.03994298     C
## 4  1.7202164 0.94860915     W
## 5  1.2657255 0.72267166     C
## 6 -1.6533803 0.33113699     O

*Tibble with date time

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
df2<-tibble(Time=lubridate::now() + runif(20)*23, Date=lubridate::today() + runif(20)*10, myletter=sample(letters,size=20,replace = T))

head(df2)
## # A tibble: 6 x 3
##                  Time       Date myletter
##                <dttm>     <date>    <chr>
## 1 2017-09-22 20:52:33 2017-09-24        z
## 2 2017-09-22 20:52:15 2017-09-26        p
## 3 2017-09-22 20:52:18 2017-09-30        o
## 4 2017-09-22 20:52:14 2017-10-01        m
## 5 2017-09-22 20:52:32 2017-09-26        p
## 6 2017-09-22 20:52:17 2017-09-25        p

*Converting character to numneric

library(readr)

a<-parse_double("1234")

class(a)
## [1] "numeric"
# This method ignores all charactors

a<-parse_number("This is cost $234")

a
## [1] 234
# Locate and add decimals
parse_double("1,23", locale = locale(decimal_mark = ","))
## [1] 1.23
parse_number("123.456.789", locale = locale(grouping_mark = "."))
## [1] 123456789

Characters

charToRaw("Tuyen")
## [1] 54 75 79 65 6e

UTF-8 is the most common encoding used

x1 <- "El Ni\xf1o was particularly bad this year"
x2 <- "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"
x1 # We don't understand this language
## [1] "El Niño was particularly bad this year"
x2 #We don't understand this language
## [1] "<U+0082>±<U+0082>ñ<U+0082>É<U+0082>¿<U+0082>Í"

*Guessing encode of a data file

df<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/LungCapData.csv",sep=";")

head(df)
##   LungCap Age Height Smoke Gender Caesarean
## 1   6.475   6   62.1    no   male        no
## 2  10.125  18   74.7   yes female        no
## 3   9.550  16   69.7    no female       yes
## 4  11.125  14   71.0    no   male        no
## 5   4.800   5   56.9    no   male        no
## 6   6.225  11   58.7    no female        no
guess_encoding(df) #This dataset is used ASCII
## Warning in if (stringi::stri_enc_isascii(lines)) {: the condition has
## length > 1 and only the first element will be used
## # A tibble: 1 x 2
##   encoding confidence
##      <chr>      <dbl>
## 1    ASCII          1
guess_encoding(charToRaw(x1)) # Two encodings popped up with the first be more confident
## # A tibble: 2 x 2
##     encoding confidence
##        <chr>      <dbl>
## 1 ISO-8859-1       0.46
## 2 ISO-8859-9       0.23
parse_character(x1, locale = locale(encoding = "ISO-8859-1"))
## [1] "El Niño was particularly bad this year"

*parse_datetime() will organize date time from year, month,…second

parse_datetime("20100928") # date 
## [1] "2010-09-28 UTC"
parse_datetime("20100928T2059") # Both second, year...
## [1] "2010-09-28 20:59:00 UTC"
# convert date times month, day, and year

parse_date("01/02/15", "%m/%d/%y")
## [1] "2015-01-02"
parse_date("01/02/15", "%d/%m/%y") # day, month and year
## [1] "2015-02-01"
parse_date("01/02/15", "%y/%m/%d") # year, month and day
## [1] "2001-02-15"
challenge <- read_csv(readr_example("challenge.csv"))
## Parsed with column specification:
## cols(
##   x = col_integer(),
##   y = col_character()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 1000 parsing failures.
## row # A tibble: 5 x 5 col     row   col               expected             actual expected   <int> <chr>                  <chr>              <chr> actual 1  1001     x no trailing characters .23837975086644292 file 2  1002     x no trailing characters .41167997173033655 row 3  1003     x no trailing characters  .7460716762579978 col 4  1004     x no trailing characters   .723450553836301 expected 5  1005     x no trailing characters   .614524137461558 actual # ... with 1 more variables: file <chr>
## ... ................. ... ....................................................... ........ ....................................................... ...... ....................................................... .... ....................................................... ... ....................................................... ... ....................................................... ........ ....................................................... ...... .......................................
## See problems(...) for more details.
# Finding solution

problems(challenge)
## # A tibble: 1,000 x 5
##      row   col               expected             actual
##    <int> <chr>                  <chr>              <chr>
##  1  1001     x no trailing characters .23837975086644292
##  2  1002     x no trailing characters .41167997173033655
##  3  1003     x no trailing characters  .7460716762579978
##  4  1004     x no trailing characters   .723450553836301
##  5  1005     x no trailing characters   .614524137461558
##  6  1006     x no trailing characters   .473980569280684
##  7  1007     x no trailing characters  .5784610391128808
##  8  1008     x no trailing characters  .2415937229525298
##  9  1009     x no trailing characters .11437866208143532
## 10  1010     x no trailing characters  .2983446326106787
## # ... with 990 more rows, and 1 more variables: file <chr>
# Fixing the problem
hallenge <- read_csv(
  readr_example("challenge.csv"), 
  col_types = cols(
    x = col_integer(),
    y = col_character()
  )
)
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 1000 parsing failures.
## row # A tibble: 5 x 5 col     row   col               expected             actual expected   <int> <chr>                  <chr>              <chr> actual 1  1001     x no trailing characters .23837975086644292 file 2  1002     x no trailing characters .41167997173033655 row 3  1003     x no trailing characters  .7460716762579978 col 4  1004     x no trailing characters   .723450553836301 expected 5  1005     x no trailing characters   .614524137461558 actual # ... with 1 more variables: file <chr>
## ... ................. ... ....................................................... ........ ....................................................... ...... ....................................................... .... ....................................................... ... ....................................................... ... ....................................................... ........ ....................................................... ...... .......................................
## See problems(...) for more details.
#

hallenge <- read_csv(
  readr_example("challenge.csv"), 
  col_types = cols(
    x = col_double(),
    y = col_character()
  )
)

head(hallenge)
## # A tibble: 6 x 2
##       x     y
##   <dbl> <chr>
## 1   404  <NA>
## 2  4172  <NA>
## 3  3004  <NA>
## 4   787  <NA>
## 5    37  <NA>
## 6  2332  <NA>

*Data Cleaning

Data used in this demo can be viewed from WHO

library(tidyr)

head(who)
## # A tibble: 6 x 60
##       country  iso2  iso3  year new_sp_m014 new_sp_m1524 new_sp_m2534
##         <chr> <chr> <chr> <int>       <int>        <int>        <int>
## 1 Afghanistan    AF   AFG  1980          NA           NA           NA
## 2 Afghanistan    AF   AFG  1981          NA           NA           NA
## 3 Afghanistan    AF   AFG  1982          NA           NA           NA
## 4 Afghanistan    AF   AFG  1983          NA           NA           NA
## 5 Afghanistan    AF   AFG  1984          NA           NA           NA
## 6 Afghanistan    AF   AFG  1985          NA           NA           NA
## # ... with 53 more variables: new_sp_m3544 <int>, new_sp_m4554 <int>,
## #   new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,
## #   new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
## #   new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
## #   new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
## #   new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
## #   new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
## #   new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,
## #   new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,
## #   new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,
## #   new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
## #   new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,
## #   new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,
## #   new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,
## #   newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,
## #   newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,
## #   newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,
## #   newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>
tail(who)
## # A tibble: 6 x 60
##    country  iso2  iso3  year new_sp_m014 new_sp_m1524 new_sp_m2534
##      <chr> <chr> <chr> <int>       <int>        <int>        <int>
## 1 Zimbabwe    ZW   ZWE  2008         127          614            0
## 2 Zimbabwe    ZW   ZWE  2009         125          578           NA
## 3 Zimbabwe    ZW   ZWE  2010         150          710         2208
## 4 Zimbabwe    ZW   ZWE  2011         152          784         2467
## 5 Zimbabwe    ZW   ZWE  2012         120          783         2421
## 6 Zimbabwe    ZW   ZWE  2013          NA           NA           NA
## # ... with 53 more variables: new_sp_m3544 <int>, new_sp_m4554 <int>,
## #   new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,
## #   new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
## #   new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
## #   new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
## #   new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
## #   new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
## #   new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,
## #   new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,
## #   new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,
## #   new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
## #   new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,
## #   new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,
## #   new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,
## #   newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,
## #   newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,
## #   newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,
## #   newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>
# Checking the number of column and rows

dim(who)
## [1] 7240   60
# It is good to give column with a meaningfull name

?who
## starting httpd help server ...
##  done
df<-who # making a copy of `who`

df<-df[,-3] # Only keeping one country code

names(df)[2]<-"Country_ID"

head(df)
## # A tibble: 6 x 59
##       country Country_ID  year new_sp_m014 new_sp_m1524 new_sp_m2534
##         <chr>      <chr> <int>       <int>        <int>        <int>
## 1 Afghanistan         AF  1980          NA           NA           NA
## 2 Afghanistan         AF  1981          NA           NA           NA
## 3 Afghanistan         AF  1982          NA           NA           NA
## 4 Afghanistan         AF  1983          NA           NA           NA
## 5 Afghanistan         AF  1984          NA           NA           NA
## 6 Afghanistan         AF  1985          NA           NA           NA
## # ... with 53 more variables: new_sp_m3544 <int>, new_sp_m4554 <int>,
## #   new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,
## #   new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
## #   new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
## #   new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
## #   new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
## #   new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
## #   new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,
## #   new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,
## #   new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,
## #   new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
## #   new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,
## #   new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,
## #   new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,
## #   newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,
## #   newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,
## #   newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,
## #   newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>
# Transform columns to rows as below

df1<- df %>% gather(key=Age_group,value=Cases,new_sp_m014:newrel_f65
)

head(df1)
## # A tibble: 6 x 5
##       country Country_ID  year   Age_group Cases
##         <chr>      <chr> <int>       <chr> <int>
## 1 Afghanistan         AF  1980 new_sp_m014    NA
## 2 Afghanistan         AF  1981 new_sp_m014    NA
## 3 Afghanistan         AF  1982 new_sp_m014    NA
## 4 Afghanistan         AF  1983 new_sp_m014    NA
## 5 Afghanistan         AF  1984 new_sp_m014    NA
## 6 Afghanistan         AF  1985 new_sp_m014    NA
# Making age group names consistent
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df<-who %>%
# Transform wide data    
  gather(code, value, new_sp_m014:newrel_f65, na.rm = TRUE) %>% 
  mutate(code = stringr::str_replace(code, "newrel", "new_rel")) %>%
  separate(code, c("new", "var", "sexage")) %>% 
  select(-new, -iso2, -iso3) %>% 
# Separate Sexage into columns `Sex` and `Age` by splitting after the first character  
  separate(sexage, c("sex", "age"), sep = 1)

head(df)
## # A tibble: 6 x 6
##       country  year   var   sex   age value
##         <chr> <int> <chr> <chr> <chr> <int>
## 1 Afghanistan  1997    sp     m   014     0
## 2 Afghanistan  1998    sp     m   014    30
## 3 Afghanistan  1999    sp     m   014     8
## 4 Afghanistan  2000    sp     m   014    52
## 5 Afghanistan  2001    sp     m   014   129
## 6 Afghanistan  2002    sp     m   014    90
library(tidyverse)
## Loading tidyverse: purrr
## Conflicts with tidy packages ----------------------------------------------
## as.difftime(): lubridate, base
## date():        lubridate, base
## filter():      dplyr, stats
## intersect():   lubridate, base
## lag():         dplyr, stats
## lift():        purrr, caret
## setdiff():     lubridate, base
## union():       lubridate, base
library(nycflights13)

head(airlines)
## # A tibble: 6 x 2
##   carrier                     name
##     <chr>                    <chr>
## 1      9E        Endeavor Air Inc.
## 2      AA   American Airlines Inc.
## 3      AS     Alaska Airlines Inc.
## 4      B6          JetBlue Airways
## 5      DL     Delta Air Lines Inc.
## 6      EV ExpressJet Airlines Inc.
head(airports)
## # A tibble: 6 x 8
##     faa                           name      lat       lon   alt    tz
##   <chr>                          <chr>    <dbl>     <dbl> <int> <dbl>
## 1   04G              Lansdowne Airport 41.13047 -80.61958  1044    -5
## 2   06A  Moton Field Municipal Airport 32.46057 -85.68003   264    -6
## 3   06C            Schaumburg Regional 41.98934 -88.10124   801    -6
## 4   06N                Randall Airport 41.43191 -74.39156   523    -5
## 5   09J          Jekyll Island Airport 31.07447 -81.42778    11    -5
## 6   0A9 Elizabethton Municipal Airport 36.37122 -82.17342  1593    -5
## # ... with 2 more variables: dst <chr>, tzone <chr>
head(planes)
## # A tibble: 6 x 9
##   tailnum  year                    type     manufacturer     model engines
##     <chr> <int>                   <chr>            <chr>     <chr>   <int>
## 1  N10156  2004 Fixed wing multi engine          EMBRAER EMB-145XR       2
## 2  N102UW  1998 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214       2
## 3  N103US  1999 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214       2
## 4  N104UW  1999 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214       2
## 5  N10575  2002 Fixed wing multi engine          EMBRAER EMB-145LR       2
## 6  N105UW  1999 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214       2
## # ... with 3 more variables: seats <int>, speed <int>, engine <chr>
head(weather)
## # A tibble: 6 x 15
##   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##    <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
## 1    EWR  2013     1     1     0 37.04 21.92 53.97      230   10.35702
## 2    EWR  2013     1     1     1 37.04 21.92 53.97      230   13.80936
## 3    EWR  2013     1     1     2 37.94 21.92 52.09      230   12.65858
## 4    EWR  2013     1     1     3 37.94 23.00 54.51      230   13.80936
## 5    EWR  2013     1     1     4 37.94 24.08 57.04      240   14.96014
## 6    EWR  2013     1     1     6 39.02 26.06 59.37      270   10.35702
## # ... with 5 more variables: wind_gust <dbl>, precip <dbl>,
## #   pressure <dbl>, visib <dbl>, time_hour <dttm>