library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
*Convert dataframe to tibble
library(tibble)
df<-as_tibble(iris)
head(df)
## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fctr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
*Creating a tibble
df1<-tibble(x=rnorm(10), y=runif(10),Ab=sample(LETTERS,replace = T,size=10))
head(df1)
## # A tibble: 6 x 3
## x y Ab
## <dbl> <dbl> <chr>
## 1 0.6783612 0.85004568 G
## 2 0.2780036 0.08800381 X
## 3 0.9491178 0.03994298 C
## 4 1.7202164 0.94860915 W
## 5 1.2657255 0.72267166 C
## 6 -1.6533803 0.33113699 O
*Tibble with date time
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
df2<-tibble(Time=lubridate::now() + runif(20)*23, Date=lubridate::today() + runif(20)*10, myletter=sample(letters,size=20,replace = T))
head(df2)
## # A tibble: 6 x 3
## Time Date myletter
## <dttm> <date> <chr>
## 1 2017-09-22 20:52:33 2017-09-24 z
## 2 2017-09-22 20:52:15 2017-09-26 p
## 3 2017-09-22 20:52:18 2017-09-30 o
## 4 2017-09-22 20:52:14 2017-10-01 m
## 5 2017-09-22 20:52:32 2017-09-26 p
## 6 2017-09-22 20:52:17 2017-09-25 p
*Converting character to numneric
library(readr)
a<-parse_double("1234")
class(a)
## [1] "numeric"
# This method ignores all charactors
a<-parse_number("This is cost $234")
a
## [1] 234
# Locate and add decimals
parse_double("1,23", locale = locale(decimal_mark = ","))
## [1] 1.23
parse_number("123.456.789", locale = locale(grouping_mark = "."))
## [1] 123456789
charToRaw("Tuyen")
## [1] 54 75 79 65 6e
UTF-8 is the most common encoding used
x1 <- "El Ni\xf1o was particularly bad this year"
x2 <- "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"
x1 # We don't understand this language
## [1] "El Niño was particularly bad this year"
x2 #We don't understand this language
## [1] "<U+0082>±<U+0082>ñ<U+0082>É<U+0082>¿<U+0082>Í"
*Guessing encode of a data file
df<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/LungCapData.csv",sep=";")
head(df)
## LungCap Age Height Smoke Gender Caesarean
## 1 6.475 6 62.1 no male no
## 2 10.125 18 74.7 yes female no
## 3 9.550 16 69.7 no female yes
## 4 11.125 14 71.0 no male no
## 5 4.800 5 56.9 no male no
## 6 6.225 11 58.7 no female no
guess_encoding(df) #This dataset is used ASCII
## Warning in if (stringi::stri_enc_isascii(lines)) {: the condition has
## length > 1 and only the first element will be used
## # A tibble: 1 x 2
## encoding confidence
## <chr> <dbl>
## 1 ASCII 1
guess_encoding(charToRaw(x1)) # Two encodings popped up with the first be more confident
## # A tibble: 2 x 2
## encoding confidence
## <chr> <dbl>
## 1 ISO-8859-1 0.46
## 2 ISO-8859-9 0.23
parse_character(x1, locale = locale(encoding = "ISO-8859-1"))
## [1] "El Niño was particularly bad this year"
*parse_datetime() will organize date time from year, month,…second
parse_datetime("20100928") # date
## [1] "2010-09-28 UTC"
parse_datetime("20100928T2059") # Both second, year...
## [1] "2010-09-28 20:59:00 UTC"
# convert date times month, day, and year
parse_date("01/02/15", "%m/%d/%y")
## [1] "2015-01-02"
parse_date("01/02/15", "%d/%m/%y") # day, month and year
## [1] "2015-02-01"
parse_date("01/02/15", "%y/%m/%d") # year, month and day
## [1] "2001-02-15"
challenge <- read_csv(readr_example("challenge.csv"))
## Parsed with column specification:
## cols(
## x = col_integer(),
## y = col_character()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 1000 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual expected <int> <chr> <chr> <chr> actual 1 1001 x no trailing characters .23837975086644292 file 2 1002 x no trailing characters .41167997173033655 row 3 1003 x no trailing characters .7460716762579978 col 4 1004 x no trailing characters .723450553836301 expected 5 1005 x no trailing characters .614524137461558 actual # ... with 1 more variables: file <chr>
## ... ................. ... ....................................................... ........ ....................................................... ...... ....................................................... .... ....................................................... ... ....................................................... ... ....................................................... ........ ....................................................... ...... .......................................
## See problems(...) for more details.
# Finding solution
problems(challenge)
## # A tibble: 1,000 x 5
## row col expected actual
## <int> <chr> <chr> <chr>
## 1 1001 x no trailing characters .23837975086644292
## 2 1002 x no trailing characters .41167997173033655
## 3 1003 x no trailing characters .7460716762579978
## 4 1004 x no trailing characters .723450553836301
## 5 1005 x no trailing characters .614524137461558
## 6 1006 x no trailing characters .473980569280684
## 7 1007 x no trailing characters .5784610391128808
## 8 1008 x no trailing characters .2415937229525298
## 9 1009 x no trailing characters .11437866208143532
## 10 1010 x no trailing characters .2983446326106787
## # ... with 990 more rows, and 1 more variables: file <chr>
# Fixing the problem
hallenge <- read_csv(
readr_example("challenge.csv"),
col_types = cols(
x = col_integer(),
y = col_character()
)
)
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 1000 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual expected <int> <chr> <chr> <chr> actual 1 1001 x no trailing characters .23837975086644292 file 2 1002 x no trailing characters .41167997173033655 row 3 1003 x no trailing characters .7460716762579978 col 4 1004 x no trailing characters .723450553836301 expected 5 1005 x no trailing characters .614524137461558 actual # ... with 1 more variables: file <chr>
## ... ................. ... ....................................................... ........ ....................................................... ...... ....................................................... .... ....................................................... ... ....................................................... ... ....................................................... ........ ....................................................... ...... .......................................
## See problems(...) for more details.
#
hallenge <- read_csv(
readr_example("challenge.csv"),
col_types = cols(
x = col_double(),
y = col_character()
)
)
head(hallenge)
## # A tibble: 6 x 2
## x y
## <dbl> <chr>
## 1 404 <NA>
## 2 4172 <NA>
## 3 3004 <NA>
## 4 787 <NA>
## 5 37 <NA>
## 6 2332 <NA>
*Data Cleaning
Data used in this demo can be viewed from WHO
library(tidyr)
head(who)
## # A tibble: 6 x 60
## country iso2 iso3 year new_sp_m014 new_sp_m1524 new_sp_m2534
## <chr> <chr> <chr> <int> <int> <int> <int>
## 1 Afghanistan AF AFG 1980 NA NA NA
## 2 Afghanistan AF AFG 1981 NA NA NA
## 3 Afghanistan AF AFG 1982 NA NA NA
## 4 Afghanistan AF AFG 1983 NA NA NA
## 5 Afghanistan AF AFG 1984 NA NA NA
## 6 Afghanistan AF AFG 1985 NA NA NA
## # ... with 53 more variables: new_sp_m3544 <int>, new_sp_m4554 <int>,
## # new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,
## # new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
## # new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
## # new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
## # new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
## # new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
## # new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,
## # new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,
## # new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,
## # new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
## # new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,
## # new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,
## # new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,
## # newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,
## # newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,
## # newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,
## # newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>
tail(who)
## # A tibble: 6 x 60
## country iso2 iso3 year new_sp_m014 new_sp_m1524 new_sp_m2534
## <chr> <chr> <chr> <int> <int> <int> <int>
## 1 Zimbabwe ZW ZWE 2008 127 614 0
## 2 Zimbabwe ZW ZWE 2009 125 578 NA
## 3 Zimbabwe ZW ZWE 2010 150 710 2208
## 4 Zimbabwe ZW ZWE 2011 152 784 2467
## 5 Zimbabwe ZW ZWE 2012 120 783 2421
## 6 Zimbabwe ZW ZWE 2013 NA NA NA
## # ... with 53 more variables: new_sp_m3544 <int>, new_sp_m4554 <int>,
## # new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,
## # new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
## # new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
## # new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
## # new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
## # new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
## # new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,
## # new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,
## # new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,
## # new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
## # new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,
## # new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,
## # new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,
## # newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,
## # newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,
## # newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,
## # newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>
# Checking the number of column and rows
dim(who)
## [1] 7240 60
# It is good to give column with a meaningfull name
?who
## starting httpd help server ...
## done
df<-who # making a copy of `who`
df<-df[,-3] # Only keeping one country code
names(df)[2]<-"Country_ID"
head(df)
## # A tibble: 6 x 59
## country Country_ID year new_sp_m014 new_sp_m1524 new_sp_m2534
## <chr> <chr> <int> <int> <int> <int>
## 1 Afghanistan AF 1980 NA NA NA
## 2 Afghanistan AF 1981 NA NA NA
## 3 Afghanistan AF 1982 NA NA NA
## 4 Afghanistan AF 1983 NA NA NA
## 5 Afghanistan AF 1984 NA NA NA
## 6 Afghanistan AF 1985 NA NA NA
## # ... with 53 more variables: new_sp_m3544 <int>, new_sp_m4554 <int>,
## # new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,
## # new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
## # new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
## # new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
## # new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
## # new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
## # new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,
## # new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,
## # new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,
## # new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
## # new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,
## # new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,
## # new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,
## # newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,
## # newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,
## # newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,
## # newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>
# Transform columns to rows as below
df1<- df %>% gather(key=Age_group,value=Cases,new_sp_m014:newrel_f65
)
head(df1)
## # A tibble: 6 x 5
## country Country_ID year Age_group Cases
## <chr> <chr> <int> <chr> <int>
## 1 Afghanistan AF 1980 new_sp_m014 NA
## 2 Afghanistan AF 1981 new_sp_m014 NA
## 3 Afghanistan AF 1982 new_sp_m014 NA
## 4 Afghanistan AF 1983 new_sp_m014 NA
## 5 Afghanistan AF 1984 new_sp_m014 NA
## 6 Afghanistan AF 1985 new_sp_m014 NA
# Making age group names consistent
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df<-who %>%
# Transform wide data
gather(code, value, new_sp_m014:newrel_f65, na.rm = TRUE) %>%
mutate(code = stringr::str_replace(code, "newrel", "new_rel")) %>%
separate(code, c("new", "var", "sexage")) %>%
select(-new, -iso2, -iso3) %>%
# Separate Sexage into columns `Sex` and `Age` by splitting after the first character
separate(sexage, c("sex", "age"), sep = 1)
head(df)
## # A tibble: 6 x 6
## country year var sex age value
## <chr> <int> <chr> <chr> <chr> <int>
## 1 Afghanistan 1997 sp m 014 0
## 2 Afghanistan 1998 sp m 014 30
## 3 Afghanistan 1999 sp m 014 8
## 4 Afghanistan 2000 sp m 014 52
## 5 Afghanistan 2001 sp m 014 129
## 6 Afghanistan 2002 sp m 014 90
library(tidyverse)
## Loading tidyverse: purrr
## Conflicts with tidy packages ----------------------------------------------
## as.difftime(): lubridate, base
## date(): lubridate, base
## filter(): dplyr, stats
## intersect(): lubridate, base
## lag(): dplyr, stats
## lift(): purrr, caret
## setdiff(): lubridate, base
## union(): lubridate, base
library(nycflights13)
head(airlines)
## # A tibble: 6 x 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
head(airports)
## # A tibble: 6 x 8
## faa name lat lon alt tz
## <chr> <chr> <dbl> <dbl> <int> <dbl>
## 1 04G Lansdowne Airport 41.13047 -80.61958 1044 -5
## 2 06A Moton Field Municipal Airport 32.46057 -85.68003 264 -6
## 3 06C Schaumburg Regional 41.98934 -88.10124 801 -6
## 4 06N Randall Airport 41.43191 -74.39156 523 -5
## 5 09J Jekyll Island Airport 31.07447 -81.42778 11 -5
## 6 0A9 Elizabethton Municipal Airport 36.37122 -82.17342 1593 -5
## # ... with 2 more variables: dst <chr>, tzone <chr>
head(planes)
## # A tibble: 6 x 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 4 N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 5 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2
## 6 N105UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## # ... with 3 more variables: seats <int>, speed <int>, engine <chr>
head(weather)
## # A tibble: 6 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 0 37.04 21.92 53.97 230 10.35702
## 2 EWR 2013 1 1 1 37.04 21.92 53.97 230 13.80936
## 3 EWR 2013 1 1 2 37.94 21.92 52.09 230 12.65858
## 4 EWR 2013 1 1 3 37.94 23.00 54.51 230 13.80936
## 5 EWR 2013 1 1 4 37.94 24.08 57.04 240 14.96014
## 6 EWR 2013 1 1 6 39.02 26.06 59.37 270 10.35702
## # ... with 5 more variables: wind_gust <dbl>, precip <dbl>,
## # pressure <dbl>, visib <dbl>, time_hour <dttm>