readr packageR Programming for Data Science by Roger D. Peng, May 31, 2022
Textbook
#install.packages("readr", "tictoc", "dplyr")
library(readr)
library(dplyr)
library(tictoc)
teams <- read_csv("team_standings.csv")
## Rows: 32 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Team
## dbl (1): Standing
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
teams <- read_csv("team_standings.csv",
show_col_types = FALSE)
teams
## # A tibble: 32 × 2
## Standing Team
## <dbl> <chr>
## 1 1 Spain
## 2 2 Netherlands
## 3 3 Germany
## 4 4 Uruguay
## 5 5 Argentina
## 6 6 Brazil
## 7 7 Ghana
## 8 8 Paraguay
## 9 9 Japan
## 10 10 Chile
## # … with 22 more rows
# install.packages("dplyr")
# library(dplyr)
glimpse(teams)
## Rows: 32
## Columns: 2
## $ Standing <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ Team <chr> "Spain", "Netherlands", "Germany", "Uruguay", "Argentina", "B…
teams <- read_csv("team_standings.csv",
col_types = "ic")
head(teams)
## # A tibble: 6 × 2
## Standing Team
## <int> <chr>
## 1 1 Spain
## 2 2 Netherlands
## 3 3 Germany
## 4 4 Uruguay
## 5 5 Argentina
## 6 6 Brazil
logs <- read_csv("2016-07-19.csv.bz2",
n_max = 10)
## Rows: 10 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): r_version, r_arch, r_os, package, version, country
## dbl (2): size, ip_id
## date (1): date
## time (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(logs)
## [1] "date" "time" "size" "r_version" "r_arch" "r_os"
## [7] "package" "version" "country" "ip_id"
tic()
logs <- read_csv("2016-07-19.csv.bz2",
col_types = "Dticccccci",
n_max = 10)
toc()
## 0.03 sec elapsed
logs
## # A tibble: 10 × 10
## date time size r_ver…¹ r_arch r_os package version country ip_id
## <date> <time> <int> <chr> <chr> <chr> <chr> <chr> <chr> <int>
## 1 2016-07-19 22:00:00 1.89e6 3.3.0 x86_64 ming… data.t… 1.9.6 US 1
## 2 2016-07-19 22:00:05 4.54e4 3.3.1 x86_64 ming… assert… 0.1 US 2
## 3 2016-07-19 22:00:03 1.43e7 3.3.1 x86_64 ming… stringi 1.1.1 DE 3
## 4 2016-07-19 22:00:05 1.89e6 3.3.1 x86_64 ming… data.t… 1.9.6 US 4
## 5 2016-07-19 22:00:06 3.90e5 3.3.1 x86_64 ming… foreach 1.4.3 US 4
## 6 2016-07-19 22:00:08 4.88e4 3.3.1 x86_64 linu… tree 1.0-37 CO 5
## 7 2016-07-19 22:00:12 5.25e2 3.3.1 x86_64 darw… surviv… 2.39-5 US 6
## 8 2016-07-19 22:00:08 3.23e6 3.3.1 x86_64 ming… Rcpp 0.12.5 US 2
## 9 2016-07-19 22:00:09 5.56e5 3.3.1 x86_64 ming… tibble 1.1 US 2
## 10 2016-07-19 22:00:10 1.52e5 3.3.1 x86_64 ming… magrit… 1.5 US 2
## # … with abbreviated variable name ¹r_version
# Reading the 1st column only
logdates <- read_csv("2016-07-19.csv.bz2",
col_types = cols_only(date = col_date()),
n_max = 10)
logdates
## # A tibble: 10 × 1
## date
## <date>
## 1 2016-07-19
## 2 2016-07-19
## 3 2016-07-19
## 4 2016-07-19
## 5 2016-07-19
## 6 2016-07-19
## 7 2016-07-19
## 8 2016-07-19
## 9 2016-07-19
## 10 2016-07-19
tic()
full_logs <- read_csv("2016-07-19.csv.bz2",
col_types = "Dticccccci")
toc()
## 3.39 sec elapsed
glimpse(full_logs)
## Rows: 701,878
## Columns: 10
## $ date <date> 2016-07-19, 2016-07-19, 2016-07-19, 2016-07-19, 2016-07-19,…
## $ time <time> 22:00:00, 22:00:05, 22:00:03, 22:00:05, 22:00:06, 22:00:08,…
## $ size <int> 1887881, 45436, 14259016, 1887881, 389615, 48842, 525, 32259…
## $ r_version <chr> "3.3.0", "3.3.1", "3.3.1", "3.3.1", "3.3.1", "3.3.1", "3.3.1…
## $ r_arch <chr> "x86_64", "x86_64", "x86_64", "x86_64", "x86_64", "x86_64", …
## $ r_os <chr> "mingw32", "mingw32", "mingw32", "mingw32", "mingw32", "linu…
## $ package <chr> "data.table", "assertthat", "stringi", "data.table", "foreac…
## $ version <chr> "1.9.6", "0.1", "1.1.1", "1.9.6", "1.4.3", "1.0-37", "2.39-5…
## $ country <chr> "US", "US", "DE", "US", "US", "CO", "US", "US", "US", "US", …
## $ ip_id <int> 1, 2, 3, 4, 4, 5, 6, 2, 2, 2, 2, 2, 7, 8, 8, 9, 9, 9, 9, 10,…
unique(full_logs$r_arch)
## [1] "x86_64" NA "i386" "i686" "armv7l"
## [6] "powerpc64le" "i586" "s390x"
summary(full_logs$r_arch)
## Length Class Mode
## 701878 character character
table(full_logs$r_arch)
##
## armv7l i386 i586 i686 powerpc64le s390x
## 2 46690 42 2159 2 7
## x86_64
## 562903
summary(factor(full_logs$r_arch))
## armv7l i386 i586 i686 powerpc64le s390x
## 2 46690 42 2159 2 7
## x86_64 NA's
## 562903 90073
summary(factor(full_logs$r_version))
## 2.11.0 2.11.1 2.12.1 2.12.2 2.13.0 2.13.1 2.13.2 2.14.0 2.14.1 2.14.2 2.15.0
## 4 57 7 23 6 28 7 18 16 11 63
## 2.15.1 2.15.2 2.15.3 3.0.0 3.0.1 3.0.2 3.0.3 3.1.0 3.1.1 3.1.2 3.1.3
## 177 93 293 415 651 4086 1204 2536 5620 10678 6205
## 3.2.0 3.2.1 3.2.2 3.2.3 3.2.4 3.2.5 3.3.0 3.3.1 3.4.0 NA's
## 8071 10534 41786 38162 17197 21032 96957 343237 2631 90073
sessionInfo()
## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22000)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_Philippines.utf8 LC_CTYPE=English_Philippines.utf8
## [3] LC_MONETARY=English_Philippines.utf8 LC_NUMERIC=C
## [5] LC_TIME=English_Philippines.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tictoc_1.1 dplyr_1.0.10 readr_2.1.2
##
## loaded via a namespace (and not attached):
## [1] bslib_0.4.0 compiler_4.2.1 pillar_1.8.1 jquerylib_0.1.4
## [5] tools_4.2.1 bit_4.0.4 digest_0.6.29 jsonlite_1.8.0
## [9] evaluate_0.16 lifecycle_1.0.1 tibble_3.1.8 pkgconfig_2.0.3
## [13] rlang_1.0.5 cli_3.3.0 DBI_1.1.3 rstudioapi_0.14
## [17] parallel_4.2.1 yaml_2.3.5 xfun_0.33 fastmap_1.1.0
## [21] stringr_1.4.1 knitr_1.40 generics_0.1.3 vctrs_0.4.1
## [25] sass_0.4.2 hms_1.1.2 bit64_4.0.5 tidyselect_1.1.2
## [29] glue_1.6.2 R6_2.5.1 fansi_1.0.3 vroom_1.5.7
## [33] rmarkdown_2.16 tzdb_0.3.0 purrr_0.3.4 magrittr_2.0.3
## [37] ellipsis_0.3.2 htmltools_0.5.3 assertthat_0.2.1 utf8_1.2.2
## [41] stringi_1.7.8 cachem_1.0.6 crayon_1.5.1
tic()
nle2022 <- read_csv("AllContests2022_05131447.csv",
n_max = 10)
## Rows: 10 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): PRECINCT_CODE, CONTEST_CODE, RECEPTION_DATE, CONTEST_NAME, CANDIDA...
## dbl (3): VOTES_AMOUNT, ACTUALVOTERS, REGISTEREDVOTERS
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
toc() # 1.83 sec elapsed
## 0.08 sec elapsed
glimpse(nle2022)
## Rows: 10
## Columns: 13
## $ PRECINCT_CODE <chr> "01010017", "01010017", "01010017", "01010017", "0101…
## $ CONTEST_CODE <chr> "01199000", "01199000", "01199000", "01199000", "0119…
## $ VOTES_AMOUNT <dbl> 0, 1, 0, 2, 5, 1, 0, 1, 0, 2
## $ ACTUALVOTERS <dbl> 697, 697, 697, 697, 697, 697, 697, 697, 697, 697
## $ RECEPTION_DATE <chr> "05/09/2022 - 08:07:08 PM", "05/09/2022 - 08:07:08 PM…
## $ CONTEST_NAME <chr> "PARTY LIST PHILIPPINES", "PARTY LIST PHILIPPINES", "…
## $ CANDIDATE_NAME <chr> "01 KAMALAYAN", "02 KM NGAYON NA", "03 PSIS", "04 AGA…
## $ REGION <chr> "CAR", "CAR", "CAR", "CAR", "CAR", "CAR", "CAR", "CAR…
## $ PROVINCE <chr> "ABRA", "ABRA", "ABRA", "ABRA", "ABRA", "ABRA", "ABRA…
## $ MUNICIPALITY <chr> "BANGUED", "BANGUED", "BANGUED", "BANGUED", "BANGUED"…
## $ BARANGAY <chr> "AGTANGAO", "AGTANGAO", "AGTANGAO", "AGTANGAO", "AGTA…
## $ REGISTEREDVOTERS <dbl> 783, 783, 783, 783, 783, 783, 783, 783, 783, 783
## $ DISTRICT <chr> "ABRA - LONE DISTRICT", "ABRA - LONE DISTRICT", "ABRA…
# Run this code if your RAM is 16 GB or more
# This csv file is about 5GB
tic()
nle2022 <- read_csv("AllContests2022_05131447.csv",
show_col_types = FALSE)
toc() #72.86 sec elapsed
glimpse(nle2022)
## Rows: 10
## Columns: 13
## $ PRECINCT_CODE <chr> "01010017", "01010017", "01010017", "01010017", "0101…
## $ CONTEST_CODE <chr> "01199000", "01199000", "01199000", "01199000", "0119…
## $ VOTES_AMOUNT <dbl> 0, 1, 0, 2, 5, 1, 0, 1, 0, 2
## $ ACTUALVOTERS <dbl> 697, 697, 697, 697, 697, 697, 697, 697, 697, 697
## $ RECEPTION_DATE <chr> "05/09/2022 - 08:07:08 PM", "05/09/2022 - 08:07:08 PM…
## $ CONTEST_NAME <chr> "PARTY LIST PHILIPPINES", "PARTY LIST PHILIPPINES", "…
## $ CANDIDATE_NAME <chr> "01 KAMALAYAN", "02 KM NGAYON NA", "03 PSIS", "04 AGA…
## $ REGION <chr> "CAR", "CAR", "CAR", "CAR", "CAR", "CAR", "CAR", "CAR…
## $ PROVINCE <chr> "ABRA", "ABRA", "ABRA", "ABRA", "ABRA", "ABRA", "ABRA…
## $ MUNICIPALITY <chr> "BANGUED", "BANGUED", "BANGUED", "BANGUED", "BANGUED"…
## $ BARANGAY <chr> "AGTANGAO", "AGTANGAO", "AGTANGAO", "AGTANGAO", "AGTA…
## $ REGISTEREDVOTERS <dbl> 783, 783, 783, 783, 783, 783, 783, 783, 783, 783
## $ DISTRICT <chr> "ABRA - LONE DISTRICT", "ABRA - LONE DISTRICT", "ABRA…
32057651*13*8/10^9
## [1] 3.333996
library(hcandersenr)
glimpse(hcandersen_en)
## Rows: 31,380
## Columns: 2
## $ text <chr> "A soldier came marching along the high road: \"Left, right - lef…
## $ book <chr> "The tinder-box", "The tinder-box", "The tinder-box", "The tinder…
unique(hcandersen_fr$book)
## [1] "Le briquet"
## [2] "Grand Claus et petit Claus"
## [3] "La princesse au petit pois"
## [4] "Les fleurs de la petite Ida"
## [5] "La Petite Poucette"
## [6] "Le compagnon de route"
## [7] "La petite sirène"
## [8] "Les habits neufs de l'empereur"
## [9] "La pâquerette"
## [10] "Le stoïque soldat de plomb"
## [11] "Les cygnes sauvages"
## [12] "Le jardin du paradis"
## [13] "La malle volante"
## [14] "Une semaine du petit elfe Ferme-l'œil"
## [15] "La princesse et le porcher"
## [16] "L'ange"
## [17] "Le Rossignol et l'Empereur de Chine"
## [18] "Le vilain petit canard"
## [19] "Le sapin"
## [20] "La reine des neiges"
## [21] "La fée du sureau"
## [22] "Le concours de saut"
## [23] "La bergère et le ramoneur"
## [24] "La cloche"
## [25] "L'aiguille à repriser"
## [26] "La petite fille aux allumettes"
## [27] "Le vieux réverbère"
## [28] "Les voisins"
## [29] "L'ombre"
## [30] "La vieille maison"
## [31] "L'heureuse famille"
## [32] "Les amours d'un faux-col"
## [33] "Le chanvre"
## [34] "Bonne humeur"
## [35] "Chacun et chaque chose à sa place"
## [36] "Cinq dans une cosse de pois"
## [37] "La tirelire"
## [38] "Hans le balourd"
## [39] "Le goulot de la bouteille"
## [40] "La soupe à la brochette"
## [41] "Quelque chose"
## [42] "Le dernier rêve du chêne"
## [43] "Les coureurs"
## [44] "Papotages d'enfants"
## [45] "La plume et l'encrier"
## [46] "Le coq de poulailler et le coq de girouette"
## [47] "Le papillon"
## [48] "Ce que le Père fait est bien fait"
## [49] "Le bonhomme de neige"
## [50] "L'escargot et le rosier"
## [51] "Le schilling d'argent"
## [52] "Une rose de la tombe d'Homère"
## [53] "Le crapaud"
## [54] "Le montreur de marionnettes"
## [55] "Le soleil raconte"
## [56] "Les aventures du chardon"
## [57] "Le bisaïeul"
## [58] "Le jardinier et ses maîtres"