Reference

R Programming for Data Science by Roger D. Peng, May 31, 2022

Textbook

#install.packages("readr", "tictoc", "dplyr")
library(readr)
library(dplyr)
library(tictoc)
teams <- read_csv("team_standings.csv")
## Rows: 32 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Team
## dbl (1): Standing
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
teams <- read_csv("team_standings.csv",
                  show_col_types = FALSE)
teams
## # A tibble: 32 × 2
##    Standing Team       
##       <dbl> <chr>      
##  1        1 Spain      
##  2        2 Netherlands
##  3        3 Germany    
##  4        4 Uruguay    
##  5        5 Argentina  
##  6        6 Brazil     
##  7        7 Ghana      
##  8        8 Paraguay   
##  9        9 Japan      
## 10       10 Chile      
## # … with 22 more rows
# install.packages("dplyr")
# library(dplyr)
glimpse(teams)
## Rows: 32
## Columns: 2
## $ Standing <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ Team     <chr> "Spain", "Netherlands", "Germany", "Uruguay", "Argentina", "B…
teams <- read_csv("team_standings.csv", 
                  col_types = "ic")
head(teams)
## # A tibble: 6 × 2
##   Standing Team       
##      <int> <chr>      
## 1        1 Spain      
## 2        2 Netherlands
## 3        3 Germany    
## 4        4 Uruguay    
## 5        5 Argentina  
## 6        6 Brazil

1 Reading Compressed a File

logs <- read_csv("2016-07-19.csv.bz2", 
                 n_max = 10)
## Rows: 10 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): r_version, r_arch, r_os, package, version, country
## dbl  (2): size, ip_id
## date (1): date
## time (1): time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(logs)
##  [1] "date"      "time"      "size"      "r_version" "r_arch"    "r_os"     
##  [7] "package"   "version"   "country"   "ip_id"
tic()
logs <- read_csv("2016-07-19.csv.bz2", 
                 col_types = "Dticccccci", 
                 n_max = 10)
toc()
## 0.03 sec elapsed
logs
## # A tibble: 10 × 10
##    date       time       size r_ver…¹ r_arch r_os  package version country ip_id
##    <date>     <time>    <int> <chr>   <chr>  <chr> <chr>   <chr>   <chr>   <int>
##  1 2016-07-19 22:00:00 1.89e6 3.3.0   x86_64 ming… data.t… 1.9.6   US          1
##  2 2016-07-19 22:00:05 4.54e4 3.3.1   x86_64 ming… assert… 0.1     US          2
##  3 2016-07-19 22:00:03 1.43e7 3.3.1   x86_64 ming… stringi 1.1.1   DE          3
##  4 2016-07-19 22:00:05 1.89e6 3.3.1   x86_64 ming… data.t… 1.9.6   US          4
##  5 2016-07-19 22:00:06 3.90e5 3.3.1   x86_64 ming… foreach 1.4.3   US          4
##  6 2016-07-19 22:00:08 4.88e4 3.3.1   x86_64 linu… tree    1.0-37  CO          5
##  7 2016-07-19 22:00:12 5.25e2 3.3.1   x86_64 darw… surviv… 2.39-5  US          6
##  8 2016-07-19 22:00:08 3.23e6 3.3.1   x86_64 ming… Rcpp    0.12.5  US          2
##  9 2016-07-19 22:00:09 5.56e5 3.3.1   x86_64 ming… tibble  1.1     US          2
## 10 2016-07-19 22:00:10 1.52e5 3.3.1   x86_64 ming… magrit… 1.5     US          2
## # … with abbreviated variable name ¹​r_version
# Reading the 1st column only
logdates <- read_csv("2016-07-19.csv.bz2", 
                      col_types = cols_only(date = col_date()),
                      n_max = 10)
logdates
## # A tibble: 10 × 1
##    date      
##    <date>    
##  1 2016-07-19
##  2 2016-07-19
##  3 2016-07-19
##  4 2016-07-19
##  5 2016-07-19
##  6 2016-07-19
##  7 2016-07-19
##  8 2016-07-19
##  9 2016-07-19
## 10 2016-07-19
tic()
full_logs <- read_csv("2016-07-19.csv.bz2", 
                 col_types = "Dticccccci")
toc()
## 3.39 sec elapsed
glimpse(full_logs)
## Rows: 701,878
## Columns: 10
## $ date      <date> 2016-07-19, 2016-07-19, 2016-07-19, 2016-07-19, 2016-07-19,…
## $ time      <time> 22:00:00, 22:00:05, 22:00:03, 22:00:05, 22:00:06, 22:00:08,…
## $ size      <int> 1887881, 45436, 14259016, 1887881, 389615, 48842, 525, 32259…
## $ r_version <chr> "3.3.0", "3.3.1", "3.3.1", "3.3.1", "3.3.1", "3.3.1", "3.3.1…
## $ r_arch    <chr> "x86_64", "x86_64", "x86_64", "x86_64", "x86_64", "x86_64", …
## $ r_os      <chr> "mingw32", "mingw32", "mingw32", "mingw32", "mingw32", "linu…
## $ package   <chr> "data.table", "assertthat", "stringi", "data.table", "foreac…
## $ version   <chr> "1.9.6", "0.1", "1.1.1", "1.9.6", "1.4.3", "1.0-37", "2.39-5…
## $ country   <chr> "US", "US", "DE", "US", "US", "CO", "US", "US", "US", "US", …
## $ ip_id     <int> 1, 2, 3, 4, 4, 5, 6, 2, 2, 2, 2, 2, 7, 8, 8, 9, 9, 9, 9, 10,…
unique(full_logs$r_arch)
## [1] "x86_64"      NA            "i386"        "i686"        "armv7l"     
## [6] "powerpc64le" "i586"        "s390x"
summary(full_logs$r_arch)
##    Length     Class      Mode 
##    701878 character character
table(full_logs$r_arch)
## 
##      armv7l        i386        i586        i686 powerpc64le       s390x 
##           2       46690          42        2159           2           7 
##      x86_64 
##      562903
summary(factor(full_logs$r_arch))
##      armv7l        i386        i586        i686 powerpc64le       s390x 
##           2       46690          42        2159           2           7 
##      x86_64        NA's 
##      562903       90073
summary(factor(full_logs$r_version))
## 2.11.0 2.11.1 2.12.1 2.12.2 2.13.0 2.13.1 2.13.2 2.14.0 2.14.1 2.14.2 2.15.0 
##      4     57      7     23      6     28      7     18     16     11     63 
## 2.15.1 2.15.2 2.15.3  3.0.0  3.0.1  3.0.2  3.0.3  3.1.0  3.1.1  3.1.2  3.1.3 
##    177     93    293    415    651   4086   1204   2536   5620  10678   6205 
##  3.2.0  3.2.1  3.2.2  3.2.3  3.2.4  3.2.5  3.3.0  3.3.1  3.4.0   NA's 
##   8071  10534  41786  38162  17197  21032  96957 343237   2631  90073
sessionInfo()
## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22000)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Philippines.utf8  LC_CTYPE=English_Philippines.utf8   
## [3] LC_MONETARY=English_Philippines.utf8 LC_NUMERIC=C                        
## [5] LC_TIME=English_Philippines.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tictoc_1.1   dplyr_1.0.10 readr_2.1.2 
## 
## loaded via a namespace (and not attached):
##  [1] bslib_0.4.0      compiler_4.2.1   pillar_1.8.1     jquerylib_0.1.4 
##  [5] tools_4.2.1      bit_4.0.4        digest_0.6.29    jsonlite_1.8.0  
##  [9] evaluate_0.16    lifecycle_1.0.1  tibble_3.1.8     pkgconfig_2.0.3 
## [13] rlang_1.0.5      cli_3.3.0        DBI_1.1.3        rstudioapi_0.14 
## [17] parallel_4.2.1   yaml_2.3.5       xfun_0.33        fastmap_1.1.0   
## [21] stringr_1.4.1    knitr_1.40       generics_0.1.3   vctrs_0.4.1     
## [25] sass_0.4.2       hms_1.1.2        bit64_4.0.5      tidyselect_1.1.2
## [29] glue_1.6.2       R6_2.5.1         fansi_1.0.3      vroom_1.5.7     
## [33] rmarkdown_2.16   tzdb_0.3.0       purrr_0.3.4      magrittr_2.0.3  
## [37] ellipsis_0.3.2   htmltools_0.5.3  assertthat_0.2.1 utf8_1.2.2      
## [41] stringi_1.7.8    cachem_1.0.6     crayon_1.5.1
tic()
nle2022 <- read_csv("AllContests2022_05131447.csv",
                    n_max = 10)
## Rows: 10 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): PRECINCT_CODE, CONTEST_CODE, RECEPTION_DATE, CONTEST_NAME, CANDIDA...
## dbl  (3): VOTES_AMOUNT, ACTUALVOTERS, REGISTEREDVOTERS
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
toc() # 1.83 sec elapsed
## 0.08 sec elapsed
glimpse(nle2022)
## Rows: 10
## Columns: 13
## $ PRECINCT_CODE    <chr> "01010017", "01010017", "01010017", "01010017", "0101…
## $ CONTEST_CODE     <chr> "01199000", "01199000", "01199000", "01199000", "0119…
## $ VOTES_AMOUNT     <dbl> 0, 1, 0, 2, 5, 1, 0, 1, 0, 2
## $ ACTUALVOTERS     <dbl> 697, 697, 697, 697, 697, 697, 697, 697, 697, 697
## $ RECEPTION_DATE   <chr> "05/09/2022 - 08:07:08 PM", "05/09/2022 - 08:07:08 PM…
## $ CONTEST_NAME     <chr> "PARTY LIST PHILIPPINES", "PARTY LIST PHILIPPINES", "…
## $ CANDIDATE_NAME   <chr> "01 KAMALAYAN", "02 KM NGAYON NA", "03 PSIS", "04 AGA…
## $ REGION           <chr> "CAR", "CAR", "CAR", "CAR", "CAR", "CAR", "CAR", "CAR…
## $ PROVINCE         <chr> "ABRA", "ABRA", "ABRA", "ABRA", "ABRA", "ABRA", "ABRA…
## $ MUNICIPALITY     <chr> "BANGUED", "BANGUED", "BANGUED", "BANGUED", "BANGUED"…
## $ BARANGAY         <chr> "AGTANGAO", "AGTANGAO", "AGTANGAO", "AGTANGAO", "AGTA…
## $ REGISTEREDVOTERS <dbl> 783, 783, 783, 783, 783, 783, 783, 783, 783, 783
## $ DISTRICT         <chr> "ABRA - LONE DISTRICT", "ABRA - LONE DISTRICT", "ABRA…
# Run this code if your RAM is 16 GB or more
# This csv file is about 5GB
tic()
nle2022 <- read_csv("AllContests2022_05131447.csv",
                    show_col_types = FALSE)
toc() #72.86 sec elapsed
glimpse(nle2022)
## Rows: 10
## Columns: 13
## $ PRECINCT_CODE    <chr> "01010017", "01010017", "01010017", "01010017", "0101…
## $ CONTEST_CODE     <chr> "01199000", "01199000", "01199000", "01199000", "0119…
## $ VOTES_AMOUNT     <dbl> 0, 1, 0, 2, 5, 1, 0, 1, 0, 2
## $ ACTUALVOTERS     <dbl> 697, 697, 697, 697, 697, 697, 697, 697, 697, 697
## $ RECEPTION_DATE   <chr> "05/09/2022 - 08:07:08 PM", "05/09/2022 - 08:07:08 PM…
## $ CONTEST_NAME     <chr> "PARTY LIST PHILIPPINES", "PARTY LIST PHILIPPINES", "…
## $ CANDIDATE_NAME   <chr> "01 KAMALAYAN", "02 KM NGAYON NA", "03 PSIS", "04 AGA…
## $ REGION           <chr> "CAR", "CAR", "CAR", "CAR", "CAR", "CAR", "CAR", "CAR…
## $ PROVINCE         <chr> "ABRA", "ABRA", "ABRA", "ABRA", "ABRA", "ABRA", "ABRA…
## $ MUNICIPALITY     <chr> "BANGUED", "BANGUED", "BANGUED", "BANGUED", "BANGUED"…
## $ BARANGAY         <chr> "AGTANGAO", "AGTANGAO", "AGTANGAO", "AGTANGAO", "AGTA…
## $ REGISTEREDVOTERS <dbl> 783, 783, 783, 783, 783, 783, 783, 783, 783, 783
## $ DISTRICT         <chr> "ABRA - LONE DISTRICT", "ABRA - LONE DISTRICT", "ABRA…
32057651*13*8/10^9
## [1] 3.333996
library(hcandersenr)
glimpse(hcandersen_en)
## Rows: 31,380
## Columns: 2
## $ text <chr> "A soldier came marching along the high road: \"Left, right - lef…
## $ book <chr> "The tinder-box", "The tinder-box", "The tinder-box", "The tinder…
unique(hcandersen_fr$book)
##  [1] "Le briquet"                                 
##  [2] "Grand Claus et petit Claus"                 
##  [3] "La princesse au petit pois"                 
##  [4] "Les fleurs de la petite Ida"                
##  [5] "La Petite Poucette"                         
##  [6] "Le compagnon de route"                      
##  [7] "La petite sirène"                           
##  [8] "Les habits neufs de l'empereur"             
##  [9] "La pâquerette"                              
## [10] "Le stoïque soldat de plomb"                 
## [11] "Les cygnes sauvages"                        
## [12] "Le jardin du paradis"                       
## [13] "La malle volante"                           
## [14] "Une semaine du petit elfe Ferme-l'œil"      
## [15] "La princesse et le porcher"                 
## [16] "L'ange"                                     
## [17] "Le Rossignol et l'Empereur de Chine"        
## [18] "Le vilain petit canard"                     
## [19] "Le sapin"                                   
## [20] "La reine des neiges"                        
## [21] "La fée du sureau"                           
## [22] "Le concours de saut"                        
## [23] "La bergère et le ramoneur"                  
## [24] "La cloche"                                  
## [25] "L'aiguille à repriser"                      
## [26] "La petite fille aux allumettes"             
## [27] "Le vieux réverbère"                         
## [28] "Les voisins"                                
## [29] "L'ombre"                                    
## [30] "La vieille maison"                          
## [31] "L'heureuse famille"                         
## [32] "Les amours d'un faux-col"                   
## [33] "Le chanvre"                                 
## [34] "Bonne humeur"                               
## [35] "Chacun et chaque chose à sa place"          
## [36] "Cinq dans une cosse de pois"                
## [37] "La tirelire"                                
## [38] "Hans le balourd"                            
## [39] "Le goulot de la bouteille"                  
## [40] "La soupe à la brochette"                    
## [41] "Quelque chose"                              
## [42] "Le dernier rêve du chêne"                   
## [43] "Les coureurs"                               
## [44] "Papotages d'enfants"                        
## [45] "La plume et l'encrier"                      
## [46] "Le coq de poulailler et le coq de girouette"
## [47] "Le papillon"                                
## [48] "Ce que le Père fait est bien fait"          
## [49] "Le bonhomme de neige"                       
## [50] "L'escargot et le rosier"                    
## [51] "Le schilling d'argent"                      
## [52] "Une rose de la tombe d'Homère"              
## [53] "Le crapaud"                                 
## [54] "Le montreur de marionnettes"                
## [55] "Le soleil raconte"                          
## [56] "Les aventures du chardon"                   
## [57] "Le bisaïeul"                                
## [58] "Le jardinier et ses maîtres"