# install.packages('tinytex')
# tinytex::install_tinytex()  # install TinyTeX

\[\int{\dfrac{5x^2}{3x+2}}dx\]

Reference

R Programming for Data Science by Roger D. Peng, May 31, 2022

Textbook

1 Reading and Writing Data

1.1 Functions for reading data in R

  • read.table, read.csv, for reading tabular data

  • readLines, for reading lines of a text file

  • source, for reading in R code files (inverse of dump)

  • dget, for reading in R code files (inverse of dput)

  • load, for reading in saved workspaces

  • unserialize, for reading single R objects in binary form

team_standing <- read.csv("team_standings.csv")
str(team_standing)
'data.frame':   32 obs. of  2 variables:
 $ Standing: int  1 2 3 4 5 6 7 8 9 10 ...
 $ Team    : chr  "Spain" "Netherlands" "Germany" "Uruguay" ...
team_standing
head(team_standing, 10)
tail(team_standing,5)
little_mermaid <- read.table("little_mermaid.txt",
                             col.names = "The_Little_Mermaid")
little_mermaid

1.2 Functions for writing data in R

  • write.table, for writing tabular data to text files (i.e. CSV) or connections

  • writeLines, for writing character data line-by-line to a file or connection

  • dump, for dumping a textual representation of multiple R objects

  • dput, for outputting a textual representation of an R object

  • save, for saving an arbitrary number of R objects in binary format (possibly compressed) to a file.

  • serialize, for converting an R object into a binary format for outputting to a connection (or file).

str(iris)
'data.frame':   150 obs. of  5 variables:
 $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
iris
write.csv(x = iris,
          file = "iris.csv",
          row.names = FALSE)
str(swiss)
'data.frame':   47 obs. of  6 variables:
 $ Fertility       : num  80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
 $ Agriculture     : num  17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ...
 $ Examination     : int  15 6 5 12 17 9 16 14 12 16 ...
 $ Education       : int  12 9 5 7 15 7 7 8 7 13 ...
 $ Catholic        : num  9.96 84.84 93.4 33.77 5.16 ...
 $ Infant.Mortality: num  22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...
write.table(x = swiss,
            file = "swiss.txt",
            sep = ";",
            row.names = FALSE)
swiss_v2 <- read.table(file = "swiss.txt",
                       header = TRUE,
                       sep = ";")
swiss_v2

1.3 Reading in Larger Datasets

initial <- read.table("president2022_05131447.csv",
                      sep = ",",
                      nrows = 100,
                      header = TRUE)
head(initial)
str(initial)
'data.frame':   100 obs. of  23 variables:
 $ PRECINCT_CODE      : int  1010017 1010017 1010017 1010017 1010017 1010017 1010017 1010017 1010017 1010017 ...
 $ CONTEST_CODE       : int  199000 199000 199000 199000 199000 199000 199000 199000 199000 199000 ...
 $ CANDIDATE_CODE     : num  9.9e+09 9.9e+09 9.9e+09 9.9e+09 9.9e+09 ...
 $ PARTY_CODE         : int  35 180 36 3 38 22 182 16 1 180 ...
 $ VOTES_AMOUNT       : int  1 0 0 7 5 17 616 0 3 31 ...
 $ TOTALIZATION_ORDER : int  4 1 2 3 5 6 7 8 9 10 ...
 $ NUMBER_VOTERS      : int  697 697 697 697 697 697 697 697 697 697 ...
 $ UNDERVOTE          : int  2 2 2 2 2 2 2 2 2 2 ...
 $ OVERVOTE           : int  15 15 15 15 15 15 15 15 15 15 ...
 $ RECEPTION_DATE     : chr  "05/09/2022 - 08:07:08 PM" "05/09/2022 - 08:07:08 PM" "05/09/2022 - 08:07:08 PM" "05/09/2022 - 08:07:08 PM" ...
 $ CONTEST_NAME       : chr  "PRESIDENT PHILIPPINES" "PRESIDENT PHILIPPINES" "PRESIDENT PHILIPPINES" "PRESIDENT PHILIPPINES" ...
 $ CANDIDATE_NAME     : chr  "GONZALES, NORBERTO (PDSP)" "ABELLA, ERNIE (IND)" "DE GUZMAN, LEODY (PLM)" "DOMAGOSO, ISKO MORENO (AKSYON)" ...
 $ PARTIES_NAME       : chr  "PARTIDO DEMOKRATIKO SOSYALISTA NG PILIPINAS" "INDEPENDENT" "PARTIDO LAKAS NG MASA" "AKSYON DEMOKRATIKO" ...
 $ PARTIES_ALIAS      : chr  "PDSP" "IND" "PLM" "AKSYON" ...
 $ REGION             : chr  "CAR" "CAR" "CAR" "CAR" ...
 $ PROVINCE           : chr  "ABRA" "ABRA" "ABRA" "ABRA" ...
 $ MUNICIPALITY       : chr  "BANGUED" "BANGUED" "BANGUED" "BANGUED" ...
 $ BARANGAY           : chr  "AGTANGAO" "AGTANGAO" "AGTANGAO" "AGTANGAO" ...
 $ CLUSTER            : int  17 17 17 17 17 17 17 17 17 17 ...
 $ CLUSTERTOTAL       : int  783 783 783 783 783 783 783 783 783 783 ...
 $ CLUSTERED_PRECINCTS: chr  "0045A, 0045B, 0050A, 0050B" "0045A, 0045B, 0050A, 0050B" "0045A, 0045B, 0050A, 0050B" "0045A, 0045B, 0050A, 0050B" ...
 $ POLLINGCENTER      : chr  "AGTANGAO ELEMENTARY SCHOOL, AGTANGAO, BANGUED, ABRA" "AGTANGAO ELEMENTARY SCHOOL, AGTANGAO, BANGUED, ABRA" "AGTANGAO ELEMENTARY SCHOOL, AGTANGAO, BANGUED, ABRA" "AGTANGAO ELEMENTARY SCHOOL, AGTANGAO, BANGUED, ABRA" ...
 $ DISTRICT           : chr  "ABRA - LONE DISTRICT" "ABRA - LONE DISTRICT" "ABRA - LONE DISTRICT" "ABRA - LONE DISTRICT" ...
classes <- sapply(initial, class)
classes
      PRECINCT_CODE        CONTEST_CODE      CANDIDATE_CODE          PARTY_CODE        VOTES_AMOUNT 
          "integer"           "integer"           "numeric"           "integer"           "integer" 
 TOTALIZATION_ORDER       NUMBER_VOTERS           UNDERVOTE            OVERVOTE      RECEPTION_DATE 
          "integer"           "integer"           "integer"           "integer"         "character" 
       CONTEST_NAME      CANDIDATE_NAME        PARTIES_NAME       PARTIES_ALIAS              REGION 
        "character"         "character"         "character"         "character"         "character" 
           PROVINCE        MUNICIPALITY            BARANGAY             CLUSTER        CLUSTERTOTAL 
        "character"         "character"         "character"           "integer"           "integer" 
CLUSTERED_PRECINCTS       POLLINGCENTER            DISTRICT 
        "character"         "character"         "character" 
library(tictoc)
tic()
pres2022 <- read.csv("president2022_05131447.csv")
toc()
25.92 sec elapsed
dim(pres2022)
[1] 1060080      23
names(pres2022)
 [1] "PRECINCT_CODE"       "CONTEST_CODE"        "CANDIDATE_CODE"      "PARTY_CODE"         
 [5] "VOTES_AMOUNT"        "TOTALIZATION_ORDER"  "NUMBER_VOTERS"       "UNDERVOTE"          
 [9] "OVERVOTE"            "RECEPTION_DATE"      "CONTEST_NAME"        "CANDIDATE_NAME"     
[13] "PARTIES_NAME"        "PARTIES_ALIAS"       "REGION"              "PROVINCE"           
[17] "MUNICIPALITY"        "BARANGAY"            "CLUSTER"             "CLUSTERTOTAL"       
[21] "CLUSTERED_PRECINCTS" "POLLINGCENTER"       "DISTRICT"           
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ──────────────────────────────────────────────────────────── tidyverse 1.3.2 ──✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.4.1 
✔ readr   2.1.2      ✔ forcats 0.5.2 ── Conflicts ─────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
pres2022 %>%
  filter(PROVINCE == "NEGROS ORIENTAL") %>%
  group_by(CANDIDATE_NAME) %>%
  summarise(VOTES_AMOUNT = sum(VOTES_AMOUNT)) %>%
  ungroup() %>%
  arrange(desc(VOTES_AMOUNT))

1.4 Calculating Memory Requirements

paste(1060080 * 23 * 8, "bytes")
[1] "195054720 bytes"
paste(195054720 / 2^{20}, "MB")
[1] "186.018676757812 MB"
LS0tDQp0aXRsZTogIkdldHRpbmcgRGF0YSBJbiBhbmQgT3V0IG9mIFIiDQpzdWJ0aXRsZTogIlIgUHJvZ3JhbW1pbmcgZm9yIERhdGEgU2NpZW5jZSINCmF1dGhvcjogIlJvZ2VyIEQuIFBlbmciDQpkYXRlOiAiU2VwdGVtYmVyIDE5LCAyMDIyIg0Kb3V0cHV0Og0KICBodG1sX2RvY3VtZW50OiANCiAgICB0b2M6IHllcw0KICAgIG51bWJlcl9zZWN0aW9uczogdHJ1ZQ0KICBwZGZfZG9jdW1lbnQ6DQogICAgdG9jOiB5ZXMNCiAgICB0b2NfZGVwdGg6IDMNCiAgICBudW1iZXJfc2VjdGlvbnM6IHRydWUNCiAgaHRtbF9ub3RlYm9vazogDQogICAgdG9jOiB5ZXMNCiAgICBudW1iZXJfc2VjdGlvbnM6IHRydWUNCiAgd29yZF9kb2N1bWVudDoNCiAgICB0b2M6IHllcw0KICAgIG51bWJlcl9zZWN0aW9uczogdHJ1ZQ0KLS0tDQoNCmBgYHtyIGV2YWw9RkFMU0V9DQojIGluc3RhbGwucGFja2FnZXMoJ3Rpbnl0ZXgnKQ0KIyB0aW55dGV4OjppbnN0YWxsX3Rpbnl0ZXgoKSAgIyBpbnN0YWxsIFRpbnlUZVgNCmBgYA0KDQokJFxpbnR7XGRmcmFjezV4XjJ9ezN4KzJ9fWR4JCQNCg0KDQojIFJlZmVyZW5jZSB7LX0NCg0KW1IgUHJvZ3JhbW1pbmcgZm9yIERhdGEgU2NpZW5jZV0oaHR0cHM6Ly9ib29rZG93bi5vcmcvcmRwZW5nL3Jwcm9nZGF0YXNjaWVuY2UvKQ0KYnkgUm9nZXIgRC4gUGVuZywgTWF5IDMxLCAyMDIyDQoNCiFbVGV4dGJvb2tdKGJvb2tjb3Zlci5wbmcpDQoNCiMgUmVhZGluZyBhbmQgV3JpdGluZyBEYXRhDQoNCiMjICAgIEZ1bmN0aW9ucyBmb3IgcmVhZGluZyBkYXRhIGluIFINCg0KKiAgIGByZWFkLnRhYmxlYCwgYHJlYWQuY3N2YCwgZm9yIHJlYWRpbmcgdGFidWxhciBkYXRhDQoJDQoqICAgYHJlYWRMaW5lc2AsIGZvciByZWFkaW5nIGxpbmVzIG9mIGEgdGV4dCBmaWxlDQoJDQoqICAgYHNvdXJjZWAsIGZvciByZWFkaW5nIGluIFIgY29kZSBmaWxlcyAoaW52ZXJzZSBvZiBkdW1wKQ0KCQ0KKiAgIGBkZ2V0YCwgZm9yIHJlYWRpbmcgaW4gUiBjb2RlIGZpbGVzIChpbnZlcnNlIG9mIGRwdXQpDQoJDQoqICAgYGxvYWRgLCBmb3IgcmVhZGluZyBpbiBzYXZlZCB3b3Jrc3BhY2VzDQoJDQoqICAgYHVuc2VyaWFsaXplYCwgZm9yIHJlYWRpbmcgc2luZ2xlIFIgb2JqZWN0cyBpbiBiaW5hcnkgZm9ybQ0KDQpgYGB7cn0NCnRlYW1fc3RhbmRpbmcgPC0gcmVhZC5jc3YoInRlYW1fc3RhbmRpbmdzLmNzdiIpDQpzdHIodGVhbV9zdGFuZGluZykNCmBgYA0KDQpgYGB7cn0NCnRlYW1fc3RhbmRpbmcNCmBgYA0KDQpgYGB7cn0NCmhlYWQodGVhbV9zdGFuZGluZywgMTApDQpgYGANCg0KYGBge3J9DQp0YWlsKHRlYW1fc3RhbmRpbmcsNSkNCmBgYA0KDQoNCmBgYHtyfQ0KbGl0dGxlX21lcm1haWQgPC0gcmVhZC50YWJsZSgibGl0dGxlX21lcm1haWQudHh0IiwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgY29sLm5hbWVzID0gIlRoZV9MaXR0bGVfTWVybWFpZCIpDQpsaXR0bGVfbWVybWFpZA0KYGBgDQoNCiMjICAgIEZ1bmN0aW9ucyBmb3Igd3JpdGluZyBkYXRhIGluIFINCg0KKiAgIGB3cml0ZS50YWJsZWAsIGZvciB3cml0aW5nIHRhYnVsYXIgZGF0YSB0byB0ZXh0IGZpbGVzIChpLmUuIENTVikgb3IgY29ubmVjdGlvbnMNCgkNCiogICBgd3JpdGVMaW5lc2AsIGZvciB3cml0aW5nIGNoYXJhY3RlciBkYXRhIGxpbmUtYnktbGluZSB0byBhIGZpbGUgb3IgY29ubmVjdGlvbg0KCQ0KKiAgIGBkdW1wYCwgZm9yIGR1bXBpbmcgYSB0ZXh0dWFsIHJlcHJlc2VudGF0aW9uIG9mIG11bHRpcGxlIFIgb2JqZWN0cw0KCQ0KKiAgIGBkcHV0YCwgZm9yIG91dHB1dHRpbmcgYSB0ZXh0dWFsIHJlcHJlc2VudGF0aW9uIG9mIGFuIFIgb2JqZWN0DQoJDQoqICAgYHNhdmVgLCBmb3Igc2F2aW5nIGFuIGFyYml0cmFyeSBudW1iZXIgb2YgUiBvYmplY3RzIGluIGJpbmFyeSBmb3JtYXQgKHBvc3NpYmx5IGNvbXByZXNzZWQpIHRvIGEgZmlsZS4NCgkNCiogICBgc2VyaWFsaXplYCwgZm9yIGNvbnZlcnRpbmcgYW4gUiBvYmplY3QgaW50byBhIGJpbmFyeSBmb3JtYXQgZm9yIG91dHB1dHRpbmcgdG8gYSBjb25uZWN0aW9uIChvciBmaWxlKS4NCg0KYGBge3J9DQpzdHIoaXJpcykNCmBgYA0KYGBge3J9DQppcmlzDQpgYGANCg0KDQoNCg0KYGBge3J9DQp3cml0ZS5jc3YoeCA9IGlyaXMsDQogICAgICAgICAgZmlsZSA9ICJpcmlzLmNzdiIsDQogICAgICAgICAgcm93Lm5hbWVzID0gRkFMU0UpDQpgYGANCg0KDQpgYGB7cn0NCnN0cihzd2lzcykNCmBgYA0KDQpgYGB7cn0NCndyaXRlLnRhYmxlKHggPSBzd2lzcywNCiAgICAgICAgICAgIGZpbGUgPSAic3dpc3MudHh0IiwNCiAgICAgICAgICAgIHNlcCA9ICI7IiwNCiAgICAgICAgICAgIHJvdy5uYW1lcyA9IEZBTFNFKQ0KYGBgDQoNCmBgYHtyfQ0Kc3dpc3NfdjIgPC0gcmVhZC50YWJsZShmaWxlID0gInN3aXNzLnR4dCIsDQogICAgICAgICAgICAgICAgICAgICAgIGhlYWRlciA9IFRSVUUsDQogICAgICAgICAgICAgICAgICAgICAgIHNlcCA9ICI7IikNCnN3aXNzX3YyDQpgYGANCg0KDQojIyBSZWFkaW5nIGluIExhcmdlciBEYXRhc2V0cyANCg0KYGBge3J9DQppbml0aWFsIDwtIHJlYWQudGFibGUoInByZXNpZGVudDIwMjJfMDUxMzE0NDcuY3N2IiwNCiAgICAgICAgICAgICAgICAgICAgICBzZXAgPSAiLCIsDQogICAgICAgICAgICAgICAgICAgICAgbnJvd3MgPSAxMDAsDQogICAgICAgICAgICAgICAgICAgICAgaGVhZGVyID0gVFJVRSkNCmBgYA0KDQpgYGB7cn0NCmhlYWQoaW5pdGlhbCkNCmBgYA0KDQpgYGB7cn0NCnN0cihpbml0aWFsKQ0KYGBgDQpgYGB7cn0NCmNsYXNzZXMgPC0gc2FwcGx5KGluaXRpYWwsIGNsYXNzKQ0KY2xhc3Nlcw0KYGBgDQpgYGB7cn0NCmxpYnJhcnkodGljdG9jKQ0KYGBgDQoNCmBgYHtyfQ0KdGljKCkNCnByZXMyMDIyIDwtIHJlYWQuY3N2KCJwcmVzaWRlbnQyMDIyXzA1MTMxNDQ3LmNzdiIpDQp0b2MoKQ0KYGBgDQpgYGB7cn0NCmRpbShwcmVzMjAyMikNCmBgYA0KDQoNCmBgYHtyfQ0KbmFtZXMocHJlczIwMjIpDQpgYGANCmBgYHtyfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpwcmVzMjAyMiAlPiUNCiAgZmlsdGVyKFBST1ZJTkNFID09ICJORUdST1MgT1JJRU5UQUwiKSAlPiUNCiAgZ3JvdXBfYnkoQ0FORElEQVRFX05BTUUpICU+JQ0KICBzdW1tYXJpc2UoVk9URVNfQU1PVU5UID0gc3VtKFZPVEVTX0FNT1VOVCkpICU+JQ0KICB1bmdyb3VwKCkgJT4lDQogIGFycmFuZ2UoZGVzYyhWT1RFU19BTU9VTlQpKQ0KYGBgDQoNCg0KDQojIyBDYWxjdWxhdGluZyBNZW1vcnkgUmVxdWlyZW1lbnRzDQoNCmBgYHtyfQ0KcGFzdGUoMTA2MDA4MCAqIDIzICogOCwgImJ5dGVzIikNCmBgYA0KYGBge3J9DQpwYXN0ZSgxOTUwNTQ3MjAgLyAyXnsyMH0sICJNQiIpDQpgYGANCg0K