if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(tidyverse, DBI, RPostgres)
con <- dbConnect(
RPostgres::Postgres(),
dbname = 'sms',
host = 'localhost',
port = 5432,
user = 'Shaloom',
password = 'Shaloom_50095'
)
dbListTables(con)
## [1] "posts" "users"
dbListFields(con, "users")
## [1] "id" "username" "email" "password" "created_at"
my_tb <- dbReadTable(con, "users")
my_query <- dbGetQuery(con, "SELECT * FROM users WHERE username = 'Shaloom' ")
summary(my_tb)
## id username email password
## Length : 4 Length :4 Length : 4 Length : 4
## N.unique : 4 N.unique :4 N.unique : 4 N.unique : 4
## N.blank : 0 N.blank :0 N.blank : 0 N.blank : 0
## Min.nchar:36 Min.nchar:5 Min.nchar:15 Min.nchar:162
## Max.nchar:36 Max.nchar:7 Max.nchar:17 Max.nchar:162
##
## created_at
## Min. :2026-05-08 10:59:25
## 1st Qu.:2026-05-08 11:01:39
## Median :2026-05-08 11:02:32
## Mean :2026-05-08 11:01:56
## 3rd Qu.:2026-05-08 11:02:49
## Max. :2026-05-08 11:03:14
View(my_query)
library(readr)
billboard <- read_csv("../data/charts.csv")
## Rows: 330087 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): song, artist
## dbl (4): rank, last-week, peak-rank, weeks-on-board
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(billboard) # getting number of row(observation) in dataset
## [1] 330087
ncol(billboard) # getting number of column(variable) in dataset
## [1] 7
library(haven)
survey <- read_sav("../data/survey.sav")
summary(survey)
## id sex age marital
## Min. : 1.0 Min. :1.000 Min. :18.00 Min. :1.000
## 1st Qu.:131.5 1st Qu.:1.000 1st Qu.:26.00 1st Qu.:2.000
## Median :280.0 Median :2.000 Median :36.00 Median :4.000
## Mean :276.4 Mean :1.579 Mean :37.44 Mean :3.371
## 3rd Qu.:422.5 3rd Qu.:2.000 3rd Qu.:47.00 3rd Qu.:4.000
## Max. :702.0 Max. :2.000 Max. :82.00 Max. :8.000
##
## child educ source smoke
## Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:1.00 1st Qu.:2.000
## Median :2.000 Median :4.000 Median :1.00 Median :2.000
## Mean :1.578 Mean :4.087 Mean :3.41 Mean :1.805
## 3rd Qu.:2.000 3rd Qu.:5.000 3rd Qu.:6.00 3rd Qu.:2.000
## Max. :2.000 Max. :6.000 Max. :9.00 Max. :2.000
## NAs :1 NAs :17 NAs :3
## smokenum op1 op2 op3 op4
## Min. : 0.00 Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000
## 1st Qu.: 0.00 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:3.0 1st Qu.:1.000
## Median : 0.00 Median :3.000 Median :2.000 Median :4.0 Median :2.000
## Mean : 16.59 Mean :3.264 Mean :2.284 Mean :3.7 Mean :2.259
## 3rd Qu.: 0.00 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:4.0 3rd Qu.:3.000
## Max. :280.00 Max. :5.000 Max. :5.000 Max. :5.0 Max. :5.000
## NAs :79 NAs :4 NAs :3 NAs :3 NAs :3
## op5 op6 mast1 mast2 mast3
## Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:1.000
## Median :4.00 Median :2.000 Median :2.000 Median :3.000 Median :2.000
## Mean :3.94 Mean :2.266 Mean :2.037 Mean :3.323 Mean :1.959
## 3rd Qu.:5.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :5.00 Max. :5.000 Max. :4.000 Max. :5.000 Max. :4.000
## NAs :3 NAs :3 NAs :3 NAs :3 NAs :3
## mast4 mast5 mast6 mast7
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.000 Median :4.000 Median :2.000 Median :2.000
## Mean :1.924 Mean :3.411 Mean :1.956 Mean :2.094
## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000
## NAs :3 NAs :3 NAs :3 NAs :3
## pn1 pn2 pn3 pn4
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000
## Median :4.000 Median :2.000 Median :1.000 Median :3.000
## Mean :3.789 Mean :2.562 Mean :1.734 Mean :3.163
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NAs :3 NAs :3 NAs :3 NAs :3
## pn5 pn6 pn7 pn8
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:1.000
## Median :1.000 Median :4.000 Median :4.000 Median :2.000
## Mean :1.411 Mean :3.727 Mean :3.596 Mean :2.202
## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NAs :3 NAs :3 NAs :3 NAs :4
## pn9 pn10 pn11 pn12 pn13
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:1.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000
## Median :3.000 Median :2.00 Median :2.000 Median :3.000 Median :3.000
## Mean :3.294 Mean :1.86 Mean :2.422 Mean :2.872 Mean :3.261
## 3rd Qu.:4.000 3rd Qu.:2.00 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.00 Max. :5.000 Max. :5.000 Max. :5.000
## NAs :3 NAs :3 NAs :3 NAs :3 NAs :3
## pn14 pn15 pn16 pn17
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:3.000
## Median :1.000 Median :3.000 Median :1.000 Median :3.000
## Mean :1.661 Mean :3.188 Mean :1.674 Mean :3.376
## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NAs :3 NAs :3 NAs :3 NAs :3
## pn18 pn19 pn20 lifsat1
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:3.000
## Median :3.500 Median :2.000 Median :1.000 Median :5.000
## Mean :3.424 Mean :2.186 Mean :1.722 Mean :4.374
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :7.000
## NAs :3 NAs :3 NAs :3 NAs :3
## lifsat2 lifsat3 lifsat4 lifsat5 pss1
## Min. :1.000 Min. :1.00 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:4.00 1st Qu.:4.00 1st Qu.:2.000 1st Qu.:2.000
## Median :5.000 Median :5.00 Median :5.00 Median :4.000 Median :3.000
## Mean :4.571 Mean :4.69 Mean :4.75 Mean :3.993 Mean :2.836
## 3rd Qu.:6.000 3rd Qu.:6.00 3rd Qu.:6.00 3rd Qu.:6.000 3rd Qu.:3.000
## Max. :7.000 Max. :7.00 Max. :7.00 Max. :7.000 Max. :5.000
## NAs :3 NAs :3 NAs :3 NAs :3 NAs :6
## pss2 pss3 pss4 pss5
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000
## Median :3.000 Median :3.000 Median :4.000 Median :4.000
## Mean :2.739 Mean :3.162 Mean :3.801 Mean :3.467
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NAs :6 NAs :6 NAs :6 NAs :6
## pss6 pss7 pss8 pss9
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000
## Median :3.000 Median :4.000 Median :4.000 Median :3.000
## Mean :2.774 Mean :3.538 Mean :3.536 Mean :3.046
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NAs :6 NAs :6 NAs :6 NAs :6
## pss10 sest1 sest2 sest3
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:4.000 1st Qu.:1.000
## Median :2.000 Median :4.000 Median :4.000 Median :1.000
## Mean :2.513 Mean :3.672 Mean :3.745 Mean :1.376
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :5.000 Max. :4.000 Max. :4.000 Max. :4.000
## NAs :6 NAs :3 NAs :3 NAs :3
## sest4 sest5 sest6 sest7 sest8
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:3.00
## Median :4.000 Median :1.000 Median :3.000 Median :2.000 Median :3.00
## Mean :3.408 Mean :1.596 Mean :3.278 Mean :2.044 Mean :3.17
## 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:4.00
## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.00
## NAs :3 NAs :3 NAs :3 NAs :3 NAs :3
## sest9 sest10 m1 m2
## Min. :1.000 Min. :1.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :2.000 Median :1.000 Median :1.0000 Median :1.0000
## Mean :2.039 Mean :1.713 Mean :0.7294 Mean :0.9014
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :4.000 Max. :4.000 Max. :1.0000 Max. :1.0000
## NAs :3 NAs :3 NAs :3 NAs :3
## m3 m4 m5 m6
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :1.0000 Median :0.0000
## Mean :0.8624 Mean :0.4299 Mean :0.5069 Mean :0.2248
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NAs :3 NAs :4 NAs :3 NAs :3
## m7 m8 m9 m10
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :0.0000 Median :0.0000
## Mean :0.4862 Mean :0.5793 Mean :0.2248 Mean :0.3609
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NAs :3 NAs :4 NAs :3 NAs :4
## pc1 pc2 pc3 pc4
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:3.000
## Median :2.000 Median :3.000 Median :4.000 Median :4.000
## Mean :2.545 Mean :2.758 Mean :3.575 Mean :3.464
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NAs :6 NAs :6 NAs :6 NAs :6
## pc5 pc6 pc7 pc8
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :4.000 Median :3.000 Median :3.000 Median :3.000
## Mean :3.714 Mean :3.282 Mean :2.905 Mean :3.072
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NAs :6 NAs :7 NAs :6 NAs :6
## pc9 pc10 pc11 pc12 pc13
## Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.0 1st Qu.:3.000 1st Qu.:2.000
## Median :4.000 Median :4.000 Median :2.0 Median :3.000 Median :3.000
## Mean :3.575 Mean :3.596 Mean :2.7 Mean :3.346 Mean :3.254
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.0 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.0 Max. :5.000 Max. :5.000
## NAs :6 NAs :6 NAs :6 NAs :6 NAs :6
## pc14 pc15 pc16 pc17 pc18
## Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:2.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.00
## Median :3.00 Median :2.000 Median :2.000 Median :3.000 Median :4.00
## Mean :2.94 Mean :2.619 Mean :2.443 Mean :3.171 Mean :3.61
## 3rd Qu.:4.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.00
## Max. :5.00 Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.00
## NAs :6 NAs :6 NAs :6 NAs :6 NAs :6
## Rop2 Rop4 Rop6 Rmast1
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000
## Median :4.000 Median :4.000 Median :4.000 Median :3.000
## Mean :3.716 Mean :3.741 Mean :3.734 Mean :2.963
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :4.000
## NAs :3 NAs :3 NAs :3 NAs :3
## Rmast3 Rmast4 Rmast6 Rmast7
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.000
## Median :3.000 Median :3.000 Median :3.000 Median :3.000
## Mean :3.041 Mean :3.076 Mean :3.044 Mean :2.906
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000
## NAs :3 NAs :3 NAs :3 NAs :3
## Rpss4 Rpss5 Rpss7 Rpss8
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :2.000 Median :2.000 Median :2.000
## Mean :2.199 Mean :2.533 Mean :2.462 Mean :2.464
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NAs :6 NAs :6 NAs :6 NAs :6
## Rsest3 Rsest5 Rsest7 Rsest9
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.000
## Median :4.000 Median :4.000 Median :3.000 Median :3.000
## Mean :3.624 Mean :3.404 Mean :2.956 Mean :2.961
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000
## NAs :3 NAs :3 NAs :3 NAs :3
## Rsest10 Rpc1 Rpc2 Rpc7 Rpc11
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.0
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.0
## Median :4.000 Median :4.000 Median :3.000 Median :3.000 Median :4.0
## Mean :3.287 Mean :3.455 Mean :3.242 Mean :3.095 Mean :3.3
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.0
## Max. :4.000 Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.0
## NAs :3 NAs :6 NAs :6 NAs :6 NAs :6
## Rpc15 Rpc16 toptim tmast
## Min. :1.000 Min. :1.000 Min. : 7.00 Min. : 8.00
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:19.00 1st Qu.:19.00
## Median :4.000 Median :4.000 Median :22.00 Median :22.00
## Mean :3.381 Mean :3.557 Mean :22.12 Mean :21.76
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:26.00 3rd Qu.:25.00
## Max. :5.000 Max. :5.000 Max. :30.00 Max. :28.00
## NAs :6 NAs :6 NAs :4 NAs :3
## tposaff tnegaff tlifesat tpstress tslfest
## Min. :11.00 Min. :10.0 Min. : 5.00 Min. :12.00 Min. :18.00
## 1st Qu.:29.00 1st Qu.:14.0 1st Qu.:18.00 1st Qu.:23.00 1st Qu.:30.00
## Median :34.00 Median :18.0 Median :23.00 Median :26.00 Median :35.00
## Mean :33.69 Mean :19.4 Mean :22.38 Mean :26.73 Mean :33.53
## 3rd Qu.:39.00 3rd Qu.:23.0 3rd Qu.:27.00 3rd Qu.:31.00 3rd Qu.:38.00
## Max. :49.00 Max. :39.0 Max. :35.00 Max. :46.00 Max. :40.00
## NAs :3 NAs :4 NAs :3 NAs :6 NAs :3
## tmarlow tpcoiss agegp3 agegp5
## Min. : 0.000 Min. :20.00 Min. :1.000 Min. :1.000
## 1st Qu.: 4.000 1st Qu.:53.00 1st Qu.:1.000 1st Qu.:2.000
## Median : 5.000 Median :62.00 Median :2.000 Median :3.000
## Mean : 5.305 Mean :60.63 Mean :1.973 Mean :2.954
## 3rd Qu.: 7.000 3rd Qu.:69.00 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :10.000 Max. :88.00 Max. :3.000 Max. :5.000
## NAs :6 NAs :9
## educrec LG10negaff MAH_1 COO_1
## Min. :1.000 Min. :1.000 Min. : 0.003542 Min. :1.000e-09
## 1st Qu.:2.000 1st Qu.:1.146 1st Qu.: 0.534432 1st Qu.:1.293e-04
## Median :3.000 Median :1.255 Median : 1.322372 Median :5.578e-04
## Mean :3.091 Mean :1.261 Mean : 1.993438 Mean :3.031e-03
## 3rd Qu.:4.000 3rd Qu.:1.362 3rd Qu.: 2.538417 3rd Qu.:2.413e-03
## Max. :5.000 Max. :1.663 Max. :13.896642 Max. :9.449e-02
## NAs :4 NAs :10 NAs :13
## MAH_2
## Min. : 0.09048
## 1st Qu.: 1.11653
## Median : 1.99169
## Mean : 2.99306
## 3rd Qu.: 3.86184
## Max. :18.10064
## NAs :7
sum(is.na(survey$source))
## [1] 17
nrow(billboard) # getting number of row(observation) in dataset
## [1] 330087
ncol(billboard) # getting number of column(variable) in dataset
## [1] 7
In R, piping is a coding technique used to chain operations together in a sequence, making code much easier to read and maintain. Instead of nesting functions inside each other, which forces you to read from the inside out. piping allows you to read from left to right or top to bottom.
The native pipe |> requires parentheses after the function name (e.g., x |> mean()), whereas %>% can accept the bare function name (e.g., x %>% mean).
If you need to pass the data to a position other than the first argument:
The native pipe |> is slightly faster because it is handled directly by the R parser rather than as a function call.
billboard %>% head(10) # getting 10 first record in data set using %>%
## # A tibble: 10 × 7
## date rank song artist `last-week` `peak-rank` `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 1 1 3
## 2 2021-11-06 2 Stay The K… 2 1 16
## 3 2021-11-06 3 Industry Ba… Lil N… 3 1 14
## 4 2021-11-06 4 Fancy Like Walke… 4 3 19
## 5 2021-11-06 5 Bad Habits Ed Sh… 5 2 18
## 6 2021-11-06 6 Way 2 Sexy Drake… 6 1 8
## 7 2021-11-06 7 Shivers Ed Sh… 9 7 7
## 8 2021-11-06 8 Good 4 U Olivi… 7 1 24
## 9 2021-11-06 9 Need To Know Doja … 11 9 20
## 10 2021-11-06 10 Levitating Dua L… 8 2 56
Common Selection Methods
Advanced Helper functions for select
You can use selection helpers to find columns based on patterns:
names(billboard)
## [1] "date" "rank" "song" "artist"
## [5] "last-week" "peak-rank" "weeks-on-board"
billboard |> select(date, rank, song, artist, "weeks-on-board")
## # A tibble: 330,087 × 5
## date rank song artist `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 3
## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16
## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14
## 4 2021-11-06 4 Fancy Like Walker Hayes 19
## 5 2021-11-06 5 Bad Habits Ed Sheeran 18
## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & You… 8
## 7 2021-11-06 7 Shivers Ed Sheeran 7
## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24
## 9 2021-11-06 9 Need To Know Doja Cat 20
## 10 2021-11-06 10 Levitating Dua Lipa 56
## # ℹ 330,077 more rows
billboard |> select(date:artist, "weeks-on-board")
## # A tibble: 330,087 × 5
## date rank song artist `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 3
## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16
## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14
## 4 2021-11-06 4 Fancy Like Walker Hayes 19
## 5 2021-11-06 5 Bad Habits Ed Sheeran 18
## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & You… 8
## 7 2021-11-06 7 Shivers Ed Sheeran 7
## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24
## 9 2021-11-06 9 Need To Know Doja Cat 20
## 10 2021-11-06 10 Levitating Dua Lipa 56
## # ℹ 330,077 more rows
billboard |> select(1:4, "weeks-on-board")
## # A tibble: 330,087 × 5
## date rank song artist `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 3
## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16
## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14
## 4 2021-11-06 4 Fancy Like Walker Hayes 19
## 5 2021-11-06 5 Bad Habits Ed Sheeran 18
## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & You… 8
## 7 2021-11-06 7 Shivers Ed Sheeran 7
## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24
## 9 2021-11-06 9 Need To Know Doja Cat 20
## 10 2021-11-06 10 Levitating Dua Lipa 56
## # ℹ 330,077 more rows
billboard |> select(-"last-week", -"peak-rank")
## # A tibble: 330,087 × 5
## date rank song artist `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 3
## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16
## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14
## 4 2021-11-06 4 Fancy Like Walker Hayes 19
## 5 2021-11-06 5 Bad Habits Ed Sheeran 18
## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & You… 8
## 7 2021-11-06 7 Shivers Ed Sheeran 7
## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24
## 9 2021-11-06 9 Need To Know Doja Cat 20
## 10 2021-11-06 10 Levitating Dua Lipa 56
## # ℹ 330,077 more rows
billboard |> select(date:artist, week_popular = "weeks-on-board")
## # A tibble: 330,087 × 5
## date rank song artist week_popular
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 3
## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16
## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14
## 4 2021-11-06 4 Fancy Like Walker Hayes 19
## 5 2021-11-06 5 Bad Habits Ed Sheeran 18
## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & Young T… 8
## 7 2021-11-06 7 Shivers Ed Sheeran 7
## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24
## 9 2021-11-06 9 Need To Know Doja Cat 20
## 10 2021-11-06 10 Levitating Dua Lipa 56
## # ℹ 330,077 more rows
Advanced Helper functions for select
You can combine mutate() with other dplyr functions for complex data transformations:
billboard |>
select(date:artist, week_popular = "weeks-on-board") |>
mutate(is_collabo = grepl("featuring", artist, ignore.case = T)) |>
select(artist, is_collabo, everything())
## # A tibble: 330,087 × 6
## artist is_collabo date rank song week_popular
## <chr> <lgl> <date> <dbl> <chr> <dbl>
## 1 Adele FALSE 2021-11-06 1 Easy… 3
## 2 The Kid LAROI & Justin Bieber FALSE 2021-11-06 2 Stay 16
## 3 Lil Nas X & Jack Harlow FALSE 2021-11-06 3 Indu… 14
## 4 Walker Hayes FALSE 2021-11-06 4 Fanc… 19
## 5 Ed Sheeran FALSE 2021-11-06 5 Bad … 18
## 6 Drake Featuring Future & Youn… TRUE 2021-11-06 6 Way … 8
## 7 Ed Sheeran FALSE 2021-11-06 7 Shiv… 7
## 8 Olivia Rodrigo FALSE 2021-11-06 8 Good… 24
## 9 Doja Cat FALSE 2021-11-06 9 Need… 20
## 10 Dua Lipa FALSE 2021-11-06 10 Levi… 56
## # ℹ 330,077 more rows
The basic syntax is filter(.data, condition).
Common Filtering Methods
Advanced Helpers
You can use specialized functions inside filter() to handle missing data, strings, or complex rows:
billboard |>
select(date:artist, week_popular = "weeks-on-board") |>
filter(week_popular >= 20, artist == "Drake" | artist == "The Weeknd")
## # A tibble: 236 × 5
## date rank song artist week_popular
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-09-04 20 Blinding Lights The Weeknd 90
## 2 2021-08-28 21 Blinding Lights The Weeknd 89
## 3 2021-08-21 18 Blinding Lights The Weeknd 88
## 4 2021-08-14 16 Blinding Lights The Weeknd 87
## 5 2021-08-07 17 Blinding Lights The Weeknd 86
## 6 2021-07-31 17 Blinding Lights The Weeknd 85
## 7 2021-07-24 17 Blinding Lights The Weeknd 84
## 8 2021-07-17 15 Blinding Lights The Weeknd 83
## 9 2021-07-10 18 Blinding Lights The Weeknd 82
## 10 2021-07-03 15 Blinding Lights The Weeknd 81
## # ℹ 226 more rows
The basic syntax is distinct(.data, …, .keep_all = FALSE).
Common Deduplication Methods
Advanced Helpers & Tricks
billboard |>
select(date:artist, week_popular = "weeks-on-board") |>
filter(artist == "Drake")
## # A tibble: 787 × 5
## date rank song artist week_popular
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 91 No Friends In The Industry Drake 8
## 2 2021-10-30 87 No Friends In The Industry Drake 7
## 3 2021-10-30 90 Champagne Poetry Drake 7
## 4 2021-10-23 74 No Friends In The Industry Drake 6
## 5 2021-10-23 77 Champagne Poetry Drake 6
## 6 2021-10-16 64 No Friends In The Industry Drake 5
## 7 2021-10-16 65 Champagne Poetry Drake 5
## 8 2021-10-16 98 TSU Drake 5
## 9 2021-10-09 54 Champagne Poetry Drake 4
## 10 2021-10-09 60 No Friends In The Industry Drake 4
## # ℹ 777 more rows
billboard |>
select(date:artist, week_popular = "weeks-on-board") |>
filter(artist == "Drake") |>
distinct(song)
## # A tibble: 108 × 1
## song
## <chr>
## 1 No Friends In The Industry
## 2 Champagne Poetry
## 3 TSU
## 4 Pipe Down
## 5 Papi's Home
## 6 Race My Mind
## 7 7am On Bridle Path
## 8 Fucking Fans
## 9 The Remorse
## 10 What's Next
## # ℹ 98 more rows
billboard |>
select(date:artist, week_popular = "weeks-on-board") |>
filter(artist == "Drake") |>
distinct(song, .keep_all = T)
## # A tibble: 108 × 5
## date rank song artist week_popular
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 91 No Friends In The Industry Drake 8
## 2 2021-10-30 90 Champagne Poetry Drake 7
## 3 2021-10-16 98 TSU Drake 5
## 4 2021-10-09 96 Pipe Down Drake 4
## 5 2021-10-09 97 Papi's Home Drake 4
## 6 2021-10-02 89 Race My Mind Drake 3
## 7 2021-10-02 92 7am On Bridle Path Drake 3
## 8 2021-09-18 32 Fucking Fans Drake 1
## 9 2021-09-18 35 The Remorse Drake 1
## 10 2021-07-17 97 What's Next Drake 16
## # ℹ 98 more rows
The basic syntax is group_by(.data, …).
Common Grouping Methods
group_by() is almost always paired with one of these downstream functions:
df %>% group_by(category) %>% summarise(avg_price = mean(price), total_sales = n())
ex: Calculates each employee’s salary relative to their department’s average
df %>% group_by(department) %>% mutate(pct_of_dept_avg = salary / mean(salary))
With filter() (Per-Group Filtering): Filters rows based on a group-level condition:
ex: Keeps only the highest-selling transaction for each store
df %>% group_by(store_id) %>% filter(sales == max(sales))
NB: Always ungroup(): Grouping sticks to the data frame. If you don’t remove it, future operations will accidentally run by group and cause strange bugs. Always end your pipeline with ungroup(): df %>% group_by(region) %>% summarise(total = sum(sales)) %>% ungroup()
The basic syntax is summarise(.data, new_column = function(existing_column)).
Common Summary Functions * Center / Average: Use mean() or median(): summarise(df, avg_age = mean(age)). * Spread: Use sd() or IQR(): summarise(df, spread = sd(score)). * Range / Extremes: Use min() or max(): summarise(df, highest = max(sales)). * Counts: Use n() for total rows, or n_distinct() for unique values: summarise(df, total_rows = n(), unique_users = n_distinct(user_id)).
Advanced Helpers
df %>% group_by(region, year) %>% summarise(total = sum(sales), .groups = “drop”)
billboard |>
select(date:artist, week_popular = "weeks-on-board") |>
filter(artist == "Drake") |>
group_by(song) |>
summarise(total_week_popular = max(week_popular))
## # A tibble: 108 × 2
## song total_week_popular
## <chr> <dbl>
## 1 0 To 100 / The Catch Up 20
## 2 10 Bands 13
## 3 30 For 30 Freestyle 2
## 4 6 God 1
## 5 6 Man 1
## 6 7am On Bridle Path 3
## 7 8 Out Of 10 3
## 8 9 5
## 9 9 AM In Dallas 1
## 10 Back To Back 20
## # ℹ 98 more rows
library(dplyr)
billboard |>
group_by(artist) |>
summarise(count = n())
## # A tibble: 10,205 × 2
## artist count
## <chr> <int>
## 1 "\"Groove\" Holmes" 14
## 2 "\"Little\" Jimmy Dickens" 10
## 3 "\"Pookie\" Hudson" 1
## 4 "\"Weird Al\" Yankovic" 91
## 5 "'N Sync" 172
## 6 "'N Sync & Gloria Estefan" 20
## 7 "'N Sync Featuring Nelly" 20
## 8 "'Til Tuesday" 53
## 9 "(+44)" 1
## 10 "(The Preacher) Bobby Womack" 9
## # ℹ 10,195 more rows
The basic syntax is arrange(.data, column1, column2).
Common Sorting Methods
Advanced Helpers & Arguments
billboard |>
select(date:artist, week_popular = "weeks-on-board") |>
filter(artist == "Drake") |>
group_by(song) |>
summarise(total_week_popular = max(week_popular)) |>
arrange(desc(total_week_popular), song) |>
head(10)
## # A tibble: 10 × 2
## song total_week_popular
## <chr> <dbl>
## 1 God's Plan 36
## 2 Hotline Bling 36
## 3 Controlla 26
## 4 Fake Love 25
## 5 Headlines 25
## 6 Nice For What 25
## 7 Best I Ever Had 24
## 8 In My Feelings 22
## 9 Nonstop 22
## 10 Started From The Bottom 22
The basic syntax is count(.data, …, wt = NULL, sort = FALSE, name = NULL).
Common Counting Methods
Single Variable: Count how many times each unique value appears: count(df, country).
Multiple Variables: Count unique combinations across columns: count(df, country, status).
Auto-Sorting: Set sort = TRUE to immediately sort the results from highest to lowest frequency: count(df, country, sort = TRUE).
Custom Column Name: By default, the counts are saved in a column named n. Use the name argument to change it: count(df, country, name = “total_users”).
Advanced Helpers
billboard |>
select(date:artist, week_popular = "weeks-on-board") |>
count(artist) |>
arrange(desc(n))
## # A tibble: 10,205 × 2
## artist n
## <chr> <int>
## 1 Taylor Swift 1023
## 2 Elton John 889
## 3 Madonna 857
## 4 Drake 787
## 5 Kenny Chesney 769
## 6 Tim McGraw 731
## 7 Keith Urban 673
## 8 Stevie Wonder 659
## 9 Rod Stewart 657
## 10 Mariah Carey 621
## # ℹ 10,195 more rows
trace() FunctionThe trace() function allows you to temporarily insert pieces of code (tracers) into any existing function without manually modifying its source code.
# A function that occasionally receives invalid negative input
check <- function(x) {
log_val <- log(x)
return(log_val * 2)
} # Insert a browser breakpoint ONLY if x is negative
trace(check, tracer = quote(if(x < 0) browser()), at = 1)
## [1] "check"
check(5)
## Tracing check(5) step 1
## [1] 3.218876
check(-5)
## Tracing check(-5) step 1
## Called from: eval(expr, p)
## debug: `{`
## debug: log_val <- log(x)
## Warning in log(x): NaNs produced
## debug: return(log_val * 2)
## [1] NaN
untrace(check)
recover() FunctionThe recover() function is a specialized
error handler designed for navigating the active function call stack
right at the exact moment a script fails.
# Setup nested functions
func_alpha <- function(a) func_beta(a)
func_beta <- function(b) log(b) # Will fail if 'b' is text
# Enable recover globally
options(error = recover) # Execute the error-prone flow
#func_alpha("hello")
top_artists <- billboard %>%
count(artist, sort = TRUE) %>%
head(10)
# Bar chart
ggplot(top_artists, aes(x = reorder(artist, n), y = n)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(
title = "Top 10 Artists by Number of Songs",
x = "Artist",
y = "Count"
) +
theme_minimal()
ggplot(billboard, aes(x = "", y = rank)) +
geom_boxplot(fill = "orange") +
labs(
title = "Distribution of Song Rankings",
y = "Rank"
) +
theme_minimal()
ggplot(billboard, aes(x = "weeks-on-board", y = rank)) +
geom_point(alpha = 0.3) +
geom_smooth(color = "blue") +
labs(
title = "Relationship Between Weeks on Board and Rank",
x = "Weeks on Board",
y = "Rank"
) +
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(billboard, aes(x = rank)) +
geom_histogram(binwidth = 2, color = "white") +
labs(
title = "Distribution of Song Rankings",
x = "Rank",
y = "Frequency"
) +
theme_minimal()
adele_data <- billboard %>%
filter(artist == "Adele")
ggplot(adele_data, aes(x = date, y = rank)) +
geom_line(color = "blue") +
geom_point() +
scale_y_reverse() +
labs(
title = "Adele Song Ranking Trend",
x = "Date",
y = "Rank"
) +
theme_minimal()
top5 <- billboard %>%
filter(artist %in% c("Adele", "Ed Sheeran", "Drake"))
ggplot(top5, aes(x = "weeks-on-board", y = rank)) +
geom_point() +
facet_wrap(~artist) +
labs(
title = "Artist Comparison",
x = "Weeks on Board",
y = "Rank"
) +
theme_minimal()
# Function to calculate summary statistics
stats_func <- function(data) {
data <- na.omit(data) # Remove missing values
# Calculate statistics
min_value <- min(data)
max_value <- max(data)
mean_value <- mean(data)
median_value <- median(data)
q1 <- quantile(data, 0.25)
q2 <- quantile(data, 0.50)
q3 <- quantile(data, 0.75)
result <- list(
Minimum = min_value,
Maximum = max_value,
Mean = mean_value,
Median = median_value,
First_Quartile_Q1 = q1,
Second_Quartile_Q2 = q2,
Third_Quartile_Q3 = q3
)
return(result)
}
billboard$`weeks-on-board` |> stats_func()
## $Minimum
## [1] 1
##
## $Maximum
## [1] 90
##
## $Mean
## [1] 9.161785
##
## $Median
## [1] 7
##
## $First_Quartile_Q1
## 25%
## 4
##
## $Second_Quartile_Q2
## 50%
## 7
##
## $Third_Quartile_Q3
## 75%
## 13
Apply Functions like sapply, vapply, lapply, tapply
Using lapply() to get class (datatype) of each
column
lapply(billboard, class)
## $date
## [1] "Date"
##
## $rank
## [1] "numeric"
##
## $song
## [1] "character"
##
## $artist
## [1] "character"
##
## $`last-week`
## [1] "numeric"
##
## $`peak-rank`
## [1] "numeric"
##
## $`weeks-on-board`
## [1] "numeric"
Using lapply() to calculate mean for numeric columns
numeric_cols <- billboard[sapply(billboard, is.numeric)]
numeric_cols |> lapply(mean, na.rm = TRUE)
## $rank
## [1] 50.50093
##
## $`last-week`
## [1] 47.59163
##
## $`peak-rank`
## [1] 40.97063
##
## $`weeks-on-board`
## [1] 9.161785
Using sapply() to count missing values in each
column
billboard |> sapply(function(x) sum(is.na(x)))
## date rank song artist last-week
## 0 0 0 0 32312
## peak-rank weeks-on-board
## 0 0
Using sapply() Find mean of numeric columns
sapply(numeric_cols, mean, na.rm = TRUE)
## rank last-week peak-rank weeks-on-board
## 50.500929 47.591631 40.970629 9.161785
Using vapply() to find mean of numeric columns
vapply(
numeric_cols,
mean,
FUN.VALUE = numeric(1),
na.rm = TRUE
)
## rank last-week peak-rank weeks-on-board
## 50.500929 47.591631 40.970629 9.161785
Using tapply() to calculate average rank by artist
#tapply(
# billboard$rank,
#billboard$artist,
#mean,
#na.rm = TRUE
#)
getting Number of songs per artist using tapply()
#tapply(
# billboard$song,
#billboard$artist,
#length
#)