Importing required library using pacman package manager

if (!require("pacman")) install.packages("pacman") 
## Loading required package: pacman
pacman::p_load(tidyverse, DBI, RPostgres)

Q1. Import Data from different sources

Q1.1. Import data from database

con <- dbConnect(
  RPostgres::Postgres(),
  dbname = 'sms',
  host   = 'localhost',      
  port   = 5432,            
  user   = 'Shaloom',
  password = 'Shaloom_50095'
)

dbListTables(con)
## [1] "posts" "users"
dbListFields(con, "users")
## [1] "id"         "username"   "email"      "password"   "created_at"
my_tb <- dbReadTable(con, "users")
my_query <- dbGetQuery(con, "SELECT * FROM users WHERE username = 'Shaloom' ") 

summary(my_tb)
##          id          username       email         password  
##  Length   : 4   Length   :4   Length   : 4   Length   :  4  
##  N.unique : 4   N.unique :4   N.unique : 4   N.unique :  4  
##  N.blank  : 0   N.blank  :0   N.blank  : 0   N.blank  :  0  
##  Min.nchar:36   Min.nchar:5   Min.nchar:15   Min.nchar:162  
##  Max.nchar:36   Max.nchar:7   Max.nchar:17   Max.nchar:162  
##                                                             
##    created_at                 
##  Min.   :2026-05-08 10:59:25  
##  1st Qu.:2026-05-08 11:01:39  
##  Median :2026-05-08 11:02:32  
##  Mean   :2026-05-08 11:01:56  
##  3rd Qu.:2026-05-08 11:02:49  
##  Max.   :2026-05-08 11:03:14
View(my_query)

Q1.2. Import from external source like .csv file

library(readr)

billboard <- read_csv("../data/charts.csv") 
## Rows: 330087 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): song, artist
## dbl  (4): rank, last-week, peak-rank, weeks-on-board
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(billboard) # getting number of row(observation) in dataset
## [1] 330087
ncol(billboard) # getting number of column(variable) in dataset
## [1] 7

Q1.3. Import data from statistical data set like .sav file

library(haven)
survey <- read_sav("../data/survey.sav") 
summary(survey)
##        id             sex             age           marital     
##  Min.   :  1.0   Min.   :1.000   Min.   :18.00   Min.   :1.000  
##  1st Qu.:131.5   1st Qu.:1.000   1st Qu.:26.00   1st Qu.:2.000  
##  Median :280.0   Median :2.000   Median :36.00   Median :4.000  
##  Mean   :276.4   Mean   :1.579   Mean   :37.44   Mean   :3.371  
##  3rd Qu.:422.5   3rd Qu.:2.000   3rd Qu.:47.00   3rd Qu.:4.000  
##  Max.   :702.0   Max.   :2.000   Max.   :82.00   Max.   :8.000  
##                                                                 
##      child            educ           source         smoke      
##  Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:3.000   1st Qu.:1.00   1st Qu.:2.000  
##  Median :2.000   Median :4.000   Median :1.00   Median :2.000  
##  Mean   :1.578   Mean   :4.087   Mean   :3.41   Mean   :1.805  
##  3rd Qu.:2.000   3rd Qu.:5.000   3rd Qu.:6.00   3rd Qu.:2.000  
##  Max.   :2.000   Max.   :6.000   Max.   :9.00   Max.   :2.000  
##  NAs    :1                       NAs    :17     NAs    :3      
##     smokenum           op1             op2             op3           op4       
##  Min.   :  0.00   Min.   :1.000   Min.   :1.000   Min.   :1.0   Min.   :1.000  
##  1st Qu.:  0.00   1st Qu.:3.000   1st Qu.:1.000   1st Qu.:3.0   1st Qu.:1.000  
##  Median :  0.00   Median :3.000   Median :2.000   Median :4.0   Median :2.000  
##  Mean   : 16.59   Mean   :3.264   Mean   :2.284   Mean   :3.7   Mean   :2.259  
##  3rd Qu.:  0.00   3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:4.0   3rd Qu.:3.000  
##  Max.   :280.00   Max.   :5.000   Max.   :5.000   Max.   :5.0   Max.   :5.000  
##  NAs    :79       NAs    :4       NAs    :3       NAs    :3     NAs    :3      
##       op5            op6            mast1           mast2           mast3      
##  Min.   :1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.00   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:3.000   1st Qu.:1.000  
##  Median :4.00   Median :2.000   Median :2.000   Median :3.000   Median :2.000  
##  Mean   :3.94   Mean   :2.266   Mean   :2.037   Mean   :3.323   Mean   :1.959  
##  3rd Qu.:5.00   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:3.000  
##  Max.   :5.00   Max.   :5.000   Max.   :4.000   Max.   :5.000   Max.   :4.000  
##  NAs    :3      NAs    :3       NAs    :3       NAs    :3       NAs    :3      
##      mast4           mast5           mast6           mast7      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:3.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :2.000   Median :4.000   Median :2.000   Median :2.000  
##  Mean   :1.924   Mean   :3.411   Mean   :1.956   Mean   :2.094  
##  3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
##  NAs    :3       NAs    :3       NAs    :3       NAs    :3      
##       pn1             pn2             pn3             pn4       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:2.000  
##  Median :4.000   Median :2.000   Median :1.000   Median :3.000  
##  Mean   :3.789   Mean   :2.562   Mean   :1.734   Mean   :3.163  
##  3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  NAs    :3       NAs    :3       NAs    :3       NAs    :3      
##       pn5             pn6             pn7             pn8       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:1.000  
##  Median :1.000   Median :4.000   Median :4.000   Median :2.000  
##  Mean   :1.411   Mean   :3.727   Mean   :3.596   Mean   :2.202  
##  3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  NAs    :3       NAs    :3       NAs    :3       NAs    :4      
##       pn9             pn10           pn11            pn12            pn13      
##  Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:1.00   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:3.000  
##  Median :3.000   Median :2.00   Median :2.000   Median :3.000   Median :3.000  
##  Mean   :3.294   Mean   :1.86   Mean   :2.422   Mean   :2.872   Mean   :3.261  
##  3rd Qu.:4.000   3rd Qu.:2.00   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.00   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  NAs    :3       NAs    :3      NAs    :3       NAs    :3       NAs    :3      
##       pn14            pn15            pn16            pn17      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:3.000  
##  Median :1.000   Median :3.000   Median :1.000   Median :3.000  
##  Mean   :1.661   Mean   :3.188   Mean   :1.674   Mean   :3.376  
##  3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:2.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  NAs    :3       NAs    :3       NAs    :3       NAs    :3      
##       pn18            pn19            pn20          lifsat1     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:3.000  
##  Median :3.500   Median :2.000   Median :1.000   Median :5.000  
##  Mean   :3.424   Mean   :2.186   Mean   :1.722   Mean   :4.374  
##  3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:5.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :7.000  
##  NAs    :3       NAs    :3       NAs    :3       NAs    :3      
##     lifsat2         lifsat3        lifsat4        lifsat5           pss1      
##  Min.   :1.000   Min.   :1.00   Min.   :1.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:4.00   1st Qu.:4.00   1st Qu.:2.000   1st Qu.:2.000  
##  Median :5.000   Median :5.00   Median :5.00   Median :4.000   Median :3.000  
##  Mean   :4.571   Mean   :4.69   Mean   :4.75   Mean   :3.993   Mean   :2.836  
##  3rd Qu.:6.000   3rd Qu.:6.00   3rd Qu.:6.00   3rd Qu.:6.000   3rd Qu.:3.000  
##  Max.   :7.000   Max.   :7.00   Max.   :7.00   Max.   :7.000   Max.   :5.000  
##  NAs    :3       NAs    :3      NAs    :3      NAs    :3       NAs    :6      
##       pss2            pss3            pss4            pss5      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :3.000   Median :3.000   Median :4.000   Median :4.000  
##  Mean   :2.739   Mean   :3.162   Mean   :3.801   Mean   :3.467  
##  3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  NAs    :6       NAs    :6       NAs    :6       NAs    :6      
##       pss6            pss7            pss8            pss9      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :3.000   Median :4.000   Median :4.000   Median :3.000  
##  Mean   :2.774   Mean   :3.538   Mean   :3.536   Mean   :3.046  
##  3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  NAs    :6       NAs    :6       NAs    :6       NAs    :6      
##      pss10           sest1           sest2           sest3      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:3.000   1st Qu.:4.000   1st Qu.:1.000  
##  Median :2.000   Median :4.000   Median :4.000   Median :1.000  
##  Mean   :2.513   Mean   :3.672   Mean   :3.745   Mean   :1.376  
##  3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :5.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
##  NAs    :6       NAs    :3       NAs    :3       NAs    :3      
##      sest4           sest5           sest6           sest7           sest8     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00  
##  1st Qu.:3.000   1st Qu.:1.000   1st Qu.:3.000   1st Qu.:1.000   1st Qu.:3.00  
##  Median :4.000   Median :1.000   Median :3.000   Median :2.000   Median :3.00  
##  Mean   :3.408   Mean   :1.596   Mean   :3.278   Mean   :2.044   Mean   :3.17  
##  3rd Qu.:4.000   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:4.00  
##  Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.00  
##  NAs    :3       NAs    :3       NAs    :3       NAs    :3       NAs    :3     
##      sest9           sest10            m1               m2        
##  Min.   :1.000   Min.   :1.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :2.000   Median :1.000   Median :1.0000   Median :1.0000  
##  Mean   :2.039   Mean   :1.713   Mean   :0.7294   Mean   :0.9014  
##  3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :4.000   Max.   :4.000   Max.   :1.0000   Max.   :1.0000  
##  NAs    :3       NAs    :3       NAs    :3        NAs    :3       
##        m3               m4               m5               m6        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.8624   Mean   :0.4299   Mean   :0.5069   Mean   :0.2248  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  NAs    :3        NAs    :4        NAs    :3        NAs    :3       
##        m7               m8               m9              m10        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.4862   Mean   :0.5793   Mean   :0.2248   Mean   :0.3609  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  NAs    :3        NAs    :4        NAs    :3        NAs    :4       
##       pc1             pc2             pc3             pc4       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :2.000   Median :3.000   Median :4.000   Median :4.000  
##  Mean   :2.545   Mean   :2.758   Mean   :3.575   Mean   :3.464  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  NAs    :6       NAs    :6       NAs    :6       NAs    :6      
##       pc5             pc6             pc7             pc8       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :4.000   Median :3.000   Median :3.000   Median :3.000  
##  Mean   :3.714   Mean   :3.282   Mean   :2.905   Mean   :3.072  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  NAs    :6       NAs    :7       NAs    :6       NAs    :6      
##       pc9             pc10            pc11          pc12            pc13      
##  Min.   :1.000   Min.   :1.000   Min.   :1.0   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:2.0   1st Qu.:3.000   1st Qu.:2.000  
##  Median :4.000   Median :4.000   Median :2.0   Median :3.000   Median :3.000  
##  Mean   :3.575   Mean   :3.596   Mean   :2.7   Mean   :3.346   Mean   :3.254  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.0   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.0   Max.   :5.000   Max.   :5.000  
##  NAs    :6       NAs    :6       NAs    :6     NAs    :6       NAs    :6      
##       pc14           pc15            pc16            pc17            pc18     
##  Min.   :1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00  
##  1st Qu.:2.00   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:3.00  
##  Median :3.00   Median :2.000   Median :2.000   Median :3.000   Median :4.00  
##  Mean   :2.94   Mean   :2.619   Mean   :2.443   Mean   :3.171   Mean   :3.61  
##  3rd Qu.:4.00   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:4.00  
##  Max.   :5.00   Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.00  
##  NAs    :6      NAs    :6       NAs    :6       NAs    :6       NAs    :6     
##       Rop2            Rop4            Rop6           Rmast1     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :4.000   Median :4.000   Median :4.000   Median :3.000  
##  Mean   :3.716   Mean   :3.741   Mean   :3.734   Mean   :2.963  
##  3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :4.000  
##  NAs    :3       NAs    :3       NAs    :3       NAs    :3      
##      Rmast3          Rmast4          Rmast6          Rmast7     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :3.000   Median :3.000   Median :3.000   Median :3.000  
##  Mean   :3.041   Mean   :3.076   Mean   :3.044   Mean   :2.906  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
##  NAs    :3       NAs    :3       NAs    :3       NAs    :3      
##      Rpss4           Rpss5           Rpss7           Rpss8      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :2.000   Median :2.000   Median :2.000  
##  Mean   :2.199   Mean   :2.533   Mean   :2.462   Mean   :2.464  
##  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  NAs    :6       NAs    :6       NAs    :6       NAs    :6      
##      Rsest3          Rsest5          Rsest7          Rsest9     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :4.000   Median :4.000   Median :3.000   Median :3.000  
##  Mean   :3.624   Mean   :3.404   Mean   :2.956   Mean   :2.961  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
##  NAs    :3       NAs    :3       NAs    :3       NAs    :3      
##     Rsest10           Rpc1            Rpc2            Rpc7           Rpc11    
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.0  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.0  
##  Median :4.000   Median :4.000   Median :3.000   Median :3.000   Median :4.0  
##  Mean   :3.287   Mean   :3.455   Mean   :3.242   Mean   :3.095   Mean   :3.3  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.0  
##  Max.   :4.000   Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.0  
##  NAs    :3       NAs    :6       NAs    :6       NAs    :6       NAs    :6    
##      Rpc15           Rpc16           toptim          tmast      
##  Min.   :1.000   Min.   :1.000   Min.   : 7.00   Min.   : 8.00  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:19.00   1st Qu.:19.00  
##  Median :4.000   Median :4.000   Median :22.00   Median :22.00  
##  Mean   :3.381   Mean   :3.557   Mean   :22.12   Mean   :21.76  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:26.00   3rd Qu.:25.00  
##  Max.   :5.000   Max.   :5.000   Max.   :30.00   Max.   :28.00  
##  NAs    :6       NAs    :6       NAs    :4       NAs    :3      
##     tposaff         tnegaff        tlifesat        tpstress        tslfest     
##  Min.   :11.00   Min.   :10.0   Min.   : 5.00   Min.   :12.00   Min.   :18.00  
##  1st Qu.:29.00   1st Qu.:14.0   1st Qu.:18.00   1st Qu.:23.00   1st Qu.:30.00  
##  Median :34.00   Median :18.0   Median :23.00   Median :26.00   Median :35.00  
##  Mean   :33.69   Mean   :19.4   Mean   :22.38   Mean   :26.73   Mean   :33.53  
##  3rd Qu.:39.00   3rd Qu.:23.0   3rd Qu.:27.00   3rd Qu.:31.00   3rd Qu.:38.00  
##  Max.   :49.00   Max.   :39.0   Max.   :35.00   Max.   :46.00   Max.   :40.00  
##  NAs    :3       NAs    :4      NAs    :3       NAs    :6       NAs    :3      
##     tmarlow          tpcoiss          agegp3          agegp5     
##  Min.   : 0.000   Min.   :20.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 4.000   1st Qu.:53.00   1st Qu.:1.000   1st Qu.:2.000  
##  Median : 5.000   Median :62.00   Median :2.000   Median :3.000  
##  Mean   : 5.305   Mean   :60.63   Mean   :1.973   Mean   :2.954  
##  3rd Qu.: 7.000   3rd Qu.:69.00   3rd Qu.:3.000   3rd Qu.:4.000  
##  Max.   :10.000   Max.   :88.00   Max.   :3.000   Max.   :5.000  
##  NAs    :6        NAs    :9                                      
##     educrec        LG10negaff        MAH_1               COO_1          
##  Min.   :1.000   Min.   :1.000   Min.   : 0.003542   Min.   :1.000e-09  
##  1st Qu.:2.000   1st Qu.:1.146   1st Qu.: 0.534432   1st Qu.:1.293e-04  
##  Median :3.000   Median :1.255   Median : 1.322372   Median :5.578e-04  
##  Mean   :3.091   Mean   :1.261   Mean   : 1.993438   Mean   :3.031e-03  
##  3rd Qu.:4.000   3rd Qu.:1.362   3rd Qu.: 2.538417   3rd Qu.:2.413e-03  
##  Max.   :5.000   Max.   :1.663   Max.   :13.896642   Max.   :9.449e-02  
##                  NAs    :4       NAs    :10          NAs    :13         
##      MAH_2         
##  Min.   : 0.09048  
##  1st Qu.: 1.11653  
##  Median : 1.99169  
##  Mean   : 2.99306  
##  3rd Qu.: 3.86184  
##  Max.   :18.10064  
##  NAs    :7
sum(is.na(survey$source))
## [1] 17
nrow(billboard) # getting number of row(observation) in dataset
## [1] 330087
ncol(billboard) # getting number of column(variable) in dataset
## [1] 7

Q2. piping technique (%>% or |>)

Q2.1.piping

In R, piping is a coding technique used to chain operations together in a sequence, making code much easier to read and maintain. Instead of nesting functions inside each other, which forces you to read from the inside out. piping allows you to read from left to right or top to bottom. 

Key Differences of piping

  • The native pipe |> requires parentheses after the function name (e.g., x |> mean()), whereas %>% can accept the bare function name (e.g., x %>% mean).

  • If you need to pass the data to a position other than the first argument:

    • Use a dot . with %>% (e.g., df %>% lm(y ~ x, data = .))
    • Use an underscore _ with |> in newer R versions, but only for named arguments.
  • The native pipe |> is slightly faster because it is handled directly by the R parser rather than as a function call.

billboard %>% head(10) # getting 10 first record in data set using %>% 
## # A tibble: 10 × 7
##    date        rank song         artist `last-week` `peak-rank` `weeks-on-board`
##    <date>     <dbl> <chr>        <chr>        <dbl>       <dbl>            <dbl>
##  1 2021-11-06     1 Easy On Me   Adele            1           1                3
##  2 2021-11-06     2 Stay         The K…           2           1               16
##  3 2021-11-06     3 Industry Ba… Lil N…           3           1               14
##  4 2021-11-06     4 Fancy Like   Walke…           4           3               19
##  5 2021-11-06     5 Bad Habits   Ed Sh…           5           2               18
##  6 2021-11-06     6 Way 2 Sexy   Drake…           6           1                8
##  7 2021-11-06     7 Shivers      Ed Sh…           9           7                7
##  8 2021-11-06     8 Good 4 U     Olivi…           7           1               24
##  9 2021-11-06     9 Need To Know Doja …          11           9               20
## 10 2021-11-06    10 Levitating   Dua L…           8           2               56

Q3. Dplyr Functions

-> COLUMN FUCTIONS

  1. select(): is primarily used to pick or exclude specific columns (variables) from a data frame.
  2. mutate(): is used to create new columns or modify existing columns in a data frame

-> ROW FUCTIONS

  1. filter(): is used to pick specific rows from a data frame based on one or more conditions.
  2. distinct(): is used to keep unique rows from a data frame and remove duplicates.
  3. arrange(): is used to reorder (sort) the rows of a data frame based on the values of one or more columns.

-> GROUP FUCTIONS

  1. group_by(): is used to split your data frame into separate groups based on one or more variables. it doesn’t change how the data looks by itself; instead, it changes how other functions (like summarise() or mutate()) operate on the data by making them run group-by-group.
  2. summarise(): it summarize a data frame into a single row of statistical value.
  3. count(): it count the number of rows with each unique value of variable

Q3.1. SELECT() FUNCTION

Common Selection Methods

  • By Name: List column names without quotes: select(df, column1, column2). 
  • By Position: Use numerical indices: select(df, 1:3) to get the first three columns. 
  • Exclusion: Use a minus sign (-) to drop columns: select(df, -unwanted_column). 
  • Range: Select all columns between two specified names: select(df, start_col:end_col). 

Advanced Helper functions for select
You can use selection helpers to find columns based on patterns: 

  • starts_with(“prefix”): Select columns whose name start with a character string.
  • ends_with(“suffix”): Select columns whose name end with a character string.
  • contains(“string”): Select columns whose name containing a character string.
  • matches(“regex”): Select columns whose name matching a regular expression.
  • everything(): select every column. Useful for reordering columns (e.g., select(df, important_col, everything())).
  • where(is.numeric): Selects columns based on their data type.
names(billboard)
## [1] "date"           "rank"           "song"           "artist"        
## [5] "last-week"      "peak-rank"      "weeks-on-board"
billboard |> select(date, rank, song, artist, "weeks-on-board")
## # A tibble: 330,087 × 5
##    date        rank song          artist                        `weeks-on-board`
##    <date>     <dbl> <chr>         <chr>                                    <dbl>
##  1 2021-11-06     1 Easy On Me    Adele                                        3
##  2 2021-11-06     2 Stay          The Kid LAROI & Justin Bieber               16
##  3 2021-11-06     3 Industry Baby Lil Nas X & Jack Harlow                     14
##  4 2021-11-06     4 Fancy Like    Walker Hayes                                19
##  5 2021-11-06     5 Bad Habits    Ed Sheeran                                  18
##  6 2021-11-06     6 Way 2 Sexy    Drake Featuring Future & You…                8
##  7 2021-11-06     7 Shivers       Ed Sheeran                                   7
##  8 2021-11-06     8 Good 4 U      Olivia Rodrigo                              24
##  9 2021-11-06     9 Need To Know  Doja Cat                                    20
## 10 2021-11-06    10 Levitating    Dua Lipa                                    56
## # ℹ 330,077 more rows
billboard |> select(date:artist, "weeks-on-board")
## # A tibble: 330,087 × 5
##    date        rank song          artist                        `weeks-on-board`
##    <date>     <dbl> <chr>         <chr>                                    <dbl>
##  1 2021-11-06     1 Easy On Me    Adele                                        3
##  2 2021-11-06     2 Stay          The Kid LAROI & Justin Bieber               16
##  3 2021-11-06     3 Industry Baby Lil Nas X & Jack Harlow                     14
##  4 2021-11-06     4 Fancy Like    Walker Hayes                                19
##  5 2021-11-06     5 Bad Habits    Ed Sheeran                                  18
##  6 2021-11-06     6 Way 2 Sexy    Drake Featuring Future & You…                8
##  7 2021-11-06     7 Shivers       Ed Sheeran                                   7
##  8 2021-11-06     8 Good 4 U      Olivia Rodrigo                              24
##  9 2021-11-06     9 Need To Know  Doja Cat                                    20
## 10 2021-11-06    10 Levitating    Dua Lipa                                    56
## # ℹ 330,077 more rows
billboard |> select(1:4, "weeks-on-board")
## # A tibble: 330,087 × 5
##    date        rank song          artist                        `weeks-on-board`
##    <date>     <dbl> <chr>         <chr>                                    <dbl>
##  1 2021-11-06     1 Easy On Me    Adele                                        3
##  2 2021-11-06     2 Stay          The Kid LAROI & Justin Bieber               16
##  3 2021-11-06     3 Industry Baby Lil Nas X & Jack Harlow                     14
##  4 2021-11-06     4 Fancy Like    Walker Hayes                                19
##  5 2021-11-06     5 Bad Habits    Ed Sheeran                                  18
##  6 2021-11-06     6 Way 2 Sexy    Drake Featuring Future & You…                8
##  7 2021-11-06     7 Shivers       Ed Sheeran                                   7
##  8 2021-11-06     8 Good 4 U      Olivia Rodrigo                              24
##  9 2021-11-06     9 Need To Know  Doja Cat                                    20
## 10 2021-11-06    10 Levitating    Dua Lipa                                    56
## # ℹ 330,077 more rows
billboard |> select(-"last-week", -"peak-rank")
## # A tibble: 330,087 × 5
##    date        rank song          artist                        `weeks-on-board`
##    <date>     <dbl> <chr>         <chr>                                    <dbl>
##  1 2021-11-06     1 Easy On Me    Adele                                        3
##  2 2021-11-06     2 Stay          The Kid LAROI & Justin Bieber               16
##  3 2021-11-06     3 Industry Baby Lil Nas X & Jack Harlow                     14
##  4 2021-11-06     4 Fancy Like    Walker Hayes                                19
##  5 2021-11-06     5 Bad Habits    Ed Sheeran                                  18
##  6 2021-11-06     6 Way 2 Sexy    Drake Featuring Future & You…                8
##  7 2021-11-06     7 Shivers       Ed Sheeran                                   7
##  8 2021-11-06     8 Good 4 U      Olivia Rodrigo                              24
##  9 2021-11-06     9 Need To Know  Doja Cat                                    20
## 10 2021-11-06    10 Levitating    Dua Lipa                                    56
## # ℹ 330,077 more rows
billboard |> select(date:artist, week_popular = "weeks-on-board")
## # A tibble: 330,087 × 5
##    date        rank song          artist                            week_popular
##    <date>     <dbl> <chr>         <chr>                                    <dbl>
##  1 2021-11-06     1 Easy On Me    Adele                                        3
##  2 2021-11-06     2 Stay          The Kid LAROI & Justin Bieber               16
##  3 2021-11-06     3 Industry Baby Lil Nas X & Jack Harlow                     14
##  4 2021-11-06     4 Fancy Like    Walker Hayes                                19
##  5 2021-11-06     5 Bad Habits    Ed Sheeran                                  18
##  6 2021-11-06     6 Way 2 Sexy    Drake Featuring Future & Young T…            8
##  7 2021-11-06     7 Shivers       Ed Sheeran                                   7
##  8 2021-11-06     8 Good 4 U      Olivia Rodrigo                              24
##  9 2021-11-06     9 Need To Know  Doja Cat                                    20
## 10 2021-11-06    10 Levitating    Dua Lipa                                    56
## # ℹ 330,077 more rows

Q3.2. MUTATE() FUNCTION

Advanced Helper functions for select

You can combine mutate() with other dplyr functions for complex data transformations:

  • across(): Apply the same transformation to multiple columns at once: mutate(df, across(c(col1, col2), as.character)).
  • if_else() / case_when(): Create conditional variables: mutate(df, age_group = case_when(age < 18 ~ “Minor”, age >= 18 ~ “Adult”)). row_number() / rank(): Add ranking or sequential IDs: mutate(df, rank_id = row_number()).
  • .before / .after: Control where the new column appears: mutate(df, new_col = x * 2, .before = x).
billboard |> 
  select(date:artist, week_popular = "weeks-on-board") |> 
  mutate(is_collabo = grepl("featuring", artist, ignore.case = T)) |> 
  select(artist, is_collabo, everything())
## # A tibble: 330,087 × 6
##    artist                         is_collabo date        rank song  week_popular
##    <chr>                          <lgl>      <date>     <dbl> <chr>        <dbl>
##  1 Adele                          FALSE      2021-11-06     1 Easy…            3
##  2 The Kid LAROI & Justin Bieber  FALSE      2021-11-06     2 Stay            16
##  3 Lil Nas X & Jack Harlow        FALSE      2021-11-06     3 Indu…           14
##  4 Walker Hayes                   FALSE      2021-11-06     4 Fanc…           19
##  5 Ed Sheeran                     FALSE      2021-11-06     5 Bad …           18
##  6 Drake Featuring Future & Youn… TRUE       2021-11-06     6 Way …            8
##  7 Ed Sheeran                     FALSE      2021-11-06     7 Shiv…            7
##  8 Olivia Rodrigo                 FALSE      2021-11-06     8 Good…           24
##  9 Doja Cat                       FALSE      2021-11-06     9 Need…           20
## 10 Dua Lipa                       FALSE      2021-11-06    10 Levi…           56
## # ℹ 330,077 more rows

Q3.3. FILTER() FUNCTION

The basic syntax is filter(.data, condition).

Common Filtering Methods

  • Logical Comparisons: Match specific values using ==, >, <, >=, <=, or !=: filter(df, age > 21).
  • Multiple Conditions (AND): Separate conditions with commas or &: filter(df, age > 21, status == “Active”).
  • Multiple Conditions (OR): Use the vertical bar | for alternative conditions: filter(df, status == “Active” | points > 100).
  • Value in a List: Use the %in% operator to check against multiple values: filter(df, country %in% c(“USA”, “Canada”, “UK”)).
  • Negation: Use the exclamation mark ! to invert a condition: filter(df, !country %in% c(“USA”, “Canada”)).

Advanced Helpers

You can use specialized functions inside filter() to handle missing data, strings, or complex rows:

  • is.na(): Keep or drop missing values: filter(df, !is.na(email)) (removes missing emails).
  • str_detect(): Filter based on partial string matches: filter(df, str_detect(name, “John”)).
  • between(): Filter rows within a numeric range: filter(df, between(score, 50, 100)).
  • if_any() / if_all(): Filter rows across multiple columns at once: filter(df, if_any(c(score1, score2), ~ .x > 90)).
billboard |> 
  select(date:artist, week_popular = "weeks-on-board") |> 
  filter(week_popular >= 20, artist == "Drake" | artist == "The Weeknd")
## # A tibble: 236 × 5
##    date        rank song            artist     week_popular
##    <date>     <dbl> <chr>           <chr>             <dbl>
##  1 2021-09-04    20 Blinding Lights The Weeknd           90
##  2 2021-08-28    21 Blinding Lights The Weeknd           89
##  3 2021-08-21    18 Blinding Lights The Weeknd           88
##  4 2021-08-14    16 Blinding Lights The Weeknd           87
##  5 2021-08-07    17 Blinding Lights The Weeknd           86
##  6 2021-07-31    17 Blinding Lights The Weeknd           85
##  7 2021-07-24    17 Blinding Lights The Weeknd           84
##  8 2021-07-17    15 Blinding Lights The Weeknd           83
##  9 2021-07-10    18 Blinding Lights The Weeknd           82
## 10 2021-07-03    15 Blinding Lights The Weeknd           81
## # ℹ 226 more rows

Q3.4. DISTINCT() FUNCTION

The basic syntax is distinct(.data, …, .keep_all = FALSE).

Common Deduplication Methods

  • Entire Data Frame: Remove rows where every single column is identical: distinct(df).
  • Specific Columns: Find unique combinations of specific variables: distinct(df, country, city).
  • Keep All Columns: By default, specifying columns drops the rest. Use .keep_all = TRUE to keep all other columns for the first unique occurrence found: distinct(df, customer_id, .keep_all = TRUE).

Advanced Helpers & Tricks

  • On-the-Fly Mutation: You can create a new variable directly inside the function: distinct(df, lower_email = tolower(email)).
  • Count Unique Values: To just count unique rows rather than extracting them, use n_distinct() inside a summarise() or mutate() function: df %>% summarise(unique_users = n_distinct(user_id)).
billboard |> 
  select(date:artist, week_popular = "weeks-on-board") |> 
  filter(artist == "Drake")
## # A tibble: 787 × 5
##    date        rank song                       artist week_popular
##    <date>     <dbl> <chr>                      <chr>         <dbl>
##  1 2021-11-06    91 No Friends In The Industry Drake             8
##  2 2021-10-30    87 No Friends In The Industry Drake             7
##  3 2021-10-30    90 Champagne Poetry           Drake             7
##  4 2021-10-23    74 No Friends In The Industry Drake             6
##  5 2021-10-23    77 Champagne Poetry           Drake             6
##  6 2021-10-16    64 No Friends In The Industry Drake             5
##  7 2021-10-16    65 Champagne Poetry           Drake             5
##  8 2021-10-16    98 TSU                        Drake             5
##  9 2021-10-09    54 Champagne Poetry           Drake             4
## 10 2021-10-09    60 No Friends In The Industry Drake             4
## # ℹ 777 more rows
billboard |> 
  select(date:artist, week_popular = "weeks-on-board") |> 
  filter(artist == "Drake") |> 
  distinct(song)
## # A tibble: 108 × 1
##    song                      
##    <chr>                     
##  1 No Friends In The Industry
##  2 Champagne Poetry          
##  3 TSU                       
##  4 Pipe Down                 
##  5 Papi's Home               
##  6 Race My Mind              
##  7 7am On Bridle Path        
##  8 Fucking Fans              
##  9 The Remorse               
## 10 What's Next               
## # ℹ 98 more rows
billboard |> 
  select(date:artist, week_popular = "weeks-on-board") |> 
  filter(artist == "Drake") |> 
  distinct(song, .keep_all = T)
## # A tibble: 108 × 5
##    date        rank song                       artist week_popular
##    <date>     <dbl> <chr>                      <chr>         <dbl>
##  1 2021-11-06    91 No Friends In The Industry Drake             8
##  2 2021-10-30    90 Champagne Poetry           Drake             7
##  3 2021-10-16    98 TSU                        Drake             5
##  4 2021-10-09    96 Pipe Down                  Drake             4
##  5 2021-10-09    97 Papi's Home                Drake             4
##  6 2021-10-02    89 Race My Mind               Drake             3
##  7 2021-10-02    92 7am On Bridle Path         Drake             3
##  8 2021-09-18    32 Fucking Fans               Drake             1
##  9 2021-09-18    35 The Remorse                Drake             1
## 10 2021-07-17    97 What's Next                Drake            16
## # ℹ 98 more rows

Q3.5. GROUP_BY() FUNCTION

The basic syntax is group_by(.data, …).

Common Grouping Methods

  • Single Variable: Group data by one column: group_by(df, country).
  • Multiple Variables: Create subgroups by listing multiple columns: group_by(df, year, month).
  • On-the-Fly Grouping: Create a new grouping variable directly inside the function: group_by(df, age_group = age > 18).

group_by() is almost always paired with one of these downstream functions:

  • With summarise() (Collapse Data): Calculates summary statistics for each group (reduces rows):

df %>% group_by(category) %>% summarise(avg_price = mean(price), total_sales = n())

  • With mutate() (Window Functions): Adds a new column calculated within each group (keeps original rows):

ex: Calculates each employee’s salary relative to their department’s average

df %>% group_by(department) %>% mutate(pct_of_dept_avg = salary / mean(salary))

With filter() (Per-Group Filtering): Filters rows based on a group-level condition:

ex: Keeps only the highest-selling transaction for each store

df %>% group_by(store_id) %>% filter(sales == max(sales))

NB: Always ungroup(): Grouping sticks to the data frame. If you don’t remove it, future operations will accidentally run by group and cause strange bugs. Always end your pipeline with ungroup(): df %>% group_by(region) %>% summarise(total = sum(sales)) %>% ungroup()

Q3.6. SUMMARISE() FUNCTION

The basic syntax is summarise(.data, new_column = function(existing_column)).

Common Summary Functions * Center / Average: Use mean() or median(): summarise(df, avg_age = mean(age)). * Spread: Use sd() or IQR(): summarise(df, spread = sd(score)). * Range / Extremes: Use min() or max(): summarise(df, highest = max(sales)). * Counts: Use n() for total rows, or n_distinct() for unique values: summarise(df, total_rows = n(), unique_users = n_distinct(user_id)).

Advanced Helpers

  • across(): Summarize multiple columns simultaneously: summarise(df, across(c(sales, profit), mean)).
  • .groups Argument: Controls the grouping structure of the output. Use .groups = “drop” to completely ungroup the final result:

df %>% group_by(region, year) %>% summarise(total = sum(sales), .groups = “drop”)

  • n(): is a helper function from the dplyr package used to count the number of observations (rows) in the current group. It is a zero-argument function that only works within specific dplyr verbs like summarise(), mutate(), and filter().
    • You cannot use n() on its own in the console; it will throw an error stating it “must only be used inside data-masking verbs”.
    • n() vs. nrow(): While nrow(df) returns the total number of rows in an entire data frame, n() is “group-aware” and returns counts based on defined groups.
    • n() vs. n_distinct(): n() counts every row regardless of content, whereas n_distinct(column) counts only unique values within a specific column.
    • n() vs. count(): count() is a shortcut that performs both group_by() and summarise(n = n()) in one step.
billboard |> 
  select(date:artist, week_popular = "weeks-on-board") |> 
  filter(artist == "Drake") |> 
  group_by(song) |> 
  summarise(total_week_popular = max(week_popular))
## # A tibble: 108 × 2
##    song                    total_week_popular
##    <chr>                                <dbl>
##  1 0 To 100 / The Catch Up                 20
##  2 10 Bands                                13
##  3 30 For 30 Freestyle                      2
##  4 6 God                                    1
##  5 6 Man                                    1
##  6 7am On Bridle Path                       3
##  7 8 Out Of 10                              3
##  8 9                                        5
##  9 9 AM In Dallas                           1
## 10 Back To Back                            20
## # ℹ 98 more rows
library(dplyr)
billboard |>  
  group_by(artist) |> 
  summarise(count = n())
## # A tibble: 10,205 × 2
##    artist                        count
##    <chr>                         <int>
##  1 "\"Groove\" Holmes"              14
##  2 "\"Little\" Jimmy Dickens"       10
##  3 "\"Pookie\" Hudson"               1
##  4 "\"Weird Al\" Yankovic"          91
##  5 "'N Sync"                       172
##  6 "'N Sync & Gloria Estefan"       20
##  7 "'N Sync Featuring Nelly"        20
##  8 "'Til Tuesday"                   53
##  9 "(+44)"                           1
## 10 "(The Preacher) Bobby Womack"     9
## # ℹ 10,195 more rows

Q3.7. ARRANGE() FUNCTION

The basic syntax is arrange(.data, column1, column2).

Common Sorting Methods

  • Ascending Order (A-Z, Low-High): This is the default behavior. Just list the column name: arrange(df, age).
  • Descending Order (Z-A, High-Low): Wrap the column name in the desc() helper: arrange(df, desc(age)).
  • Multiple Columns (Tie-Breakers): List multiple columns in order of sorting priority. If there is a tie in the first column, R uses the second: arrange(df, department, desc(salary)).
  • Missing Values (NA): No matter if you sort ascending or descending, R always puts NA rows at the very bottom of the data frame.

Advanced Helpers & Arguments

  • .by_group: If your data frame is grouped using group_by(), arrange() normally ignores the groups and sorts the entire dataset globally. Set .by_group = TRUE to sort within each group individually
billboard |> 
  select(date:artist, week_popular = "weeks-on-board") |> 
  filter(artist == "Drake") |> 
  group_by(song) |> 
  summarise(total_week_popular = max(week_popular)) |> 
  arrange(desc(total_week_popular), song) |> 
  head(10)
## # A tibble: 10 × 2
##    song                    total_week_popular
##    <chr>                                <dbl>
##  1 God's Plan                              36
##  2 Hotline Bling                           36
##  3 Controlla                               26
##  4 Fake Love                               25
##  5 Headlines                               25
##  6 Nice For What                           25
##  7 Best I Ever Had                         24
##  8 In My Feelings                          22
##  9 Nonstop                                 22
## 10 Started From The Bottom                 22

Q3.8. ARRANGE() FUNCTION

The basic syntax is count(.data, …, wt = NULL, sort = FALSE, name = NULL).

  • Common Counting Methods

  • Single Variable: Count how many times each unique value appears: count(df, country).

  • Multiple Variables: Count unique combinations across columns: count(df, country, status).

  • Auto-Sorting: Set sort = TRUE to immediately sort the results from highest to lowest frequency: count(df, country, sort = TRUE).

  • Custom Column Name: By default, the counts are saved in a column named n. Use the name argument to change it: count(df, country, name = “total_users”).

Advanced Helpers

  • wt (Weighted Count): Instead of counting rows, sum up the values of another column for each group: count(df, country, wt = sales). This behaves exactly like sum(sales).
  • Count Expressions: You can calculate a logical test inside the function to see how many rows pass a condition: count(df, age > 18).
billboard |> 
  select(date:artist, week_popular = "weeks-on-board") |> 
  count(artist) |> 
  arrange(desc(n))
## # A tibble: 10,205 × 2
##    artist            n
##    <chr>         <int>
##  1 Taylor Swift   1023
##  2 Elton John      889
##  3 Madonna         857
##  4 Drake           787
##  5 Kenny Chesney   769
##  6 Tim McGraw      731
##  7 Keith Urban     673
##  8 Stevie Wonder   659
##  9 Rod Stewart     657
## 10 Mariah Carey    621
## # ℹ 10,195 more rows

Q4. Use of trace() and recover()

Q4.1. The trace() Function

The trace() function allows you to temporarily insert pieces of code (tracers) into any existing function without manually modifying its source code.

# A function that occasionally receives invalid negative input 
check <- function(x) {   
  log_val <- log(x)   
  return(log_val * 2) 
}  # Insert a browser breakpoint ONLY if x is negative 
trace(check, tracer = quote(if(x < 0) browser()), at = 1)   
## [1] "check"
check(5)    
## Tracing check(5) step 1
## [1] 3.218876
check(-5)    
## Tracing check(-5) step 1 
## Called from: eval(expr, p)
## debug: `{`
## debug: log_val <- log(x)
## Warning in log(x): NaNs produced
## debug: return(log_val * 2)
## [1] NaN
untrace(check) 

Q4.2. The recover() Function

The recover() function is a specialized error handler designed for navigating the active function call stack right at the exact moment a script fails.

# Setup nested functions 
func_alpha <- function(a) func_beta(a) 
func_beta  <- function(b) log(b) # Will fail if 'b' is text  
# Enable recover globally 
options(error = recover)  # Execute the error-prone flow 
#func_alpha("hello")

Q5. Data Visualization using ggplot2

top_artists <- billboard %>%
  count(artist, sort = TRUE) %>%
  head(10)

# Bar chart
ggplot(top_artists, aes(x = reorder(artist, n), y = n)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(
    title = "Top 10 Artists by Number of Songs",
    x = "Artist",
    y = "Count"
  ) +
  theme_minimal()

ggplot(billboard, aes(x = "", y = rank)) +
  geom_boxplot(fill = "orange") +
  labs(
    title = "Distribution of Song Rankings",
    y = "Rank"
  ) +
  theme_minimal()

ggplot(billboard, aes(x = "weeks-on-board", y = rank)) +
  geom_point(alpha = 0.3) +
  geom_smooth(color = "blue") +
  labs(
    title = "Relationship Between Weeks on Board and Rank",
    x = "Weeks on Board",
    y = "Rank"
  ) +
  theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(billboard, aes(x = rank)) +
  geom_histogram(binwidth = 2, color = "white") +
  labs(
    title = "Distribution of Song Rankings",
    x = "Rank",
    y = "Frequency"
  ) +
  theme_minimal()

adele_data <- billboard %>%
  filter(artist == "Adele")

ggplot(adele_data, aes(x = date, y = rank)) +
  geom_line(color = "blue") +
  geom_point() +
  scale_y_reverse() +
  labs(
    title = "Adele Song Ranking Trend",
    x = "Date",
    y = "Rank"
  ) +
  theme_minimal()

top5 <- billboard %>%
  filter(artist %in% c("Adele", "Ed Sheeran", "Drake"))

ggplot(top5, aes(x = "weeks-on-board", y = rank)) +
  geom_point() +
  facet_wrap(~artist) +
  labs(
    title = "Artist Comparison",
    x = "Weeks on Board",
    y = "Rank"
  ) +
  theme_minimal()

Q6. Application of function

# Function to calculate summary statistics
stats_func <- function(data) {

  
  data <- na.omit(data) # Remove missing values

  # Calculate statistics
  min_value <- min(data)
  max_value <- max(data)
  mean_value <- mean(data)
  median_value <- median(data)

  q1 <- quantile(data, 0.25)
  q2 <- quantile(data, 0.50)
  q3 <- quantile(data, 0.75)


  result <- list(
    Minimum = min_value,
    Maximum = max_value,
    Mean = mean_value,
    Median = median_value,
    First_Quartile_Q1 = q1,
    Second_Quartile_Q2 = q2,
    Third_Quartile_Q3 = q3
  )

  return(result)
}

billboard$`weeks-on-board` |> stats_func()
## $Minimum
## [1] 1
## 
## $Maximum
## [1] 90
## 
## $Mean
## [1] 9.161785
## 
## $Median
## [1] 7
## 
## $First_Quartile_Q1
## 25% 
##   4 
## 
## $Second_Quartile_Q2
## 50% 
##   7 
## 
## $Third_Quartile_Q3
## 75% 
##  13

Apply Functions like sapply, vapply, lapply, tapply

Using lapply() to get class (datatype) of each column

lapply(billboard, class)
## $date
## [1] "Date"
## 
## $rank
## [1] "numeric"
## 
## $song
## [1] "character"
## 
## $artist
## [1] "character"
## 
## $`last-week`
## [1] "numeric"
## 
## $`peak-rank`
## [1] "numeric"
## 
## $`weeks-on-board`
## [1] "numeric"

Using lapply() to calculate mean for numeric columns

numeric_cols <- billboard[sapply(billboard, is.numeric)] 
numeric_cols |> lapply(mean, na.rm = TRUE)
## $rank
## [1] 50.50093
## 
## $`last-week`
## [1] 47.59163
## 
## $`peak-rank`
## [1] 40.97063
## 
## $`weeks-on-board`
## [1] 9.161785

Using sapply() to count missing values in each column

billboard |> sapply(function(x) sum(is.na(x)))
##           date           rank           song         artist      last-week 
##              0              0              0              0          32312 
##      peak-rank weeks-on-board 
##              0              0

Using sapply() Find mean of numeric columns

sapply(numeric_cols, mean, na.rm = TRUE)
##           rank      last-week      peak-rank weeks-on-board 
##      50.500929      47.591631      40.970629       9.161785

Using vapply() to find mean of numeric columns

vapply(
  numeric_cols,
  mean,
  FUN.VALUE = numeric(1),
  na.rm = TRUE
)
##           rank      last-week      peak-rank weeks-on-board 
##      50.500929      47.591631      40.970629       9.161785

Using tapply() to calculate average rank by artist

#tapply(
 # billboard$rank,
  #billboard$artist,
  #mean,
  #na.rm = TRUE
#) 

getting Number of songs per artist using tapply()

#tapply(
 # billboard$song,
  #billboard$artist,
  #length
#)