20251MBI043 - HOME WORK

Importing required library using pacman package manager

if (!require("pacman")) install.packages("pacman")

## Loading required package: pacman

pacman::p_load(tidyverse, DBI, RPostgres)

Q1. Import Data from different sources

Q1.1. Import data from database

con <- dbConnect( RPostgres::Postgres(), dbname = 'sms', host = 'localhost', port = 5432, user = 'Shaloom', password = 'Shaloom_50095' ) dbListTables(con)

## [1] "posts" "users"

dbListFields(con, "users")

## [1] "id" "username" "email" "password" "created_at"

my_tb <- dbReadTable(con, "users") my_query <- dbGetQuery(con, "SELECT * FROM users WHERE username = 'Shaloom' ") summary(my_tb)

## id username email password ## Length : 4 Length :4 Length : 4 Length : 4 ## N.unique : 4 N.unique :4 N.unique : 4 N.unique : 4 ## N.blank : 0 N.blank :0 N.blank : 0 N.blank : 0 ## Min.nchar:36 Min.nchar:5 Min.nchar:15 Min.nchar:162 ## Max.nchar:36 Max.nchar:7 Max.nchar:17 Max.nchar:162 ## ## created_at ## Min. :2026-05-08 10:59:25 ## 1st Qu.:2026-05-08 11:01:39 ## Median :2026-05-08 11:02:32 ## Mean :2026-05-08 11:01:56 ## 3rd Qu.:2026-05-08 11:02:49 ## Max. :2026-05-08 11:03:14

View(my_query)

Q1.2. Import from external source like .csv file

library(readr) billboard <- read_csv("../data/charts.csv")

## Rows: 330087 Columns: 7 ## ── Column specification ──────────────────────────────────────────────────────── ## Delimiter: "," ## chr (2): song, artist ## dbl (4): rank, last-week, peak-rank, weeks-on-board ## date (1): date ## ## ℹ Use `spec()` to retrieve the full column specification for this data. ## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

nrow(billboard) # getting number of row(observation) in dataset

## [1] 330087

ncol(billboard) # getting number of column(variable) in dataset

## [1] 7

Q1.3. Import data from statistical data set like .sav file

library(haven) survey <- read_sav("../data/survey.sav") summary(survey)

## id sex age marital ## Min. : 1.0 Min. :1.000 Min. :18.00 Min. :1.000 ## 1st Qu.:131.5 1st Qu.:1.000 1st Qu.:26.00 1st Qu.:2.000 ## Median :280.0 Median :2.000 Median :36.00 Median :4.000 ## Mean :276.4 Mean :1.579 Mean :37.44 Mean :3.371 ## 3rd Qu.:422.5 3rd Qu.:2.000 3rd Qu.:47.00 3rd Qu.:4.000 ## Max. :702.0 Max. :2.000 Max. :82.00 Max. :8.000 ## ## child educ source smoke ## Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000 ## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:1.00 1st Qu.:2.000 ## Median :2.000 Median :4.000 Median :1.00 Median :2.000 ## Mean :1.578 Mean :4.087 Mean :3.41 Mean :1.805 ## 3rd Qu.:2.000 3rd Qu.:5.000 3rd Qu.:6.00 3rd Qu.:2.000 ## Max. :2.000 Max. :6.000 Max. :9.00 Max. :2.000 ## NAs :1 NAs :17 NAs :3 ## smokenum op1 op2 op3 op4 ## Min. : 0.00 Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000 ## 1st Qu.: 0.00 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:3.0 1st Qu.:1.000 ## Median : 0.00 Median :3.000 Median :2.000 Median :4.0 Median :2.000 ## Mean : 16.59 Mean :3.264 Mean :2.284 Mean :3.7 Mean :2.259 ## 3rd Qu.: 0.00 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:4.0 3rd Qu.:3.000 ## Max. :280.00 Max. :5.000 Max. :5.000 Max. :5.0 Max. :5.000 ## NAs :79 NAs :4 NAs :3 NAs :3 NAs :3 ## op5 op6 mast1 mast2 mast3 ## Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:3.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:1.000 ## Median :4.00 Median :2.000 Median :2.000 Median :3.000 Median :2.000 ## Mean :3.94 Mean :2.266 Mean :2.037 Mean :3.323 Mean :1.959 ## 3rd Qu.:5.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:3.000 ## Max. :5.00 Max. :5.000 Max. :4.000 Max. :5.000 Max. :4.000 ## NAs :3 NAs :3 NAs :3 NAs :3 NAs :3 ## mast4 mast5 mast6 mast7 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:1.000 ## Median :2.000 Median :4.000 Median :2.000 Median :2.000 ## Mean :1.924 Mean :3.411 Mean :1.956 Mean :2.094 ## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:3.000 ## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000 ## NAs :3 NAs :3 NAs :3 NAs :3 ## pn1 pn2 pn3 pn4 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000 ## Median :4.000 Median :2.000 Median :1.000 Median :3.000 ## Mean :3.789 Mean :2.562 Mean :1.734 Mean :3.163 ## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:4.000 ## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 ## NAs :3 NAs :3 NAs :3 NAs :3 ## pn5 pn6 pn7 pn8 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:1.000 ## Median :1.000 Median :4.000 Median :4.000 Median :2.000 ## Mean :1.411 Mean :3.727 Mean :3.596 Mean :2.202 ## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:3.000 ## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 ## NAs :3 NAs :3 NAs :3 NAs :4 ## pn9 pn10 pn11 pn12 pn13 ## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:3.000 1st Qu.:1.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000 ## Median :3.000 Median :2.00 Median :2.000 Median :3.000 Median :3.000 ## Mean :3.294 Mean :1.86 Mean :2.422 Mean :2.872 Mean :3.261 ## 3rd Qu.:4.000 3rd Qu.:2.00 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000 ## Max. :5.000 Max. :5.00 Max. :5.000 Max. :5.000 Max. :5.000 ## NAs :3 NAs :3 NAs :3 NAs :3 NAs :3 ## pn14 pn15 pn16 pn17 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:3.000 ## Median :1.000 Median :3.000 Median :1.000 Median :3.000 ## Mean :1.661 Mean :3.188 Mean :1.674 Mean :3.376 ## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:4.000 ## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 ## NAs :3 NAs :3 NAs :3 NAs :3 ## pn18 pn19 pn20 lifsat1 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:3.000 ## Median :3.500 Median :2.000 Median :1.000 Median :5.000 ## Mean :3.424 Mean :2.186 Mean :1.722 Mean :4.374 ## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:5.000 ## Max. :5.000 Max. :5.000 Max. :5.000 Max. :7.000 ## NAs :3 NAs :3 NAs :3 NAs :3 ## lifsat2 lifsat3 lifsat4 lifsat5 pss1 ## Min. :1.000 Min. :1.00 Min. :1.00 Min. :1.000 Min. :1.000 ## 1st Qu.:4.000 1st Qu.:4.00 1st Qu.:4.00 1st Qu.:2.000 1st Qu.:2.000 ## Median :5.000 Median :5.00 Median :5.00 Median :4.000 Median :3.000 ## Mean :4.571 Mean :4.69 Mean :4.75 Mean :3.993 Mean :2.836 ## 3rd Qu.:6.000 3rd Qu.:6.00 3rd Qu.:6.00 3rd Qu.:6.000 3rd Qu.:3.000 ## Max. :7.000 Max. :7.00 Max. :7.00 Max. :7.000 Max. :5.000 ## NAs :3 NAs :3 NAs :3 NAs :3 NAs :6 ## pss2 pss3 pss4 pss5 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 ## Median :3.000 Median :3.000 Median :4.000 Median :4.000 ## Mean :2.739 Mean :3.162 Mean :3.801 Mean :3.467 ## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 ## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 ## NAs :6 NAs :6 NAs :6 NAs :6 ## pss6 pss7 pss8 pss9 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000 ## Median :3.000 Median :4.000 Median :4.000 Median :3.000 ## Mean :2.774 Mean :3.538 Mean :3.536 Mean :3.046 ## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 ## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 ## NAs :6 NAs :6 NAs :6 NAs :6 ## pss10 sest1 sest2 sest3 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:4.000 1st Qu.:1.000 ## Median :2.000 Median :4.000 Median :4.000 Median :1.000 ## Mean :2.513 Mean :3.672 Mean :3.745 Mean :1.376 ## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:2.000 ## Max. :5.000 Max. :4.000 Max. :4.000 Max. :4.000 ## NAs :6 NAs :3 NAs :3 NAs :3 ## sest4 sest5 sest6 sest7 sest8 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00 ## 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:3.00 ## Median :4.000 Median :1.000 Median :3.000 Median :2.000 Median :3.00 ## Mean :3.408 Mean :1.596 Mean :3.278 Mean :2.044 Mean :3.17 ## 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:4.00 ## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.00 ## NAs :3 NAs :3 NAs :3 NAs :3 NAs :3 ## sest9 sest10 m1 m2 ## Min. :1.000 Min. :1.000 Min. :0.0000 Min. :0.0000 ## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:1.0000 ## Median :2.000 Median :1.000 Median :1.0000 Median :1.0000 ## Mean :2.039 Mean :1.713 Mean :0.7294 Mean :0.9014 ## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:1.0000 ## Max. :4.000 Max. :4.000 Max. :1.0000 Max. :1.0000 ## NAs :3 NAs :3 NAs :3 NAs :3 ## m3 m4 m5 m6 ## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 ## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 ## Median :1.0000 Median :0.0000 Median :1.0000 Median :0.0000 ## Mean :0.8624 Mean :0.4299 Mean :0.5069 Mean :0.2248 ## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 ## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 ## NAs :3 NAs :4 NAs :3 NAs :3 ## m7 m8 m9 m10 ## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 ## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 ## Median :0.0000 Median :1.0000 Median :0.0000 Median :0.0000 ## Mean :0.4862 Mean :0.5793 Mean :0.2248 Mean :0.3609 ## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 ## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 ## NAs :3 NAs :4 NAs :3 NAs :4 ## pc1 pc2 pc3 pc4 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:3.000 ## Median :2.000 Median :3.000 Median :4.000 Median :4.000 ## Mean :2.545 Mean :2.758 Mean :3.575 Mean :3.464 ## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 ## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 ## NAs :6 NAs :6 NAs :6 NAs :6 ## pc5 pc6 pc7 pc8 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 ## Median :4.000 Median :3.000 Median :3.000 Median :3.000 ## Mean :3.714 Mean :3.282 Mean :2.905 Mean :3.072 ## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 ## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 ## NAs :6 NAs :7 NAs :6 NAs :6 ## pc9 pc10 pc11 pc12 pc13 ## Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000 ## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.0 1st Qu.:3.000 1st Qu.:2.000 ## Median :4.000 Median :4.000 Median :2.0 Median :3.000 Median :3.000 ## Mean :3.575 Mean :3.596 Mean :2.7 Mean :3.346 Mean :3.254 ## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.0 3rd Qu.:4.000 3rd Qu.:4.000 ## Max. :5.000 Max. :5.000 Max. :5.0 Max. :5.000 Max. :5.000 ## NAs :6 NAs :6 NAs :6 NAs :6 NAs :6 ## pc14 pc15 pc16 pc17 pc18 ## Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00 ## 1st Qu.:2.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.00 ## Median :3.00 Median :2.000 Median :2.000 Median :3.000 Median :4.00 ## Mean :2.94 Mean :2.619 Mean :2.443 Mean :3.171 Mean :3.61 ## 3rd Qu.:4.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.00 ## Max. :5.00 Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.00 ## NAs :6 NAs :6 NAs :6 NAs :6 NAs :6 ## Rop2 Rop4 Rop6 Rmast1 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000 ## Median :4.000 Median :4.000 Median :4.000 Median :3.000 ## Mean :3.716 Mean :3.741 Mean :3.734 Mean :2.963 ## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:4.000 ## Max. :5.000 Max. :5.000 Max. :5.000 Max. :4.000 ## NAs :3 NAs :3 NAs :3 NAs :3 ## Rmast3 Rmast4 Rmast6 Rmast7 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.000 ## Median :3.000 Median :3.000 Median :3.000 Median :3.000 ## Mean :3.041 Mean :3.076 Mean :3.044 Mean :2.906 ## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 ## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000 ## NAs :3 NAs :3 NAs :3 NAs :3 ## Rpss4 Rpss5 Rpss7 Rpss8 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 ## Median :2.000 Median :2.000 Median :2.000 Median :2.000 ## Mean :2.199 Mean :2.533 Mean :2.462 Mean :2.464 ## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000 ## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 ## NAs :6 NAs :6 NAs :6 NAs :6 ## Rsest3 Rsest5 Rsest7 Rsest9 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 ## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.000 ## Median :4.000 Median :4.000 Median :3.000 Median :3.000 ## Mean :3.624 Mean :3.404 Mean :2.956 Mean :2.961 ## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 ## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000 ## NAs :3 NAs :3 NAs :3 NAs :3 ## Rsest10 Rpc1 Rpc2 Rpc7 Rpc11 ## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.0 ## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.0 ## Median :4.000 Median :4.000 Median :3.000 Median :3.000 Median :4.0 ## Mean :3.287 Mean :3.455 Mean :3.242 Mean :3.095 Mean :3.3 ## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.0 ## Max. :4.000 Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.0 ## NAs :3 NAs :6 NAs :6 NAs :6 NAs :6 ## Rpc15 Rpc16 toptim tmast ## Min. :1.000 Min. :1.000 Min. : 7.00 Min. : 8.00 ## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:19.00 1st Qu.:19.00 ## Median :4.000 Median :4.000 Median :22.00 Median :22.00 ## Mean :3.381 Mean :3.557 Mean :22.12 Mean :21.76 ## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:26.00 3rd Qu.:25.00 ## Max. :5.000 Max. :5.000 Max. :30.00 Max. :28.00 ## NAs :6 NAs :6 NAs :4 NAs :3 ## tposaff tnegaff tlifesat tpstress tslfest ## Min. :11.00 Min. :10.0 Min. : 5.00 Min. :12.00 Min. :18.00 ## 1st Qu.:29.00 1st Qu.:14.0 1st Qu.:18.00 1st Qu.:23.00 1st Qu.:30.00 ## Median :34.00 Median :18.0 Median :23.00 Median :26.00 Median :35.00 ## Mean :33.69 Mean :19.4 Mean :22.38 Mean :26.73 Mean :33.53 ## 3rd Qu.:39.00 3rd Qu.:23.0 3rd Qu.:27.00 3rd Qu.:31.00 3rd Qu.:38.00 ## Max. :49.00 Max. :39.0 Max. :35.00 Max. :46.00 Max. :40.00 ## NAs :3 NAs :4 NAs :3 NAs :6 NAs :3 ## tmarlow tpcoiss agegp3 agegp5 ## Min. : 0.000 Min. :20.00 Min. :1.000 Min. :1.000 ## 1st Qu.: 4.000 1st Qu.:53.00 1st Qu.:1.000 1st Qu.:2.000 ## Median : 5.000 Median :62.00 Median :2.000 Median :3.000 ## Mean : 5.305 Mean :60.63 Mean :1.973 Mean :2.954 ## 3rd Qu.: 7.000 3rd Qu.:69.00 3rd Qu.:3.000 3rd Qu.:4.000 ## Max. :10.000 Max. :88.00 Max. :3.000 Max. :5.000 ## NAs :6 NAs :9 ## educrec LG10negaff MAH_1 COO_1 ## Min. :1.000 Min. :1.000 Min. : 0.003542 Min. :1.000e-09 ## 1st Qu.:2.000 1st Qu.:1.146 1st Qu.: 0.534432 1st Qu.:1.293e-04 ## Median :3.000 Median :1.255 Median : 1.322372 Median :5.578e-04 ## Mean :3.091 Mean :1.261 Mean : 1.993438 Mean :3.031e-03 ## 3rd Qu.:4.000 3rd Qu.:1.362 3rd Qu.: 2.538417 3rd Qu.:2.413e-03 ## Max. :5.000 Max. :1.663 Max. :13.896642 Max. :9.449e-02 ## NAs :4 NAs :10 NAs :13 ## MAH_2 ## Min. : 0.09048 ## 1st Qu.: 1.11653 ## Median : 1.99169 ## Mean : 2.99306 ## 3rd Qu.: 3.86184 ## Max. :18.10064 ## NAs :7

sum(is.na(survey$source))

## [1] 17

nrow(billboard) # getting number of row(observation) in dataset

## [1] 330087

ncol(billboard) # getting number of column(variable) in dataset

## [1] 7

Q2. piping technique (%>% or |>)

Q2.1.piping

In R, piping is a coding technique used to chain operations together in a sequence, making code much easier to read and maintain. Instead of nesting functions inside each other, which forces you to read from the inside out. piping allows you to read from left to right or top to bottom.

Key Differences of piping

The native pipe |> requires parentheses after the function name (e.g., x |> mean()), whereas %>% can accept the bare function name (e.g., x %>% mean).

If you need to pass the data to a position other than the first argument:

Use a dot . with %>% (e.g., df %>% lm(y ~ x, data = .))

Use an underscore _ with |> in newer R versions, but only for named arguments.

The native pipe |> is slightly faster because it is handled directly by the R parser rather than as a function call.

billboard %>% head(10) # getting 10 first record in data set using %>%

## # A tibble: 10 × 7 ## date rank song artist `last-week` `peak-rank` `weeks-on-board` ## <date> <dbl> <chr> <chr> <dbl> <dbl> <dbl> ## 1 2021-11-06 1 Easy On Me Adele 1 1 3 ## 2 2021-11-06 2 Stay The K… 2 1 16 ## 3 2021-11-06 3 Industry Ba… Lil N… 3 1 14 ## 4 2021-11-06 4 Fancy Like Walke… 4 3 19 ## 5 2021-11-06 5 Bad Habits Ed Sh… 5 2 18 ## 6 2021-11-06 6 Way 2 Sexy Drake… 6 1 8 ## 7 2021-11-06 7 Shivers Ed Sh… 9 7 7 ## 8 2021-11-06 8 Good 4 U Olivi… 7 1 24 ## 9 2021-11-06 9 Need To Know Doja … 11 9 20 ## 10 2021-11-06 10 Levitating Dua L… 8 2 56

Q3. Dplyr Functions

-> COLUMN FUCTIONS

select(): is primarily used to pick or exclude specific columns (variables) from a data frame.

mutate(): is used to create new columns or modify existing columns in a data frame

-> ROW FUCTIONS

filter(): is used to pick specific rows from a data frame based on one or more conditions.

distinct(): is used to keep unique rows from a data frame and remove duplicates.

arrange(): is used to reorder (sort) the rows of a data frame based on the values of one or more columns.

-> GROUP FUCTIONS

group_by(): is used to split your data frame into separate groups based on one or more variables. it doesn’t change how the data looks by itself; instead, it changes how other functions (like summarise() or mutate()) operate on the data by making them run group-by-group.

summarise(): it summarize a data frame into a single row of statistical value.

count(): it count the number of rows with each unique value of variable

Q3.1. SELECT() FUNCTION

Common Selection Methods

By Name: List column names without quotes: select(df, column1, column2).

By Position: Use numerical indices: select(df, 1:3) to get the first three columns.

Exclusion: Use a minus sign (-) to drop columns: select(df, -unwanted_column).

Range: Select all columns between two specified names: select(df, start_col:end_col).

Advanced Helper functions for select
You can use selection helpers to find columns based on patterns:

starts_with(“prefix”): Select columns whose name start with a character string.

ends_with(“suffix”): Select columns whose name end with a character string.

contains(“string”): Select columns whose name containing a character string.

matches(“regex”): Select columns whose name matching a regular expression.

everything(): select every column. Useful for reordering columns (e.g., select(df, important_col, everything())).

where(is.numeric): Selects columns based on their data type.

names(billboard)

## [1] "date" "rank" "song" "artist" ## [5] "last-week" "peak-rank" "weeks-on-board"

billboard |> select(date, rank, song, artist, "weeks-on-board")

## # A tibble: 330,087 × 5 ## date rank song artist `weeks-on-board` ## <date> <dbl> <chr> <chr> <dbl> ## 1 2021-11-06 1 Easy On Me Adele 3 ## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16 ## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14 ## 4 2021-11-06 4 Fancy Like Walker Hayes 19 ## 5 2021-11-06 5 Bad Habits Ed Sheeran 18 ## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & You… 8 ## 7 2021-11-06 7 Shivers Ed Sheeran 7 ## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24 ## 9 2021-11-06 9 Need To Know Doja Cat 20 ## 10 2021-11-06 10 Levitating Dua Lipa 56 ## # ℹ 330,077 more rows

billboard |> select(date:artist, "weeks-on-board")

## # A tibble: 330,087 × 5 ## date rank song artist `weeks-on-board` ## <date> <dbl> <chr> <chr> <dbl> ## 1 2021-11-06 1 Easy On Me Adele 3 ## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16 ## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14 ## 4 2021-11-06 4 Fancy Like Walker Hayes 19 ## 5 2021-11-06 5 Bad Habits Ed Sheeran 18 ## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & You… 8 ## 7 2021-11-06 7 Shivers Ed Sheeran 7 ## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24 ## 9 2021-11-06 9 Need To Know Doja Cat 20 ## 10 2021-11-06 10 Levitating Dua Lipa 56 ## # ℹ 330,077 more rows

billboard |> select(1:4, "weeks-on-board")

## # A tibble: 330,087 × 5 ## date rank song artist `weeks-on-board` ## <date> <dbl> <chr> <chr> <dbl> ## 1 2021-11-06 1 Easy On Me Adele 3 ## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16 ## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14 ## 4 2021-11-06 4 Fancy Like Walker Hayes 19 ## 5 2021-11-06 5 Bad Habits Ed Sheeran 18 ## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & You… 8 ## 7 2021-11-06 7 Shivers Ed Sheeran 7 ## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24 ## 9 2021-11-06 9 Need To Know Doja Cat 20 ## 10 2021-11-06 10 Levitating Dua Lipa 56 ## # ℹ 330,077 more rows

billboard |> select(-"last-week", -"peak-rank")

## # A tibble: 330,087 × 5 ## date rank song artist `weeks-on-board` ## <date> <dbl> <chr> <chr> <dbl> ## 1 2021-11-06 1 Easy On Me Adele 3 ## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16 ## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14 ## 4 2021-11-06 4 Fancy Like Walker Hayes 19 ## 5 2021-11-06 5 Bad Habits Ed Sheeran 18 ## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & You… 8 ## 7 2021-11-06 7 Shivers Ed Sheeran 7 ## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24 ## 9 2021-11-06 9 Need To Know Doja Cat 20 ## 10 2021-11-06 10 Levitating Dua Lipa 56 ## # ℹ 330,077 more rows

billboard |> select(date:artist, week_popular = "weeks-on-board")

## # A tibble: 330,087 × 5 ## date rank song artist week_popular ## <date> <dbl> <chr> <chr> <dbl> ## 1 2021-11-06 1 Easy On Me Adele 3 ## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16 ## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14 ## 4 2021-11-06 4 Fancy Like Walker Hayes 19 ## 5 2021-11-06 5 Bad Habits Ed Sheeran 18 ## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & Young T… 8 ## 7 2021-11-06 7 Shivers Ed Sheeran 7 ## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24 ## 9 2021-11-06 9 Need To Know Doja Cat 20 ## 10 2021-11-06 10 Levitating Dua Lipa 56 ## # ℹ 330,077 more rows

Q3.2. MUTATE() FUNCTION

Advanced Helper functions for select

You can combine mutate() with other dplyr functions for complex data transformations:

across(): Apply the same transformation to multiple columns at once: mutate(df, across(c(col1, col2), as.character)).

if_else() / case_when(): Create conditional variables: mutate(df, age_group = case_when(age < 18 ~ “Minor”, age >= 18 ~ “Adult”)). row_number() / rank(): Add ranking or sequential IDs: mutate(df, rank_id = row_number()).

.before / .after: Control where the new column appears: mutate(df, new_col = x * 2, .before = x).

billboard |> select(date:artist, week_popular = "weeks-on-board") |> mutate(is_collabo = grepl("featuring", artist, ignore.case = T)) |> select(artist, is_collabo, everything())

## # A tibble: 330,087 × 6 ## artist is_collabo date rank song week_popular ## <chr> <lgl> <date> <dbl> <chr> <dbl> ## 1 Adele FALSE 2021-11-06 1 Easy… 3 ## 2 The Kid LAROI & Justin Bieber FALSE 2021-11-06 2 Stay 16 ## 3 Lil Nas X & Jack Harlow FALSE 2021-11-06 3 Indu… 14 ## 4 Walker Hayes FALSE 2021-11-06 4 Fanc… 19 ## 5 Ed Sheeran FALSE 2021-11-06 5 Bad … 18 ## 6 Drake Featuring Future & Youn… TRUE 2021-11-06 6 Way … 8 ## 7 Ed Sheeran FALSE 2021-11-06 7 Shiv… 7 ## 8 Olivia Rodrigo FALSE 2021-11-06 8 Good… 24 ## 9 Doja Cat FALSE 2021-11-06 9 Need… 20 ## 10 Dua Lipa FALSE 2021-11-06 10 Levi… 56 ## # ℹ 330,077 more rows

Q3.3. FILTER() FUNCTION

The basic syntax is filter(.data, condition).

Common Filtering Methods

Logical Comparisons: Match specific values using ==, >, <, >=, <=, or !=: filter(df, age > 21).

Multiple Conditions (AND): Separate conditions with commas or &: filter(df, age > 21, status == “Active”).

Multiple Conditions (OR): Use the vertical bar | for alternative conditions: filter(df, status == “Active” | points > 100).

Value in a List: Use the %in% operator to check against multiple values: filter(df, country %in% c(“USA”, “Canada”, “UK”)).

Negation: Use the exclamation mark ! to invert a condition: filter(df, !country %in% c(“USA”, “Canada”)).

Advanced Helpers

You can use specialized functions inside filter() to handle missing data, strings, or complex rows:

is.na(): Keep or drop missing values: filter(df, !is.na(email)) (removes missing emails).

str_detect(): Filter based on partial string matches: filter(df, str_detect(name, “John”)).

between(): Filter rows within a numeric range: filter(df, between(score, 50, 100)).

if_any() / if_all(): Filter rows across multiple columns at once: filter(df, if_any(c(score1, score2), ~ .x > 90)).

billboard |> select(date:artist, week_popular = "weeks-on-board") |> filter(week_popular >= 20, artist == "Drake" | artist == "The Weeknd")

## # A tibble: 236 × 5 ## date rank song artist week_popular ## <date> <dbl> <chr> <chr> <dbl> ## 1 2021-09-04 20 Blinding Lights The Weeknd 90 ## 2 2021-08-28 21 Blinding Lights The Weeknd 89 ## 3 2021-08-21 18 Blinding Lights The Weeknd 88 ## 4 2021-08-14 16 Blinding Lights The Weeknd 87 ## 5 2021-08-07 17 Blinding Lights The Weeknd 86 ## 6 2021-07-31 17 Blinding Lights The Weeknd 85 ## 7 2021-07-24 17 Blinding Lights The Weeknd 84 ## 8 2021-07-17 15 Blinding Lights The Weeknd 83 ## 9 2021-07-10 18 Blinding Lights The Weeknd 82 ## 10 2021-07-03 15 Blinding Lights The Weeknd 81 ## # ℹ 226 more rows

Q3.4. DISTINCT() FUNCTION

The basic syntax is distinct(.data, …, .keep_all = FALSE).

Common Deduplication Methods

Entire Data Frame: Remove rows where every single column is identical: distinct(df).

Specific Columns: Find unique combinations of specific variables: distinct(df, country, city).

Keep All Columns: By default, specifying columns drops the rest. Use .keep_all = TRUE to keep all other columns for the first unique occurrence found: distinct(df, customer_id, .keep_all = TRUE).

Advanced Helpers & Tricks

On-the-Fly Mutation: You can create a new variable directly inside the function: distinct(df, lower_email = tolower(email)).

Count Unique Values: To just count unique rows rather than extracting them, use n_distinct() inside a summarise() or mutate() function: df %>% summarise(unique_users = n_distinct(user_id)).

billboard |> select(date:artist, week_popular = "weeks-on-board") |> filter(artist == "Drake")

## # A tibble: 787 × 5 ## date rank song artist week_popular ## <date> <dbl> <chr> <chr> <dbl> ## 1 2021-11-06 91 No Friends In The Industry Drake 8 ## 2 2021-10-30 87 No Friends In The Industry Drake 7 ## 3 2021-10-30 90 Champagne Poetry Drake 7 ## 4 2021-10-23 74 No Friends In The Industry Drake 6 ## 5 2021-10-23 77 Champagne Poetry Drake 6 ## 6 2021-10-16 64 No Friends In The Industry Drake 5 ## 7 2021-10-16 65 Champagne Poetry Drake 5 ## 8 2021-10-16 98 TSU Drake 5 ## 9 2021-10-09 54 Champagne Poetry Drake 4 ## 10 2021-10-09 60 No Friends In The Industry Drake 4 ## # ℹ 777 more rows

billboard |> select(date:artist, week_popular = "weeks-on-board") |> filter(artist == "Drake") |> distinct(song)

## # A tibble: 108 × 1 ## song ## <chr> ## 1 No Friends In The Industry ## 2 Champagne Poetry ## 3 TSU ## 4 Pipe Down ## 5 Papi's Home ## 6 Race My Mind ## 7 7am On Bridle Path ## 8 Fucking Fans ## 9 The Remorse ## 10 What's Next ## # ℹ 98 more rows

billboard |> select(date:artist, week_popular = "weeks-on-board") |> filter(artist == "Drake") |> distinct(song, .keep_all = T)

## # A tibble: 108 × 5 ## date rank song artist week_popular ## <date> <dbl> <chr> <chr> <dbl> ## 1 2021-11-06 91 No Friends In The Industry Drake 8 ## 2 2021-10-30 90 Champagne Poetry Drake 7 ## 3 2021-10-16 98 TSU Drake 5 ## 4 2021-10-09 96 Pipe Down Drake 4 ## 5 2021-10-09 97 Papi's Home Drake 4 ## 6 2021-10-02 89 Race My Mind Drake 3 ## 7 2021-10-02 92 7am On Bridle Path Drake 3 ## 8 2021-09-18 32 Fucking Fans Drake 1 ## 9 2021-09-18 35 The Remorse Drake 1 ## 10 2021-07-17 97 What's Next Drake 16 ## # ℹ 98 more rows

Q3.5. GROUP_BY() FUNCTION

The basic syntax is group_by(.data, …).

Common Grouping Methods

Single Variable: Group data by one column: group_by(df, country).

Multiple Variables: Create subgroups by listing multiple columns: group_by(df, year, month).

On-the-Fly Grouping: Create a new grouping variable directly inside the function: group_by(df, age_group = age > 18).

group_by() is almost always paired with one of these downstream functions:

With summarise() (Collapse Data): Calculates summary statistics for each group (reduces rows):

df %>% group_by(category) %>% summarise(avg_price = mean(price), total_sales = n())

With mutate() (Window Functions): Adds a new column calculated within each group (keeps original rows):

ex: Calculates each employee’s salary relative to their department’s average

df %>% group_by(department) %>% mutate(pct_of_dept_avg = salary / mean(salary))

With filter() (Per-Group Filtering): Filters rows based on a group-level condition:

ex: Keeps only the highest-selling transaction for each store

df %>% group_by(store_id) %>% filter(sales == max(sales))

NB: Always ungroup(): Grouping sticks to the data frame. If you don’t remove it, future operations will accidentally run by group and cause strange bugs. Always end your pipeline with ungroup(): df %>% group_by(region) %>% summarise(total = sum(sales)) %>% ungroup()

Q3.6. SUMMARISE() FUNCTION

The basic syntax is summarise(.data, new_column = function(existing_column)).

Common Summary Functions * Center / Average: Use mean() or median(): summarise(df, avg_age = mean(age)). * Spread: Use sd() or IQR(): summarise(df, spread = sd(score)). * Range / Extremes: Use min() or max(): summarise(df, highest = max(sales)). * Counts: Use n() for total rows, or n_distinct() for unique values: summarise(df, total_rows = n(), unique_users = n_distinct(user_id)).

Advanced Helpers

across(): Summarize multiple columns simultaneously: summarise(df, across(c(sales, profit), mean)).

.groups Argument: Controls the grouping structure of the output. Use .groups = “drop” to completely ungroup the final result:

df %>% group_by(region, year) %>% summarise(total = sum(sales), .groups = “drop”)

n(): is a helper function from the dplyr package used to count the number of observations (rows) in the current group. It is a zero-argument function that only works within specific dplyr verbs like summarise(), mutate(), and filter().

You cannot use n() on its own in the console; it will throw an error stating it “must only be used inside data-masking verbs”.

n() vs. nrow(): While nrow(df) returns the total number of rows in an entire data frame, n() is “group-aware” and returns counts based on defined groups.

n() vs. n_distinct(): n() counts every row regardless of content, whereas n_distinct(column) counts only unique values within a specific column.

n() vs. count(): count() is a shortcut that performs both group_by() and summarise(n = n()) in one step.

billboard |> select(date:artist, week_popular = "weeks-on-board") |> filter(artist == "Drake") |> group_by(song) |> summarise(total_week_popular = max(week_popular))

## # A tibble: 108 × 2 ## song total_week_popular ## <chr> <dbl> ## 1 0 To 100 / The Catch Up 20 ## 2 10 Bands 13 ## 3 30 For 30 Freestyle 2 ## 4 6 God 1 ## 5 6 Man 1 ## 6 7am On Bridle Path 3 ## 7 8 Out Of 10 3 ## 8 9 5 ## 9 9 AM In Dallas 1 ## 10 Back To Back 20 ## # ℹ 98 more rows

library(dplyr) billboard |> group_by(artist) |> summarise(count = n())

## # A tibble: 10,205 × 2 ## artist count ## <chr> <int> ## 1 "\"Groove\" Holmes" 14 ## 2 "\"Little\" Jimmy Dickens" 10 ## 3 "\"Pookie\" Hudson" 1 ## 4 "\"Weird Al\" Yankovic" 91 ## 5 "'N Sync" 172 ## 6 "'N Sync & Gloria Estefan" 20 ## 7 "'N Sync Featuring Nelly" 20 ## 8 "'Til Tuesday" 53 ## 9 "(+44)" 1 ## 10 "(The Preacher) Bobby Womack" 9 ## # ℹ 10,195 more rows

Q3.7. ARRANGE() FUNCTION

The basic syntax is arrange(.data, column1, column2).

Common Sorting Methods

Ascending Order (A-Z, Low-High): This is the default behavior. Just list the column name: arrange(df, age).

Descending Order (Z-A, High-Low): Wrap the column name in the desc() helper: arrange(df, desc(age)).

Multiple Columns (Tie-Breakers): List multiple columns in order of sorting priority. If there is a tie in the first column, R uses the second: arrange(df, department, desc(salary)).

Missing Values (NA): No matter if you sort ascending or descending, R always puts NA rows at the very bottom of the data frame.

Advanced Helpers & Arguments

.by_group: If your data frame is grouped using group_by(), arrange() normally ignores the groups and sorts the entire dataset globally. Set .by_group = TRUE to sort within each group individually

billboard |> select(date:artist, week_popular = "weeks-on-board") |> filter(artist == "Drake") |> group_by(song) |> summarise(total_week_popular = max(week_popular)) |> arrange(desc(total_week_popular), song) |> head(10)

## # A tibble: 10 × 2 ## song total_week_popular ## <chr> <dbl> ## 1 God's Plan 36 ## 2 Hotline Bling 36 ## 3 Controlla 26 ## 4 Fake Love 25 ## 5 Headlines 25 ## 6 Nice For What 25 ## 7 Best I Ever Had 24 ## 8 In My Feelings 22 ## 9 Nonstop 22 ## 10 Started From The Bottom 22

Q3.8. ARRANGE() FUNCTION

The basic syntax is count(.data, …, wt = NULL, sort = FALSE, name = NULL).

Common Counting Methods

Single Variable: Count how many times each unique value appears: count(df, country).

Multiple Variables: Count unique combinations across columns: count(df, country, status).

Auto-Sorting: Set sort = TRUE to immediately sort the results from highest to lowest frequency: count(df, country, sort = TRUE).

Custom Column Name: By default, the counts are saved in a column named n. Use the name argument to change it: count(df, country, name = “total_users”).

Advanced Helpers

wt (Weighted Count): Instead of counting rows, sum up the values of another column for each group: count(df, country, wt = sales). This behaves exactly like sum(sales).

Count Expressions: You can calculate a logical test inside the function to see how many rows pass a condition: count(df, age > 18).

billboard |> select(date:artist, week_popular = "weeks-on-board") |> count(artist) |> arrange(desc(n))

## # A tibble: 10,205 × 2 ## artist n ## <chr> <int> ## 1 Taylor Swift 1023 ## 2 Elton John 889 ## 3 Madonna 857 ## 4 Drake 787 ## 5 Kenny Chesney 769 ## 6 Tim McGraw 731 ## 7 Keith Urban 673 ## 8 Stevie Wonder 659 ## 9 Rod Stewart 657 ## 10 Mariah Carey 621 ## # ℹ 10,195 more rows

Q4. Use of trace() and recover()

Q4.1. The trace() Function

The trace() function allows you to temporarily insert pieces of code (tracers) into any existing function without manually modifying its source code.

# A function that occasionally receives invalid negative input check <- function(x) { log_val <- log(x) return(log_val * 2) } # Insert a browser breakpoint ONLY if x is negative trace(check, tracer = quote(if(x < 0) browser()), at = 1)

## [1] "check"

check(5)

## Tracing check(5) step 1

## [1] 3.218876

check(-5)

## Tracing check(-5) step 1 ## Called from: eval(expr, p) ## debug: `{` ## debug: log_val <- log(x)

## Warning in log(x): NaNs produced

## debug: return(log_val * 2)

## [1] NaN

untrace(check)

Q4.2. The recover() Function

The recover() function is a specialized error handler designed for navigating the active function call stack right at the exact moment a script fails.

# Setup nested functions func_alpha <- function(a) func_beta(a) func_beta <- function(b) log(b) # Will fail if 'b' is text # Enable recover globally options(error = recover) # Execute the error-prone flow #func_alpha("hello")

Q5. Data Visualization using ggplot2

top_artists <- billboard %>% count(artist, sort = TRUE) %>% head(10) # Bar chart ggplot(top_artists, aes(x = reorder(artist, n), y = n)) + geom_bar(stat = "identity", fill = "skyblue") + coord_flip() + labs( title = "Top 10 Artists by Number of Songs", x = "Artist", y = "Count" ) + theme_minimal()

ggplot(billboard, aes(x = "", y = rank)) + geom_boxplot(fill = "orange") + labs( title = "Distribution of Song Rankings", y = "Rank" ) + theme_minimal()

ggplot(billboard, aes(x = "weeks-on-board", y = rank)) + geom_point(alpha = 0.3) + geom_smooth(color = "blue") + labs( title = "Relationship Between Weeks on Board and Rank", x = "Weeks on Board", y = "Rank" ) + theme_minimal()

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(billboard, aes(x = rank)) + geom_histogram(binwidth = 2, color = "white") + labs( title = "Distribution of Song Rankings", x = "Rank", y = "Frequency" ) + theme_minimal()

adele_data <- billboard %>% filter(artist == "Adele") ggplot(adele_data, aes(x = date, y = rank)) + geom_line(color = "blue") + geom_point() + scale_y_reverse() + labs( title = "Adele Song Ranking Trend", x = "Date", y = "Rank" ) + theme_minimal()

top5 <- billboard %>% filter(artist %in% c("Adele", "Ed Sheeran", "Drake")) ggplot(top5, aes(x = "weeks-on-board", y = rank)) + geom_point() + facet_wrap(~artist) + labs( title = "Artist Comparison", x = "Weeks on Board", y = "Rank" ) + theme_minimal()

Q6. Application of function

# Function to calculate summary statistics stats_func <- function(data) { data <- na.omit(data) # Remove missing values # Calculate statistics min_value <- min(data) max_value <- max(data) mean_value <- mean(data) median_value <- median(data) q1 <- quantile(data, 0.25) q2 <- quantile(data, 0.50) q3 <- quantile(data, 0.75) result <- list( Minimum = min_value, Maximum = max_value, Mean = mean_value, Median = median_value, First_Quartile_Q1 = q1, Second_Quartile_Q2 = q2, Third_Quartile_Q3 = q3 ) return(result) } billboard$`weeks-on-board` |> stats_func()

## $Minimum ## [1] 1 ## ## $Maximum ## [1] 90 ## ## $Mean ## [1] 9.161785 ## ## $Median ## [1] 7 ## ## $First_Quartile_Q1 ## 25% ## 4 ## ## $Second_Quartile_Q2 ## 50% ## 7 ## ## $Third_Quartile_Q3 ## 75% ## 13

Apply Functions like sapply, vapply, lapply, tapply

Using lapply() to get class (datatype) of each column

lapply(billboard, class)

## $date ## [1] "Date" ## ## $rank ## [1] "numeric" ## ## $song ## [1] "character" ## ## $artist ## [1] "character" ## ## $`last-week` ## [1] "numeric" ## ## $`peak-rank` ## [1] "numeric" ## ## $`weeks-on-board` ## [1] "numeric"

Using lapply() to calculate mean for numeric columns

numeric_cols <- billboard[sapply(billboard, is.numeric)] numeric_cols |> lapply(mean, na.rm = TRUE)

## $rank ## [1] 50.50093 ## ## $`last-week` ## [1] 47.59163 ## ## $`peak-rank` ## [1] 40.97063 ## ## $`weeks-on-board` ## [1] 9.161785

Using sapply() to count missing values in each column

billboard |> sapply(function(x) sum(is.na(x)))

## date rank song artist last-week ## 0 0 0 0 32312 ## peak-rank weeks-on-board ## 0 0

Using sapply() Find mean of numeric columns

sapply(numeric_cols, mean, na.rm = TRUE)

## rank last-week peak-rank weeks-on-board ## 50.500929 47.591631 40.970629 9.161785

Using vapply() to find mean of numeric columns

vapply( numeric_cols, mean, FUN.VALUE = numeric(1), na.rm = TRUE )

## rank last-week peak-rank weeks-on-board ## 50.500929 47.591631 40.970629 9.161785

Using tapply() to calculate average rank by artist

#tapply( # billboard$rank, #billboard$artist, #mean, #na.rm = TRUE #)

getting Number of songs per artist using tapply()

#tapply( # billboard$song, #billboard$artist, #length #)

20251MBI043 - HOME WORK

NIYIBIZI Shaloom

Importing required library using pacman package manager

Q1. Import Data from different sources

Q1.1. Import data from database

Q1.2. Import from external source like .csv file

Q1.3. Import data from statistical data set like .sav file

Q2. piping technique (%>% or |>)

Q2.1.piping

Key Differences of piping

Q3. Dplyr Functions

-> COLUMN FUCTIONS

-> ROW FUCTIONS

-> GROUP FUCTIONS

Q3.1. SELECT() FUNCTION

Q3.2. MUTATE() FUNCTION

Q3.3. FILTER() FUNCTION

Q3.4. DISTINCT() FUNCTION

Q3.5. GROUP_BY() FUNCTION

Q3.6. SUMMARISE() FUNCTION

Q3.7. ARRANGE() FUNCTION

Q3.8. ARRANGE() FUNCTION

Q4. Use of trace() and recover()

Q4.1. The `trace()` Function

Q4.2. The `recover()` Function

Q5. Data Visualization using ggplot2

Q6. Application of function

20251MBI043 - HOME WORK

NIYIBIZI Shaloom

Importing required library using pacman package manager

Q1. Import Data from different sources

Q1.1. Import data from database

Q1.2. Import from external source like .csv file

Q1.3. Import data from statistical data set like .sav file

Q2. piping technique (%>% or |>)

Q2.1.piping

Key Differences of piping

Q3. Dplyr Functions

-> COLUMN FUCTIONS

-> ROW FUCTIONS

-> GROUP FUCTIONS

Q3.1. SELECT() FUNCTION

Q3.2. MUTATE() FUNCTION

Q3.3. FILTER() FUNCTION

Q3.4. DISTINCT() FUNCTION

Q3.5. GROUP_BY() FUNCTION

Q3.6. SUMMARISE() FUNCTION

Q3.7. ARRANGE() FUNCTION

Q3.8. ARRANGE() FUNCTION

Q4. Use of trace() and recover()

Q4.1. The trace() Function

Q4.2. The recover() Function

Q5. Data Visualization using ggplot2

Q6. Application of function

Q4.1. The `trace()` Function

Q4.2. The `recover()` Function