library(readxl)
library(dplyr)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.2     ✔ tidyr     1.3.2
## ✔ readr     2.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
cc_df <- read_excel("C:/Users/user/OneDrive/Desktop/20251MBI051_R_For_Data_Science_Exam/default of credit card clients.xls")
## New names:
## • `` -> `...1`

4.1. Displaying rows and columns, data type and shape

#Top 5 rows
head(cc_df, 5)
## # A tibble: 5 × 25
##   ...1  X1     X2    X3    X4    X5    X6    X7    X8    X9    X10   X11   X12  
##   <chr> <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ID    LIMIT… SEX   EDUC… MARR… AGE   PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL…
## 2 1     20000  2     2     1     24    2     2     -1    -1    -2    -2    3913 
## 3 2     120000 2     2     2     26    -1    2     0     0     0     2     2682 
## 4 3     90000  2     2     2     34    0     0     0     0     0     0     29239
## 5 4     50000  2     2     1     37    0     0     0     0     0     0     46990
## # ℹ 12 more variables: X13 <chr>, X14 <chr>, X15 <chr>, X16 <chr>, X17 <chr>,
## #   X18 <chr>, X19 <chr>, X20 <chr>, X21 <chr>, X22 <chr>, X23 <chr>, Y <chr>
# Bottom 10 rows
tail(cc_df, 10)
## # A tibble: 10 × 25
##    ...1  X1    X2    X3    X4    X5    X6    X7    X8    X9    X10   X11   X12  
##    <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 29991 1400… 1     2     1     41    0     0     0     0     0     0     1383…
##  2 29992 2100… 1     2     1     34    3     2     2     2     2     2     2500 
##  3 29993 10000 1     3     1     43    0     0     0     -2    -2    -2    8802 
##  4 29994 1000… 1     1     2     38    0     -1    -1    0     0     0     3042 
##  5 29995 80000 1     2     2     34    2     2     2     2     2     2     72557
##  6 29996 2200… 1     3     1     39    0     0     0     0     0     0     1889…
##  7 29997 1500… 1     3     2     43    -1    -1    -1    -1    0     0     1683 
##  8 29998 30000 1     2     2     37    4     3     2     -1    0     0     3565 
##  9 29999 80000 1     3     1     41    1     -1    0     0     0     -1    -1645
## 10 30000 50000 1     2     1     46    0     0     0     0     0     0     47929
## # ℹ 12 more variables: X13 <chr>, X14 <chr>, X15 <chr>, X16 <chr>, X17 <chr>,
## #   X18 <chr>, X19 <chr>, X20 <chr>, X21 <chr>, X22 <chr>, X23 <chr>, Y <chr>
# Data type
str(cc_df)
## tibble [30,001 × 25] (S3: tbl_df/tbl/data.frame)
##  $ ...1: chr [1:30001] "ID" "1" "2" "3" ...
##  $ X1  : chr [1:30001] "LIMIT_BAL" "20000" "120000" "90000" ...
##  $ X2  : chr [1:30001] "SEX" "2" "2" "2" ...
##  $ X3  : chr [1:30001] "EDUCATION" "2" "2" "2" ...
##  $ X4  : chr [1:30001] "MARRIAGE" "1" "2" "2" ...
##  $ X5  : chr [1:30001] "AGE" "24" "26" "34" ...
##  $ X6  : chr [1:30001] "PAY_0" "2" "-1" "0" ...
##  $ X7  : chr [1:30001] "PAY_2" "2" "2" "0" ...
##  $ X8  : chr [1:30001] "PAY_3" "-1" "0" "0" ...
##  $ X9  : chr [1:30001] "PAY_4" "-1" "0" "0" ...
##  $ X10 : chr [1:30001] "PAY_5" "-2" "0" "0" ...
##  $ X11 : chr [1:30001] "PAY_6" "-2" "2" "0" ...
##  $ X12 : chr [1:30001] "BILL_AMT1" "3913" "2682" "29239" ...
##  $ X13 : chr [1:30001] "BILL_AMT2" "3102" "1725" "14027" ...
##  $ X14 : chr [1:30001] "BILL_AMT3" "689" "2682" "13559" ...
##  $ X15 : chr [1:30001] "BILL_AMT4" "0" "3272" "14331" ...
##  $ X16 : chr [1:30001] "BILL_AMT5" "0" "3455" "14948" ...
##  $ X17 : chr [1:30001] "BILL_AMT6" "0" "3261" "15549" ...
##  $ X18 : chr [1:30001] "PAY_AMT1" "0" "0" "1518" ...
##  $ X19 : chr [1:30001] "PAY_AMT2" "689" "1000" "1500" ...
##  $ X20 : chr [1:30001] "PAY_AMT3" "0" "1000" "1000" ...
##  $ X21 : chr [1:30001] "PAY_AMT4" "0" "1000" "1000" ...
##  $ X22 : chr [1:30001] "PAY_AMT5" "0" "0" "1000" ...
##  $ X23 : chr [1:30001] "PAY_AMT6" "0" "2000" "5000" ...
##  $ Y   : chr [1:30001] "default payment next month" "1" "1" "0" ...
# shape of the dataset
sample(cc_df)
## # A tibble: 30,001 × 25
##    X14   X20   X10   X8    X9    X23   X6    X17   ...1  X4    X12   X16   X1   
##    <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 BILL… PAY_… PAY_5 PAY_3 PAY_4 PAY_… PAY_0 BILL… ID    MARR… BILL… BILL… LIMI…
##  2 689   0     -2    -1    -1    0     2     0     1     1     3913  0     20000
##  3 2682  1000  0     0     0     2000  -1    3261  2     2     2682  3455  1200…
##  4 13559 1000  0     0     0     5000  0     15549 3     2     29239 14948 90000
##  5 49291 1200  0     0     0     1000  0     29547 4     1     46990 28959 50000
##  6 35835 10000 0     -1    0     679   -1    19131 5     1     8617  19146 50000
##  7 57608 657   0     0     0     800   0     20024 6     2     64400 19619 50000
##  8 4450… 38000 0     0     0     13770 0     4739… 7     2     3679… 4830… 5000…
##  9 601   0     0     -1    0     1542  0     567   8     2     11876 -159  1000…
## 10 12108 432   0     2     0     1000  0     3719  9     1     11285 11793 1400…
## # ℹ 29,991 more rows
## # ℹ 12 more variables: Y <chr>, X22 <chr>, X3 <chr>, X7 <chr>, X21 <chr>,
## #   X5 <chr>, X15 <chr>, X13 <chr>, X19 <chr>, X2 <chr>, X11 <chr>, X18 <chr>
summary(cc_df)
##         ...1               X1                X2                X3       
##  Length   :30001   Length   :30001   Length   :30001   Length   :30001  
##  N.unique :30001   N.unique :   82   N.unique :    3   N.unique :    8  
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    1   Min.nchar:    5   Min.nchar:    1   Min.nchar:    1  
##  Max.nchar:    5   Max.nchar:    9   Max.nchar:    3   Max.nchar:    9  
##          X4                X5                X6                X7       
##  Length   :30001   Length   :30001   Length   :30001   Length   :30001  
##  N.unique :    5   N.unique :   57   N.unique :   12   N.unique :   12  
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    1   Min.nchar:    2   Min.nchar:    1   Min.nchar:    1  
##  Max.nchar:    8   Max.nchar:    3   Max.nchar:    5   Max.nchar:    5  
##          X8                X9               X10               X11       
##  Length   :30001   Length   :30001   Length   :30001   Length   :30001  
##  N.unique :   12   N.unique :   12   N.unique :   11   N.unique :   11  
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    1   Min.nchar:    1   Min.nchar:    1   Min.nchar:    1  
##  Max.nchar:    5   Max.nchar:    5   Max.nchar:    5   Max.nchar:    5  
##         X12               X13               X14               X15       
##  Length   :30001   Length   :30001   Length   :30001   Length   :30001  
##  N.unique :22724   N.unique :22347   N.unique :22027   N.unique :21549  
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    1   Min.nchar:    1   Min.nchar:    1   Min.nchar:    1  
##  Max.nchar:    9   Max.nchar:    9   Max.nchar:    9   Max.nchar:    9  
##         X16               X17               X18               X19       
##  Length   :30001   Length   :30001   Length   :30001   Length   :30001  
##  N.unique :21011   N.unique :20605   N.unique : 7944   N.unique : 7900  
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    1   Min.nchar:    1   Min.nchar:    1   Min.nchar:    1  
##  Max.nchar:    9   Max.nchar:    9   Max.nchar:    8   Max.nchar:    8  
##         X20               X21               X22               X23       
##  Length   :30001   Length   :30001   Length   :30001   Length   :30001  
##  N.unique : 7519   N.unique : 6938   N.unique : 6898   N.unique : 6940  
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    1   Min.nchar:    1   Min.nchar:    1   Min.nchar:    1  
##  Max.nchar:    8   Max.nchar:    8   Max.nchar:    8   Max.nchar:    8  
##          Y        
##  Length   :30001  
##  N.unique :    3  
##  N.blank  :    0  
##  Min.nchar:    1  
##  Max.nchar:   26

Data cleaning

summary(cc_df)
##         ...1               X1                X2                X3       
##  Length   :30001   Length   :30001   Length   :30001   Length   :30001  
##  N.unique :30001   N.unique :   82   N.unique :    3   N.unique :    8  
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    1   Min.nchar:    5   Min.nchar:    1   Min.nchar:    1  
##  Max.nchar:    5   Max.nchar:    9   Max.nchar:    3   Max.nchar:    9  
##          X4                X5                X6                X7       
##  Length   :30001   Length   :30001   Length   :30001   Length   :30001  
##  N.unique :    5   N.unique :   57   N.unique :   12   N.unique :   12  
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    1   Min.nchar:    2   Min.nchar:    1   Min.nchar:    1  
##  Max.nchar:    8   Max.nchar:    3   Max.nchar:    5   Max.nchar:    5  
##          X8                X9               X10               X11       
##  Length   :30001   Length   :30001   Length   :30001   Length   :30001  
##  N.unique :   12   N.unique :   12   N.unique :   11   N.unique :   11  
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    1   Min.nchar:    1   Min.nchar:    1   Min.nchar:    1  
##  Max.nchar:    5   Max.nchar:    5   Max.nchar:    5   Max.nchar:    5  
##         X12               X13               X14               X15       
##  Length   :30001   Length   :30001   Length   :30001   Length   :30001  
##  N.unique :22724   N.unique :22347   N.unique :22027   N.unique :21549  
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    1   Min.nchar:    1   Min.nchar:    1   Min.nchar:    1  
##  Max.nchar:    9   Max.nchar:    9   Max.nchar:    9   Max.nchar:    9  
##         X16               X17               X18               X19       
##  Length   :30001   Length   :30001   Length   :30001   Length   :30001  
##  N.unique :21011   N.unique :20605   N.unique : 7944   N.unique : 7900  
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    1   Min.nchar:    1   Min.nchar:    1   Min.nchar:    1  
##  Max.nchar:    9   Max.nchar:    9   Max.nchar:    8   Max.nchar:    8  
##         X20               X21               X22               X23       
##  Length   :30001   Length   :30001   Length   :30001   Length   :30001  
##  N.unique : 7519   N.unique : 6938   N.unique : 6898   N.unique : 6940  
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    1   Min.nchar:    1   Min.nchar:    1   Min.nchar:    1  
##  Max.nchar:    8   Max.nchar:    8   Max.nchar:    8   Max.nchar:    8  
##          Y        
##  Length   :30001  
##  N.unique :    3  
##  N.blank  :    0  
##  Min.nchar:    1  
##  Max.nchar:   26
new <- cc_df %>% 
  count(is.na(cc_df))
print(new)
## # A tibble: 1 × 2
##   `is.na(cc_df)`[,"...1"] [,"X1"] [,"X2"] [,"X3"] [,"X4"] [,"X5"] [,"X6"]     n
##   <lgl>                   <lgl>   <lgl>   <lgl>   <lgl>   <lgl>   <lgl>   <int>
## 1 FALSE                   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   30001
## # ℹ 1 more variable: `is.na(cc_df)`[8:25] <lgl>

although there are no duplicates, just to make sure the data is clean

cc_df_clean <- cc_df %>%
  distinct()
cc_df_clean
## # A tibble: 30,001 × 25
##    ...1  X1    X2    X3    X4    X5    X6    X7    X8    X9    X10   X11   X12  
##    <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 ID    LIMI… SEX   EDUC… MARR… AGE   PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL…
##  2 1     20000 2     2     1     24    2     2     -1    -1    -2    -2    3913 
##  3 2     1200… 2     2     2     26    -1    2     0     0     0     2     2682 
##  4 3     90000 2     2     2     34    0     0     0     0     0     0     29239
##  5 4     50000 2     2     1     37    0     0     0     0     0     0     46990
##  6 5     50000 1     2     1     57    -1    0     -1    0     0     0     8617 
##  7 6     50000 1     1     2     37    0     0     0     0     0     0     64400
##  8 7     5000… 1     1     2     29    0     0     0     0     0     0     3679…
##  9 8     1000… 2     2     2     23    0     -1    -1    0     0     -1    11876
## 10 9     1400… 2     3     1     28    0     0     2     0     0     0     11285
## # ℹ 29,991 more rows
## # ℹ 12 more variables: X13 <chr>, X14 <chr>, X15 <chr>, X16 <chr>, X17 <chr>,
## #   X18 <chr>, X19 <chr>, X20 <chr>, X21 <chr>, X22 <chr>, X23 <chr>, Y <chr>

Summarising

```