library(readxl)
library(dplyr)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.2 ✔ tidyr 1.3.2
## ✔ readr 2.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
cc_df <- read_excel("C:/Users/user/OneDrive/Desktop/20251MBI051_R_For_Data_Science_Exam/default of credit card clients.xls")
## New names:
## • `` -> `...1`
#Top 5 rows
head(cc_df, 5)
## # A tibble: 5 × 25
## ...1 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ID LIMIT… SEX EDUC… MARR… AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL…
## 2 1 20000 2 2 1 24 2 2 -1 -1 -2 -2 3913
## 3 2 120000 2 2 2 26 -1 2 0 0 0 2 2682
## 4 3 90000 2 2 2 34 0 0 0 0 0 0 29239
## 5 4 50000 2 2 1 37 0 0 0 0 0 0 46990
## # ℹ 12 more variables: X13 <chr>, X14 <chr>, X15 <chr>, X16 <chr>, X17 <chr>,
## # X18 <chr>, X19 <chr>, X20 <chr>, X21 <chr>, X22 <chr>, X23 <chr>, Y <chr>
# Bottom 10 rows
tail(cc_df, 10)
## # A tibble: 10 × 25
## ...1 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 29991 1400… 1 2 1 41 0 0 0 0 0 0 1383…
## 2 29992 2100… 1 2 1 34 3 2 2 2 2 2 2500
## 3 29993 10000 1 3 1 43 0 0 0 -2 -2 -2 8802
## 4 29994 1000… 1 1 2 38 0 -1 -1 0 0 0 3042
## 5 29995 80000 1 2 2 34 2 2 2 2 2 2 72557
## 6 29996 2200… 1 3 1 39 0 0 0 0 0 0 1889…
## 7 29997 1500… 1 3 2 43 -1 -1 -1 -1 0 0 1683
## 8 29998 30000 1 2 2 37 4 3 2 -1 0 0 3565
## 9 29999 80000 1 3 1 41 1 -1 0 0 0 -1 -1645
## 10 30000 50000 1 2 1 46 0 0 0 0 0 0 47929
## # ℹ 12 more variables: X13 <chr>, X14 <chr>, X15 <chr>, X16 <chr>, X17 <chr>,
## # X18 <chr>, X19 <chr>, X20 <chr>, X21 <chr>, X22 <chr>, X23 <chr>, Y <chr>
# Data type
str(cc_df)
## tibble [30,001 × 25] (S3: tbl_df/tbl/data.frame)
## $ ...1: chr [1:30001] "ID" "1" "2" "3" ...
## $ X1 : chr [1:30001] "LIMIT_BAL" "20000" "120000" "90000" ...
## $ X2 : chr [1:30001] "SEX" "2" "2" "2" ...
## $ X3 : chr [1:30001] "EDUCATION" "2" "2" "2" ...
## $ X4 : chr [1:30001] "MARRIAGE" "1" "2" "2" ...
## $ X5 : chr [1:30001] "AGE" "24" "26" "34" ...
## $ X6 : chr [1:30001] "PAY_0" "2" "-1" "0" ...
## $ X7 : chr [1:30001] "PAY_2" "2" "2" "0" ...
## $ X8 : chr [1:30001] "PAY_3" "-1" "0" "0" ...
## $ X9 : chr [1:30001] "PAY_4" "-1" "0" "0" ...
## $ X10 : chr [1:30001] "PAY_5" "-2" "0" "0" ...
## $ X11 : chr [1:30001] "PAY_6" "-2" "2" "0" ...
## $ X12 : chr [1:30001] "BILL_AMT1" "3913" "2682" "29239" ...
## $ X13 : chr [1:30001] "BILL_AMT2" "3102" "1725" "14027" ...
## $ X14 : chr [1:30001] "BILL_AMT3" "689" "2682" "13559" ...
## $ X15 : chr [1:30001] "BILL_AMT4" "0" "3272" "14331" ...
## $ X16 : chr [1:30001] "BILL_AMT5" "0" "3455" "14948" ...
## $ X17 : chr [1:30001] "BILL_AMT6" "0" "3261" "15549" ...
## $ X18 : chr [1:30001] "PAY_AMT1" "0" "0" "1518" ...
## $ X19 : chr [1:30001] "PAY_AMT2" "689" "1000" "1500" ...
## $ X20 : chr [1:30001] "PAY_AMT3" "0" "1000" "1000" ...
## $ X21 : chr [1:30001] "PAY_AMT4" "0" "1000" "1000" ...
## $ X22 : chr [1:30001] "PAY_AMT5" "0" "0" "1000" ...
## $ X23 : chr [1:30001] "PAY_AMT6" "0" "2000" "5000" ...
## $ Y : chr [1:30001] "default payment next month" "1" "1" "0" ...
# shape of the dataset
sample(cc_df)
## # A tibble: 30,001 × 25
## X14 X20 X10 X8 X9 X23 X6 X17 ...1 X4 X12 X16 X1
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 BILL… PAY_… PAY_5 PAY_3 PAY_4 PAY_… PAY_0 BILL… ID MARR… BILL… BILL… LIMI…
## 2 689 0 -2 -1 -1 0 2 0 1 1 3913 0 20000
## 3 2682 1000 0 0 0 2000 -1 3261 2 2 2682 3455 1200…
## 4 13559 1000 0 0 0 5000 0 15549 3 2 29239 14948 90000
## 5 49291 1200 0 0 0 1000 0 29547 4 1 46990 28959 50000
## 6 35835 10000 0 -1 0 679 -1 19131 5 1 8617 19146 50000
## 7 57608 657 0 0 0 800 0 20024 6 2 64400 19619 50000
## 8 4450… 38000 0 0 0 13770 0 4739… 7 2 3679… 4830… 5000…
## 9 601 0 0 -1 0 1542 0 567 8 2 11876 -159 1000…
## 10 12108 432 0 2 0 1000 0 3719 9 1 11285 11793 1400…
## # ℹ 29,991 more rows
## # ℹ 12 more variables: Y <chr>, X22 <chr>, X3 <chr>, X7 <chr>, X21 <chr>,
## # X5 <chr>, X15 <chr>, X13 <chr>, X19 <chr>, X2 <chr>, X11 <chr>, X18 <chr>
summary(cc_df)
## ...1 X1 X2 X3
## Length :30001 Length :30001 Length :30001 Length :30001
## N.unique :30001 N.unique : 82 N.unique : 3 N.unique : 8
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 1 Min.nchar: 5 Min.nchar: 1 Min.nchar: 1
## Max.nchar: 5 Max.nchar: 9 Max.nchar: 3 Max.nchar: 9
## X4 X5 X6 X7
## Length :30001 Length :30001 Length :30001 Length :30001
## N.unique : 5 N.unique : 57 N.unique : 12 N.unique : 12
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 1 Min.nchar: 2 Min.nchar: 1 Min.nchar: 1
## Max.nchar: 8 Max.nchar: 3 Max.nchar: 5 Max.nchar: 5
## X8 X9 X10 X11
## Length :30001 Length :30001 Length :30001 Length :30001
## N.unique : 12 N.unique : 12 N.unique : 11 N.unique : 11
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 1 Min.nchar: 1 Min.nchar: 1 Min.nchar: 1
## Max.nchar: 5 Max.nchar: 5 Max.nchar: 5 Max.nchar: 5
## X12 X13 X14 X15
## Length :30001 Length :30001 Length :30001 Length :30001
## N.unique :22724 N.unique :22347 N.unique :22027 N.unique :21549
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 1 Min.nchar: 1 Min.nchar: 1 Min.nchar: 1
## Max.nchar: 9 Max.nchar: 9 Max.nchar: 9 Max.nchar: 9
## X16 X17 X18 X19
## Length :30001 Length :30001 Length :30001 Length :30001
## N.unique :21011 N.unique :20605 N.unique : 7944 N.unique : 7900
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 1 Min.nchar: 1 Min.nchar: 1 Min.nchar: 1
## Max.nchar: 9 Max.nchar: 9 Max.nchar: 8 Max.nchar: 8
## X20 X21 X22 X23
## Length :30001 Length :30001 Length :30001 Length :30001
## N.unique : 7519 N.unique : 6938 N.unique : 6898 N.unique : 6940
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 1 Min.nchar: 1 Min.nchar: 1 Min.nchar: 1
## Max.nchar: 8 Max.nchar: 8 Max.nchar: 8 Max.nchar: 8
## Y
## Length :30001
## N.unique : 3
## N.blank : 0
## Min.nchar: 1
## Max.nchar: 26
summary(cc_df)
## ...1 X1 X2 X3
## Length :30001 Length :30001 Length :30001 Length :30001
## N.unique :30001 N.unique : 82 N.unique : 3 N.unique : 8
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 1 Min.nchar: 5 Min.nchar: 1 Min.nchar: 1
## Max.nchar: 5 Max.nchar: 9 Max.nchar: 3 Max.nchar: 9
## X4 X5 X6 X7
## Length :30001 Length :30001 Length :30001 Length :30001
## N.unique : 5 N.unique : 57 N.unique : 12 N.unique : 12
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 1 Min.nchar: 2 Min.nchar: 1 Min.nchar: 1
## Max.nchar: 8 Max.nchar: 3 Max.nchar: 5 Max.nchar: 5
## X8 X9 X10 X11
## Length :30001 Length :30001 Length :30001 Length :30001
## N.unique : 12 N.unique : 12 N.unique : 11 N.unique : 11
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 1 Min.nchar: 1 Min.nchar: 1 Min.nchar: 1
## Max.nchar: 5 Max.nchar: 5 Max.nchar: 5 Max.nchar: 5
## X12 X13 X14 X15
## Length :30001 Length :30001 Length :30001 Length :30001
## N.unique :22724 N.unique :22347 N.unique :22027 N.unique :21549
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 1 Min.nchar: 1 Min.nchar: 1 Min.nchar: 1
## Max.nchar: 9 Max.nchar: 9 Max.nchar: 9 Max.nchar: 9
## X16 X17 X18 X19
## Length :30001 Length :30001 Length :30001 Length :30001
## N.unique :21011 N.unique :20605 N.unique : 7944 N.unique : 7900
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 1 Min.nchar: 1 Min.nchar: 1 Min.nchar: 1
## Max.nchar: 9 Max.nchar: 9 Max.nchar: 8 Max.nchar: 8
## X20 X21 X22 X23
## Length :30001 Length :30001 Length :30001 Length :30001
## N.unique : 7519 N.unique : 6938 N.unique : 6898 N.unique : 6940
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 1 Min.nchar: 1 Min.nchar: 1 Min.nchar: 1
## Max.nchar: 8 Max.nchar: 8 Max.nchar: 8 Max.nchar: 8
## Y
## Length :30001
## N.unique : 3
## N.blank : 0
## Min.nchar: 1
## Max.nchar: 26
new <- cc_df %>%
count(is.na(cc_df))
print(new)
## # A tibble: 1 × 2
## `is.na(cc_df)`[,"...1"] [,"X1"] [,"X2"] [,"X3"] [,"X4"] [,"X5"] [,"X6"] n
## <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <int>
## 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE 30001
## # ℹ 1 more variable: `is.na(cc_df)`[8:25] <lgl>
cc_df_clean <- cc_df %>%
distinct()
cc_df_clean
## # A tibble: 30,001 × 25
## ...1 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ID LIMI… SEX EDUC… MARR… AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL…
## 2 1 20000 2 2 1 24 2 2 -1 -1 -2 -2 3913
## 3 2 1200… 2 2 2 26 -1 2 0 0 0 2 2682
## 4 3 90000 2 2 2 34 0 0 0 0 0 0 29239
## 5 4 50000 2 2 1 37 0 0 0 0 0 0 46990
## 6 5 50000 1 2 1 57 -1 0 -1 0 0 0 8617
## 7 6 50000 1 1 2 37 0 0 0 0 0 0 64400
## 8 7 5000… 1 1 2 29 0 0 0 0 0 0 3679…
## 9 8 1000… 2 2 2 23 0 -1 -1 0 0 -1 11876
## 10 9 1400… 2 3 1 28 0 0 2 0 0 0 11285
## # ℹ 29,991 more rows
## # ℹ 12 more variables: X13 <chr>, X14 <chr>, X15 <chr>, X16 <chr>, X17 <chr>,
## # X18 <chr>, X19 <chr>, X20 <chr>, X21 <chr>, X22 <chr>, X23 <chr>, Y <chr>
```