EXPLORATORY DATA ANALYSIS : LA1
TEAM NAME: DEADLINE SURVIVORS
TEAM NUMBER: 32
TEAM MEMBERS:
ANKTHA KUMARI 1NT23IS027 SECTION A
SANJANA S SONDUR 1NT23IS193 SECTION D
library (tidyverse) # Load core data manipulation and visualization packages
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (readr) # Read CSV and other data files
library (dplyr) # Data manipulation (filter, select, mutate, etc.)
library (ggplot2) # Data visualization
library (skimr) # Summary statistics of dataset
library (janitor) # Data cleaning and formatting tools
Attaching package: 'janitor'
The following objects are masked from 'package:stats':
chisq.test, fisher.test
data <- read_csv ("UCI_Credit_Card.csv" )# Import dataset from CSV file
Rows: 30000 Columns: 25
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (25): ID, LIMIT_BAL, SEX, EDUCATION, MARRIAGE, AGE, PAY_0, PAY_2, PAY_3,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View (data) # Open dataset in spreadsheet viewer
str (data) # Display structure (data types and columns)
spc_tbl_ [30,000 × 25] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ ID : num [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
$ LIMIT_BAL : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
$ SEX : num [1:30000] 2 2 2 2 1 1 1 2 2 1 ...
$ EDUCATION : num [1:30000] 2 2 2 2 2 1 1 2 3 3 ...
$ MARRIAGE : num [1:30000] 1 2 2 1 1 2 2 2 1 2 ...
$ AGE : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
$ PAY_0 : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
$ PAY_2 : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
$ PAY_3 : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
$ PAY_4 : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
$ PAY_5 : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
$ PAY_6 : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
$ BILL_AMT1 : num [1:30000] 3913 2682 29239 46990 8617 ...
$ BILL_AMT2 : num [1:30000] 3102 1725 14027 48233 5670 ...
$ BILL_AMT3 : num [1:30000] 689 2682 13559 49291 35835 ...
$ BILL_AMT4 : num [1:30000] 0 3272 14331 28314 20940 ...
$ BILL_AMT5 : num [1:30000] 0 3455 14948 28959 19146 ...
$ BILL_AMT6 : num [1:30000] 0 3261 15549 29547 19131 ...
$ PAY_AMT1 : num [1:30000] 0 0 1518 2000 2000 ...
$ PAY_AMT2 : num [1:30000] 689 1000 1500 2019 36681 ...
$ PAY_AMT3 : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
$ PAY_AMT4 : num [1:30000] 0 1000 1000 1100 9000 ...
$ PAY_AMT5 : num [1:30000] 0 0 1000 1069 689 ...
$ PAY_AMT6 : num [1:30000] 0 2000 5000 1000 679 ...
$ default.payment.next.month: num [1:30000] 1 1 0 0 0 0 0 0 0 0 ...
- attr(*, "spec")=
.. cols(
.. ID = col_double(),
.. LIMIT_BAL = col_double(),
.. SEX = col_double(),
.. EDUCATION = col_double(),
.. MARRIAGE = col_double(),
.. AGE = col_double(),
.. PAY_0 = col_double(),
.. PAY_2 = col_double(),
.. PAY_3 = col_double(),
.. PAY_4 = col_double(),
.. PAY_5 = col_double(),
.. PAY_6 = col_double(),
.. BILL_AMT1 = col_double(),
.. BILL_AMT2 = col_double(),
.. BILL_AMT3 = col_double(),
.. BILL_AMT4 = col_double(),
.. BILL_AMT5 = col_double(),
.. BILL_AMT6 = col_double(),
.. PAY_AMT1 = col_double(),
.. PAY_AMT2 = col_double(),
.. PAY_AMT3 = col_double(),
.. PAY_AMT4 = col_double(),
.. PAY_AMT5 = col_double(),
.. PAY_AMT6 = col_double(),
.. default.payment.next.month = col_double()
.. )
- attr(*, "problems")=<externalptr>
summary (data) # Statistical summary (min, max, mean, quartiles)
ID LIMIT_BAL SEX EDUCATION
Min. : 1 Min. : 10000 Min. :1.000 Min. :0.000
1st Qu.: 7501 1st Qu.: 50000 1st Qu.:1.000 1st Qu.:1.000
Median :15000 Median : 140000 Median :2.000 Median :2.000
Mean :15000 Mean : 167484 Mean :1.604 Mean :1.853
3rd Qu.:22500 3rd Qu.: 240000 3rd Qu.:2.000 3rd Qu.:2.000
Max. :30000 Max. :1000000 Max. :2.000 Max. :6.000
MARRIAGE AGE PAY_0 PAY_2
Min. :0.000 Min. :21.00 Min. :-2.0000 Min. :-2.0000
1st Qu.:1.000 1st Qu.:28.00 1st Qu.:-1.0000 1st Qu.:-1.0000
Median :2.000 Median :34.00 Median : 0.0000 Median : 0.0000
Mean :1.552 Mean :35.49 Mean :-0.0167 Mean :-0.1338
3rd Qu.:2.000 3rd Qu.:41.00 3rd Qu.: 0.0000 3rd Qu.: 0.0000
Max. :3.000 Max. :79.00 Max. : 8.0000 Max. : 8.0000
PAY_3 PAY_4 PAY_5 PAY_6
Min. :-2.0000 Min. :-2.0000 Min. :-2.0000 Min. :-2.0000
1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000
Median : 0.0000 Median : 0.0000 Median : 0.0000 Median : 0.0000
Mean :-0.1662 Mean :-0.2207 Mean :-0.2662 Mean :-0.2911
3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
Max. : 8.0000 Max. : 8.0000 Max. : 8.0000 Max. : 8.0000
BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4
Min. :-165580 Min. :-69777 Min. :-157264 Min. :-170000
1st Qu.: 3559 1st Qu.: 2985 1st Qu.: 2666 1st Qu.: 2327
Median : 22382 Median : 21200 Median : 20089 Median : 19052
Mean : 51223 Mean : 49179 Mean : 47013 Mean : 43263
3rd Qu.: 67091 3rd Qu.: 64006 3rd Qu.: 60165 3rd Qu.: 54506
Max. : 964511 Max. :983931 Max. :1664089 Max. : 891586
BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2
Min. :-81334 Min. :-339603 Min. : 0 Min. : 0
1st Qu.: 1763 1st Qu.: 1256 1st Qu.: 1000 1st Qu.: 833
Median : 18105 Median : 17071 Median : 2100 Median : 2009
Mean : 40311 Mean : 38872 Mean : 5664 Mean : 5921
3rd Qu.: 50191 3rd Qu.: 49198 3rd Qu.: 5006 3rd Qu.: 5000
Max. :927171 Max. : 961664 Max. :873552 Max. :1684259
PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
Min. : 0 Min. : 0 Min. : 0.0 Min. : 0.0
1st Qu.: 390 1st Qu.: 296 1st Qu.: 252.5 1st Qu.: 117.8
Median : 1800 Median : 1500 Median : 1500.0 Median : 1500.0
Mean : 5226 Mean : 4826 Mean : 4799.4 Mean : 5215.5
3rd Qu.: 4505 3rd Qu.: 4013 3rd Qu.: 4031.5 3rd Qu.: 4000.0
Max. :896040 Max. :621000 Max. :426529.0 Max. :528666.0
default.payment.next.month
Min. :0.0000
1st Qu.:0.0000
Median :0.0000
Mean :0.2212
3rd Qu.:0.0000
Max. :1.0000
head (data) # Show first 6 rows
# A tibble: 6 × 25
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 20000 2 2 1 24 2 2 -1 -1 -2
2 2 120000 2 2 2 26 -1 2 0 0 0
3 3 90000 2 2 2 34 0 0 0 0 0
4 4 50000 2 2 1 37 0 0 0 0 0
5 5 50000 1 2 1 57 -1 0 -1 0 0
6 6 50000 1 1 2 37 0 0 0 0 0
# ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
# BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
# PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
# PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, default.payment.next.month <dbl>
tail (data) # Show last 6 rows
# A tibble: 6 × 25
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 29995 80000 1 2 2 34 2 2 2 2 2
2 29996 220000 1 3 1 39 0 0 0 0 0
3 29997 150000 1 3 2 43 -1 -1 -1 -1 0
4 29998 30000 1 2 2 37 4 3 2 -1 0
5 29999 80000 1 3 1 41 1 -1 0 0 0
6 30000 50000 1 2 1 46 0 0 0 0 0
# ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
# BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
# PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
# PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, default.payment.next.month <dbl>
dim (data) # Number of rows and columns
nrow (data) # Number of rows
ncol (data) #Number of columns
names (data) # Column names
[1] "ID" "LIMIT_BAL"
[3] "SEX" "EDUCATION"
[5] "MARRIAGE" "AGE"
[7] "PAY_0" "PAY_2"
[9] "PAY_3" "PAY_4"
[11] "PAY_5" "PAY_6"
[13] "BILL_AMT1" "BILL_AMT2"
[15] "BILL_AMT3" "BILL_AMT4"
[17] "BILL_AMT5" "BILL_AMT6"
[19] "PAY_AMT1" "PAY_AMT2"
[21] "PAY_AMT3" "PAY_AMT4"
[23] "PAY_AMT5" "PAY_AMT6"
[25] "default.payment.next.month"
class (data) # Object type
[1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
typeof (data$ LIMIT_BAL) # Data type of specific column
data <- mutate (data, newcol = NA ); head (data) # Add new column
# A tibble: 6 × 26
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 20000 2 2 1 24 2 2 -1 -1 -2
2 2 120000 2 2 2 26 -1 2 0 0 0
3 3 90000 2 2 2 34 0 0 0 0 0
4 4 50000 2 2 1 37 0 0 0 0 0
5 5 50000 1 2 1 57 -1 0 -1 0 0
6 6 50000 1 1 2 37 0 0 0 0 0
# ℹ 15 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
# BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
# PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
# PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, default.payment.next.month <dbl>,
# newcol <lgl>
data$ newcol <- NULL ; head (data) # Delete column
# A tibble: 6 × 25
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 20000 2 2 1 24 2 2 -1 -1 -2
2 2 120000 2 2 2 26 -1 2 0 0 0
3 3 90000 2 2 2 34 0 0 0 0 0
4 4 50000 2 2 1 37 0 0 0 0 0
5 5 50000 1 2 1 57 -1 0 -1 0 0
6 6 50000 1 1 2 37 0 0 0 0 0
# ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
# BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
# PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
# PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, default.payment.next.month <dbl>
data <- rename (data, GENDER = SEX); names (data) # Rename column
[1] "ID" "LIMIT_BAL"
[3] "GENDER" "EDUCATION"
[5] "MARRIAGE" "AGE"
[7] "PAY_0" "PAY_2"
[9] "PAY_3" "PAY_4"
[11] "PAY_5" "PAY_6"
[13] "BILL_AMT1" "BILL_AMT2"
[15] "BILL_AMT3" "BILL_AMT4"
[17] "BILL_AMT5" "BILL_AMT6"
[19] "PAY_AMT1" "PAY_AMT2"
[21] "PAY_AMT3" "PAY_AMT4"
[23] "PAY_AMT5" "PAY_AMT6"
[25] "default.payment.next.month"
data <- select (data, LIMIT_BAL, AGE, everything ()); head (data) # Reorder columns
# A tibble: 6 × 25
LIMIT_BAL AGE ID GENDER EDUCATION MARRIAGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 20000 24 1 2 2 1 2 2 -1 -1 -2
2 120000 26 2 2 2 2 -1 2 0 0 0
3 90000 34 3 2 2 2 0 0 0 0 0
4 50000 37 4 2 2 1 0 0 0 0 0
5 50000 57 5 1 2 1 -1 0 -1 0 0
6 50000 37 6 1 1 2 0 0 0 0 0
# ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
# BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
# PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
# PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, default.payment.next.month <dbl>
data %>% filter (LIMIT_BAL > 50000 ) # Filter rows
# A tibble: 22,324 × 25
LIMIT_BAL AGE ID GENDER EDUCATION MARRIAGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 120000 26 2 2 2 2 -1 2 0 0 0
2 90000 34 3 2 2 2 0 0 0 0 0
3 500000 29 7 1 1 2 0 0 0 0 0
4 100000 23 8 2 2 2 0 -1 -1 0 0
5 140000 28 9 2 3 1 0 0 2 0 0
6 200000 34 11 2 3 2 0 0 2 0 0
7 260000 51 12 2 1 2 -1 -1 -1 -1 -1
8 630000 41 13 2 2 2 -1 0 -1 -1 -1
9 70000 30 14 1 2 2 1 2 2 0 0
10 250000 29 15 1 1 2 0 0 0 0 0
# ℹ 22,314 more rows
# ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
# BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
# PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
# PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, default.payment.next.month <dbl>
data %>% select (LIMIT_BAL, AGE) # Select columns
# A tibble: 30,000 × 2
LIMIT_BAL AGE
<dbl> <dbl>
1 20000 24
2 120000 26
3 90000 34
4 50000 37
5 50000 57
6 50000 37
7 500000 29
8 100000 23
9 140000 28
10 20000 35
# ℹ 29,990 more rows
data$ EDUCATION <- factor (data$ EDUCATION); str (data) # Convert numeric to categorical (factor)
tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
$ LIMIT_BAL : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
$ AGE : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
$ ID : num [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
$ GENDER : num [1:30000] 2 2 2 2 1 1 1 2 2 1 ...
$ EDUCATION : Factor w/ 7 levels "0","1","2","3",..: 3 3 3 3 3 2 2 3 4 4 ...
$ MARRIAGE : num [1:30000] 1 2 2 1 1 2 2 2 1 2 ...
$ PAY_0 : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
$ PAY_2 : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
$ PAY_3 : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
$ PAY_4 : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
$ PAY_5 : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
$ PAY_6 : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
$ BILL_AMT1 : num [1:30000] 3913 2682 29239 46990 8617 ...
$ BILL_AMT2 : num [1:30000] 3102 1725 14027 48233 5670 ...
$ BILL_AMT3 : num [1:30000] 689 2682 13559 49291 35835 ...
$ BILL_AMT4 : num [1:30000] 0 3272 14331 28314 20940 ...
$ BILL_AMT5 : num [1:30000] 0 3455 14948 28959 19146 ...
$ BILL_AMT6 : num [1:30000] 0 3261 15549 29547 19131 ...
$ PAY_AMT1 : num [1:30000] 0 0 1518 2000 2000 ...
$ PAY_AMT2 : num [1:30000] 689 1000 1500 2019 36681 ...
$ PAY_AMT3 : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
$ PAY_AMT4 : num [1:30000] 0 1000 1000 1100 9000 ...
$ PAY_AMT5 : num [1:30000] 0 0 1000 1069 689 ...
$ PAY_AMT6 : num [1:30000] 0 2000 5000 1000 679 ...
$ default.payment.next.month: num [1:30000] 1 1 0 0 0 0 0 0 0 0 ...
levels (data$ EDUCATION) # Show factor levels
[1] "0" "1" "2" "3" "4" "5" "6"
data$ EDUCATION <- reorder (data$ EDUCATION, data$ LIMIT_BAL); str (data) # Reorder factor based on LIMIT_BAL
tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
$ LIMIT_BAL : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
$ AGE : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
$ ID : num [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
$ GENDER : num [1:30000] 2 2 2 2 1 1 1 2 2 1 ...
$ EDUCATION : Factor w/ 7 levels "3","2","6","5",..: 2 2 2 2 2 5 5 2 1 1 ...
..- attr(*, "scores")= num [1:7(1d)] 217143 212956 147062 126550 220894 ...
.. ..- attr(*, "dimnames")=List of 1
.. .. ..$ : chr [1:7] "0" "1" "2" "3" ...
$ MARRIAGE : num [1:30000] 1 2 2 1 1 2 2 2 1 2 ...
$ PAY_0 : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
$ PAY_2 : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
$ PAY_3 : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
$ PAY_4 : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
$ PAY_5 : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
$ PAY_6 : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
$ BILL_AMT1 : num [1:30000] 3913 2682 29239 46990 8617 ...
$ BILL_AMT2 : num [1:30000] 3102 1725 14027 48233 5670 ...
$ BILL_AMT3 : num [1:30000] 689 2682 13559 49291 35835 ...
$ BILL_AMT4 : num [1:30000] 0 3272 14331 28314 20940 ...
$ BILL_AMT5 : num [1:30000] 0 3455 14948 28959 19146 ...
$ BILL_AMT6 : num [1:30000] 0 3261 15549 29547 19131 ...
$ PAY_AMT1 : num [1:30000] 0 0 1518 2000 2000 ...
$ PAY_AMT2 : num [1:30000] 689 1000 1500 2019 36681 ...
$ PAY_AMT3 : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
$ PAY_AMT4 : num [1:30000] 0 1000 1000 1100 9000 ...
$ PAY_AMT5 : num [1:30000] 0 0 1000 1069 689 ...
$ PAY_AMT6 : num [1:30000] 0 2000 5000 1000 679 ...
$ default.payment.next.month: num [1:30000] 1 1 0 0 0 0 0 0 0 0 ...
levels (data$ EDUCATION) <- c ("Unknown" ,"Grad" ,"Univ" ,"HighSchool" ,"Others" ,"Other1" ,"Other2" ) # Assign meaningful labels to categories
table (data$ EDUCATION)# Count frequency of each category
Unknown Grad Univ HighSchool Others Other1 Other2
4917 14030 51 280 10585 14 123
data$ EDUCATION <- droplevels (data$ EDUCATION);str (data) # Remove unused levels
tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
$ LIMIT_BAL : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
$ AGE : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
$ ID : num [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
$ GENDER : num [1:30000] 2 2 2 2 1 1 1 2 2 1 ...
$ EDUCATION : Factor w/ 7 levels "Unknown","Grad",..: 2 2 2 2 2 5 5 2 1 1 ...
$ MARRIAGE : num [1:30000] 1 2 2 1 1 2 2 2 1 2 ...
$ PAY_0 : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
$ PAY_2 : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
$ PAY_3 : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
$ PAY_4 : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
$ PAY_5 : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
$ PAY_6 : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
$ BILL_AMT1 : num [1:30000] 3913 2682 29239 46990 8617 ...
$ BILL_AMT2 : num [1:30000] 3102 1725 14027 48233 5670 ...
$ BILL_AMT3 : num [1:30000] 689 2682 13559 49291 35835 ...
$ BILL_AMT4 : num [1:30000] 0 3272 14331 28314 20940 ...
$ BILL_AMT5 : num [1:30000] 0 3455 14948 28959 19146 ...
$ BILL_AMT6 : num [1:30000] 0 3261 15549 29547 19131 ...
$ PAY_AMT1 : num [1:30000] 0 0 1518 2000 2000 ...
$ PAY_AMT2 : num [1:30000] 689 1000 1500 2019 36681 ...
$ PAY_AMT3 : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
$ PAY_AMT4 : num [1:30000] 0 1000 1000 1100 9000 ...
$ PAY_AMT5 : num [1:30000] 0 0 1000 1069 689 ...
$ PAY_AMT6 : num [1:30000] 0 2000 5000 1000 679 ...
$ default.payment.next.month: num [1:30000] 1 1 0 0 0 0 0 0 0 0 ...
unique (data$ GENDER) # Check unique values in GENDER
data$ GENDER <- recode (data$ GENDER, "1" = "Male" , "2" = "Female" ) # Convert numeric gender codes to labels
table (data$ GENDER) # Frequency of gender
prop.table (table (data$ GENDER)) # Proportion of each gender
Female Male
0.6037333 0.3962667
data$ LIMIT_CAT <- cut (data$ LIMIT_BAL, breaks= 3 ) # Convert numeric LIMIT_BAL into categories
table (data$ LIMIT_CAT) # Count observations in each category
(9.01e+03,3.4e+05] (3.4e+05,6.7e+05] (6.7e+05,1e+06]
26412 3552 36
levels (data$ LIMIT_CAT) # View category ranges
[1] "(9.01e+03,3.4e+05]" "(3.4e+05,6.7e+05]" "(6.7e+05,1e+06]"
data <- mutate (data, TOTAL_BILL = BILL_AMT1 + BILL_AMT2) # Create new column as sum of two bill amounts
head (data)
# A tibble: 6 × 27
LIMIT_BAL AGE ID GENDER EDUCATION MARRIAGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 20000 24 1 Female Grad 1 2 2 -1 -1 -2
2 120000 26 2 Female Grad 2 -1 2 0 0 0
3 90000 34 3 Female Grad 2 0 0 0 0 0
4 50000 37 4 Female Grad 1 0 0 0 0 0
5 50000 57 5 Male Grad 1 -1 0 -1 0 0
6 50000 37 6 Male Others 2 0 0 0 0 0
# ℹ 16 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
# BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
# PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
# PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, default.payment.next.month <dbl>,
# LIMIT_CAT <fct>, TOTAL_BILL <dbl>
data$ TOTAL_BILL[1 : 10 ] # Display first 10 values of new column
[1] 7015 4407 43266 95223 14287 121469 779988 12256 25381 0
data %>% group_by (GENDER) %>% summarise (avg = mean (LIMIT_BAL))
# A tibble: 2 × 2
GENDER avg
<chr> <dbl>
1 Female 170086.
2 Male 163520.
# Group data by gender and calculate the average credit limit for each group
data %>% group_by (GENDER) %>% summarise (count = n ())
# A tibble: 2 × 2
GENDER count
<chr> <int>
1 Female 18112
2 Male 11888
# Count the number of observations (rows) for each gender group
data %>% group_by (GENDER) %>% summarise (sd = sd (LIMIT_BAL))
# A tibble: 2 × 2
GENDER sd
<chr> <dbl>
1 Female 125232.
2 Male 136250.
# Compute the standard deviation of credit limit for each gender group
data %>% group_by (GENDER) %>% summarise (min_val = min (LIMIT_BAL))
# A tibble: 2 × 2
GENDER min_val
<chr> <dbl>
1 Female 10000
2 Male 10000
# Find the minimum credit limit within each gender group
data %>% group_by (GENDER) %>% summarise (max_val = max (LIMIT_BAL))
# A tibble: 2 × 2
GENDER max_val
<chr> <dbl>
1 Female 1000000
2 Male 800000
# Find the maximum credit limit within each gender group
data %>% group_by (GENDER) %>% mutate (avg_limit = mean (LIMIT_BAL))
# A tibble: 30,000 × 28
# Groups: GENDER [2]
LIMIT_BAL AGE ID GENDER EDUCATION MARRIAGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 20000 24 1 Female Grad 1 2 2 -1 -1 -2
2 120000 26 2 Female Grad 2 -1 2 0 0 0
3 90000 34 3 Female Grad 2 0 0 0 0 0
4 50000 37 4 Female Grad 1 0 0 0 0 0
5 50000 57 5 Male Grad 1 -1 0 -1 0 0
6 50000 37 6 Male Others 2 0 0 0 0 0
7 500000 29 7 Male Others 2 0 0 0 0 0
8 100000 23 8 Female Grad 2 0 -1 -1 0 0
9 140000 28 9 Female Unknown 1 0 0 2 0 0
10 20000 35 10 Male Unknown 2 -2 -2 -2 -2 -1
# ℹ 29,990 more rows
# ℹ 17 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
# BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
# PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
# PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, default.payment.next.month <dbl>,
# LIMIT_CAT <fct>, TOTAL_BILL <dbl>, avg_limit <dbl>
# Add a new column showing the average credit limit for each gender (repeated per row)
data %>% group_by (GENDER) %>% mutate (rank = rank (LIMIT_BAL))
# A tibble: 30,000 × 28
# Groups: GENDER [2]
LIMIT_BAL AGE ID GENDER EDUCATION MARRIAGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 20000 24 1 Female Grad 1 2 2 -1 -1 -2
2 120000 26 2 Female Grad 2 -1 2 0 0 0
3 90000 34 3 Female Grad 2 0 0 0 0 0
4 50000 37 4 Female Grad 1 0 0 0 0 0
5 50000 57 5 Male Grad 1 -1 0 -1 0 0
6 50000 37 6 Male Others 2 0 0 0 0 0
7 500000 29 7 Male Others 2 0 0 0 0 0
8 100000 23 8 Female Grad 2 0 -1 -1 0 0
9 140000 28 9 Female Unknown 1 0 0 2 0 0
10 20000 35 10 Male Unknown 2 -2 -2 -2 -2 -1
# ℹ 29,990 more rows
# ℹ 17 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
# BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
# PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
# PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, default.payment.next.month <dbl>,
# LIMIT_CAT <fct>, TOTAL_BILL <dbl>, rank <dbl>
# Rank credit limit values within each gender group
# Calculate the average (mean) credit limit across all observations
# Find the median (middle value) of the credit limit
# Compute the standard deviation of the credit limit (measure of spread)
# Get the minimum (smallest) credit limit value
# Get the maximum (largest) credit limit value
data[order (data$ LIMIT_BAL), ][1 : 10 , ]
# A tibble: 10 × 27
LIMIT_BAL AGE ID GENDER EDUCATION MARRIAGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 10000 22 43 Male Grad 2 0 0 0 0 0
2 10000 56 67 Male Grad 1 2 2 2 0 0
3 10000 23 267 Male Unknown 2 0 0 0 0 0
4 10000 27 294 Male Grad 2 0 0 2 0 0
5 10000 24 368 Male Grad 2 -1 2 2 2 0
6 10000 22 519 Female Grad 2 1 2 0 0 0
7 10000 46 533 Male Grad 2 0 0 2 2 2
8 10000 33 655 Male Grad 2 0 0 0 0 0
9 10000 22 703 Female Unknown 2 0 0 0 0 -2
10 10000 22 722 Male Grad 2 0 0 0 0 0
# ℹ 16 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
# BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
# PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
# PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, default.payment.next.month <dbl>,
# LIMIT_CAT <fct>, TOTAL_BILL <dbl>
# Sort data in ascending order of credit limit and display the first 10
data[order (- data$ LIMIT_BAL), ][1 : 10 , ]
# A tibble: 10 × 27
LIMIT_BAL AGE ID GENDER EDUCATION MARRIAGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1000000 47 2198 Female Others 1 0 0 0 -1 0
2 800000 53 14920 Male Grad 1 -1 -1 -1 0 0
3 800000 46 18007 Male Others 2 -2 -2 -2 -1 -1
4 780000 48 24321 Male Others 1 0 0 0 0 0
5 780000 41 28716 Female Grad 2 -2 -2 -2 -2 -2
6 760000 54 21376 Male Unknown 1 0 0 0 0 0
7 750000 43 3734 Female Grad 1 -1 -1 -1 -2 -1
8 750000 40 4604 Female Unknown 1 -2 -2 -1 -1 -1
9 750000 41 10090 Female Others 1 0 0 0 0 -1
10 750000 28 13187 Female Others 2 1 -1 -1 0 0
# ℹ 16 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
# BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
# PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
# PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, default.payment.next.month <dbl>,
# LIMIT_CAT <fct>, TOTAL_BILL <dbl>
# Sort data in descending order of credit limit and display the first 10
subset (data, LIMIT_BAL > 100000 )
# A tibble: 17,502 × 27
LIMIT_BAL AGE ID GENDER EDUCATION MARRIAGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 120000 26 2 Female Grad 2 -1 2 0 0 0
2 500000 29 7 Male Others 2 0 0 0 0 0
3 140000 28 9 Female Unknown 1 0 0 2 0 0
4 200000 34 11 Female Unknown 2 0 0 2 0 0
5 260000 51 12 Female Others 2 -1 -1 -1 -1 -1
6 630000 41 13 Female Grad 2 -1 0 -1 -1 -1
7 250000 29 15 Male Others 2 0 0 0 0 0
8 320000 49 18 Male Others 1 0 0 0 -1 -1
9 360000 49 19 Female Others 1 1 -2 -2 -2 -2
10 180000 29 20 Female Others 2 1 -2 -2 -2 -2
# ℹ 17,492 more rows
# ℹ 16 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
# BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
# PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
# PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, default.payment.next.month <dbl>,
# LIMIT_CAT <fct>, TOTAL_BILL <dbl>
# Filter dataset to include only rows where credit limit is greater than 100,000
# A tibble: 18,987 × 27
LIMIT_BAL AGE ID GENDER EDUCATION MARRIAGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 90000 34 3 Female Grad 2 0 0 0 0 0
2 50000 37 4 Female Grad 1 0 0 0 0 0
3 50000 57 5 Male Grad 1 -1 0 -1 0 0
4 50000 37 6 Male Others 2 0 0 0 0 0
5 20000 35 10 Male Unknown 2 -2 -2 -2 -2 -1
6 200000 34 11 Female Unknown 2 0 0 2 0 0
7 260000 51 12 Female Others 2 -1 -1 -1 -1 -1
8 630000 41 13 Female Grad 2 -1 0 -1 -1 -1
9 320000 49 18 Male Others 1 0 0 0 -1 -1
10 360000 49 19 Female Others 1 1 -2 -2 -2 -2
# ℹ 18,977 more rows
# ℹ 16 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
# BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
# PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
# PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, default.payment.next.month <dbl>,
# LIMIT_CAT <fct>, TOTAL_BILL <dbl>
# Filter dataset to include only individuals older than 30
# Count the total number of missing (NA) values in the dataset
LIMIT_BAL AGE
0 0
ID GENDER
0 0
EDUCATION MARRIAGE
0 0
PAY_0 PAY_2
0 0
PAY_3 PAY_4
0 0
PAY_5 PAY_6
0 0
BILL_AMT1 BILL_AMT2
0 0
BILL_AMT3 BILL_AMT4
0 0
BILL_AMT5 BILL_AMT6
0 0
PAY_AMT1 PAY_AMT2
0 0
PAY_AMT3 PAY_AMT4
0 0
PAY_AMT5 PAY_AMT6
0 0
default.payment.next.month LIMIT_CAT
0 0
TOTAL_BILL
0
# Count the number of missing values in each column
# Check if there are any missing values in the dataset (TRUE/FALSE)
numeric_data <- data[sapply (data, is.numeric)]
# Extract only numeric columns from the dataset
cor (numeric_data)[1 : 5 ,1 : 5 ]
LIMIT_BAL AGE ID MARRIAGE PAY_0
LIMIT_BAL 1.00000000 0.14471280 0.02617916 -0.10813941 -0.27121433
AGE 0.14471280 1.00000000 0.01867773 -0.41416992 -0.03944738
ID 0.02617916 0.01867773 1.00000000 -0.02907943 -0.03057493
MARRIAGE -0.10813941 -0.41416992 -0.02907943 1.00000000 0.01991719
PAY_0 -0.27121433 -0.03944738 -0.03057493 0.01991719 1.00000000
# Compute correlation matrix for all numeric variables
data_long <- pivot_longer (data,
cols = starts_with ("BILL_AMT" ),
names_to= "Month" ,
values_to= "Amount" )
# Convert data from wide format to long format for BILL_AMT columns
pivot_wider (data_long,
names_from= Month,
values_from= Amount)
# A tibble: 30,000 × 27
LIMIT_BAL AGE ID GENDER EDUCATION MARRIAGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
<dbl> <dbl> <dbl> <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 20000 24 1 Female Grad 1 2 2 -1 -1 -2
2 120000 26 2 Female Grad 2 -1 2 0 0 0
3 90000 34 3 Female Grad 2 0 0 0 0 0
4 50000 37 4 Female Grad 1 0 0 0 0 0
5 50000 57 5 Male Grad 1 -1 0 -1 0 0
6 50000 37 6 Male Others 2 0 0 0 0 0
7 500000 29 7 Male Others 2 0 0 0 0 0
8 100000 23 8 Female Grad 2 0 -1 -1 0 0
9 140000 28 9 Female Unknown 1 0 0 2 0 0
10 20000 35 10 Male Unknown 2 -2 -2 -2 -2 -1
# ℹ 29,990 more rows
# ℹ 16 more variables: PAY_6 <dbl>, PAY_AMT1 <dbl>, PAY_AMT2 <dbl>,
# PAY_AMT3 <dbl>, PAY_AMT4 <dbl>, PAY_AMT5 <dbl>, PAY_AMT6 <dbl>,
# default.payment.next.month <dbl>, LIMIT_CAT <fct>, TOTAL_BILL <dbl>,
# BILL_AMT1 <dbl>, BILL_AMT2 <dbl>, BILL_AMT3 <dbl>, BILL_AMT4 <dbl>,
# BILL_AMT5 <dbl>, BILL_AMT6 <dbl>
# Convert data back from long format to wide format
ts_data <- ts (data$ LIMIT_BAL)
# Create a time series object from the LIMIT_BAL variable
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# Extract the time index from the time series
as.numeric (time (ts_data))[1 : 20 ]
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# Convert the time index to numeric values
as.numeric (ts_data)[1 : 20 ]
[1] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000
[11] 200000 260000 630000 70000 250000 50000 20000 320000 360000 180000
# Convert time series values into a numeric vector