library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Data Default of credit card clients Data Set http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients ï· Create a .CSV file (or optionally, a MySQL database!) that includes all of the information included in the dataset. Youâre encouraged to use a âwideâ structure similar to how the information appears in the discussion item, so that you can practice tidying and transformations as described below. ï· Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data. [Most of your grade will be based on this step!]
Read the data
credit <- read.csv("default of credit card clients.csv", skip = 1)
Few First rows of the data
head(credit)
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## 1 1 20000 2 2 1 24 2 2 -1 -1 -2
## 2 2 120000 2 2 2 26 -1 2 0 0 0
## 3 3 90000 2 2 2 34 0 0 0 0 0
## 4 4 50000 2 2 1 37 0 0 0 0 0
## 5 5 50000 1 2 1 57 -1 0 -1 0 0
## 6 6 50000 1 1 2 37 0 0 0 0 0
## PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6
## 1 -2 3913 3102 689 0 0 0
## 2 2 2682 1725 2682 3272 3455 3261
## 3 0 29239 14027 13559 14331 14948 15549
## 4 0 46990 48233 49291 28314 28959 29547
## 5 0 8617 5670 35835 20940 19146 19131
## 6 0 64400 57069 57608 19394 19619 20024
## PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
## 1 0 689 0 0 0 0
## 2 0 1000 1000 1000 0 2000
## 3 1518 1500 1000 1000 1000 5000
## 4 2000 2019 1200 1100 1069 1000
## 5 2000 36681 10000 9000 689 679
## 6 2500 1815 657 1000 1000 800
## default.payment.next.month
## 1 1
## 2 1
## 3 0
## 4 0
## 5 0
## 6 0
Get summary of the data set
glimpse(credit)
## Observations: 30,000
## Variables: 25
## $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ...
## $ LIMIT_BAL <int> 20000, 120000, 90000, 50000, 50000,...
## $ SEX <int> 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2,...
## $ EDUCATION <int> 2, 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 1,...
## $ MARRIAGE <int> 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2,...
## $ AGE <int> 24, 26, 34, 37, 57, 37, 29, 23, 28,...
## $ PAY_0 <int> 2, -1, 0, 0, -1, 0, 0, 0, 0, -2, 0,...
## $ PAY_2 <int> 2, 2, 0, 0, 0, 0, 0, -1, 0, -2, 0, ...
## $ PAY_3 <int> -1, 0, 0, 0, -1, 0, 0, -1, 2, -2, 2...
## $ PAY_4 <int> -1, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, ...
## $ PAY_5 <int> -2, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, ...
## $ PAY_6 <int> -2, 2, 0, 0, 0, 0, 0, -1, 0, -1, -1...
## $ BILL_AMT1 <int> 3913, 2682, 29239, 46990, 8617, 644...
## $ BILL_AMT2 <int> 3102, 1725, 14027, 48233, 5670, 570...
## $ BILL_AMT3 <int> 689, 2682, 13559, 49291, 35835, 576...
## $ BILL_AMT4 <int> 0, 3272, 14331, 28314, 20940, 19394...
## $ BILL_AMT5 <int> 0, 3455, 14948, 28959, 19146, 19619...
## $ BILL_AMT6 <int> 0, 3261, 15549, 29547, 19131, 20024...
## $ PAY_AMT1 <int> 0, 0, 1518, 2000, 2000, 2500, 55000...
## $ PAY_AMT2 <int> 689, 1000, 1500, 2019, 36681, 1815,...
## $ PAY_AMT3 <int> 0, 1000, 1000, 1200, 10000, 657, 38...
## $ PAY_AMT4 <int> 0, 1000, 1000, 1100, 9000, 1000, 20...
## $ PAY_AMT5 <int> 0, 0, 1000, 1069, 689, 1000, 13750,...
## $ PAY_AMT6 <int> 0, 2000, 5000, 1000, 679, 800, 1377...
## $ default.payment.next.month <int> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
Correct wrong column name
colnames(credit)[colnames(credit)=="PAY_0"] <- "PAY_1"
Assign value to Sex, Education and Marriage
credit$SEX <- recode(credit$SEX, `1`="Male",`2`="Female")
credit$EDUCATION <- recode(credit$EDUCATION, `1`="Graduate School",`2`="University",
`3`="High School",`4`="Others")
## Warning: Unreplaced values treated as NA as .x is not compatible. Please
## specify replacements exhaustively or supply .default
credit$MARRIAGE <- recode(credit$MARRIAGE, `1`="Married",`2`="Single",`3`="Others")
## Warning: Unreplaced values treated as NA as .x is not compatible. Please
## specify replacements exhaustively or supply .default
Tranform wide to long format
credit %>%
gather("Past Payment","Payment count",PAY_1:PAY_6) %>%
gather(Bill,"Bill Amount",BILL_AMT1:BILL_AMT6)%>%
gather(Payment,"Payment Amount",PAY_AMT1:PAY_AMT6) ->
credit_long
Print first few rows of the data
head(credit_long)
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE
## 1 1 20000 Female University Married 24
## 2 2 120000 Female University Single 26
## 3 3 90000 Female University Single 34
## 4 4 50000 Female University Married 37
## 5 5 50000 Male University Married 57
## 6 6 50000 Male Graduate School Single 37
## default.payment.next.month Past Payment Payment count Bill
## 1 1 PAY_1 2 BILL_AMT1
## 2 1 PAY_1 -1 BILL_AMT1
## 3 0 PAY_1 0 BILL_AMT1
## 4 0 PAY_1 0 BILL_AMT1
## 5 0 PAY_1 -1 BILL_AMT1
## 6 0 PAY_1 0 BILL_AMT1
## Bill Amount Payment Payment Amount
## 1 3913 PAY_AMT1 0
## 2 2682 PAY_AMT1 0
## 3 29239 PAY_AMT1 1518
## 4 46990 PAY_AMT1 2000
## 5 8617 PAY_AMT1 2000
## 6 64400 PAY_AMT1 2500
ï· Perform the analysis requested in the discussion item. View summary statistics
summary(credit_long)
## ID LIMIT_BAL SEX EDUCATION
## Min. : 1 Min. : 10000 Length:6480000 Length:6480000
## 1st Qu.: 7501 1st Qu.: 50000 Class :character Class :character
## Median :15000 Median : 140000 Mode :character Mode :character
## Mean :15000 Mean : 167484
## 3rd Qu.:22500 3rd Qu.: 240000
## Max. :30000 Max. :1000000
## MARRIAGE AGE default.payment.next.month
## Length:6480000 Min. :21.00 Min. :0.0000
## Class :character 1st Qu.:28.00 1st Qu.:0.0000
## Mode :character Median :34.00 Median :0.0000
## Mean :35.49 Mean :0.2212
## 3rd Qu.:41.00 3rd Qu.:0.0000
## Max. :79.00 Max. :1.0000
## Past Payment Payment count Bill Bill Amount
## Length:6480000 Min. :-2.0000 Length:6480000 Min. :-339603
## Class :character 1st Qu.:-1.0000 Class :character 1st Qu.: 2400
## Mode :character Median : 0.0000 Mode :character Median : 19270
## Mean :-0.1824 Mean : 44977
## 3rd Qu.: 0.0000 3rd Qu.: 57417
## Max. : 8.0000 Max. :1664089
## Payment Payment Amount
## Length:6480000 Min. : 0
## Class :character 1st Qu.: 390
## Mode :character Median : 1900
## Mean : 5275
## 3rd Qu.: 4592
## Max. :1684259
Average of persons is 35.49, average payment count is -0.1824, average Bill amount is 44977 , average payment amount is 5275 . Frequency table of the data
credit_long %>%
select(SEX, EDUCATION, MARRIAGE, `Past Payment`, Bill, Payment,default.payment.next.month)%>%
sapply(table)
## $SEX
##
## Female Male
## 3912192 2567808
##
## $EDUCATION
##
## Graduate School High School Others University
## 2286360 1062072 26568 3030480
##
## $MARRIAGE
##
## Married Others Single
## 2950344 69768 3448224
##
## $`Past Payment`
##
## PAY_1 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
## 1080000 1080000 1080000 1080000 1080000 1080000
##
## $Bill
##
## BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6
## 1080000 1080000 1080000 1080000 1080000 1080000
##
## $Payment
##
## PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
## 1080000 1080000 1080000 1080000 1080000 1080000
##
## $default.payment.next.month
##
## 0 1
## 5046624 1433376
Most customers are female, Maximum has Univeristy Education, Most of them are Single.
credit_long%>%
group_by(SEX)%>%
summarise(avg_bill = mean(`Bill Amount`, na.rm = TRUE), avg_pay=mean(`Payment Amount`,na.rm = TRUE))
## # A tibble: 2 x 3
## SEX avg_bill avg_pay
## <chr> <dbl> <dbl>
## 1 Female 43649. 5235.
## 2 Male 47001. 5336.
Average bill amount of female is 43649 and male is 47001. Average Payment amount for female is 5235 and male is 5336.
prop.table(table(credit$SEX, credit$default.payment.next.month))
##
## 0 1
## Female 0.47830000 0.12543333
## Male 0.30050000 0.09576667
Conclusion: From the above analysis we conclude that Most customers are female, Maximum has Univeristy Education, Most of them are Single. Males bill and payment is higher than female. Femlaes are more likely to be credit defaulter than males.
ï· Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions.