#load the data
#skip first row of the data file which contains additional label with no clear indication of the meaning of the variable
bank.fraud.raw <- read.csv('https://raw.githubusercontent.com/xiaoxiaogao-DD/DATA606_Project/master/default%20of%20credit%20card%20clients.csv',skip = 1)
#rename the columns to be more descriptive
colnames(bank.fraud.raw) <- c('id','limit.balance','gender','edu.level','marital.status','age','num.month.delay.9','num.month.delay.8','num.month.delay.7','num.month.delay.6','num.month.delay.5','num.month.delay.4','bill.amount.9','bill.amount.8','bill.amount.7','bill.amount.6','bill.amount.5','bill.amount.4','paid.amount.9','paid.amount.8','paid.amount.7','paid.amount.6','paid.amount.5','paid.amount.4','default.payment')
#rename variables to be more descriptive
bank.fraud.descriptive <- bank.fraud.raw
bank.fraud.descriptive$gender[bank.fraud.descriptive$gender == 2] <- 'female'
bank.fraud.descriptive$gender[bank.fraud.descriptive$gender == 1] <- 'male'
bank.fraud.descriptive$edu.level[bank.fraud.descriptive$edu.level == 1] <- 'gradschool'
bank.fraud.descriptive$edu.level[bank.fraud.descriptive$edu.level == 2] <- 'university'
bank.fraud.descriptive$edu.level[bank.fraud.descriptive$edu.level == 3] <- 'highschool'
bank.fraud.descriptive$edu.level[bank.fraud.descriptive$edu.level == 4] <- 'others'
bank.fraud.descriptive$marital.status[bank.fraud.descriptive$marital.status == 1] <- 'married'
bank.fraud.descriptive$marital.status[bank.fraud.descriptive$marital.status == 2] <- 'single'
bank.fraud.descriptive$marital.status[bank.fraud.descriptive$marital.status == 3] <- 'others'
#create another dataset for predictive analysis
#add history of past payment, bill statement amount and previous payment amount with total amount
bank.fraud.all <- bank.fraud.raw
bank.fraud.all$bill.amount <- bank.fraud.all$bill.amount.9+bank.fraud.all$bill.amount.8+bank.fraud.all$bill.amount.7+bank.fraud.all$bill.amount.6+bank.fraud.all$bill.amount.5+bank.fraud.all$bill.amount.4
bank.fraud.all$paid.amount <- bank.fraud.all$paid.amount.9+bank.fraud.all$paid.amount.8+bank.fraud.all$paid.amount.7+bank.fraud.all$paid.amount.6+bank.fraud.all$paid.amount.5+bank.fraud.all$paid.amount.4
bank.fraud.total <- bank.fraud.all[,c(1:12,26,27,25)]
bank.fraud.all <- bank.fraud.all[,c(1:18,26,19:24,27,25)]
#overview of both datasets
head(bank.fraud.descriptive)
## id limit.balance gender edu.level marital.status age num.month.delay.9
## 1 1 20000 female university married 24 2
## 2 2 120000 female university single 26 -1
## 3 3 90000 female university single 34 0
## 4 4 50000 female university married 37 0
## 5 5 50000 male university married 57 -1
## 6 6 50000 male gradschool single 37 0
## num.month.delay.8 num.month.delay.7 num.month.delay.6 num.month.delay.5
## 1 2 -1 -1 -2
## 2 2 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 -1 0 0
## 6 0 0 0 0
## num.month.delay.4 bill.amount.9 bill.amount.8 bill.amount.7
## 1 -2 3913 3102 689
## 2 2 2682 1725 2682
## 3 0 29239 14027 13559
## 4 0 46990 48233 49291
## 5 0 8617 5670 35835
## 6 0 64400 57069 57608
## bill.amount.6 bill.amount.5 bill.amount.4 paid.amount.9 paid.amount.8
## 1 0 0 0 0 689
## 2 3272 3455 3261 0 1000
## 3 14331 14948 15549 1518 1500
## 4 28314 28959 29547 2000 2019
## 5 20940 19146 19131 2000 36681
## 6 19394 19619 20024 2500 1815
## paid.amount.7 paid.amount.6 paid.amount.5 paid.amount.4 default.payment
## 1 0 0 0 0 1
## 2 1000 1000 0 2000 1
## 3 1000 1000 1000 5000 0
## 4 1200 1100 1069 1000 0
## 5 10000 9000 689 679 0
## 6 657 1000 1000 800 0
summary(bank.fraud.descriptive)
## id limit.balance gender edu.level
## Min. : 1 Min. : 10000 Length:30000 Length:30000
## 1st Qu.: 7501 1st Qu.: 50000 Class :character Class :character
## Median :15000 Median : 140000 Mode :character Mode :character
## Mean :15000 Mean : 167484
## 3rd Qu.:22500 3rd Qu.: 240000
## Max. :30000 Max. :1000000
## marital.status age num.month.delay.9 num.month.delay.8
## Length:30000 Min. :21.00 Min. :-2.0000 Min. :-2.0000
## Class :character 1st Qu.:28.00 1st Qu.:-1.0000 1st Qu.:-1.0000
## Mode :character Median :34.00 Median : 0.0000 Median : 0.0000
## Mean :35.49 Mean :-0.0167 Mean :-0.1338
## 3rd Qu.:41.00 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :79.00 Max. : 8.0000 Max. : 8.0000
## num.month.delay.7 num.month.delay.6 num.month.delay.5 num.month.delay.4
## Min. :-2.0000 Min. :-2.0000 Min. :-2.0000 Min. :-2.0000
## 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000
## Median : 0.0000 Median : 0.0000 Median : 0.0000 Median : 0.0000
## Mean :-0.1662 Mean :-0.2207 Mean :-0.2662 Mean :-0.2911
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. : 8.0000 Max. : 8.0000 Max. : 8.0000 Max. : 8.0000
## bill.amount.9 bill.amount.8 bill.amount.7 bill.amount.6
## Min. :-165580 Min. :-69777 Min. :-157264 Min. :-170000
## 1st Qu.: 3559 1st Qu.: 2985 1st Qu.: 2666 1st Qu.: 2327
## Median : 22382 Median : 21200 Median : 20089 Median : 19052
## Mean : 51223 Mean : 49179 Mean : 47013 Mean : 43263
## 3rd Qu.: 67091 3rd Qu.: 64006 3rd Qu.: 60165 3rd Qu.: 54506
## Max. : 964511 Max. :983931 Max. :1664089 Max. : 891586
## bill.amount.5 bill.amount.4 paid.amount.9 paid.amount.8
## Min. :-81334 Min. :-339603 Min. : 0 Min. : 0
## 1st Qu.: 1763 1st Qu.: 1256 1st Qu.: 1000 1st Qu.: 833
## Median : 18105 Median : 17071 Median : 2100 Median : 2009
## Mean : 40311 Mean : 38872 Mean : 5664 Mean : 5921
## 3rd Qu.: 50191 3rd Qu.: 49198 3rd Qu.: 5006 3rd Qu.: 5000
## Max. :927171 Max. : 961664 Max. :873552 Max. :1684259
## paid.amount.7 paid.amount.6 paid.amount.5 paid.amount.4
## Min. : 0 Min. : 0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 390 1st Qu.: 296 1st Qu.: 252.5 1st Qu.: 117.8
## Median : 1800 Median : 1500 Median : 1500.0 Median : 1500.0
## Mean : 5226 Mean : 4826 Mean : 4799.4 Mean : 5215.5
## 3rd Qu.: 4505 3rd Qu.: 4013 3rd Qu.: 4031.5 3rd Qu.: 4000.0
## Max. :896040 Max. :621000 Max. :426529.0 Max. :528666.0
## default.payment
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2212
## 3rd Qu.:0.0000
## Max. :1.0000
#head(bank.fraud.total)
summary(bank.fraud.total)
## id limit.balance gender edu.level
## Min. : 1 Min. : 10000 Min. :1.000 Min. :0.000
## 1st Qu.: 7501 1st Qu.: 50000 1st Qu.:1.000 1st Qu.:1.000
## Median :15000 Median : 140000 Median :2.000 Median :2.000
## Mean :15000 Mean : 167484 Mean :1.604 Mean :1.853
## 3rd Qu.:22500 3rd Qu.: 240000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :30000 Max. :1000000 Max. :2.000 Max. :6.000
## marital.status age num.month.delay.9 num.month.delay.8
## Min. :0.000 Min. :21.00 Min. :-2.0000 Min. :-2.0000
## 1st Qu.:1.000 1st Qu.:28.00 1st Qu.:-1.0000 1st Qu.:-1.0000
## Median :2.000 Median :34.00 Median : 0.0000 Median : 0.0000
## Mean :1.552 Mean :35.49 Mean :-0.0167 Mean :-0.1338
## 3rd Qu.:2.000 3rd Qu.:41.00 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :3.000 Max. :79.00 Max. : 8.0000 Max. : 8.0000
## num.month.delay.7 num.month.delay.6 num.month.delay.5 num.month.delay.4
## Min. :-2.0000 Min. :-2.0000 Min. :-2.0000 Min. :-2.0000
## 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000
## Median : 0.0000 Median : 0.0000 Median : 0.0000 Median : 0.0000
## Mean :-0.1662 Mean :-0.2207 Mean :-0.2662 Mean :-0.2911
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. : 8.0000 Max. : 8.0000 Max. : 8.0000 Max. : 8.0000
## bill.amount paid.amount default.payment
## Min. :-336259 Min. : 0 Min. :0.0000
## 1st Qu.: 28688 1st Qu.: 6680 1st Qu.:0.0000
## Median : 126311 Median : 14383 Median :0.0000
## Mean : 269862 Mean : 31651 Mean :0.2212
## 3rd Qu.: 342626 3rd Qu.: 33504 3rd Qu.:0.0000
## Max. :5263883 Max. :3764066 Max. :1.0000
head(bank.fraud.total)
## id limit.balance gender edu.level marital.status age num.month.delay.9
## 1 1 20000 2 2 1 24 2
## 2 2 120000 2 2 2 26 -1
## 3 3 90000 2 2 2 34 0
## 4 4 50000 2 2 1 37 0
## 5 5 50000 1 2 1 57 -1
## 6 6 50000 1 1 2 37 0
## num.month.delay.8 num.month.delay.7 num.month.delay.6 num.month.delay.5
## 1 2 -1 -1 -2
## 2 2 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 -1 0 0
## 6 0 0 0 0
## num.month.delay.4 bill.amount paid.amount default.payment
## 1 -2 7704 689 1
## 2 2 17077 5000 1
## 3 0 101653 11018 0
## 4 0 231334 8388 0
## 5 0 109339 59049 0
## 6 0 238114 7772 0
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
Are personal information and credit movements predictive of creditability?
What are the cases, and how many are there?
Each case represents an individual consumer and his/her personal information and credit movements(bill statement and payment) during Apring - September 2005. There are 30,000 observations in the given dataset.
Describe the method of data collection.
The original dataset is provided by I-Cheng Yeh, from Department of Information Management, Chung Hua University, Taiwan and Department of Civil Engineering, Tamkang University, Taiwan.
This research aimed at the case of customers default payments in Taiwan and compares the predictive accuracy of probability of default among six data mining methods.
What type of study is this (observational/experiment)?
This is an observational study.
If you collected the data, state self-collected. If not, provide a citation/link.
This dataset is publically available on UCI Machine Learning Repository, Center for Machine Learning and Intelligent Systems (https://archive.ics.uci.edu/ml/index.php).
Data source: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients.
What is the response variable, and what type is it (numerical/categorical)?
The response variable is Y or default.payment.next.month in the original data file. The variable is categorical, or more specifically, binary (1 = Yes, default of payment; 0 = No.)
What is the explanatory variable, and what type is it (numerical/categorical)?
The explanatory variables are personal informations (amount of given credit, gender, education, marital status and age) and his/her history of past payments.
Provide summary statistics relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
hist(bank.fraud.all$bill.amount-bank.fraud.all$paid.amount,breaks = 100,main = 'diff in bill and paid amount',xlim = c(-10000,2500000))
boxplot(bank.fraud.all$bill.amount-bank.fraud.all$paid.amount)