* 银行与借款人之间的协议 — 贷款+按揭偿还本金和利息
head(loan.data,10)
## age education year_emp income debt_income cred_debt
## 1 47 Did not complete high school 22 81 5.5 1.505790
## 2 40 Did not complete high school 22 95 3.6 0.632700
## 3 35 Did not complete high school 16 36 3.4 0.178704
## 4 43 Did not complete high school 16 89 0.4 0.159488
## 5 47 Did not complete high school 26 100 12.8 4.582400
## 6 52 Did not complete high school 24 64 10.0 3.929600
## 7 35 Did not complete high school 13 35 4.5 0.431550
## 8 36 Did not complete high school 16 32 10.9 0.544128
## 9 49 High school degree 14 63 15.8 0.935676
## 10 35 High school degree 14 82 0.8 0.468384
## other_debt Loan Logistc PGR_1 Dis_1 Dis1_1 Discrim
## 1 2.949210 No 0.0006254570 No No 0.9884384 0.011561555
## 2 2.787300 No 0.0016333330 No No 0.9733337 0.026666274
## 3 1.045296 No 0.0009656821 No No 0.9888641 0.011135941
## 4 0.196512 No 0.0013862700 No No 0.9741002 0.025899755
## 5 8.217600 No 0.0123452723 No No 0.9319657 0.068034281
## 6 2.470400 No 0.0002818009 No No 0.9916055 0.008394544
## 7 1.143450 No 0.0029283941 No No 0.9764911 0.023508945
## 8 2.943872 <NA> 0.0036198182 No No 0.9703037 0.029696254
## 9 9.018324 <NA> 0.0429850328 No No 0.9000222 0.099977832
## 10 0.187616 No 0.0060533168 No No 0.9011298 0.098870237
CrossTable(loan.data$education)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | Did not complete high school | High school degree | Some college | College degree |
## |------------------------------|------------------------------|------------------------------|------------------------------|
## | 53 | 25 | 17 | 5 |
## | 0.530 | 0.250 | 0.170 | 0.050 |
## |------------------------------|------------------------------|------------------------------|------------------------------|
##
##
##
##
CrossTable(loan.data$education, loan.data$Loan, prop.r = T, prop.c = F, prop.t = F, prop.chisq = F)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 84
##
##
## | loan.data$Loan
## loan.data$education | No | Yes | Row Total |
## -----------------------------|-----------|-----------|-----------|
## Did not complete high school | 35 | 7 | 42 |
## | 0.833 | 0.167 | 0.500 |
## -----------------------------|-----------|-----------|-----------|
## High school degree | 15 | 6 | 21 |
## | 0.714 | 0.286 | 0.250 |
## -----------------------------|-----------|-----------|-----------|
## Some college | 10 | 6 | 16 |
## | 0.625 | 0.375 | 0.190 |
## -----------------------------|-----------|-----------|-----------|
## College degree | 4 | 1 | 5 |
## | 0.800 | 0.200 | 0.060 |
## -----------------------------|-----------|-----------|-----------|
## Column Total | 64 | 20 | 84 |
## -----------------------------|-----------|-----------|-----------|
##
##
hist(loan.data$debt_income, main = 'Histogram of debt-to-income ratio (*100)', xlab = 'Debt-to-income ratio')
hist(loan.data$income, main = 'Histogram of household income in thousands', xlab = 'Income')
hist(loan.data$income, main = 'Histogram of household income in thousands', xlab = 'Income')$breaks
## [1] 0 20 40 60 80 100 120 140 160 180
hist(loan.data$income, breaks = sqrt(nrow(loan.data)), main = 'Histogram of income with breaks argument', xlab = 'Income')
plot(loan.data$income,col=ifelse(loan.data$income>=150,"orange","black"), pch=ifelse(loan.data$income>=150,19,1), ylab = 'Income')
outlier1 = which(loan.data$income>150)
data.nooutlier1 = loan.data[-outlier1,]
outlier.cutoff = quantile(loan.data$income, 0.75) + 1.5 * IQR(loan.data$income)
outlier2 = which(loan.data$income>outlier.cutoff)
data.nooutlier2 = loan.data[-outlier2,]
hist(data.nooutlier1$income, breaks = sqrt(nrow(data.nooutlier1)), main = 'Histogram of income without outliers', xlab = 'Income')
plot(loan.data$year_emp, loan.data$income, col=ifelse(loan.data$income>=150,"orange","black"), pch=ifelse(loan.data$income>=150,19,1), main = 'Bivariate plot', xlab = 'Years with current employer', ylab = 'Income')
sapply(loan.data, function(x) all(!is.na(x)))
## age education year_emp income debt_income cred_debt
## TRUE TRUE TRUE TRUE TRUE TRUE
## other_debt Loan Logistc PGR_1 Dis_1 Dis1_1
## TRUE FALSE TRUE TRUE TRUE TRUE
## Discrim
## TRUE
temp = loan.data
set.seed(10)
temp$year_emp[sample(1:nrow(loan.data),10)] = NA
summary(temp$year_emp)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 3.000 6.500 8.378 13.750 26.000 10
na.index = which(is.na(temp$year_emp))
data.nona1 = temp[-na.index,]
data.nona2 = temp
data.nona2$year_emp[na.index] = median(temp$year_emp, na.rm=T)
temp$year_emp_cat <- rep(NA, length(temp$year_emp))
temp$year_emp_cat[which(temp$year_emp <= 5)] <- "0-5"
temp$year_emp_cat[which(temp$year_emp > 5 & temp$year_emp <= 10)] <- "5-10"
temp$year_emp_cat[which(temp$year_emp > 10 & temp$year_emp <= 15)] <- "10-15"
temp$year_emp_cat[which(temp$year_emp > 15 & temp$year_emp <= 20)] <- "15-20"
temp$year_emp_cat[which(temp$year_emp > 20)] <- "20+"
temp$year_emp_cat[which(is.na(temp$year_emp))] <- "Missing"
temp$year_emp_cat <- factor(temp$year_emp_cat, levels = c("0-5","5-10","10-15","15-20","20+","Missing"))
plot(temp$year_emp_cat)
median
,分类型数据用频率最高的类别,待以后文章详解)bin
分类;分类型数据创建NA
类别)set.seed(1)
train.index <- sample(1:nrow(loan.data), 2/3 * nrow(loan.data))
training <- loan.data[train.index, ]
test <- loan.data[-train.index, ]
#table(test$Loan, model_pred)