df_old = read.csv(file="C:/Users/joepu/OneDrive/Documents/Northwestern/MSDS 401/Week 8/HMEQ.csv", header=TRUE, sep=",")
head(df_old)
df=df_old
print( head( df ) )
print( tail( df ) )
print( str( df ) )
'data.frame': 5960 obs. of 13 variables:
$ BAD : int 1 1 1 1 0 1 1 1 1 1 ...
$ LOAN : int 1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
$ MORTDUE: num 25860 70053 13500 NA 97800 ...
$ VALUE : num 39025 68400 16700 NA 112000 ...
$ REASON : Factor w/ 3 levels "","DebtCon","HomeImp": 3 3 3 1 3 3 3 3 3 3 ...
$ JOB : Factor w/ 7 levels "","Mgr","Office",..: 4 4 4 1 3 4 4 4 4 6 ...
$ YOJ : num 10.5 7 4 NA 3 9 5 11 3 16 ...
$ DEROG : int 0 0 0 NA 0 0 3 0 0 0 ...
$ DELINQ : int 0 2 0 NA 0 0 2 0 2 0 ...
$ CLAGE : num 94.4 121.8 149.5 NA 93.3 ...
$ NINQ : int 1 0 1 NA 0 1 1 0 1 0 ...
$ CLNO : int 9 14 10 NA 14 8 17 8 12 13 ...
$ DEBTINC: num NA NA NA NA NA ...
NULL
print( nrow( df ) )
[1] 5960
print( length( df ) )
[1] 13
print( summary( df ) )
BAD LOAN MORTDUE VALUE REASON JOB
Min. :0.0000 Min. : 1100 Min. : 2063 Min. : 8000 : 252 : 279
1st Qu.:0.0000 1st Qu.:11100 1st Qu.: 46276 1st Qu.: 66076 DebtCon:3928 Mgr : 767
Median :0.0000 Median :16300 Median : 65019 Median : 89236 HomeImp:1780 Office : 948
Mean :0.1995 Mean :18608 Mean : 73761 Mean :101776 Other :2388
3rd Qu.:0.0000 3rd Qu.:23300 3rd Qu.: 91488 3rd Qu.:119824 ProfExe:1276
Max. :1.0000 Max. :89900 Max. :399550 Max. :855909 Sales : 109
NA's :518 NA's :112 Self : 193
YOJ DEROG DELINQ CLAGE NINQ
Min. : 0.000 Min. : 0.0000 Min. : 0.0000 Min. : 0.0 Min. : 0.000
1st Qu.: 3.000 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 115.1 1st Qu.: 0.000
Median : 7.000 Median : 0.0000 Median : 0.0000 Median : 173.5 Median : 1.000
Mean : 8.922 Mean : 0.2546 Mean : 0.4494 Mean : 179.8 Mean : 1.186
3rd Qu.:13.000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 231.6 3rd Qu.: 2.000
Max. :41.000 Max. :10.0000 Max. :15.0000 Max. :1168.2 Max. :17.000
NA's :515 NA's :708 NA's :580 NA's :308 NA's :510
CLNO DEBTINC
Min. : 0.0 Min. : 0.5245
1st Qu.:15.0 1st Qu.: 29.1400
Median :20.0 Median : 34.8183
Mean :21.3 Mean : 33.7799
3rd Qu.:26.0 3rd Qu.: 39.0031
Max. :71.0 Max. :203.3121
NA's :222 NA's :1267
#mortgage Due
print( summary( df$MORTDUE ) )
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
2063 46276 65019 73761 91488 399550 518
#Debt to Income
#mean
print( with( df, tapply( DEBTINC, BAD, mean, na.rm=TRUE ) ) )
0 1
33.25313 39.38764
#median
print( with( df, tapply( DEBTINC, BAD, median, na.rm=TRUE ) ) )
0 1
34.54167 38.07976
df$M_DEBTINC = is.na( df$DEBTINC )
df$M_DEBTINC = is.na( df$DEBTINC ) + .12
print( with( df, tapply( M_DEBTINC, BAD, mean, na.rm=TRUE ) ) )
0 1
0.2208174 0.7810597
#Credit Line number
print( with( df, tapply( CLNO, BAD, mean, na.rm=TRUE ) ) )
0 1
21.31704 21.21127
mean( df$DEBTINC, na.rm=TRUE )
[1] 33.77992
df$IMP_DEBTINC = df$DEBTINC
df$IMP_DEBTINC = ifelse(is.na( df$IMP_DEBTINC ), 34, df$IMP_DEBTINC )
print( head(df) )
df = subset(df, select = -c( DEBTINC ) )
print( head(df) )
print( with( df, tapply( BAD, JOB, mean, na.rm=TRUE ) ) )
Mgr Office Other ProfExe Sales Self
0.08243728 0.23337679 0.13185654 0.23199330 0.16614420 0.34862385 0.30051813
df$IMP_JOB = df$JOB
df$IMP_JOB = ifelse(df$JOB == "", "UNKNOWN", as.character(df$IMP_JOB) )
#df$IMP_JOB = ifelse(df$JOB == "", "UNKNOWN", df$IMP_JOB )
print( head(df) )
df = subset(df, select = -c( JOB ) )
print( head(df) )
hist( df$IMP_DEBTINC, main= "Debt to Income (w/ Outliers", col="green")

boxplot( df$IMP_DEBTINC, main="Histogram of Debt to Income (w/ Outliers)",col="green" )

#Remove outliers
a = max( df$IMP_DEBTINC, na.rm=TRUE )
z = min( df$IMP_DEBTINC, na.rm=TRUE )
m = mean( df$IMP_DEBTINC, na.rm=TRUE )
s = sd( df$IMP_DEBTINC, na.rm=TRUE )
df$IMP_DEBTINC = ifelse( df$IMP_DEBTINC > m+3*s, m+3*s, df$IMP_DEBTINC )
df$IMP_DEBTINC = ifelse( df$IMP_DEBTINC < m-3*s, m-3*s, df$IMP_DEBTINC )
hist( df$IMP_DEBTINC, main = "Histogram of Debt to Income",col="blue" )

boxplot( df$IMP_DEBTINC,main="Boxplot of Debt to Income", col="blue" )

a = max( df$IMP_DEBTINC, na.rm=TRUE )
z = min( df$IMP_DEBTINC, na.rm=TRUE )
m = mean( df$IMP_DEBTINC, na.rm=TRUE )
s = sd( df$IMP_DEBTINC, na.rm=TRUE )
table(df$IMP_JOB)
Mgr Office Other ProfExe Sales Self UNKNOWN
767 948 2388 1276 109 193 279
pie( table(df$IMP_JOB), main = "Job of Mortgage Applicant")

print( with( df, tapply( BAD, IMP_JOB, mean ) ) )
Mgr Office Other ProfExe Sales Self UNKNOWN
0.23337679 0.13185654 0.23199330 0.16614420 0.34862385 0.30051813 0.08243728
df$FLAG_JOB_RISKY = ifelse(df$IMP_JOB %in% c("Self","Sales"), 1, 0 )
LS0tDQp0aXRsZTogIkltcHV0ZSBNaXNzaW5nIERhdGEgVmFsdWUiDQphdXRob3I6ICJQdXRuaWssIEpvc2VwaCINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCmBgYHtyfQ0KZGZfb2xkICA9IHJlYWQuY3N2KGZpbGU9IkM6L1VzZXJzL2pvZXB1L09uZURyaXZlL0RvY3VtZW50cy9Ob3J0aHdlc3Rlcm4vTVNEUyA0MDEvV2VlayA4L0hNRVEuY3N2IiwgaGVhZGVyPVRSVUUsIHNlcD0iLCIpDQpoZWFkKGRmX29sZCkNCg0KZGY9ZGZfb2xkDQojZmlyc3Qgcm93cw0KcHJpbnQoIGhlYWQoIGRmICkgKQ0KI2xhc3Qgcm93cw0KcHJpbnQoIHRhaWwoIGRmICkgKQ0KI3N0cnVjdHVyZSBvZiBkYXRhDQpwcmludCggc3RyKCBkZiApICkNCiNudW1iZXIgb2Ygcm93cw0KcHJpbnQoIG5yb3coIGRmICkgKQ0KI2xlbmd0aCBvZiB0YWJsZQ0KcHJpbnQoIGxlbmd0aCggZGYgKSApDQpgYGANCg0KDQoNCmBgYHtyfQ0KcHJpbnQoIHN1bW1hcnkoIGRmICkgKQ0KI21vcnRnYWdlIER1ZQ0KcHJpbnQoIHN1bW1hcnkoIGRmJE1PUlREVUUgKSApDQpgYGANCg0KYGBge3J9DQojRGVidCB0byBJbmNvbWUNCiNtZWFuDQpwcmludCggd2l0aCggZGYsIHRhcHBseSggREVCVElOQywgQkFELCBtZWFuLCBuYS5ybT1UUlVFICkgKSApDQojbWVkaWFuDQpwcmludCggd2l0aCggZGYsIHRhcHBseSggREVCVElOQywgQkFELCBtZWRpYW4sIG5hLnJtPVRSVUUgKSApICkNCmRmJE1fREVCVElOQyA9IGlzLm5hKCBkZiRERUJUSU5DICkNCmRmJE1fREVCVElOQyA9IGlzLm5hKCBkZiRERUJUSU5DICkgKyAuMTINCnByaW50KCB3aXRoKCBkZiwgdGFwcGx5KCBNX0RFQlRJTkMsIEJBRCwgbWVhbiwgbmEucm09VFJVRSApICkgKQ0KI0NyZWRpdCBMaW5lIG51bWJlcg0KcHJpbnQoIHdpdGgoIGRmLCB0YXBwbHkoIENMTk8sIEJBRCwgbWVhbiwgbmEucm09VFJVRSApICkgKQ0KYGBgDQoNCmBgYHtyfQ0KDQoNCmBgYA0KDQoNCmBgYHtyfQ0KbWVhbiggZGYkREVCVElOQywgbmEucm09VFJVRSApDQpkZiRJTVBfREVCVElOQyA9IGRmJERFQlRJTkMNCmRmJElNUF9ERUJUSU5DID0gaWZlbHNlKGlzLm5hKCBkZiRJTVBfREVCVElOQyApLCAzNCwgZGYkSU1QX0RFQlRJTkMgKQ0KYGBgDQoNCmBgYHtyfQ0KcHJpbnQoIGhlYWQoZGYpICkNCmRmID0gc3Vic2V0KGRmLCBzZWxlY3QgPSAtYyggREVCVElOQyApICkNCnByaW50KCBoZWFkKGRmKSApDQpgYGANCg0KYGBge3J9DQpwcmludCggd2l0aCggZGYsIHRhcHBseSggQkFELCBKT0IsIG1lYW4sIG5hLnJtPVRSVUUgKSApICkNCmRmJElNUF9KT0IgPSBkZiRKT0INCmRmJElNUF9KT0IgPSBpZmVsc2UoZGYkSk9CID09ICIiLCAiVU5LTk9XTiIsIGFzLmNoYXJhY3RlcihkZiRJTVBfSk9CKSApDQojZGYkSU1QX0pPQiA9IGlmZWxzZShkZiRKT0IgPT0gIiIsICJVTktOT1dOIiwgZGYkSU1QX0pPQiApDQpgYGANCmBgYHtyfQ0KcHJpbnQoIGhlYWQoZGYpICkNCmRmID0gc3Vic2V0KGRmLCBzZWxlY3QgPSAtYyggSk9CICkgKQ0KcHJpbnQoIGhlYWQoZGYpICkNCmBgYA0KDQpgYGB7cn0NCmhpc3QoIGRmJElNUF9ERUJUSU5DLCBtYWluPSAiRGVidCB0byBJbmNvbWUgKHcvIE91dGxpZXJzIiwgY29sPSJncmVlbiIpDQpgYGANCg0KYGBge3J9DQpib3hwbG90KCBkZiRJTVBfREVCVElOQywgbWFpbj0iSGlzdG9ncmFtIG9mIERlYnQgdG8gSW5jb21lICh3LyBPdXRsaWVycykiLGNvbD0iZ3JlZW4iICApDQpgYGANCmBgYHtyfQ0KI1JlbW92ZSBvdXRsaWVycw0KYSA9IG1heCggZGYkSU1QX0RFQlRJTkMsIG5hLnJtPVRSVUUgKQ0KeiA9IG1pbiggZGYkSU1QX0RFQlRJTkMsIG5hLnJtPVRSVUUgKQ0KbSA9IG1lYW4oIGRmJElNUF9ERUJUSU5DLCBuYS5ybT1UUlVFICkNCnMgPSBzZCggZGYkSU1QX0RFQlRJTkMsIG5hLnJtPVRSVUUgKQ0KDQpkZiRJTVBfREVCVElOQyA9IGlmZWxzZSggZGYkSU1QX0RFQlRJTkMgPiBtKzMqcywgbSszKnMsIGRmJElNUF9ERUJUSU5DICkNCmRmJElNUF9ERUJUSU5DID0gaWZlbHNlKCBkZiRJTVBfREVCVElOQyA8IG0tMypzLCBtLTMqcywgZGYkSU1QX0RFQlRJTkMgKQ0KDQpoaXN0KCBkZiRJTVBfREVCVElOQywgbWFpbiA9ICJIaXN0b2dyYW0gb2YgRGVidCB0byBJbmNvbWUiLGNvbD0iYmx1ZSIgICkNCmBgYA0KYGBge3J9DQpib3hwbG90KCBkZiRJTVBfREVCVElOQyxtYWluPSJCb3hwbG90IG9mIERlYnQgdG8gSW5jb21lIiwgY29sPSJibHVlIiAgKQ0KYGBgDQoNCmBgYHtyfQ0KYSA9IG1heCggZGYkSU1QX0RFQlRJTkMsIG5hLnJtPVRSVUUgKQ0KeiA9IG1pbiggZGYkSU1QX0RFQlRJTkMsIG5hLnJtPVRSVUUgKQ0KbSA9IG1lYW4oIGRmJElNUF9ERUJUSU5DLCBuYS5ybT1UUlVFICkNCnMgPSBzZCggZGYkSU1QX0RFQlRJTkMsIG5hLnJtPVRSVUUgKQ0KDQp0YWJsZShkZiRJTVBfSk9CKQ0KYGBgDQoNCmBgYHtyfQ0KcGllKCB0YWJsZShkZiRJTVBfSk9CKSwgbWFpbiA9ICJKb2Igb2YgTW9ydGdhZ2UgQXBwbGljYW50IikNCmBgYA0KYGBge3J9DQojcmlzayBhc3Nlc3NtZW50IGlmIHNvbWVvbmUgd29ya3MgaW4gc2FsZXMNCnByaW50KCB3aXRoKCBkZiwgdGFwcGx5KCBCQUQsIElNUF9KT0IsIG1lYW4gKSApICkNCmRmJEZMQUdfSk9CX1JJU0tZID0gaWZlbHNlKGRmJElNUF9KT0IgJWluJSBjKCJTZWxmIiwiU2FsZXMiKSwgMSwgMCApDQpgYGANCg0K