df_old  = read.csv(file="C:/Users/joepu/OneDrive/Documents/Northwestern/MSDS 401/Week 8/HMEQ.csv", header=TRUE, sep=",")
head(df_old)

df=df_old

print( head( df ) )
print( tail( df ) )

print( str( df ) )
'data.frame':   5960 obs. of  13 variables:
 $ BAD    : int  1 1 1 1 0 1 1 1 1 1 ...
 $ LOAN   : int  1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
 $ MORTDUE: num  25860 70053 13500 NA 97800 ...
 $ VALUE  : num  39025 68400 16700 NA 112000 ...
 $ REASON : Factor w/ 3 levels "","DebtCon","HomeImp": 3 3 3 1 3 3 3 3 3 3 ...
 $ JOB    : Factor w/ 7 levels "","Mgr","Office",..: 4 4 4 1 3 4 4 4 4 6 ...
 $ YOJ    : num  10.5 7 4 NA 3 9 5 11 3 16 ...
 $ DEROG  : int  0 0 0 NA 0 0 3 0 0 0 ...
 $ DELINQ : int  0 2 0 NA 0 0 2 0 2 0 ...
 $ CLAGE  : num  94.4 121.8 149.5 NA 93.3 ...
 $ NINQ   : int  1 0 1 NA 0 1 1 0 1 0 ...
 $ CLNO   : int  9 14 10 NA 14 8 17 8 12 13 ...
 $ DEBTINC: num  NA NA NA NA NA ...
NULL
print( nrow( df ) )
[1] 5960
print( length( df ) )
[1] 13
print( summary( df ) )
      BAD              LOAN          MORTDUE           VALUE            REASON          JOB      
 Min.   :0.0000   Min.   : 1100   Min.   :  2063   Min.   :  8000          : 252          : 279  
 1st Qu.:0.0000   1st Qu.:11100   1st Qu.: 46276   1st Qu.: 66076   DebtCon:3928   Mgr    : 767  
 Median :0.0000   Median :16300   Median : 65019   Median : 89236   HomeImp:1780   Office : 948  
 Mean   :0.1995   Mean   :18608   Mean   : 73761   Mean   :101776                  Other  :2388  
 3rd Qu.:0.0000   3rd Qu.:23300   3rd Qu.: 91488   3rd Qu.:119824                  ProfExe:1276  
 Max.   :1.0000   Max.   :89900   Max.   :399550   Max.   :855909                  Sales  : 109  
                                  NA's   :518      NA's   :112                     Self   : 193  
      YOJ             DEROG             DELINQ            CLAGE             NINQ       
 Min.   : 0.000   Min.   : 0.0000   Min.   : 0.0000   Min.   :   0.0   Min.   : 0.000  
 1st Qu.: 3.000   1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.: 115.1   1st Qu.: 0.000  
 Median : 7.000   Median : 0.0000   Median : 0.0000   Median : 173.5   Median : 1.000  
 Mean   : 8.922   Mean   : 0.2546   Mean   : 0.4494   Mean   : 179.8   Mean   : 1.186  
 3rd Qu.:13.000   3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 231.6   3rd Qu.: 2.000  
 Max.   :41.000   Max.   :10.0000   Max.   :15.0000   Max.   :1168.2   Max.   :17.000  
 NA's   :515      NA's   :708       NA's   :580       NA's   :308      NA's   :510     
      CLNO         DEBTINC        
 Min.   : 0.0   Min.   :  0.5245  
 1st Qu.:15.0   1st Qu.: 29.1400  
 Median :20.0   Median : 34.8183  
 Mean   :21.3   Mean   : 33.7799  
 3rd Qu.:26.0   3rd Qu.: 39.0031  
 Max.   :71.0   Max.   :203.3121  
 NA's   :222    NA's   :1267      
#mortgage Due
print( summary( df$MORTDUE ) )
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   2063   46276   65019   73761   91488  399550     518 
#Debt to Income
#mean
print( with( df, tapply( DEBTINC, BAD, mean, na.rm=TRUE ) ) )
       0        1 
33.25313 39.38764 
#median
print( with( df, tapply( DEBTINC, BAD, median, na.rm=TRUE ) ) )
       0        1 
34.54167 38.07976 
df$M_DEBTINC = is.na( df$DEBTINC )
df$M_DEBTINC = is.na( df$DEBTINC ) + .12
print( with( df, tapply( M_DEBTINC, BAD, mean, na.rm=TRUE ) ) )
        0         1 
0.2208174 0.7810597 
#Credit Line number
print( with( df, tapply( CLNO, BAD, mean, na.rm=TRUE ) ) )
       0        1 
21.31704 21.21127 
mean( df$DEBTINC, na.rm=TRUE )
[1] 33.77992
df$IMP_DEBTINC = df$DEBTINC
df$IMP_DEBTINC = ifelse(is.na( df$IMP_DEBTINC ), 34, df$IMP_DEBTINC )
print( head(df) )
df = subset(df, select = -c( DEBTINC ) )
print( head(df) )
print( with( df, tapply( BAD, JOB, mean, na.rm=TRUE ) ) )
                  Mgr     Office      Other    ProfExe      Sales       Self 
0.08243728 0.23337679 0.13185654 0.23199330 0.16614420 0.34862385 0.30051813 
df$IMP_JOB = df$JOB
df$IMP_JOB = ifelse(df$JOB == "", "UNKNOWN", as.character(df$IMP_JOB) )
#df$IMP_JOB = ifelse(df$JOB == "", "UNKNOWN", df$IMP_JOB )
print( head(df) )
df = subset(df, select = -c( JOB ) )
print( head(df) )
hist( df$IMP_DEBTINC, main= "Debt to Income (w/ Outliers", col="green")

boxplot( df$IMP_DEBTINC, main="Histogram of Debt to Income (w/ Outliers)",col="green"  )

#Remove outliers
a = max( df$IMP_DEBTINC, na.rm=TRUE )
z = min( df$IMP_DEBTINC, na.rm=TRUE )
m = mean( df$IMP_DEBTINC, na.rm=TRUE )
s = sd( df$IMP_DEBTINC, na.rm=TRUE )

df$IMP_DEBTINC = ifelse( df$IMP_DEBTINC > m+3*s, m+3*s, df$IMP_DEBTINC )
df$IMP_DEBTINC = ifelse( df$IMP_DEBTINC < m-3*s, m-3*s, df$IMP_DEBTINC )

hist( df$IMP_DEBTINC, main = "Histogram of Debt to Income",col="blue"  )

boxplot( df$IMP_DEBTINC,main="Boxplot of Debt to Income", col="blue"  )

a = max( df$IMP_DEBTINC, na.rm=TRUE )
z = min( df$IMP_DEBTINC, na.rm=TRUE )
m = mean( df$IMP_DEBTINC, na.rm=TRUE )
s = sd( df$IMP_DEBTINC, na.rm=TRUE )

table(df$IMP_JOB)

    Mgr  Office   Other ProfExe   Sales    Self UNKNOWN 
    767     948    2388    1276     109     193     279 
pie( table(df$IMP_JOB), main = "Job of Mortgage Applicant")

print( with( df, tapply( BAD, IMP_JOB, mean ) ) )
       Mgr     Office      Other    ProfExe      Sales       Self    UNKNOWN 
0.23337679 0.13185654 0.23199330 0.16614420 0.34862385 0.30051813 0.08243728 
df$FLAG_JOB_RISKY = ifelse(df$IMP_JOB %in% c("Self","Sales"), 1, 0 )
LS0tDQp0aXRsZTogIkltcHV0ZSBNaXNzaW5nIERhdGEgVmFsdWUiDQphdXRob3I6ICJQdXRuaWssIEpvc2VwaCINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCmBgYHtyfQ0KZGZfb2xkICA9IHJlYWQuY3N2KGZpbGU9IkM6L1VzZXJzL2pvZXB1L09uZURyaXZlL0RvY3VtZW50cy9Ob3J0aHdlc3Rlcm4vTVNEUyA0MDEvV2VlayA4L0hNRVEuY3N2IiwgaGVhZGVyPVRSVUUsIHNlcD0iLCIpDQpoZWFkKGRmX29sZCkNCg0KZGY9ZGZfb2xkDQojZmlyc3Qgcm93cw0KcHJpbnQoIGhlYWQoIGRmICkgKQ0KI2xhc3Qgcm93cw0KcHJpbnQoIHRhaWwoIGRmICkgKQ0KI3N0cnVjdHVyZSBvZiBkYXRhDQpwcmludCggc3RyKCBkZiApICkNCiNudW1iZXIgb2Ygcm93cw0KcHJpbnQoIG5yb3coIGRmICkgKQ0KI2xlbmd0aCBvZiB0YWJsZQ0KcHJpbnQoIGxlbmd0aCggZGYgKSApDQpgYGANCg0KDQoNCmBgYHtyfQ0KcHJpbnQoIHN1bW1hcnkoIGRmICkgKQ0KI21vcnRnYWdlIER1ZQ0KcHJpbnQoIHN1bW1hcnkoIGRmJE1PUlREVUUgKSApDQpgYGANCg0KYGBge3J9DQojRGVidCB0byBJbmNvbWUNCiNtZWFuDQpwcmludCggd2l0aCggZGYsIHRhcHBseSggREVCVElOQywgQkFELCBtZWFuLCBuYS5ybT1UUlVFICkgKSApDQojbWVkaWFuDQpwcmludCggd2l0aCggZGYsIHRhcHBseSggREVCVElOQywgQkFELCBtZWRpYW4sIG5hLnJtPVRSVUUgKSApICkNCmRmJE1fREVCVElOQyA9IGlzLm5hKCBkZiRERUJUSU5DICkNCmRmJE1fREVCVElOQyA9IGlzLm5hKCBkZiRERUJUSU5DICkgKyAuMTINCnByaW50KCB3aXRoKCBkZiwgdGFwcGx5KCBNX0RFQlRJTkMsIEJBRCwgbWVhbiwgbmEucm09VFJVRSApICkgKQ0KI0NyZWRpdCBMaW5lIG51bWJlcg0KcHJpbnQoIHdpdGgoIGRmLCB0YXBwbHkoIENMTk8sIEJBRCwgbWVhbiwgbmEucm09VFJVRSApICkgKQ0KYGBgDQoNCmBgYHtyfQ0KDQoNCmBgYA0KDQoNCmBgYHtyfQ0KbWVhbiggZGYkREVCVElOQywgbmEucm09VFJVRSApDQpkZiRJTVBfREVCVElOQyA9IGRmJERFQlRJTkMNCmRmJElNUF9ERUJUSU5DID0gaWZlbHNlKGlzLm5hKCBkZiRJTVBfREVCVElOQyApLCAzNCwgZGYkSU1QX0RFQlRJTkMgKQ0KYGBgDQoNCmBgYHtyfQ0KcHJpbnQoIGhlYWQoZGYpICkNCmRmID0gc3Vic2V0KGRmLCBzZWxlY3QgPSAtYyggREVCVElOQyApICkNCnByaW50KCBoZWFkKGRmKSApDQpgYGANCg0KYGBge3J9DQpwcmludCggd2l0aCggZGYsIHRhcHBseSggQkFELCBKT0IsIG1lYW4sIG5hLnJtPVRSVUUgKSApICkNCmRmJElNUF9KT0IgPSBkZiRKT0INCmRmJElNUF9KT0IgPSBpZmVsc2UoZGYkSk9CID09ICIiLCAiVU5LTk9XTiIsIGFzLmNoYXJhY3RlcihkZiRJTVBfSk9CKSApDQojZGYkSU1QX0pPQiA9IGlmZWxzZShkZiRKT0IgPT0gIiIsICJVTktOT1dOIiwgZGYkSU1QX0pPQiApDQpgYGANCmBgYHtyfQ0KcHJpbnQoIGhlYWQoZGYpICkNCmRmID0gc3Vic2V0KGRmLCBzZWxlY3QgPSAtYyggSk9CICkgKQ0KcHJpbnQoIGhlYWQoZGYpICkNCmBgYA0KDQpgYGB7cn0NCmhpc3QoIGRmJElNUF9ERUJUSU5DLCBtYWluPSAiRGVidCB0byBJbmNvbWUgKHcvIE91dGxpZXJzIiwgY29sPSJncmVlbiIpDQpgYGANCg0KYGBge3J9DQpib3hwbG90KCBkZiRJTVBfREVCVElOQywgbWFpbj0iSGlzdG9ncmFtIG9mIERlYnQgdG8gSW5jb21lICh3LyBPdXRsaWVycykiLGNvbD0iZ3JlZW4iICApDQpgYGANCmBgYHtyfQ0KI1JlbW92ZSBvdXRsaWVycw0KYSA9IG1heCggZGYkSU1QX0RFQlRJTkMsIG5hLnJtPVRSVUUgKQ0KeiA9IG1pbiggZGYkSU1QX0RFQlRJTkMsIG5hLnJtPVRSVUUgKQ0KbSA9IG1lYW4oIGRmJElNUF9ERUJUSU5DLCBuYS5ybT1UUlVFICkNCnMgPSBzZCggZGYkSU1QX0RFQlRJTkMsIG5hLnJtPVRSVUUgKQ0KDQpkZiRJTVBfREVCVElOQyA9IGlmZWxzZSggZGYkSU1QX0RFQlRJTkMgPiBtKzMqcywgbSszKnMsIGRmJElNUF9ERUJUSU5DICkNCmRmJElNUF9ERUJUSU5DID0gaWZlbHNlKCBkZiRJTVBfREVCVElOQyA8IG0tMypzLCBtLTMqcywgZGYkSU1QX0RFQlRJTkMgKQ0KDQpoaXN0KCBkZiRJTVBfREVCVElOQywgbWFpbiA9ICJIaXN0b2dyYW0gb2YgRGVidCB0byBJbmNvbWUiLGNvbD0iYmx1ZSIgICkNCmBgYA0KYGBge3J9DQpib3hwbG90KCBkZiRJTVBfREVCVElOQyxtYWluPSJCb3hwbG90IG9mIERlYnQgdG8gSW5jb21lIiwgY29sPSJibHVlIiAgKQ0KYGBgDQoNCmBgYHtyfQ0KYSA9IG1heCggZGYkSU1QX0RFQlRJTkMsIG5hLnJtPVRSVUUgKQ0KeiA9IG1pbiggZGYkSU1QX0RFQlRJTkMsIG5hLnJtPVRSVUUgKQ0KbSA9IG1lYW4oIGRmJElNUF9ERUJUSU5DLCBuYS5ybT1UUlVFICkNCnMgPSBzZCggZGYkSU1QX0RFQlRJTkMsIG5hLnJtPVRSVUUgKQ0KDQp0YWJsZShkZiRJTVBfSk9CKQ0KYGBgDQoNCmBgYHtyfQ0KcGllKCB0YWJsZShkZiRJTVBfSk9CKSwgbWFpbiA9ICJKb2Igb2YgTW9ydGdhZ2UgQXBwbGljYW50IikNCmBgYA0KYGBge3J9DQojcmlzayBhc3Nlc3NtZW50IGlmIHNvbWVvbmUgd29ya3MgaW4gc2FsZXMNCnByaW50KCB3aXRoKCBkZiwgdGFwcGx5KCBCQUQsIElNUF9KT0IsIG1lYW4gKSApICkNCmRmJEZMQUdfSk9CX1JJU0tZID0gaWZlbHNlKGRmJElNUF9KT0IgJWluJSBjKCJTZWxmIiwiU2FsZXMiKSwgMSwgMCApDQpgYGANCg0K