R Markdown
# Load the libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Read the CSV file
data <- read.csv('C:/Users/Abhay/Desktop/1.csv')
# Take a sample of first 627 rows (clean data)
data <- data[1:627,]
## Data Summary
## Data Distribution
# check data dimensions and the summary of the data
dim(data)
## [1] 627 27
summary(data)
## CaseNumber Amt.Repaid.at.6.Months Nominal.Loan.Amount
## Min. : 10318666 Min. : 590 Min. : 3000
## 1st Qu.:116714284 1st Qu.: 5322 1st Qu.: 11800
## Median :222113091 Median : 10395 Median : 21400
## Mean :224065574 Mean : 26582 Mean : 37272
## 3rd Qu.:326716390 3rd Qu.: 21602 3rd Qu.: 42500
## Max. :459212486 Max. :4536390 Max. :452000
##
## Total.Amt.to.be.Repaid PRSM Repayment.Percentage
## Min. : 3540 Min. : 0.1563 Min. : 5.00
## 1st Qu.: 14280 1st Qu.: 0.6606 1st Qu.:12.50
## Median : 26010 Median : 0.8237 Median :12.50
## Mean : 44619 Mean : 1.9334 Mean :12.15
## 3rd Qu.: 51642 3rd Qu.: 0.9674 3rd Qu.:12.50
## Max. :535620 Max. :703.5344 Max. :17.50
##
## Commission.Upfront Validated.Monthly.Batch
## Min. : 110.4 Min. : 2420
## 1st Qu.: 1130.8 1st Qu.: 10318
## Median : 2365.9 Median : 19790
## Mean : 3946.7 Mean : 37324
## 3rd Qu.: 4542.9 3rd Qu.: 43107
## Max. :43332.0 Max. :520015
##
## Historical.Monthly.Credit.Card.Receipts Loan.Type Loan.Size.Class
## Min. : 1773 : 0 : 0
## 1st Qu.: 10161 O:407 S1: 87
## Median : 20108 R:220 S2:221
## Mean : 39822 S3:156
## 3rd Qu.: 45544 S4:163
## Max. :520015
##
## FICO Years.In.Business Num.of.Credit.Lines
## Min. :397.0 Min. : 1.000 Min. : 2.00
## 1st Qu.:521.0 1st Qu.: 3.000 1st Qu.: 15.00
## Median :579.0 Median : 6.000 Median : 23.00
## Mean :575.7 Mean : 9.276 Mean : 24.77
## 3rd Qu.:628.5 3rd Qu.:11.000 3rd Qu.: 33.00
## Max. :804.0 Max. :81.000 Max. :119.00
##
## Num.of.Paid.off.Credit.Lines Current.Delinquent.Credit.Lines
## Min. : 0.0 Min. : 0.000
## 1st Qu.: 6.0 1st Qu.: 0.000
## Median :11.0 Median : 2.000
## Mean :12.5 Mean : 3.113
## 3rd Qu.:18.0 3rd Qu.: 4.000
## Max. :75.0 Max. :43.000
##
## Previous.Delinquent.Credit.Lines Business.Entity.Type Num.of.Trade.Lines
## Min. : 0.000 : 0 Min. : 0.000
## 1st Qu.: 1.000 LLC :482 1st Qu.: 0.000
## Median : 4.000 S-Corp:145 Median : 1.000
## Mean : 4.748 Mean : 1.485
## 3rd Qu.: 7.000 3rd Qu.: 2.000
## Max. :51.000 Max. :14.000
##
## Num.of.Derog.Legal.Item Two.Digit.SIC.Code
## Min. : 0.0000 Min. : 7.0
## 1st Qu.: 0.0000 1st Qu.:56.0
## Median : 0.0000 Median :58.0
## Mean : 0.9266 Mean :60.3
## 3rd Qu.: 1.0000 3rd Qu.:72.0
## Max. :17.0000 Max. :99.0
##
## Two.Digit.SIC.Description Population.in.Zip.Code
## Eating and Drinking Places :165 Min. : 79
## Miscellaneous Retail Stores : 65 1st Qu.:14691
## Personal Services : 62 Median :24555
## Auto Repair Services and Parking : 59 Mean :25963
## Apparel and Accessory Stores : 40 3rd Qu.:34795
## Home Furniture and Furnishings Stores: 35 Max. :97086
## (Other) :201
## Average.House.Value.in.Zip.Code Income.Per.Household.in.Zip....
## Min. : 35600 Min. : 12219
## 1st Qu.: 91100 1st Qu.: 34429
## Median : 118000 Median : 42934
## Mean : 149794 Mean : 45904
## 3rd Qu.: 161450 3rd Qu.: 55539
## Max. :1000001 Max. :131250
##
## State ISO.Name
## FL :126 : 0
## NY : 61 Credit Divas:269
## PA : 36 Loan Masters:358
## NJ : 35
## OH : 31
## GA : 28
## (Other):310
# summary of PRSM scores
summary(data$PRSM)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1563 0.6606 0.8237 1.9330 0.9674 703.5000
# Remove the outlier from the data
data <- data[-527, ]
prsmScores <- data$PRSM
summary(prsmScores)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1563 0.6604 0.8233 0.8126 0.9671 1.6940
stem(prsmScores,scale = 2)
##
## The decimal point is 1 digit(s) to the left of the |
##
## 1 | 68
## 2 | 1
## 2 | 5678899
## 3 | 133
## 3 | 5556778888
## 4 | 11334444
## 4 | 55566677788888899
## 5 | 00111122222222233334444
## 5 | 55556666677777788888889999999999
## 6 | 0000000000001111111111122222223333444
## 6 | 5555555555666666666667777788888899999999
## 7 | 000000000000001111111111122222222222233333334444444
## 7 | 5555555555666666666666677777777777778888888889999999999
## 8 | 00000000000000111111122222222222333333333333333444444444
## 8 | 555555555556666666677777777777788888888889999999999
## 9 | 0000000000111111111112222222222233333333333334444444444
## 9 | 5555555556666666666777777888888888888888888999999999999999
## 10 | 0000000000011111122222223333344444444
## 10 | 555555666666677778888
## 11 | 00001111111112222222223444
## 11 | 55555566778888999
## 12 | 01244
## 12 | 7889
## 13 | 01123
## 13 |
## 14 | 01
## 14 | 6
## 15 |
## 15 | 7
## 16 |
## 16 | 9
## Histograms
h <- hist(prsmScores,
main="Histogram for PRSM Scores",
xlab="PRSM Score",
border="black",
col="blue",
las=1,
breaks=7)

## Density Function
d <- density(prsmScores)
## Density functions to draw a graph
plot(d, main="Density of PRSM Scores")
polygon(d, col='red', border = 'blue')

## Adding Density Lines to Existing Graphs
plot(h)
xfit<-seq(min(prsmScores),max(prsmScores),length=20)
yfit<-dnorm(xfit,mean=mean(prsmScores),sd=sd(prsmScores))
yfit <- yfit*diff(h$mids[1:2])*length(prsmScores)
lines(xfit, yfit, col="red", lwd=2)

## Types of Data Distributions
prsmMean <- mean(prsmScores)
prsmSD<- sd(prsmScores)
pnorm(1, mean=prsmMean, sd=prsmSD)
## [1] 0.7999677
qnorm(0.5, mean=prsmMean, sd=prsmSD)
## [1] 0.8125883
rnorm(10, mean=prsmMean, sd=prsmSD)
## [1] 1.0443174 0.5980357 0.8897985 0.7009144 0.5126269 0.5676045 0.4601230
## [8] 1.0649077 0.9811573 0.6677939
dnorm(prsmScores, mean=prsmMean, sd=prsmSD)
## [1] 1.5600901323 1.6807087735 1.6570493616 1.7355891972 0.8628407444
## [6] 1.7365193488 0.9978805896 1.6269394672 1.7197103401 1.4139486232
## [11] 1.5248827350 0.2585566903 0.0054407960 1.5611939633 1.7902699442
## [16] 1.5919426151 1.7276517559 0.1695288929 1.4148217170 0.1375598097
## [21] 1.6511889923 1.3708112868 1.1421542432 1.6122511980 0.6722228613
## [26] 1.0596491473 1.4277441128 1.6073171840 0.5174077116 1.7329679646
## [31] 1.5807064652 1.7913047277 1.0734178538 1.1221889874 1.2300021508
## [36] 1.7334143361 1.6772449932 1.6988935822 0.4704388235 1.4064008073
## [41] 1.7767138997 0.9199346887 1.2963437109 1.4849240411 1.3131347912
## [46] 1.7163662935 0.7530791928 1.1293833452 1.4380618442 0.7370238481
## [51] 0.7517442378 0.0007124195 0.5916931868 1.7667429312 0.7602157953
## [56] 1.7037978998 1.5114601532 1.3537172803 1.3182399451 1.6411633308
## [61] 1.6561669951 0.8022366325 0.9767028582 1.2301138391 0.1721899178
## [66] 0.1324282358 1.0028422177 1.7314029014 1.7832968575 1.7896152933
## [71] 1.7034990581 0.8293411381 1.7094650394 1.7010584810 1.3463164766
## [76] 1.5043073711 1.4479607695 1.7877795280 1.1430138185 1.5039081078
## [81] 0.9081434958 1.1866791000 1.6649425968 1.2338312806 1.5518460797
## [86] 1.7498247673 0.0514652726 1.3427646606 1.3527009968 1.3712614696
## [91] 1.6212034392 1.7900693848 1.3717115237 0.7117419284 1.5929024084
## [96] 1.7743580655 1.5578763308 1.0388103750 1.5855707555 1.6826498705
## [101] 1.6605533998 1.5439580034 1.1532248913 1.6711640090 1.7552909647
## [106] 0.6892299058 1.4701024091 1.7869137701 1.7907806949 1.1566259602
## [111] 1.7835972888 1.2117618959 0.9674208894 1.7583645869 1.3047548541
## [116] 1.2083930997 1.7527058753 1.2137979011 1.7881602964 1.4796950279
## [121] 1.6654473703 1.4596723247 1.5771446826 1.7753575212 1.5731153971
## [126] 1.6352557426 1.6386878652 1.5047064370 1.0499004974 1.7377243941
## [131] 0.2852759251 1.7619506656 1.1225626427 1.5658683827 0.4475748956
## [136] 0.4791239376 0.6965195255 1.6968390065 0.6995316495 1.6410914094
## [141] 1.3614337346 0.9491324564 0.7003933755 0.9013852789 1.6511191936
## [146] 0.5712370645 1.7630902941 1.3919306537 1.3145283222 1.4646925310
## [151] 1.2718814154 0.6943717525 1.2511216495 1.3717115237 1.5760715174
## [156] 0.3590719373 1.1455594698 1.6278997985 0.7803308190 1.4734165383
## [161] 0.1147429930 1.7888763811 1.6938667730 1.7795248939 1.7657085527
## [166] 0.4036135752 0.0449122665 1.5942866810 1.7670059098 1.7895719100
## [171] 0.1083975736 1.6663687956 1.7728123309 1.7863334141 1.7890812793
## [176] 0.2000738331 1.7865834656 1.5803513591 1.5677736282 1.5838093476
## [181] 0.7592179986 1.7351967136 0.2422780253 1.7853456446 1.7844479858
## [186] 1.7909125822 1.2699902545 0.2675793593 0.7340420380 1.5091756090
## [191] 1.5462216674 0.9631955216 0.9823892410 0.7668272091 1.7786843152
## [196] 0.5577555375 1.4120986418 1.6073171840 0.6743374728 0.1473095967
## [201] 0.1053215393 0.8219585430 0.4245387114 1.5863555297 0.3495539583
## [206] 1.7746025473 0.4493286544 0.7480857471 1.7311999666 0.9660869919
## [211] 0.4303871519 1.6408548511 1.4584133481 1.3408258482 1.0680526661
## [216] 1.7371460101 1.6282938747 1.4182065478 1.1299840072 1.7877289287
## [221] 1.7811199799 1.3490555291 0.5415319213 1.5318429508 1.7837281714
## [226] 0.8515166342 1.7652718012 1.0718407884 1.3094143104 1.7471013363
## [231] 1.4667768078 1.0602504177 1.5190309186 1.4113257442 1.7799584851
## [236] 1.0840360919 1.5401677502 1.5799131039 1.7877289287 1.4380618442
## [241] 1.7887059881 1.3177763496 0.4576878853 1.5867065466 0.4486578971
## [246] 0.5064090394 1.7704485940 1.7042252137 1.6550555162 0.2141721753
## [251] 1.2828398475 1.7460430686 1.5397875296 0.9181388674 1.3154568384
## [256] 1.6115862015 1.5980730554 1.4659436281 1.7912450659 1.7634793067
## [261] 0.3590719373 0.9707972203 1.7276517559 0.9693499534 1.7563581296
## [266] 1.7845876659 1.1901756446 1.5220667115 1.6058925043 1.1231635868
## [271] 0.6952304863 0.6167969343 1.7508784135 1.5828320767 1.7898511657
## [276] 1.6393083644 1.3131347912 1.7846038615 0.5115693156 1.2958755362
## [281] 1.1916251788 1.5413071110 1.7777921951 1.7409023218 1.6109978999
## [286] 1.6514183604 1.2605155519 1.3945917146 1.1202396147 1.7824226778
## [291] 1.6056339636 0.6091385074 1.7609354558 0.7547567684 1.1654779917
## [296] 1.6809780547 0.5759920404 1.5363558263 0.8655425524 0.0333251198
## [301] 1.5216765402 1.1771128070 1.1823254224 0.4553153265 1.7380636331
## [306] 0.7858643307 1.0542867631 1.0333419104 1.1017113700 1.6417076772
## [311] 0.5690032366 1.7627743774 1.5691410583 1.7841093168 0.6246011379
## [316] 1.0479514147 1.1153652438 0.7060066497 1.3754120001 1.2094681342
## [321] 1.7846569704 0.5578447678 0.0916015150 1.6002052931 1.7816066945
## [326] 1.7413185915 1.5978100925 1.7871905067 1.7911588154 0.7489737615
## [331] 1.3513348368 1.2552914773 1.1775971358 1.7817727351 0.1062253709
## [336] 1.6307664403 1.2648935160 1.5283713604 1.5514726728 0.7526340963
## [341] 1.7878297683 1.3984707669 1.2065792598 0.8655425524 1.2371784511
## [346] 0.7340420380 1.6904739611 1.1857119061 1.6259767520 0.9810516696
## [351] 1.5405477541 1.3976898945 1.7906402209 1.1294969439 1.7893241176
## [356] 0.6233804296 1.7897855746 1.7822623258 1.7327675598 1.5942866810
## [361] 0.9409744178 1.6763601709 1.4906230931 0.4452359092 1.7606423086
## [366] 1.7357849473 0.7441985615 0.6613773435 1.1217016657 0.5709397618
## [371] 0.1790264699 1.7277004177 0.5608209149 1.1829221824 0.2023851271
## [376] 1.0617131592 1.5956670093 0.6545896330 0.5670704216 1.7905494149
## [381] 1.7879669007 1.1475047801 1.6744984273 0.7130450923 1.6008873906
## [386] 1.7679154154 1.7886726677 0.2833734998 0.0233079732 1.3340468422
## [391] 0.7292061932 1.7824226778 1.5318429508 1.3014874907 0.6024228397
## [396] 1.7744932891 1.3541721604 1.7865834656 1.4051849356 1.0690280960
## [401] 1.7124903204 1.3412840314 1.4213505904 1.6866256905 1.7212973123
## [406] 0.9549001204 1.0743934181 1.3362377987 1.7903443451 1.6825875241
## [411] 1.2581420877 1.3644953373 1.4846108424 1.7305892116 1.6254086722
## [416] 0.9559742943 1.7426126707 1.6993317724 0.9915826725 1.6456867126
## [421] 1.7600519023 0.5778508319 1.1006220962 1.7705700972 1.2747157281
## [426] 1.3731657673 1.6308404398 1.7893904052 1.7676874631 0.8981770772
## [431] 0.8614362582 1.4173358040 1.3892645870 1.3726112449 0.0255495106
## [436] 0.0767691864 0.7409979173 1.7667736695 1.3595155382 1.1192648446
## [441] 1.7899079091 1.2358558462 1.0923294358 1.5867065466 0.9309186952
## [446] 1.1061008547 1.5252711966 0.2044809423 1.3650526901 1.7899468502
## [451] 1.6069026141 0.2104772057 1.3558845091 0.2194200569 0.6706319571
## [456] 1.0552616344 1.3914866556 1.4316240411 1.7327675598 1.7071574249
## [461] 1.7203705658 1.5251806156 1.6454533074 1.7317606458 0.5739552134
## [466] 1.5447134353 1.6938074357 1.3200932971 1.7208091224 1.7263938417
## [471] 1.3499676070 0.7718723288 0.6056221098 1.6705388467 1.7539715685
## [476] 1.0742796532 1.7663458659 1.2132048701 1.4252513329 0.9762198644
## [481] 1.4999982603 1.6368198564 1.0294490100 1.4911233322 1.6126607323
## [486] 1.2506456592 0.4553942497 1.7658116000 1.7832968575 1.4268801710
## [491] 1.2794329892 1.7599034446 1.0227533324 0.4300608720 1.7206411365
## [496] 0.2025306805 1.5439580034 1.3015964283 1.5817703490 1.7842357881
## [501] 0.1773077046 0.4687136975 1.7442104233 0.4073062539 1.7809283242
## [506] 1.6860380146 0.0787809337 1.6012280641 0.7172951961 1.6198245145
## [511] 1.7888343223 1.7667429312 1.4457388606 1.0841498704 0.2617692826
## [516] 1.4312939832 1.3182399451 1.4755782270 1.6122511980 1.6475075084
## [521] 1.7788515879 1.5287579367 1.7909891752 1.7657085527 1.7006240860
## [526] 0.1115470509 1.7357393225 0.7056749867 1.5581596999 1.7479740244
## [531] 0.8843541607 1.0231261443 1.4174373757 1.6591663675 1.7861055192
## [536] 1.4986998213 1.0563502473 0.2882335036 1.2262808843 1.5845968173
## [541] 1.7583645869 0.8362833293 0.0551446355 1.3289813665 1.7164188343
## [546] 1.2879026579 1.3210193519 0.4864135926 1.0963457786 1.6392360618
## [551] 1.7860298034 1.6663024297 0.5399411076 1.3932618129 1.1663351782
## [556] 0.7566439617 0.9038685434 0.4946495862 1.6317171873 1.7855897150
## [561] 0.5261532955 1.7377243941 1.2333528586 0.7980251281 1.7583645869
## [566] 1.7458655210 1.3554300832 1.2705734491 1.7319626802 0.9295957172
## [571] 0.7291038145 1.1362005908 1.6960852219 1.1483638826 1.1163402188
## [576] 0.1480503246 0.5505137786 0.9761072233 1.4943613588 1.1498224639
## [581] 0.6180108186 1.4304326422 1.7442104233 1.7199820899 0.4513447652
## [586] 0.4252614855 1.4407263482 1.2643091563 1.0427063289 1.0982969725
## [591] 1.0568377287 1.0787836750 0.9167092399 1.7875107543 0.8068073129
## [596] 1.6408548511 1.7647264747 1.2017604114 1.7206411365 1.7604952208
## [601] 1.7701759914 1.0192372063 1.5702277710 1.3297958499 0.9305520054
## [606] 1.4068399866 1.1688735710 1.4796950279 1.7795951835 0.7216586592
## [611] 1.7270731389 1.4742432651 0.7264722160 1.1921082684 1.1625662060
## [616] 1.7887919038 1.5299164016 1.6881972046 0.2730080728 0.9916956072
## [621] 1.7832968575 1.7600519023 1.4423334384 1.6912477564 1.3900501600
## [626] 0.3624843512
## Normal Distribution
dt.norm <- rnorm(1000,mean(data$PRSM),sd(data$PRSM))
hist(dt.norm, freq = F, border = 'gray50', main = 'Comparing two distributions', xlab = 'Data size classes')
lines(density(data$PRSM), lwd = 2)

## Other distributions
# Poisson Distribution
dt.pois <- rpois(50, lambda = 1.5)
plot(dt.pois, type = 'l')

# Binomial Distribution
dt.binom <- pbinom(c(3, 6, 9, 12), size = 17, prob = 0.5)
plot(dt.binom, type = 'l')

# Uniform Distribution
plot(runif(10, min = 0, max = 10), type = 'l')

# Random Number Generation and Control
set.seed(10, kind = 'Super')
runif(10)
## [1] 0.18647349 0.91369213 0.07930381 0.65855213 0.83744303 0.90648074
## [7] 0.03602256 0.31998906 0.52243956 0.48279625
set.seed(10,kind = 'default')
runif(10)
## [1] 0.50747820 0.30676851 0.42690767 0.69310208 0.08513597 0.22543662
## [7] 0.27453052 0.27230507 0.61582931 0.42967153
# Random Numbers and Sampling
set.seed(10)
x <- sample(prsmScores, size = 200, replace = FALSE)
# Shapiro-Wilk Test for Normality
shapiro.test(prsmScores)
##
## Shapiro-Wilk normality test
##
## data: prsmScores
## W = 0.99624, p-value = 0.1433
# Kolmogorov-Smirnov Test
ks.test(prsmScores, pnorm(1000, 20, 5))
## Warning in ks.test(prsmScores, pnorm(1000, 20, 5)): cannot compute exact p-
## value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: prsmScores and pnorm(1000, 20, 5)
## D = 0.81949, p-value = 0.5138
## alternative hypothesis: two-sided
# A Basic Normal Quantile-Quantile Plot
qqnorm(prsmScores)

# Adding a straight line to a QQ Plot
qqnorm(prsmScores, main = 'QQ plot of PRSM data', xlab = 'Theoretical',ylab = 'Quantiles for dataset')
qqline(prsmScores, lwd = 2, lty = 2)

# Plotting the distribution of one sample against another
qqp = qqplot(prsmScores, rnorm(50, 5, 2))
abline(lm(qqp$y ~ qqp$x))
