R Markdown

# Load the libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

# Read the CSV file
data <- read.csv('C:/Users/Abhay/Desktop/1.csv')

# Take a sample of first 627 rows (clean data)
data <- data[1:627,]
## Data Summary
## Data Distribution
# check data dimensions and the summary of the data
dim(data)
## [1] 627  27
summary(data)
##    CaseNumber        Amt.Repaid.at.6.Months Nominal.Loan.Amount
##  Min.   : 10318666   Min.   :    590        Min.   :  3000     
##  1st Qu.:116714284   1st Qu.:   5322        1st Qu.: 11800     
##  Median :222113091   Median :  10395        Median : 21400     
##  Mean   :224065574   Mean   :  26582        Mean   : 37272     
##  3rd Qu.:326716390   3rd Qu.:  21602        3rd Qu.: 42500     
##  Max.   :459212486   Max.   :4536390        Max.   :452000     
##                                                                
##  Total.Amt.to.be.Repaid      PRSM          Repayment.Percentage
##  Min.   :  3540         Min.   :  0.1563   Min.   : 5.00       
##  1st Qu.: 14280         1st Qu.:  0.6606   1st Qu.:12.50       
##  Median : 26010         Median :  0.8237   Median :12.50       
##  Mean   : 44619         Mean   :  1.9334   Mean   :12.15       
##  3rd Qu.: 51642         3rd Qu.:  0.9674   3rd Qu.:12.50       
##  Max.   :535620         Max.   :703.5344   Max.   :17.50       
##                                                                
##  Commission.Upfront Validated.Monthly.Batch
##  Min.   :  110.4    Min.   :  2420         
##  1st Qu.: 1130.8    1st Qu.: 10318         
##  Median : 2365.9    Median : 19790         
##  Mean   : 3946.7    Mean   : 37324         
##  3rd Qu.: 4542.9    3rd Qu.: 43107         
##  Max.   :43332.0    Max.   :520015         
##                                            
##  Historical.Monthly.Credit.Card.Receipts Loan.Type Loan.Size.Class
##  Min.   :  1773                           :  0       :  0         
##  1st Qu.: 10161                          O:407     S1: 87         
##  Median : 20108                          R:220     S2:221         
##  Mean   : 39822                                    S3:156         
##  3rd Qu.: 45544                                    S4:163         
##  Max.   :520015                                                   
##                                                                   
##       FICO       Years.In.Business Num.of.Credit.Lines
##  Min.   :397.0   Min.   : 1.000    Min.   :  2.00     
##  1st Qu.:521.0   1st Qu.: 3.000    1st Qu.: 15.00     
##  Median :579.0   Median : 6.000    Median : 23.00     
##  Mean   :575.7   Mean   : 9.276    Mean   : 24.77     
##  3rd Qu.:628.5   3rd Qu.:11.000    3rd Qu.: 33.00     
##  Max.   :804.0   Max.   :81.000    Max.   :119.00     
##                                                       
##  Num.of.Paid.off.Credit.Lines Current.Delinquent.Credit.Lines
##  Min.   : 0.0                 Min.   : 0.000                 
##  1st Qu.: 6.0                 1st Qu.: 0.000                 
##  Median :11.0                 Median : 2.000                 
##  Mean   :12.5                 Mean   : 3.113                 
##  3rd Qu.:18.0                 3rd Qu.: 4.000                 
##  Max.   :75.0                 Max.   :43.000                 
##                                                              
##  Previous.Delinquent.Credit.Lines Business.Entity.Type Num.of.Trade.Lines
##  Min.   : 0.000                         :  0           Min.   : 0.000    
##  1st Qu.: 1.000                   LLC   :482           1st Qu.: 0.000    
##  Median : 4.000                   S-Corp:145           Median : 1.000    
##  Mean   : 4.748                                        Mean   : 1.485    
##  3rd Qu.: 7.000                                        3rd Qu.: 2.000    
##  Max.   :51.000                                        Max.   :14.000    
##                                                                          
##  Num.of.Derog.Legal.Item Two.Digit.SIC.Code
##  Min.   : 0.0000         Min.   : 7.0      
##  1st Qu.: 0.0000         1st Qu.:56.0      
##  Median : 0.0000         Median :58.0      
##  Mean   : 0.9266         Mean   :60.3      
##  3rd Qu.: 1.0000         3rd Qu.:72.0      
##  Max.   :17.0000         Max.   :99.0      
##                                            
##                          Two.Digit.SIC.Description Population.in.Zip.Code
##  Eating and Drinking Places           :165         Min.   :   79         
##  Miscellaneous Retail Stores          : 65         1st Qu.:14691         
##  Personal Services                    : 62         Median :24555         
##  Auto Repair Services and Parking     : 59         Mean   :25963         
##  Apparel and Accessory Stores         : 40         3rd Qu.:34795         
##  Home Furniture and Furnishings Stores: 35         Max.   :97086         
##  (Other)                              :201                               
##  Average.House.Value.in.Zip.Code Income.Per.Household.in.Zip....
##  Min.   :  35600                 Min.   : 12219                 
##  1st Qu.:  91100                 1st Qu.: 34429                 
##  Median : 118000                 Median : 42934                 
##  Mean   : 149794                 Mean   : 45904                 
##  3rd Qu.: 161450                 3rd Qu.: 55539                 
##  Max.   :1000001                 Max.   :131250                 
##                                                                 
##      State             ISO.Name  
##  FL     :126               :  0  
##  NY     : 61   Credit Divas:269  
##  PA     : 36   Loan Masters:358  
##  NJ     : 35                     
##  OH     : 31                     
##  GA     : 28                     
##  (Other):310
# summary of PRSM scores
summary(data$PRSM)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.1563   0.6606   0.8237   1.9330   0.9674 703.5000
# Remove the outlier from the data
data <- data[-527, ]

prsmScores <- data$PRSM
summary(prsmScores)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1563  0.6604  0.8233  0.8126  0.9671  1.6940
stem(prsmScores,scale = 2)
## 
##   The decimal point is 1 digit(s) to the left of the |
## 
##    1 | 68
##    2 | 1
##    2 | 5678899
##    3 | 133
##    3 | 5556778888
##    4 | 11334444
##    4 | 55566677788888899
##    5 | 00111122222222233334444
##    5 | 55556666677777788888889999999999
##    6 | 0000000000001111111111122222223333444
##    6 | 5555555555666666666667777788888899999999
##    7 | 000000000000001111111111122222222222233333334444444
##    7 | 5555555555666666666666677777777777778888888889999999999
##    8 | 00000000000000111111122222222222333333333333333444444444
##    8 | 555555555556666666677777777777788888888889999999999
##    9 | 0000000000111111111112222222222233333333333334444444444
##    9 | 5555555556666666666777777888888888888888888999999999999999
##   10 | 0000000000011111122222223333344444444
##   10 | 555555666666677778888
##   11 | 00001111111112222222223444
##   11 | 55555566778888999
##   12 | 01244
##   12 | 7889
##   13 | 01123
##   13 | 
##   14 | 01
##   14 | 6
##   15 | 
##   15 | 7
##   16 | 
##   16 | 9
## Histograms
h <- hist(prsmScores, 
     main="Histogram for PRSM Scores", 
     xlab="PRSM Score", 
     border="black", 
     col="blue",
     las=1, 
     breaks=7)

## Density Function
d <- density(prsmScores)
## Density functions to draw a graph
plot(d, main="Density of PRSM Scores")
polygon(d, col='red', border = 'blue')

## Adding Density Lines to Existing Graphs
plot(h)
xfit<-seq(min(prsmScores),max(prsmScores),length=20) 
yfit<-dnorm(xfit,mean=mean(prsmScores),sd=sd(prsmScores)) 
yfit <- yfit*diff(h$mids[1:2])*length(prsmScores) 
lines(xfit, yfit, col="red", lwd=2)

## Types of Data Distributions
prsmMean <- mean(prsmScores)
prsmSD<- sd(prsmScores)

pnorm(1, mean=prsmMean, sd=prsmSD)
## [1] 0.7999677
qnorm(0.5, mean=prsmMean, sd=prsmSD)
## [1] 0.8125883
rnorm(10, mean=prsmMean, sd=prsmSD)
##  [1] 1.0443174 0.5980357 0.8897985 0.7009144 0.5126269 0.5676045 0.4601230
##  [8] 1.0649077 0.9811573 0.6677939
dnorm(prsmScores, mean=prsmMean, sd=prsmSD)
##   [1] 1.5600901323 1.6807087735 1.6570493616 1.7355891972 0.8628407444
##   [6] 1.7365193488 0.9978805896 1.6269394672 1.7197103401 1.4139486232
##  [11] 1.5248827350 0.2585566903 0.0054407960 1.5611939633 1.7902699442
##  [16] 1.5919426151 1.7276517559 0.1695288929 1.4148217170 0.1375598097
##  [21] 1.6511889923 1.3708112868 1.1421542432 1.6122511980 0.6722228613
##  [26] 1.0596491473 1.4277441128 1.6073171840 0.5174077116 1.7329679646
##  [31] 1.5807064652 1.7913047277 1.0734178538 1.1221889874 1.2300021508
##  [36] 1.7334143361 1.6772449932 1.6988935822 0.4704388235 1.4064008073
##  [41] 1.7767138997 0.9199346887 1.2963437109 1.4849240411 1.3131347912
##  [46] 1.7163662935 0.7530791928 1.1293833452 1.4380618442 0.7370238481
##  [51] 0.7517442378 0.0007124195 0.5916931868 1.7667429312 0.7602157953
##  [56] 1.7037978998 1.5114601532 1.3537172803 1.3182399451 1.6411633308
##  [61] 1.6561669951 0.8022366325 0.9767028582 1.2301138391 0.1721899178
##  [66] 0.1324282358 1.0028422177 1.7314029014 1.7832968575 1.7896152933
##  [71] 1.7034990581 0.8293411381 1.7094650394 1.7010584810 1.3463164766
##  [76] 1.5043073711 1.4479607695 1.7877795280 1.1430138185 1.5039081078
##  [81] 0.9081434958 1.1866791000 1.6649425968 1.2338312806 1.5518460797
##  [86] 1.7498247673 0.0514652726 1.3427646606 1.3527009968 1.3712614696
##  [91] 1.6212034392 1.7900693848 1.3717115237 0.7117419284 1.5929024084
##  [96] 1.7743580655 1.5578763308 1.0388103750 1.5855707555 1.6826498705
## [101] 1.6605533998 1.5439580034 1.1532248913 1.6711640090 1.7552909647
## [106] 0.6892299058 1.4701024091 1.7869137701 1.7907806949 1.1566259602
## [111] 1.7835972888 1.2117618959 0.9674208894 1.7583645869 1.3047548541
## [116] 1.2083930997 1.7527058753 1.2137979011 1.7881602964 1.4796950279
## [121] 1.6654473703 1.4596723247 1.5771446826 1.7753575212 1.5731153971
## [126] 1.6352557426 1.6386878652 1.5047064370 1.0499004974 1.7377243941
## [131] 0.2852759251 1.7619506656 1.1225626427 1.5658683827 0.4475748956
## [136] 0.4791239376 0.6965195255 1.6968390065 0.6995316495 1.6410914094
## [141] 1.3614337346 0.9491324564 0.7003933755 0.9013852789 1.6511191936
## [146] 0.5712370645 1.7630902941 1.3919306537 1.3145283222 1.4646925310
## [151] 1.2718814154 0.6943717525 1.2511216495 1.3717115237 1.5760715174
## [156] 0.3590719373 1.1455594698 1.6278997985 0.7803308190 1.4734165383
## [161] 0.1147429930 1.7888763811 1.6938667730 1.7795248939 1.7657085527
## [166] 0.4036135752 0.0449122665 1.5942866810 1.7670059098 1.7895719100
## [171] 0.1083975736 1.6663687956 1.7728123309 1.7863334141 1.7890812793
## [176] 0.2000738331 1.7865834656 1.5803513591 1.5677736282 1.5838093476
## [181] 0.7592179986 1.7351967136 0.2422780253 1.7853456446 1.7844479858
## [186] 1.7909125822 1.2699902545 0.2675793593 0.7340420380 1.5091756090
## [191] 1.5462216674 0.9631955216 0.9823892410 0.7668272091 1.7786843152
## [196] 0.5577555375 1.4120986418 1.6073171840 0.6743374728 0.1473095967
## [201] 0.1053215393 0.8219585430 0.4245387114 1.5863555297 0.3495539583
## [206] 1.7746025473 0.4493286544 0.7480857471 1.7311999666 0.9660869919
## [211] 0.4303871519 1.6408548511 1.4584133481 1.3408258482 1.0680526661
## [216] 1.7371460101 1.6282938747 1.4182065478 1.1299840072 1.7877289287
## [221] 1.7811199799 1.3490555291 0.5415319213 1.5318429508 1.7837281714
## [226] 0.8515166342 1.7652718012 1.0718407884 1.3094143104 1.7471013363
## [231] 1.4667768078 1.0602504177 1.5190309186 1.4113257442 1.7799584851
## [236] 1.0840360919 1.5401677502 1.5799131039 1.7877289287 1.4380618442
## [241] 1.7887059881 1.3177763496 0.4576878853 1.5867065466 0.4486578971
## [246] 0.5064090394 1.7704485940 1.7042252137 1.6550555162 0.2141721753
## [251] 1.2828398475 1.7460430686 1.5397875296 0.9181388674 1.3154568384
## [256] 1.6115862015 1.5980730554 1.4659436281 1.7912450659 1.7634793067
## [261] 0.3590719373 0.9707972203 1.7276517559 0.9693499534 1.7563581296
## [266] 1.7845876659 1.1901756446 1.5220667115 1.6058925043 1.1231635868
## [271] 0.6952304863 0.6167969343 1.7508784135 1.5828320767 1.7898511657
## [276] 1.6393083644 1.3131347912 1.7846038615 0.5115693156 1.2958755362
## [281] 1.1916251788 1.5413071110 1.7777921951 1.7409023218 1.6109978999
## [286] 1.6514183604 1.2605155519 1.3945917146 1.1202396147 1.7824226778
## [291] 1.6056339636 0.6091385074 1.7609354558 0.7547567684 1.1654779917
## [296] 1.6809780547 0.5759920404 1.5363558263 0.8655425524 0.0333251198
## [301] 1.5216765402 1.1771128070 1.1823254224 0.4553153265 1.7380636331
## [306] 0.7858643307 1.0542867631 1.0333419104 1.1017113700 1.6417076772
## [311] 0.5690032366 1.7627743774 1.5691410583 1.7841093168 0.6246011379
## [316] 1.0479514147 1.1153652438 0.7060066497 1.3754120001 1.2094681342
## [321] 1.7846569704 0.5578447678 0.0916015150 1.6002052931 1.7816066945
## [326] 1.7413185915 1.5978100925 1.7871905067 1.7911588154 0.7489737615
## [331] 1.3513348368 1.2552914773 1.1775971358 1.7817727351 0.1062253709
## [336] 1.6307664403 1.2648935160 1.5283713604 1.5514726728 0.7526340963
## [341] 1.7878297683 1.3984707669 1.2065792598 0.8655425524 1.2371784511
## [346] 0.7340420380 1.6904739611 1.1857119061 1.6259767520 0.9810516696
## [351] 1.5405477541 1.3976898945 1.7906402209 1.1294969439 1.7893241176
## [356] 0.6233804296 1.7897855746 1.7822623258 1.7327675598 1.5942866810
## [361] 0.9409744178 1.6763601709 1.4906230931 0.4452359092 1.7606423086
## [366] 1.7357849473 0.7441985615 0.6613773435 1.1217016657 0.5709397618
## [371] 0.1790264699 1.7277004177 0.5608209149 1.1829221824 0.2023851271
## [376] 1.0617131592 1.5956670093 0.6545896330 0.5670704216 1.7905494149
## [381] 1.7879669007 1.1475047801 1.6744984273 0.7130450923 1.6008873906
## [386] 1.7679154154 1.7886726677 0.2833734998 0.0233079732 1.3340468422
## [391] 0.7292061932 1.7824226778 1.5318429508 1.3014874907 0.6024228397
## [396] 1.7744932891 1.3541721604 1.7865834656 1.4051849356 1.0690280960
## [401] 1.7124903204 1.3412840314 1.4213505904 1.6866256905 1.7212973123
## [406] 0.9549001204 1.0743934181 1.3362377987 1.7903443451 1.6825875241
## [411] 1.2581420877 1.3644953373 1.4846108424 1.7305892116 1.6254086722
## [416] 0.9559742943 1.7426126707 1.6993317724 0.9915826725 1.6456867126
## [421] 1.7600519023 0.5778508319 1.1006220962 1.7705700972 1.2747157281
## [426] 1.3731657673 1.6308404398 1.7893904052 1.7676874631 0.8981770772
## [431] 0.8614362582 1.4173358040 1.3892645870 1.3726112449 0.0255495106
## [436] 0.0767691864 0.7409979173 1.7667736695 1.3595155382 1.1192648446
## [441] 1.7899079091 1.2358558462 1.0923294358 1.5867065466 0.9309186952
## [446] 1.1061008547 1.5252711966 0.2044809423 1.3650526901 1.7899468502
## [451] 1.6069026141 0.2104772057 1.3558845091 0.2194200569 0.6706319571
## [456] 1.0552616344 1.3914866556 1.4316240411 1.7327675598 1.7071574249
## [461] 1.7203705658 1.5251806156 1.6454533074 1.7317606458 0.5739552134
## [466] 1.5447134353 1.6938074357 1.3200932971 1.7208091224 1.7263938417
## [471] 1.3499676070 0.7718723288 0.6056221098 1.6705388467 1.7539715685
## [476] 1.0742796532 1.7663458659 1.2132048701 1.4252513329 0.9762198644
## [481] 1.4999982603 1.6368198564 1.0294490100 1.4911233322 1.6126607323
## [486] 1.2506456592 0.4553942497 1.7658116000 1.7832968575 1.4268801710
## [491] 1.2794329892 1.7599034446 1.0227533324 0.4300608720 1.7206411365
## [496] 0.2025306805 1.5439580034 1.3015964283 1.5817703490 1.7842357881
## [501] 0.1773077046 0.4687136975 1.7442104233 0.4073062539 1.7809283242
## [506] 1.6860380146 0.0787809337 1.6012280641 0.7172951961 1.6198245145
## [511] 1.7888343223 1.7667429312 1.4457388606 1.0841498704 0.2617692826
## [516] 1.4312939832 1.3182399451 1.4755782270 1.6122511980 1.6475075084
## [521] 1.7788515879 1.5287579367 1.7909891752 1.7657085527 1.7006240860
## [526] 0.1115470509 1.7357393225 0.7056749867 1.5581596999 1.7479740244
## [531] 0.8843541607 1.0231261443 1.4174373757 1.6591663675 1.7861055192
## [536] 1.4986998213 1.0563502473 0.2882335036 1.2262808843 1.5845968173
## [541] 1.7583645869 0.8362833293 0.0551446355 1.3289813665 1.7164188343
## [546] 1.2879026579 1.3210193519 0.4864135926 1.0963457786 1.6392360618
## [551] 1.7860298034 1.6663024297 0.5399411076 1.3932618129 1.1663351782
## [556] 0.7566439617 0.9038685434 0.4946495862 1.6317171873 1.7855897150
## [561] 0.5261532955 1.7377243941 1.2333528586 0.7980251281 1.7583645869
## [566] 1.7458655210 1.3554300832 1.2705734491 1.7319626802 0.9295957172
## [571] 0.7291038145 1.1362005908 1.6960852219 1.1483638826 1.1163402188
## [576] 0.1480503246 0.5505137786 0.9761072233 1.4943613588 1.1498224639
## [581] 0.6180108186 1.4304326422 1.7442104233 1.7199820899 0.4513447652
## [586] 0.4252614855 1.4407263482 1.2643091563 1.0427063289 1.0982969725
## [591] 1.0568377287 1.0787836750 0.9167092399 1.7875107543 0.8068073129
## [596] 1.6408548511 1.7647264747 1.2017604114 1.7206411365 1.7604952208
## [601] 1.7701759914 1.0192372063 1.5702277710 1.3297958499 0.9305520054
## [606] 1.4068399866 1.1688735710 1.4796950279 1.7795951835 0.7216586592
## [611] 1.7270731389 1.4742432651 0.7264722160 1.1921082684 1.1625662060
## [616] 1.7887919038 1.5299164016 1.6881972046 0.2730080728 0.9916956072
## [621] 1.7832968575 1.7600519023 1.4423334384 1.6912477564 1.3900501600
## [626] 0.3624843512
## Normal Distribution
dt.norm <- rnorm(1000,mean(data$PRSM),sd(data$PRSM))
hist(dt.norm, freq = F, border = 'gray50', main = 'Comparing two distributions', xlab = 'Data size classes')
lines(density(data$PRSM), lwd = 2)

## Other distributions
# Poisson Distribution
dt.pois <- rpois(50, lambda = 1.5) 
plot(dt.pois, type = 'l')

# Binomial Distribution
dt.binom <- pbinom(c(3, 6, 9, 12), size = 17, prob = 0.5) 
plot(dt.binom, type = 'l')

# Uniform Distribution
plot(runif(10, min = 0, max = 10), type = 'l') 

# Random Number Generation and Control
set.seed(10, kind = 'Super')
runif(10)
##  [1] 0.18647349 0.91369213 0.07930381 0.65855213 0.83744303 0.90648074
##  [7] 0.03602256 0.31998906 0.52243956 0.48279625
set.seed(10,kind = 'default')
runif(10)
##  [1] 0.50747820 0.30676851 0.42690767 0.69310208 0.08513597 0.22543662
##  [7] 0.27453052 0.27230507 0.61582931 0.42967153
# Random Numbers and Sampling
set.seed(10)
x <- sample(prsmScores, size = 200, replace = FALSE)

# Shapiro-Wilk Test for Normality
shapiro.test(prsmScores)
## 
##  Shapiro-Wilk normality test
## 
## data:  prsmScores
## W = 0.99624, p-value = 0.1433
# Kolmogorov-Smirnov Test
ks.test(prsmScores, pnorm(1000, 20, 5))
## Warning in ks.test(prsmScores, pnorm(1000, 20, 5)): cannot compute exact p-
## value with ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  prsmScores and pnorm(1000, 20, 5)
## D = 0.81949, p-value = 0.5138
## alternative hypothesis: two-sided
# A Basic Normal Quantile-Quantile Plot
qqnorm(prsmScores)

# Adding a straight line to a QQ Plot
qqnorm(prsmScores, main = 'QQ plot of PRSM data', xlab = 'Theoretical',ylab = 'Quantiles for dataset')
qqline(prsmScores, lwd = 2, lty = 2)

# Plotting the distribution of one sample against another
qqp = qqplot(prsmScores, rnorm(50, 5, 2))
abline(lm(qqp$y ~ qqp$x))