Basic functions and operations in R
# Load the libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(fBasics)
## Loading required package: timeDate
## Loading required package: timeSeries
##
## Rmetrics Package fBasics
## Analysing Markets and calculating Basic Statistics
## Copyright (C) 2005-2014 Rmetrics Association Zurich
## Educational Software for Financial Engineering and Computational Science
## Rmetrics is free software and comes with ABSOLUTELY NO WARRANTY.
## https://www.rmetrics.org --- Mail to: info@rmetrics.org
# Read the CSV file
data <- read.csv('C:/Users/Abhay/Desktop/1.csv')
## Show data
head(data)
## CaseNumber Amt.Repaid.at.6.Months Nominal.Loan.Amount
## 1 344012260 7879 19200
## 2 378114786 19392 37000
## 3 351015636 14253 26600
## 4 79719718 6934 15600
## 5 104110253 6199 18400
## 6 371413226 21268 40000
## Total.Amt.to.be.Repaid PRSM Repayment.Percentage Commission.Upfront
## 1 22656 0.6955 12.5 1344.00
## 2 43475 0.8921 12.5 1498.50
## 3 31654 0.9005 12.5 3670.80
## 4 18330 0.7566 12.5 421.20
## 5 22816 0.5434 12.5 3540.16
## 6 49000 0.8681 14.0 800.00
## Validated.Monthly.Batch Historical.Monthly.Credit.Card.Receipts
## 1 18711.92 21963.00
## 2 36229.58 36230.11
## 3 20988.40 20988.50
## 4 19360.00 21128.00
## 5 23629.62 15620.00
## 6 28714.96 7124.00
## Loan.Type Loan.Size.Class FICO Years.In.Business Num.of.Credit.Lines
## 1 O S2 550 10 19
## 2 R S3 501 6 28
## 3 O S3 579 29 9
## 4 R S2 589 6 26
## 5 R S2 645 2 46
## 6 R S3 552 4 21
## Num.of.Paid.off.Credit.Lines Current.Delinquent.Credit.Lines
## 1 6 5
## 2 11 3
## 3 3 1
## 4 21 2
## 5 10 5
## 6 6 12
## Previous.Delinquent.Credit.Lines Business.Entity.Type Num.of.Trade.Lines
## 1 9 LLC 0
## 2 10 LLC 1
## 3 2 LLC 1
## 4 20 LLC 0
## 5 5 LLC 7
## 6 2 LLC 0
## Num.of.Derog.Legal.Item Two.Digit.SIC.Code
## 1 0 55
## 2 0 58
## 3 0 58
## 4 11 58
## 5 0 51
## 6 0 58
## Two.Digit.SIC.Description Population.in.Zip.Code
## 1 Automotive Dealers and Service Stations 36785
## 2 Eating and Drinking Places 42503
## 3 Eating and Drinking Places 38066
## 4 Eating and Drinking Places 10243
## 5 Wholesale Trade-Nondurable Goods 13388
## 6 Eating and Drinking Places 41099
## Average.House.Value.in.Zip.Code Income.Per.Household.in.Zip.... State
## 1 69500 29997 FL
## 2 170400 60296 MD
## 3 92300 47825 GA
## 4 107700 45672 PA
## 5 137400 56057 FL
## 6 181200 61097 IL
## ISO.Name
## 1 Loan Masters
## 2 Loan Masters
## 3 Credit Divas
## 4 Loan Masters
## 5 Credit Divas
## 6 Loan Masters
# Read the training set and remove the outliers
data <- data[1:628,]
data <- data[-527,]
# Check if the data follows a normal distribution
qqnormPlot(data$Nominal.Loan.Amount)

qqnormPlot(data$Total.Amt.to.be.Repaid)

# Using the Student's t-test
t.test(data$Nominal.Loan.Amount, data$Total.Amt.to.be.Repaid)
##
## Welch Two Sample t-test
##
## data: data$Nominal.Loan.Amount and data$Total.Amt.to.be.Repaid
## t = -2.4293, df = 1215.8, p-value = 0.01527
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -13313.23 -1416.90
## sample estimates:
## mean of x mean of y
## 37372.30 44737.36
# Two sample t-test with Unequal Variance
t.test(data$Nominal.Loan.Amount, data$Total.Amt.to.be.Repaid, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: data$Nominal.Loan.Amount and data$Total.Amt.to.be.Repaid
## t = -2.4293, df = 1215.8, p-value = 0.01527
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -13313.23 -1416.90
## sample estimates:
## mean of x mean of y
## 37372.30 44737.36
# Two sample t-test with Equal Variance
t.test(data$Nominal.Loan.Amount, data$Total.Amt.to.be.Repaid, var.equal = TRUE)
##
## Two Sample t-test
##
## data: data$Nominal.Loan.Amount and data$Total.Amt.to.be.Repaid
## t = -2.4293, df = 1252, p-value = 0.01527
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -13313.059 -1417.072
## sample estimates:
## mean of x mean of y
## 37372.30 44737.36
# One sample t-testing
t.test(data$PRSM, mu=1.5)
##
## One Sample t-test
##
## data: data$PRSM
## t = -77.339, df = 626, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 1.5
## 95 percent confidence interval:
## 0.7952067 0.8301120
## sample estimates:
## mean of x
## 0.8126593
# Using Directional Hypothesis
t.test(data$PRSM, mu=1.5, alternative='less')
##
## One Sample t-test
##
## data: data$PRSM
## t = -77.339, df = 626, p-value < 2.2e-16
## alternative hypothesis: true mean is less than 1.5
## 95 percent confidence interval:
## -Inf 0.8272994
## sample estimates:
## mean of x
## 0.8126593
# The Wilcoxon U-Test (Mann-Whitney)
sample.1 <- sample(data$PRSM, size = 200)
sample.2 <- sample(data$PRSM, size = 200)
wilcox.test(sample.1, sample.2)
##
## Wilcoxon rank sum test with continuity correction
##
## data: sample.1 and sample.2
## W = 19840, p-value = 0.8906
## alternative hypothesis: true location shift is not equal to 0
# Two sample U test
wilcox.test(sample.1, sample.2)
##
## Wilcoxon rank sum test with continuity correction
##
## data: sample.1 and sample.2
## W = 19840, p-value = 0.8906
## alternative hypothesis: true location shift is not equal to 0
# One sample U-test
wilcox.test(sample.1, exact = FALSE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: sample.1
## V = 20100, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
# using directional hypothesis
wilcox.test(sample.1, mu = 8, exact = FALSE, conf.int = TRUE, alt = 'less')
##
## Wilcoxon signed rank test with continuity correction
##
## data: sample.1
## V = 0, p-value < 2.2e-16
## alternative hypothesis: true location is less than 8
## 95 percent confidence interval:
## -Inf 0.8343854
## sample estimates:
## (pseudo)median
## 0.8087367
# formula syntax and subsetting samples in the U test
sample.3 <- sample(data$Amt.Repaid.at.6.Months, size = 200)
sample.4 <- sample(data$Amt.Repaid.at.6.Months, size = 200)
# Paired t and u tests
wilcox.test(sample.2, sample.1, exact = FALSE, paired = TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: sample.2 and sample.1
## V = 10442, p-value = 0.5461
## alternative hypothesis: true location shift is not equal to 0
# Correlation and Covariance
cor.test(data$PRSM,data$Total.Amt.to.be.Repaid)
##
## Pearson's product-moment correlation
##
## data: data$PRSM and data$Total.Amt.to.be.Repaid
## t = 4.8445, df = 625, p-value = 1.603e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1136329 0.2646002
## sample estimates:
## cor
## 0.190241
# Simple correlation
cor(data$PRSM, data$Total.Amt.to.be.Repaid, method='spearman')
## [1] 0.155926
# Covariance
cov(sample.1,sample.2)
## [1] 0.0003811295
# significance testing in correlation tests
cor.test(sample.1,sample.2)
##
## Pearson's product-moment correlation
##
## data: sample.1 and sample.2
## t = 0.12409, df = 198, p-value = 0.9014
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1300821 0.1473788
## sample estimates:
## cor
## 0.008818053
# Tests for Association
# Multiple categories Chi square test
chisq.test(data$PRSM)
## Warning in chisq.test(data$PRSM): Chi-squared approximation may be
## incorrect
##
## Chi-squared test for given probabilities
##
## data: data$PRSM
## X-squared = 38.149, df = 626, p-value = 1
# Monte Carlo Simulation
chisq.test(data$PRSM, simulate.p.value = TRUE, B = 2500)
## Warning in matrix(sample.int(nx, B * n, TRUE, prob = p), nrow = n): data
## length [1273843] is not a sub-multiple or multiple of the number of rows
## [509]
##
## Chi-squared test for given probabilities with simulated p-value
## (based on 2500 replicates)
##
## data: data$PRSM
## X-squared = 38.149, df = NA, p-value = 1.001
# Yates correction for 2 n 2 tables
chisq.test(data$PRSM, correct = TRUE)
## Warning in chisq.test(data$PRSM, correct = TRUE): Chi-squared approximation
## may be incorrect
##
## Chi-squared test for given probabilities
##
## data: data$PRSM
## X-squared = 38.149, df = 626, p-value = 1
# Single Category Goodness of Fit tests
chisq.test(sample.1, p=sample.2, rescale.p=T)
## Warning in chisq.test(sample.1, p = sample.2, rescale.p = T): Chi-squared
## approximation may be incorrect
##
## Chi-squared test for given probabilities
##
## data: sample.1
## X-squared = 25.281, df = 199, p-value = 1