Basic functions and operations on data in R

Basic functions and operations in R

# Load the libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(fBasics)

## Loading required package: timeDate

## Loading required package: timeSeries

##

## Rmetrics Package fBasics

## Analysing Markets and calculating Basic Statistics

## Copyright (C) 2005-2014 Rmetrics Association Zurich

## Educational Software for Financial Engineering and Computational Science

## Rmetrics is free software and comes with ABSOLUTELY NO WARRANTY.

## https://www.rmetrics.org --- Mail to: info@rmetrics.org

# Read the CSV file
data <- read.csv('C:/Users/Abhay/Desktop/1.csv')

## Show data
head(data)

##   CaseNumber Amt.Repaid.at.6.Months Nominal.Loan.Amount
## 1  344012260                   7879               19200
## 2  378114786                  19392               37000
## 3  351015636                  14253               26600
## 4   79719718                   6934               15600
## 5  104110253                   6199               18400
## 6  371413226                  21268               40000
##   Total.Amt.to.be.Repaid   PRSM Repayment.Percentage Commission.Upfront
## 1                  22656 0.6955                 12.5            1344.00
## 2                  43475 0.8921                 12.5            1498.50
## 3                  31654 0.9005                 12.5            3670.80
## 4                  18330 0.7566                 12.5             421.20
## 5                  22816 0.5434                 12.5            3540.16
## 6                  49000 0.8681                 14.0             800.00
##   Validated.Monthly.Batch Historical.Monthly.Credit.Card.Receipts
## 1                18711.92                                21963.00
## 2                36229.58                                36230.11
## 3                20988.40                                20988.50
## 4                19360.00                                21128.00
## 5                23629.62                                15620.00
## 6                28714.96                                 7124.00
##   Loan.Type Loan.Size.Class FICO Years.In.Business Num.of.Credit.Lines
## 1         O              S2  550                10                  19
## 2         R              S3  501                 6                  28
## 3         O              S3  579                29                   9
## 4         R              S2  589                 6                  26
## 5         R              S2  645                 2                  46
## 6         R              S3  552                 4                  21
##   Num.of.Paid.off.Credit.Lines Current.Delinquent.Credit.Lines
## 1                            6                               5
## 2                           11                               3
## 3                            3                               1
## 4                           21                               2
## 5                           10                               5
## 6                            6                              12
##   Previous.Delinquent.Credit.Lines Business.Entity.Type Num.of.Trade.Lines
## 1                                9                  LLC                  0
## 2                               10                  LLC                  1
## 3                                2                  LLC                  1
## 4                               20                  LLC                  0
## 5                                5                  LLC                  7
## 6                                2                  LLC                  0
##   Num.of.Derog.Legal.Item Two.Digit.SIC.Code
## 1                       0                 55
## 2                       0                 58
## 3                       0                 58
## 4                      11                 58
## 5                       0                 51
## 6                       0                 58
##                 Two.Digit.SIC.Description Population.in.Zip.Code
## 1 Automotive Dealers and Service Stations                  36785
## 2              Eating and Drinking Places                  42503
## 3              Eating and Drinking Places                  38066
## 4              Eating and Drinking Places                  10243
## 5        Wholesale Trade-Nondurable Goods                  13388
## 6              Eating and Drinking Places                  41099
##   Average.House.Value.in.Zip.Code Income.Per.Household.in.Zip.... State
## 1                           69500                           29997    FL
## 2                          170400                           60296    MD
## 3                           92300                           47825    GA
## 4                          107700                           45672    PA
## 5                          137400                           56057    FL
## 6                          181200                           61097    IL
##       ISO.Name
## 1 Loan Masters
## 2 Loan Masters
## 3 Credit Divas
## 4 Loan Masters
## 5 Credit Divas
## 6 Loan Masters

# Read the training set and remove the outliers
data <- data[1:628,]
data <- data[-527,]

# Check if the data follows a normal distribution
qqnormPlot(data$Nominal.Loan.Amount)

qqnormPlot(data$Total.Amt.to.be.Repaid)

# Using the Student's t-test
t.test(data$Nominal.Loan.Amount, data$Total.Amt.to.be.Repaid)

## 
##  Welch Two Sample t-test
## 
## data:  data$Nominal.Loan.Amount and data$Total.Amt.to.be.Repaid
## t = -2.4293, df = 1215.8, p-value = 0.01527
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -13313.23  -1416.90
## sample estimates:
## mean of x mean of y 
##  37372.30  44737.36

# Two sample t-test with Unequal Variance
t.test(data$Nominal.Loan.Amount, data$Total.Amt.to.be.Repaid, var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  data$Nominal.Loan.Amount and data$Total.Amt.to.be.Repaid
## t = -2.4293, df = 1215.8, p-value = 0.01527
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -13313.23  -1416.90
## sample estimates:
## mean of x mean of y 
##  37372.30  44737.36

# Two sample t-test with Equal Variance
t.test(data$Nominal.Loan.Amount, data$Total.Amt.to.be.Repaid, var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  data$Nominal.Loan.Amount and data$Total.Amt.to.be.Repaid
## t = -2.4293, df = 1252, p-value = 0.01527
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -13313.059  -1417.072
## sample estimates:
## mean of x mean of y 
##  37372.30  44737.36

# One sample t-testing
t.test(data$PRSM, mu=1.5)

## 
##  One Sample t-test
## 
## data:  data$PRSM
## t = -77.339, df = 626, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 1.5
## 95 percent confidence interval:
##  0.7952067 0.8301120
## sample estimates:
## mean of x 
## 0.8126593

# Using Directional Hypothesis
t.test(data$PRSM, mu=1.5, alternative='less')

## 
##  One Sample t-test
## 
## data:  data$PRSM
## t = -77.339, df = 626, p-value < 2.2e-16
## alternative hypothesis: true mean is less than 1.5
## 95 percent confidence interval:
##       -Inf 0.8272994
## sample estimates:
## mean of x 
## 0.8126593

# The Wilcoxon U-Test (Mann-Whitney)
sample.1 <- sample(data$PRSM, size = 200)
sample.2 <- sample(data$PRSM, size = 200)
wilcox.test(sample.1, sample.2)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sample.1 and sample.2
## W = 19840, p-value = 0.8906
## alternative hypothesis: true location shift is not equal to 0

# Two sample U test
wilcox.test(sample.1, sample.2)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sample.1 and sample.2
## W = 19840, p-value = 0.8906
## alternative hypothesis: true location shift is not equal to 0

# One sample U-test
wilcox.test(sample.1, exact = FALSE)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  sample.1
## V = 20100, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0

# using directional hypothesis
wilcox.test(sample.1, mu = 8, exact = FALSE, conf.int = TRUE, alt = 'less')

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  sample.1
## V = 0, p-value < 2.2e-16
## alternative hypothesis: true location is less than 8
## 95 percent confidence interval:
##       -Inf 0.8343854
## sample estimates:
## (pseudo)median 
##      0.8087367

# formula syntax and subsetting samples in the U test
sample.3 <- sample(data$Amt.Repaid.at.6.Months, size = 200)
sample.4 <- sample(data$Amt.Repaid.at.6.Months, size = 200)

# Paired t and u tests
wilcox.test(sample.2, sample.1, exact = FALSE, paired = TRUE)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  sample.2 and sample.1
## V = 10442, p-value = 0.5461
## alternative hypothesis: true location shift is not equal to 0

# Correlation and Covariance
cor.test(data$PRSM,data$Total.Amt.to.be.Repaid)

## 
##  Pearson's product-moment correlation
## 
## data:  data$PRSM and data$Total.Amt.to.be.Repaid
## t = 4.8445, df = 625, p-value = 1.603e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1136329 0.2646002
## sample estimates:
##      cor 
## 0.190241

# Simple correlation
cor(data$PRSM, data$Total.Amt.to.be.Repaid, method='spearman')

## [1] 0.155926

# Covariance
cov(sample.1,sample.2)

## [1] 0.0003811295

# significance testing in correlation tests
cor.test(sample.1,sample.2)

## 
##  Pearson's product-moment correlation
## 
## data:  sample.1 and sample.2
## t = 0.12409, df = 198, p-value = 0.9014
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1300821  0.1473788
## sample estimates:
##         cor 
## 0.008818053

# Tests for Association
# Multiple categories Chi square test
chisq.test(data$PRSM)

## Warning in chisq.test(data$PRSM): Chi-squared approximation may be
## incorrect

## 
##  Chi-squared test for given probabilities
## 
## data:  data$PRSM
## X-squared = 38.149, df = 626, p-value = 1

# Monte Carlo Simulation
chisq.test(data$PRSM, simulate.p.value = TRUE, B = 2500)

## Warning in matrix(sample.int(nx, B * n, TRUE, prob = p), nrow = n): data
## length [1273843] is not a sub-multiple or multiple of the number of rows
## [509]

## 
##  Chi-squared test for given probabilities with simulated p-value
##  (based on 2500 replicates)
## 
## data:  data$PRSM
## X-squared = 38.149, df = NA, p-value = 1.001

# Yates correction for 2 n 2 tables
chisq.test(data$PRSM, correct = TRUE)

## Warning in chisq.test(data$PRSM, correct = TRUE): Chi-squared approximation
## may be incorrect

## 
##  Chi-squared test for given probabilities
## 
## data:  data$PRSM
## X-squared = 38.149, df = 626, p-value = 1

# Single Category Goodness of Fit tests
chisq.test(sample.1, p=sample.2, rescale.p=T)

## Warning in chisq.test(sample.1, p = sample.2, rescale.p = T): Chi-squared
## approximation may be incorrect

## 
##  Chi-squared test for given probabilities
## 
## data:  sample.1
## X-squared = 25.281, df = 199, p-value = 1

Basic functions and operations on data in R

Abhay Padda

7 January 2017

Basic functions and operations in R