Introduction

Since the original data set was split into 9 data sets on Github, I load each data set individually into R and then combine them into one large data set titled bankLoan.

loan01 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational01.csv", header = TRUE)[, -1]
loan02 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational02.csv", header = TRUE)[, -1]
loan03 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational03.csv", header = TRUE)[, -1]
loan04 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational04.csv", header = TRUE)[, -1]
loan05 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational05.csv", header = TRUE)[, -1]
loan06 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational06.csv", header = TRUE)[, -1]
loan07 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational07.csv", header = TRUE)[, -1]
loan08 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational08.csv", header = TRUE)[, -1]
loan09 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational09.csv", header = TRUE)[, -1]
bankLoan = rbind(loan01, loan02, loan03, loan04, loan05, loan06, loan07, loan08, loan09)

Choosing our Stratification Variable

Next, I decide to use ApprovalFY for my stratification variable. ApprovalFY is the fiscal year of commitment for each observation.

Below is the breakdown of the last 2 digits of the ApprovalFY year and their distributions across the data set.

ApprovalFY.4.digits = substr(bankLoan$ApprovalFY, 1, 4)
bankLoan$ApprovalFY4Digit = ApprovalFY.4.digits
ftable = table(bankLoan$ApprovalFY4Digit)
kable(t(ftable))
1962 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
1 1 1 2 2 4 8 20 27 52 42 30 84 137 242 352 477 630 719 1684 2022 1944 2118 2218 1898 13248 14859 15666 20885 23305 31598 45758 40112 37748 36016 37363 37381 37350 44391 58193 68290 77525 76040 71876 39540 19126 16848 12608 5997 2458 268

As you can see above, several years have relatively small sizes (particularly before the 1990’s). Therefore, I will combine the observations from the 1960’s, 1970’s, and 1980’s into a new category called 1989.

cate.pre.90s = c("1962", "1965", "1966", "1967", "1968", "1969", "1970", "1971", "1972", "1973", "1974", "1975", "1976", "1977", "1978", "1979", "1980", "1981", "1982", "1983", "1984", "1985", "1986", "1987", "1988", "1989")

ApprovalFY4Digit0 = bankLoan$ApprovalFY4Digit
ApprovalFY4Digit = bankLoan$ApprovalFY4Digit

logic.pre.90s = ApprovalFY4Digit %in% cate.pre.90s
ApprovalFY4Digit[logic.pre.90s] = 1989

bankLoan$strApprovalFY = ApprovalFY4Digit
x.table = table(bankLoan$strApprovalFY, bankLoan$MIS_Status)

no.lab = x.table[,1]
default = x.table[,2]
no.default = x.table[,3]
default.rate = round(100*default/(default+no.default),1)
default.status.rate = cbind(no.lab = no.lab, default = default, no.default = no.default, default.rate = default.rate)

year.name = c("1989 and Before", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999", "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014")

#rownames(default.status.rate) = year.name
kable(default.status.rate)
no.lab default no.default default.rate
1989 84 7791 20088 27.9
1990 0 663 14196 4.5
1991 6 443 15217 2.8
1992 10 450 20425 2.2
1993 6 434 22865 1.9
1994 14 696 30888 2.2
1995 70 1296 44392 2.8
1996 91 1646 38375 4.1
1997 30 2247 35471 6.0
1998 11 2970 33035 8.2
1999 15 3694 33654 9.9
2000 29 4266 33086 11.4
2001 33 4450 32867 11.9
2002 84 5187 39120 11.7
2003 193 8425 49575 14.5
2004 95 12306 55889 18.0
2005 567 19479 57479 25.3
2006 284 26517 49239 35.0
2007 227 30658 40991 42.8
2008 82 16250 23208 41.2
2009 23 3971 15132 20.8
2010 20 2312 14516 13.7
2011 15 989 11604 7.9
2012 5 343 5649 5.7
2013 3 70 2385 2.9
2014 0 5 263 1.9

As seen above, the observations from the 1960’s, 1970’s, and 1980’s have been combined into a new category called 1989.

study.pop = bankLoan
kable(t(table(study.pop$strApprovalFY)))
1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
27963 14859 15666 20885 23305 31598 45758 40112 37748 36016 37363 37381 37350 44391 58193 68290 77525 76040 71876 39540 19126 16848 12608 5997 2458 268

We have now defined our study population and can move on to sampling!

Type 1: Simple Random Sampling

study.pop$sampling.frame = 1:length(study.pop$GrAppv)

sampled.list = sample(1:length(study.pop$GrAppv), 4000)
SRS.sample = study.pop[sampled.list,]

dimension.SRS = dim(SRS.sample)
names(dimension.SRS) = c("Size", "Variable.Count")
kable(t(dimension.SRS))
Size Variable.Count
4000 30

Above I defined a sampling list and added it to the study population for simple random sampling.

Type 2: Systematic Sampling

jump.size = dim(study.pop)[1]%/%4000

random.starting.point = sample(1:jump.size, 1)
sampling.id = seq(random.starting.point, dim(study.pop)[1], jump.size)

sys.sample = study.pop[sampling.id,]
sys.sample.dim = dim(sys.sample)

names(sys.sample.dim) = c("Size", "Variable.Count")
kable(t(sys.sample.dim))
Size Variable.Count
4014 30

Since the jump size involves rounding error and we have a large population, the actual systematic sample size is slightly different from the target size of 4000.

Type 3: Stratified Sampling

First, I calculate the SRS size for each stratum and then take the SRS from the corresponding stratum.

freq.table = table(study.pop$strApprovalFY)
rel.freq = freq.table/sum(freq.table)
strata.size = round(rel.freq*4000)
strata.names = names(strata.size)
kable(t(strata.size))
1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
124 66 70 93 104 141 204 178 168 160 166 166 166 197 259 304 345 338 320 176 85 75 56 27 11 1

Next, I take stratified samples.

strata.sample = study.pop[1,]
strata.sample$add.id = 1

for (i in 1:length(strata.names)){
  ith.strata.names = strata.names[i]
  ith.strata.size = strata.size[i]
  
  ith.sampling.id = which(study.pop$strApprovalFY == ith.strata.names)
  ith.strata = study.pop[ith.sampling.id,]
  ith.strata$add.id = 1:dim(ith.strata)[1]
  
  ith.sampling.id = sample(1:dim(ith.strata)[1], ith.strata.size)
  
  ith.sample = ith.strata[ith.strata$add.id %in% ith.sampling.id,]
  strata.sample = rbind(strata.sample, ith.sample)
}

strat.sample.final = strata.sample[-1,]

kable(head(strat.sample.final))
LoanNr_ChkDgt Name City State Zip Bank BankState NAICS ApprovalDate ApprovalFY Term NoEmp NewExist CreateJob RetainedJob FranchiseCode UrbanRural RevLineCr LowDoc ChgOffDate DisbursementDate DisbursementGross BalanceGross MIS_Status ChgOffPrinGr GrAppv SBA_Appv ApprovalFY4Digit strApprovalFY sampling.frame add.id
44 1000653000 LARRY SCHOETTMER FORD INC EDINBURGH IN 46124 JPMORGAN CHASE BANK NATL ASSOC IN 0 11-Jun-80 1980 120 16 2 0 0 0 0 Y N 4-Oct-89 31-Jul-80 $197,485.00 $0.00 CHGOFF $44,374.00 $200,000.00 $150,000.00 1980 1989 44 2
11342 1081373000 GREENHILL NURSERY & LANDSCAPE JOHNSON CITY TN 37601 FEDERAL DEPOSIT INSUR CORP TN 0 3-Oct-80 1981 25 12 1 0 0 0 0 N N 14-Dec-88 6-Jan-81 $150,000.00 $0.00 CHGOFF $29,573.00 $150,000.00 $135,000.00 1981 1989 11342 162
18583 1136093008 IDEAL PROFESSIONAL COLOR LAB MOUNTLAKE TERRACE WA 98043 BANK OF AMERICA NATL ASSOC NC 0 30-Dec-80 1981 102 15 2 0 0 0 0 N N 17-May-91 17-Feb-81 $150,000.00 $0.00 CHGOFF $19,662.00 $150,000.00 $135,000.00 1981 1989 18583 258
32124 1242433007 MADERA FOOD MART MADERA CA 93637 WESTAMERICA BANK CA 0 3-Jun-81 1981 99 35 1 0 0 0 0 N N 23-Jun-93 8-Jul-81 $360,000.00 $0.00 CHGOFF $1,954.00 $360,000.00 $324,000.00 1981 1989 32124 467
45751 1354643005 ASTRO JOHNNY/TYLER FIBLS.SYS. TYLERR TX 75710 FEDERAL DEPOSIT INSUR CORP DC 0 3-Mar-82 1982 66 6 1 0 0 0 0 N N 27-Sep-91 30-Mar-82 $110,000.00 $0.00 CHGOFF $82,666.00 $110,000.00 $97,900.00 1982 1989 45751 737
48541 1375853008 J L MOORE BLDG CONTRACTOR INC ODESSA TX 79760 LOANS FROM OLD CLOSED LENDERS DC 0 29-Apr-82 1982 240 21 1 0 0 0 0 N N 25-Jan-89 21-May-82 $380,000.00 $0.00 CHGOFF $190,319.00 $380,000.00 $342,000.00 1982 1989 48541 828

As seen above, the stratified sample is taken and completed!

Performance Analysis

I use the population level industry specific rates as a reference and compare them to the sample level industry rates. Below is the same table that I created from above (for reference).

x.table = table(bankLoan$strApprovalFY, bankLoan$MIS_Status)

no.lab = x.table[,1]
default = x.table[,2]
no.default = x.table[,3]
default.rate = round(100*default/(default+no.default),1)
default.status.rate = cbind(no.lab = no.lab, default = default, no.default = no.default, default.rate = default.rate)

rownames(default.status.rate) = year.name
kable(default.status.rate)
no.lab default no.default default.rate
1989 and Before 84 7791 20088 27.9
1990 0 663 14196 4.5
1991 6 443 15217 2.8
1992 10 450 20425 2.2
1993 6 434 22865 1.9
1994 14 696 30888 2.2
1995 70 1296 44392 2.8
1996 91 1646 38375 4.1
1997 30 2247 35471 6.0
1998 11 2970 33035 8.2
1999 15 3694 33654 9.9
2000 29 4266 33086 11.4
2001 33 4450 32867 11.9
2002 84 5187 39120 11.7
2003 193 8425 49575 14.5
2004 95 12306 55889 18.0
2005 567 19479 57479 25.3
2006 284 26517 49239 35.0
2007 227 30658 40991 42.8
2008 82 16250 23208 41.2
2009 23 3971 15132 20.8
2010 20 2312 14516 13.7
2011 15 989 11604 7.9
2012 5 343 5649 5.7
2013 3 70 2385 2.9
2014 0 5 263 1.9

Simple Random Sampling Check

Below, I create the following table based on the Simple Random Sampling.

x.table = table(SRS.sample$strApprovalFY, SRS.sample$MIS_Status)

no.lab.srs = x.table[,1]
default.srs = x.table[,2]
no.default.srs = x.table[,3]
default.rate.srs = round(100*default.srs/(default.srs+no.default.srs),1)

year.code = names(default.rate.srs)

default.rate.pop = default.rate[year.code]

SRS.pop.rates = cbind(default.rate.pop, default.rate.srs)
rownames(SRS.pop.rates) = year.name
kable(SRS.pop.rates)
default.rate.pop default.rate.srs
1989 and Before 27.9 29.1
1990 4.5 1.8
1991 2.8 2.9
1992 2.2 2.2
1993 1.9 0.0
1994 2.2 1.4
1995 2.8 4.4
1996 4.1 3.0
1997 6.0 5.4
1998 8.2 5.6
1999 9.9 7.6
2000 11.4 9.7
2001 11.9 10.8
2002 11.7 11.2
2003 14.5 13.9
2004 18.0 18.6
2005 25.3 24.7
2006 35.0 34.7
2007 42.8 42.6
2008 41.2 41.6
2009 20.8 23.8
2010 13.7 15.6
2011 7.9 7.3
2012 5.7 15.6
2013 2.9 0.0
2014 1.9 0.0

Some of the years are similar but others are significantly different. There could be a better sampling technique for our data set. I will continue to compare sampling techniques below.

Systematic Sampling Check

Below, I create the following table based on the Systematic Sampling.

x.table = table(sys.sample$strApprovalFY, sys.sample$MIS_Status)

no.lab.sys = x.table[,1]
default.sys = x.table[,2]
no.default.sys = x.table[,3]
default.rate.sys = round(100*default.sys/(default.sys + no.default.sys), 1)
sys.SRS.pop.rates = cbind(default.rate.pop, default.rate.srs, default.rate.sys)

rownames(SRS.pop.rates) = year.name
kable(sys.SRS.pop.rates)
default.rate.pop default.rate.srs default.rate.sys
1989 27.9 29.1 29.7
1990 4.5 1.8 1.5
1991 2.8 2.9 0.0
1992 2.2 2.2 3.5
1993 1.9 0.0 0.0
1994 2.2 1.4 2.9
1995 2.8 4.4 2.0
1996 4.1 3.0 3.4
1997 6.0 5.4 7.1
1998 8.2 5.6 9.6
1999 9.9 7.6 10.9
2000 11.4 9.7 12.0
2001 11.9 10.8 9.1
2002 11.7 11.2 9.6
2003 14.5 13.9 12.6
2004 18.0 18.6 15.6
2005 25.3 24.7 27.1
2006 35.0 34.7 35.1
2007 42.8 42.6 40.6
2008 41.2 41.6 42.0
2009 20.8 23.8 24.1
2010 13.7 15.6 14.5
2011 7.9 7.3 7.9
2012 5.7 15.6 8.3
2013 2.9 0.0 0.0
2014 1.9 0.0 0.0

It seems that the systematic sample performs similarly to the SRS sample. I will do one last comparison next.

Stratified Sampling Check

In this final comparison, I put all three sampling techniques into one table

x.table = table(strat.sample.final$strApprovalFY, strat.sample.final$MIS_Status)

no.lab.str = x.table[,1]
default.str = x.table[,2]
no.default.str = x.table[,3]
default.rate.str = round(100*default.str/(default.str + no.default.str),1)
str.SRS.pop.rates = cbind(default.rate.pop, default.rate.srs, default.rate.sys, default.rate.str)

rownames(str.SRS.pop.rates) = year.name
kable(str.SRS.pop.rates)
default.rate.pop default.rate.srs default.rate.sys default.rate.str
1989 and Before 27.9 29.1 29.7 26.6
1990 4.5 1.8 1.5 3.0
1991 2.8 2.9 0.0 4.3
1992 2.2 2.2 3.5 2.2
1993 1.9 0.0 0.0 1.9
1994 2.2 1.4 2.9 2.8
1995 2.8 4.4 2.0 2.0
1996 4.1 3.0 3.4 2.8
1997 6.0 5.4 7.1 5.4
1998 8.2 5.6 9.6 9.4
1999 9.9 7.6 10.9 13.9
2000 11.4 9.7 12.0 11.4
2001 11.9 10.8 9.1 12.0
2002 11.7 11.2 9.6 11.2
2003 14.5 13.9 12.6 17.4
2004 18.0 18.6 15.6 16.8
2005 25.3 24.7 27.1 26.9
2006 35.0 34.7 35.1 36.5
2007 42.8 42.6 40.6 43.4
2008 41.2 41.6 42.0 41.5
2009 20.8 23.8 24.1 18.8
2010 13.7 15.6 14.5 12.0
2011 7.9 7.3 7.9 5.4
2012 5.7 15.6 8.3 3.8
2013 2.9 0.0 0.0 0.0
2014 1.9 0.0 0.0 0.0

Visualization

Now I create a statistical graphic to compare the default rates among the samples.

n = length(default.rate.pop)

plot(NULL, xlim = c(0,n), ylim=c(0,60), xlab = "Approval Year", ylab = "Default Rates (Percentage)", axes = FALSE)

title("Comparison of Industry-Specific Default Rates")
points(1:n, as.vector(default.rate.pop), pch = 16, col = "blue4", cex = 0.8)
lines(1:n, as.vector(default.rate.pop), lty = 1, col = "blue4", cex = 0.8)

points(1:n, as.vector(default.rate.srs), pch = 17, col = "blue", cex = 0.8)
lines(1:n, as.vector(default.rate.srs), lty = 2, col = "blue", cex = 0.8)

points(1:n, as.vector(default.rate.sys), pch = 19, col = "deepskyblue", cex = 0.8)
lines(1:n, as.vector(default.rate.sys), lty = 3, col = "deepskyblue", cex = 0.8)

points(1:n, as.vector(default.rate.str), pch = 20, col = "cyan3", cex = 0.8)
lines(1:n, as.vector(default.rate.str), lty = 4, col = "cyan3", cex = 0.8)

axis(1, at = 1:n, label = year.code)
axis(2, las = 2)

clr = c("blue4", "blue", "deepskyblue", "cyan3")
rowMax = apply(str.SRS.pop.rates, 1, max)
segments(1:n, rep(0,n), 1:n, rowMax, lty = 2, col = "lightgray", lwd = 0.5)
legend("topleft", c("Population", "Simple Random Sampling", "Systematic Sampling", "Stratified Sampling"), lty = 1:4, col = clr, pch = c(16, 17, 19, 20), cex = 0.6, bty = "n")

The following table is used for ease of interpretation

kable(cbind(year.name, year.code))
year.name year.code
1989 and Before 1989
1990 1990
1991 1991
1992 1992
1993 1993
1994 1994
1995 1995
1996 1996
1997 1997
1998 1998
1999 1999
2000 2000
2001 2001
2002 2002
2003 2003
2004 2004
2005 2005
2006 2006
2007 2007
2008 2008
2009 2009
2010 2010
2011 2011
2012 2012
2013 2013
2014 2014