Assignment #4

Introduction

Since the original data set was split into 9 data sets on Github, I load each data set individually into R and then combine them into one large data set titled bankLoan.

loan01 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational01.csv", header = TRUE)[, -1]
loan02 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational02.csv", header = TRUE)[, -1]
loan03 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational03.csv", header = TRUE)[, -1]
loan04 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational04.csv", header = TRUE)[, -1]
loan05 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational05.csv", header = TRUE)[, -1]
loan06 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational06.csv", header = TRUE)[, -1]
loan07 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational07.csv", header = TRUE)[, -1]
loan08 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational08.csv", header = TRUE)[, -1]
loan09 = read.csv("https://pengdsci.github.io/datasets/w06-SBAnational09.csv", header = TRUE)[, -1]
bankLoan = rbind(loan01, loan02, loan03, loan04, loan05, loan06, loan07, loan08, loan09)

Choosing our Stratification Variable

Next, I decide to use ApprovalFY for my stratification variable. ApprovalFY is the fiscal year of commitment for each observation.

Below is the breakdown of the last 2 digits of the ApprovalFY year and their distributions across the data set.

ApprovalFY.4.digits = substr(bankLoan$ApprovalFY, 1, 4)
bankLoan$ApprovalFY4Digit = ApprovalFY.4.digits
ftable = table(bankLoan$ApprovalFY4Digit)
kable(t(ftable))

1962	1965	1966	1967	1968	1969	1970	1971	1972	1973	1974	1975	1976	1977	1978	1979	1980	1981	1982	1983	1984	1985	1986	1987	1988	1989	1990	1991	1992	1993	1994	1995	1996	1997	1998	1999	2000	2001	2002	2003	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013	2014
1	1	1	2	2	4	8	20	27	52	42	30	84	137	242	352	477	630	719	1684	2022	1944	2118	2218	1898	13248	14859	15666	20885	23305	31598	45758	40112	37748	36016	37363	37381	37350	44391	58193	68290	77525	76040	71876	39540	19126	16848	12608	5997	2458	268

As you can see above, several years have relatively small sizes (particularly before the 1990’s). Therefore, I will combine the observations from the 1960’s, 1970’s, and 1980’s into a new category called 1989.

cate.pre.90s = c("1962", "1965", "1966", "1967", "1968", "1969", "1970", "1971", "1972", "1973", "1974", "1975", "1976", "1977", "1978", "1979", "1980", "1981", "1982", "1983", "1984", "1985", "1986", "1987", "1988", "1989")

ApprovalFY4Digit0 = bankLoan$ApprovalFY4Digit
ApprovalFY4Digit = bankLoan$ApprovalFY4Digit

logic.pre.90s = ApprovalFY4Digit %in% cate.pre.90s
ApprovalFY4Digit[logic.pre.90s] = 1989

bankLoan$strApprovalFY = ApprovalFY4Digit

x.table = table(bankLoan$strApprovalFY, bankLoan$MIS_Status)

no.lab = x.table[,1]
default = x.table[,2]
no.default = x.table[,3]
default.rate = round(100*default/(default+no.default),1)
default.status.rate = cbind(no.lab = no.lab, default = default, no.default = no.default, default.rate = default.rate)

year.name = c("1989 and Before", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999", "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014")

#rownames(default.status.rate) = year.name
kable(default.status.rate)

	no.lab	default	no.default	default.rate
1989	84	7791	20088	27.9
1990	0	663	14196	4.5
1991	6	443	15217	2.8
1992	10	450	20425	2.2
1993	6	434	22865	1.9
1994	14	696	30888	2.2
1995	70	1296	44392	2.8
1996	91	1646	38375	4.1
1997	30	2247	35471	6.0
1998	11	2970	33035	8.2
1999	15	3694	33654	9.9
2000	29	4266	33086	11.4
2001	33	4450	32867	11.9
2002	84	5187	39120	11.7
2003	193	8425	49575	14.5
2004	95	12306	55889	18.0
2005	567	19479	57479	25.3
2006	284	26517	49239	35.0
2007	227	30658	40991	42.8
2008	82	16250	23208	41.2
2009	23	3971	15132	20.8
2010	20	2312	14516	13.7
2011	15	989	11604	7.9
2012	5	343	5649	5.7
2013	3	70	2385	2.9
2014	0	5	263	1.9

As seen above, the observations from the 1960’s, 1970’s, and 1980’s have been combined into a new category called 1989.

study.pop = bankLoan
kable(t(table(study.pop$strApprovalFY)))

1989	1990	1991	1992	1993	1994	1995	1996	1997	1998	1999	2000	2001	2002	2003	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013	2014
27963	14859	15666	20885	23305	31598	45758	40112	37748	36016	37363	37381	37350	44391	58193	68290	77525	76040	71876	39540	19126	16848	12608	5997	2458	268

We have now defined our study population and can move on to sampling!

Type 1: Simple Random Sampling

study.pop$sampling.frame = 1:length(study.pop$GrAppv)

sampled.list = sample(1:length(study.pop$GrAppv), 4000)
SRS.sample = study.pop[sampled.list,]

dimension.SRS = dim(SRS.sample)
names(dimension.SRS) = c("Size", "Variable.Count")
kable(t(dimension.SRS))

Size	Variable.Count
4000	30

Above I defined a sampling list and added it to the study population for simple random sampling.

Type 2: Systematic Sampling

jump.size = dim(study.pop)[1]%/%4000

random.starting.point = sample(1:jump.size, 1)
sampling.id = seq(random.starting.point, dim(study.pop)[1], jump.size)

sys.sample = study.pop[sampling.id,]
sys.sample.dim = dim(sys.sample)

names(sys.sample.dim) = c("Size", "Variable.Count")
kable(t(sys.sample.dim))

Size	Variable.Count
4014	30

Since the jump size involves rounding error and we have a large population, the actual systematic sample size is slightly different from the target size of 4000.

Type 3: Stratified Sampling

First, I calculate the SRS size for each stratum and then take the SRS from the corresponding stratum.

freq.table = table(study.pop$strApprovalFY)
rel.freq = freq.table/sum(freq.table)
strata.size = round(rel.freq*4000)
strata.names = names(strata.size)
kable(t(strata.size))

1989	1990	1991	1992	1993	1994	1995	1996	1997	1998	1999	2000	2001	2002	2003	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013	2014
124	66	70	93	104	141	204	178	168	160	166	166	166	197	259	304	345	338	320	176	85	75	56	27	11	1

Next, I take stratified samples.

strata.sample = study.pop[1,]
strata.sample$add.id = 1

for (i in 1:length(strata.names)){
  ith.strata.names = strata.names[i]
  ith.strata.size = strata.size[i]
  
  ith.sampling.id = which(study.pop$strApprovalFY == ith.strata.names)
  ith.strata = study.pop[ith.sampling.id,]
  ith.strata$add.id = 1:dim(ith.strata)[1]
  
  ith.sampling.id = sample(1:dim(ith.strata)[1], ith.strata.size)
  
  ith.sample = ith.strata[ith.strata$add.id %in% ith.sampling.id,]
  strata.sample = rbind(strata.sample, ith.sample)
}

strat.sample.final = strata.sample[-1,]

kable(head(strat.sample.final))

	LoanNr_ChkDgt	Name	City	State	Zip	Bank	BankState	ApprovalDate	ApprovalFY	Term	NoEmp	NewExist	RevLineCr	LowDoc	ChgOffDate	DisbursementDate	DisbursementGross	BalanceGross	MIS_Status	ChgOffPrinGr	GrAppv	SBA_Appv	ApprovalFY4Digit	strApprovalFY	sampling.frame	add.id
44	1000653000	LARRY SCHOETTMER FORD INC	EDINBURGH	IN	46124	JPMORGAN CHASE BANK NATL ASSOC	IN	11-Jun-80	1980	120	16	2	Y	N	4-Oct-89	31-Jul-80	$197,485.00	$0.00	CHGOFF	$44,374.00	$200,000.00	$150,000.00	1980	1989	44	2
11342	1081373000	GREENHILL NURSERY & LANDSCAPE	JOHNSON CITY	TN	37601	FEDERAL DEPOSIT INSUR CORP	TN	3-Oct-80	1981	25	12	1	N	N	14-Dec-88	6-Jan-81	$150,000.00	$0.00	CHGOFF	$29,573.00	$150,000.00	$135,000.00	1981	1989	11342	162
18583	1136093008	IDEAL PROFESSIONAL COLOR LAB	MOUNTLAKE TERRACE	WA	98043	BANK OF AMERICA NATL ASSOC	NC	30-Dec-80	1981	102	15	2	N	N	17-May-91	17-Feb-81	$150,000.00	$0.00	CHGOFF	$19,662.00	$150,000.00	$135,000.00	1981	1989	18583	258
32124	1242433007	MADERA FOOD MART	MADERA	CA	93637	WESTAMERICA BANK	CA	3-Jun-81	1981	99	35	1	N	N	23-Jun-93	8-Jul-81	$360,000.00	$0.00	CHGOFF	$1,954.00	$360,000.00	$324,000.00	1981	1989	32124	467
45751	1354643005	ASTRO JOHNNY/TYLER FIBLS.SYS.	TYLERR	TX	75710	FEDERAL DEPOSIT INSUR CORP	DC	3-Mar-82	1982	66	6	1	N	N	27-Sep-91	30-Mar-82	$110,000.00	$0.00	CHGOFF	$82,666.00	$110,000.00	$97,900.00	1982	1989	45751	737
48541	1375853008	J L MOORE BLDG CONTRACTOR INC	ODESSA	TX	79760	LOANS FROM OLD CLOSED LENDERS	DC	29-Apr-82	1982	240	21	1	N	N	25-Jan-89	21-May-82	$380,000.00	$0.00	CHGOFF	$190,319.00	$380,000.00	$342,000.00	1982	1989	48541	828

As seen above, the stratified sample is taken and completed!

Performance Analysis

I use the population level industry specific rates as a reference and compare them to the sample level industry rates. Below is the same table that I created from above (for reference).

x.table = table(bankLoan$strApprovalFY, bankLoan$MIS_Status)

no.lab = x.table[,1]
default = x.table[,2]
no.default = x.table[,3]
default.rate = round(100*default/(default+no.default),1)
default.status.rate = cbind(no.lab = no.lab, default = default, no.default = no.default, default.rate = default.rate)

rownames(default.status.rate) = year.name
kable(default.status.rate)

	no.lab	default	no.default	default.rate
1989 and Before	84	7791	20088	27.9
1990	0	663	14196	4.5
1991	6	443	15217	2.8
1992	10	450	20425	2.2
1993	6	434	22865	1.9
1994	14	696	30888	2.2
1995	70	1296	44392	2.8
1996	91	1646	38375	4.1
1997	30	2247	35471	6.0
1998	11	2970	33035	8.2
1999	15	3694	33654	9.9
2000	29	4266	33086	11.4
2001	33	4450	32867	11.9
2002	84	5187	39120	11.7
2003	193	8425	49575	14.5
2004	95	12306	55889	18.0
2005	567	19479	57479	25.3
2006	284	26517	49239	35.0
2007	227	30658	40991	42.8
2008	82	16250	23208	41.2
2009	23	3971	15132	20.8
2010	20	2312	14516	13.7
2011	15	989	11604	7.9
2012	5	343	5649	5.7
2013	3	70	2385	2.9
2014	0	5	263	1.9

Simple Random Sampling Check

Below, I create the following table based on the Simple Random Sampling.

x.table = table(SRS.sample$strApprovalFY, SRS.sample$MIS_Status)

no.lab.srs = x.table[,1]
default.srs = x.table[,2]
no.default.srs = x.table[,3]
default.rate.srs = round(100*default.srs/(default.srs+no.default.srs),1)

year.code = names(default.rate.srs)

default.rate.pop = default.rate[year.code]

SRS.pop.rates = cbind(default.rate.pop, default.rate.srs)
rownames(SRS.pop.rates) = year.name
kable(SRS.pop.rates)

	default.rate.pop	default.rate.srs
1989 and Before	27.9	29.1
1990	4.5	1.8
1991	2.8	2.9
1992	2.2	2.2
1993	1.9	0.0
1994	2.2	1.4
1995	2.8	4.4
1996	4.1	3.0
1997	6.0	5.4
1998	8.2	5.6
1999	9.9	7.6
2000	11.4	9.7
2001	11.9	10.8
2002	11.7	11.2
2003	14.5	13.9
2004	18.0	18.6
2005	25.3	24.7
2006	35.0	34.7
2007	42.8	42.6
2008	41.2	41.6
2009	20.8	23.8
2010	13.7	15.6
2011	7.9	7.3
2012	5.7	15.6
2013	2.9	0.0
2014	1.9	0.0

Some of the years are similar but others are significantly different. There could be a better sampling technique for our data set. I will continue to compare sampling techniques below.

Systematic Sampling Check

Below, I create the following table based on the Systematic Sampling.

x.table = table(sys.sample$strApprovalFY, sys.sample$MIS_Status)

no.lab.sys = x.table[,1]
default.sys = x.table[,2]
no.default.sys = x.table[,3]
default.rate.sys = round(100*default.sys/(default.sys + no.default.sys), 1)
sys.SRS.pop.rates = cbind(default.rate.pop, default.rate.srs, default.rate.sys)

rownames(SRS.pop.rates) = year.name
kable(sys.SRS.pop.rates)

	default.rate.pop	default.rate.srs	default.rate.sys
1989	27.9	29.1	29.7
1990	4.5	1.8	1.5
1991	2.8	2.9	0.0
1992	2.2	2.2	3.5
1993	1.9	0.0	0.0
1994	2.2	1.4	2.9
1995	2.8	4.4	2.0
1996	4.1	3.0	3.4
1997	6.0	5.4	7.1
1998	8.2	5.6	9.6
1999	9.9	7.6	10.9
2000	11.4	9.7	12.0
2001	11.9	10.8	9.1
2002	11.7	11.2	9.6
2003	14.5	13.9	12.6
2004	18.0	18.6	15.6
2005	25.3	24.7	27.1
2006	35.0	34.7	35.1
2007	42.8	42.6	40.6
2008	41.2	41.6	42.0
2009	20.8	23.8	24.1
2010	13.7	15.6	14.5
2011	7.9	7.3	7.9
2012	5.7	15.6	8.3
2013	2.9	0.0	0.0
2014	1.9	0.0	0.0

It seems that the systematic sample performs similarly to the SRS sample. I will do one last comparison next.

Stratified Sampling Check

In this final comparison, I put all three sampling techniques into one table

x.table = table(strat.sample.final$strApprovalFY, strat.sample.final$MIS_Status)

no.lab.str = x.table[,1]
default.str = x.table[,2]
no.default.str = x.table[,3]
default.rate.str = round(100*default.str/(default.str + no.default.str),1)
str.SRS.pop.rates = cbind(default.rate.pop, default.rate.srs, default.rate.sys, default.rate.str)

rownames(str.SRS.pop.rates) = year.name
kable(str.SRS.pop.rates)

	default.rate.pop	default.rate.srs	default.rate.sys	default.rate.str
1989 and Before	27.9	29.1	29.7	26.6
1990	4.5	1.8	1.5	3.0
1991	2.8	2.9	0.0	4.3
1992	2.2	2.2	3.5	2.2
1993	1.9	0.0	0.0	1.9
1994	2.2	1.4	2.9	2.8
1995	2.8	4.4	2.0	2.0
1996	4.1	3.0	3.4	2.8
1997	6.0	5.4	7.1	5.4
1998	8.2	5.6	9.6	9.4
1999	9.9	7.6	10.9	13.9
2000	11.4	9.7	12.0	11.4
2001	11.9	10.8	9.1	12.0
2002	11.7	11.2	9.6	11.2
2003	14.5	13.9	12.6	17.4
2004	18.0	18.6	15.6	16.8
2005	25.3	24.7	27.1	26.9
2006	35.0	34.7	35.1	36.5
2007	42.8	42.6	40.6	43.4
2008	41.2	41.6	42.0	41.5
2009	20.8	23.8	24.1	18.8
2010	13.7	15.6	14.5	12.0
2011	7.9	7.3	7.9	5.4
2012	5.7	15.6	8.3	3.8
2013	2.9	0.0	0.0	0.0
2014	1.9	0.0	0.0	0.0

Visualization

Now I create a statistical graphic to compare the default rates among the samples.

n = length(default.rate.pop)

plot(NULL, xlim = c(0,n), ylim=c(0,60), xlab = "Approval Year", ylab = "Default Rates (Percentage)", axes = FALSE)

title("Comparison of Industry-Specific Default Rates")
points(1:n, as.vector(default.rate.pop), pch = 16, col = "blue4", cex = 0.8)
lines(1:n, as.vector(default.rate.pop), lty = 1, col = "blue4", cex = 0.8)

points(1:n, as.vector(default.rate.srs), pch = 17, col = "blue", cex = 0.8)
lines(1:n, as.vector(default.rate.srs), lty = 2, col = "blue", cex = 0.8)

points(1:n, as.vector(default.rate.sys), pch = 19, col = "deepskyblue", cex = 0.8)
lines(1:n, as.vector(default.rate.sys), lty = 3, col = "deepskyblue", cex = 0.8)

points(1:n, as.vector(default.rate.str), pch = 20, col = "cyan3", cex = 0.8)
lines(1:n, as.vector(default.rate.str), lty = 4, col = "cyan3", cex = 0.8)

axis(1, at = 1:n, label = year.code)
axis(2, las = 2)

clr = c("blue4", "blue", "deepskyblue", "cyan3")
rowMax = apply(str.SRS.pop.rates, 1, max)
segments(1:n, rep(0,n), 1:n, rowMax, lty = 2, col = "lightgray", lwd = 0.5)
legend("topleft", c("Population", "Simple Random Sampling", "Systematic Sampling", "Stratified Sampling"), lty = 1:4, col = clr, pch = c(16, 17, 19, 20), cex = 0.6, bty = "n")

The following table is used for ease of interpretation

kable(cbind(year.name, year.code))

year.name	year.code
1989 and Before	1989
1990	1990
1991	1991
1992	1992
1993	1993
1994	1994
1995	1995
1996	1996
1997	1997
1998	1998
1999	1999
2000	2000
2001	2001
2002	2002
2003	2003
2004	2004
2005	2005
2006	2006
2007	2007
2008	2008
2009	2009
2010	2010
2011	2011
2012	2012
2013	2013
2014	2014