1 Introduction

We will be taking several samples from bank loan data. These will consist of a simple random sample, a systematic sample, a stratifid sample based on a categorical variable, and a cluster sample.

2 Reading in the Data

To begin, we read in the data for the data set. This dataset has 897154 observations on 27 variables.

loan01 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational01.csv", header = TRUE)[, -1]
loan02 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational02.csv", header = TRUE)[, -1]
loan03 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational03.csv", header = TRUE)[, -1]
loan04 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational04.csv", header = TRUE)[, -1]
loan05 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational05.csv", header = TRUE)[, -1]
loan06 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational06.csv", header = TRUE)[, -1]
loan07 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational07.csv", header = TRUE)[, -1]
loan08 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational08.csv", header = TRUE)[, -1]
loan09 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational09.csv", header = TRUE)[, -1]
loan = rbind(loan01, loan02, loan03, loan04, loan05, loan06, loan07, loan08, loan09)

3 Delete all records whose MIS_Status is missing

To begin, we will delete all records in which MIS_Status has a missing value.

loan <- loan[!(loan$MIS_Status %in% ""),]

loan %>% group_by(MIS_Status) %>% summarize(n=n())

## # A tibble: 2 × 2
##   MIS_Status      n
##   <chr>       <int>
## 1 CHGOFF     157558
## 2 P I F      739609

4 Use redefined categorical variable and use it to stratify population into subpopulations

We will create subcategories based on the variable State.

loan %>% group_by(State) %>% summarize(n = n())

## # A tibble: 52 × 2
##    State      n
##    <chr>  <int>
##  1 ""        13
##  2 "AK"    2403
##  3 "AL"    8360
##  4 "AR"    6333
##  5 "AZ"   17624
##  6 "CA"  130488
##  7 "CO"   20598
##  8 "CT"   12127
##  9 "DC"    1613
## 10 "DE"    2193
## # ℹ 42 more rows

We will then redefine the categorical variable State into nine different regions in the United States: Northeast, Middle Atlantic, South Atlantic, East South Central, West South Central, East North Central, West North Central, Mountain, and Pacific. We exclude missing values. No state has a particularly small enough category to require being completely excluded.

loan <- loan[!(loan$State %in% ""),]

loan <- loan %>% mutate(
  StateRegion = case_when(
    State %in% c("MA", "ME", "VT", "NH", "CT", "RI") ~ "Northeast",
    State %in% c("NY", "PA", "NJ") ~ "Middle Atlantic",
    State %in% c("WV", "MD", "DE", "DC", "VA", "NC", "GA", "FL", "SC") ~ "South Atlantic",
    State %in% c("KY", "TN", "AL", "MS") ~ "East South Central", 
    State %in% c("OK", "AR", "LA", "TX") ~ "West South Central",
    State %in% c("OH", "IN", "IL", "MI", "WI") ~ "East North Central",
    State %in% c("MN", "IA", "MO", "KS", "NE", "SD", "ND") ~ "West North Central",
    State %in% c("MT", "WY", "CO", "UT", "AZ", "NM","NV", "ID") ~ "Mountain",
    State %in% c("AK", "OR", "CA", "AK", "HI","WA") ~ "Pacific",
    TRUE ~ "Other"
  )
)

loan %>% group_by(StateRegion) %>% summarize(n = n())

## # A tibble: 9 × 2
##   StateRegion             n
##   <chr>               <int>
## 1 East North Central 117685
## 2 East South Central  33148
## 3 Middle Atlantic    116205
## 4 Mountain            92113
## 5 Northeast           69560
## 6 Pacific            170794
## 7 South Atlantic     116902
## 8 West North Central  84498
## 9 West South Central  96249

5 Use Zip Code to define clusters

Finally, we will determine clusters based on the variable Zip. Zip codes in the United States are defined as having the first digit representing a certain group of US States, the second and third digits together representing a region, and then the fourth and fifth digits representing a group of delivery addresses within that region. I will create clusters such that one exists for each state grouping, three exist for regions within the state grouping, and 2 exist for postal regions inside each reason.

6 Perform a Simple Random Sample

loan$sampling.frame = 1:length(loan$GrAppv)   
# sampling list
# names(study.pop)                                     
# checking the sampling list variable
sampled.list = sample(1:length(loan$GrAppv), 4000) 
# sampling the list
SRS.sample = loan[sampled.list,]                  
# extract the sampling units (observations)
## dimension check
dimension.SRS = dim(SRS.sample)
names(dimension.SRS) = c("Size", "Var.count")
kable(t(dimension.SRS))   # checking the sample size

Size	Var.count
4000	32

7 Performing a Systematic Sample

Next, we perform a systematic sample.

jump.size = dim(loan)[1]%/%4000  
# find the jump size in the systematic sampling
# jump.size
rand.starting.pt=sample(1:jump.size,1) # find the random starting value
sampling.id = seq(rand.starting.pt, dim(loan)[1], jump.size)  # sampling IDs
#length(sampling.id)
sys.sample=loan[sampling.id,]    
# extract the sampling units of systematic samples
sys.Sample.dim = dim(sys.sample)
names(sys.Sample.dim) = c("Size", "Var.count")
kable(t(sys.Sample.dim))

Size	Var.count
4005	32

The actual systematic sampling size is slightly bigger than the target size due to the rounding error and large population.

8 Performing a Stratified Sample

We will perform a stratified sample, doing an SRS on each stratum. We will take sample sizes that are approximately proportionate to the size of each stratum.

freq.table = table(loan$StateRegion)  # frequency table of strNAICS
rel.freq = freq.table/sum(freq.table)   # relative frequency 
strata.size = round(rel.freq*4000)      # strata size allocation
strata.names=names(strata.size)

Then, we take stratified samples.

strata.sample = loan[1,]    # create a reference data frame
strata.sample$add.id = 1   # add a temporary ID to because in the loop
                           # i =2 testing a single iteration
for (i in 1:length(strata.names)){
   ith.strata.names = strata.names[i]   # extract data frame names
   ith.strata.size = strata.size[i]     # allocated stratum size
   # The following code identifies observations to be selected
   ith.sampling.id = which(loan$StateRegion==ith.strata.names) 
   ith.strata = loan[ith.sampling.id,]  # i-th stratified population
   ith.strata$add.id = 1:dim(ith.strata)[1]  # add sampling list/frame
   # The following code generates a subset of random ID
   ith.sampling.id = sample(1:dim(ith.strata)[1], ith.strata.size) 
   ## Create a selection status -- pay attention to the operator: %in% 
   ith.sample =ith.strata[ith.strata$add.id %in%ith.sampling.id,]
   ## dim(ith.sample)         $ check the sample
   strata.sample = rbind(strata.sample, ith.sample)  # stack all data frame!
 }
 # dim(strata.sample)
 strat.sample.final = strata.sample[-1,]  # drop the temporary stratum ID
 # kable(head(strat.sample.final))         # accuracy check!

8.1 Perform a Cluster Sample

For our cluster sample, for each state region, we will choose randomly choose two of the three regions and then one of the two postal groups and perform a simple random sample with a sample size of 200.

cluster.sample = loan[1,]


for(i in 1:10){
  rand1<- sample(1:3, 2, replace=F)
  rand2 <- sample(1:2, 2, replace=F)
  sample1 <- loan %>% filter(stategroup == i & regiongroup == rand1[1] & postalgroup == rand2[1])
  sample2 <- loan %>% filter(stategroup == i & regiongroup == rand1[2] & postalgroup == rand2[2])
  
  sampled.list1 = sample(1:length(sample1$GrAppv), 200) 
  sampled.list2 = sample(1:length(sample2$GrAppv), 200) 
  
  SRS.sample1 = loan[sampled.list1,] 
  SRS.sample2 = loan[sampled.list2,] 
  
  cluster.sample = rbind(cluster.sample, SRS.sample1)
  cluster.sample = rbind(cluster.sample, SRS.sample2)
}

cluster.sample.final = cluster.sample[-1,]

STA 321 Assignment 7: Sampling Methods for Bank Loan Data

Alice Xiang

3/10/2024