We will be taking several samples from bank loan data. These will consist of a simple random sample, a systematic sample, a stratifid sample based on a categorical variable, and a cluster sample.
To begin, we read in the data for the data set. This dataset has 897154 observations on 27 variables.
loan01 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational01.csv", header = TRUE)[, -1]
loan02 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational02.csv", header = TRUE)[, -1]
loan03 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational03.csv", header = TRUE)[, -1]
loan04 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational04.csv", header = TRUE)[, -1]
loan05 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational05.csv", header = TRUE)[, -1]
loan06 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational06.csv", header = TRUE)[, -1]
loan07 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational07.csv", header = TRUE)[, -1]
loan08 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational08.csv", header = TRUE)[, -1]
loan09 = read.csv("https://pengdsci.github.io/datasets/SBAloan/w06-SBAnational09.csv", header = TRUE)[, -1]
loan = rbind(loan01, loan02, loan03, loan04, loan05, loan06, loan07, loan08, loan09)
To begin, we will delete all records in which MIS_Status has a missing value.
loan <- loan[!(loan$MIS_Status %in% ""),]
loan %>% group_by(MIS_Status) %>% summarize(n=n())
## # A tibble: 2 × 2
## MIS_Status n
## <chr> <int>
## 1 CHGOFF 157558
## 2 P I F 739609
We will create subcategories based on the variable State.
loan %>% group_by(State) %>% summarize(n = n())
## # A tibble: 52 × 2
## State n
## <chr> <int>
## 1 "" 13
## 2 "AK" 2403
## 3 "AL" 8360
## 4 "AR" 6333
## 5 "AZ" 17624
## 6 "CA" 130488
## 7 "CO" 20598
## 8 "CT" 12127
## 9 "DC" 1613
## 10 "DE" 2193
## # ℹ 42 more rows
We will then redefine the categorical variable State into nine different regions in the United States: Northeast, Middle Atlantic, South Atlantic, East South Central, West South Central, East North Central, West North Central, Mountain, and Pacific. We exclude missing values. No state has a particularly small enough category to require being completely excluded.
loan <- loan[!(loan$State %in% ""),]
loan <- loan %>% mutate(
StateRegion = case_when(
State %in% c("MA", "ME", "VT", "NH", "CT", "RI") ~ "Northeast",
State %in% c("NY", "PA", "NJ") ~ "Middle Atlantic",
State %in% c("WV", "MD", "DE", "DC", "VA", "NC", "GA", "FL", "SC") ~ "South Atlantic",
State %in% c("KY", "TN", "AL", "MS") ~ "East South Central",
State %in% c("OK", "AR", "LA", "TX") ~ "West South Central",
State %in% c("OH", "IN", "IL", "MI", "WI") ~ "East North Central",
State %in% c("MN", "IA", "MO", "KS", "NE", "SD", "ND") ~ "West North Central",
State %in% c("MT", "WY", "CO", "UT", "AZ", "NM","NV", "ID") ~ "Mountain",
State %in% c("AK", "OR", "CA", "AK", "HI","WA") ~ "Pacific",
TRUE ~ "Other"
)
)
loan %>% group_by(StateRegion) %>% summarize(n = n())
## # A tibble: 9 × 2
## StateRegion n
## <chr> <int>
## 1 East North Central 117685
## 2 East South Central 33148
## 3 Middle Atlantic 116205
## 4 Mountain 92113
## 5 Northeast 69560
## 6 Pacific 170794
## 7 South Atlantic 116902
## 8 West North Central 84498
## 9 West South Central 96249
Finally, we will determine clusters based on the variable Zip. Zip codes in the United States are defined as having the first digit representing a certain group of US States, the second and third digits together representing a region, and then the fourth and fifth digits representing a group of delivery addresses within that region. I will create clusters such that one exists for each state grouping, three exist for regions within the state grouping, and 2 exist for postal regions inside each reason.
loan$sampling.frame = 1:length(loan$GrAppv)
# sampling list
# names(study.pop)
# checking the sampling list variable
sampled.list = sample(1:length(loan$GrAppv), 4000)
# sampling the list
SRS.sample = loan[sampled.list,]
# extract the sampling units (observations)
## dimension check
dimension.SRS = dim(SRS.sample)
names(dimension.SRS) = c("Size", "Var.count")
kable(t(dimension.SRS)) # checking the sample size
Size | Var.count |
---|---|
4000 | 32 |
Next, we perform a systematic sample.
jump.size = dim(loan)[1]%/%4000
# find the jump size in the systematic sampling
# jump.size
rand.starting.pt=sample(1:jump.size,1) # find the random starting value
sampling.id = seq(rand.starting.pt, dim(loan)[1], jump.size) # sampling IDs
#length(sampling.id)
sys.sample=loan[sampling.id,]
# extract the sampling units of systematic samples
sys.Sample.dim = dim(sys.sample)
names(sys.Sample.dim) = c("Size", "Var.count")
kable(t(sys.Sample.dim))
Size | Var.count |
---|---|
4005 | 32 |
The actual systematic sampling size is slightly bigger than the target size due to the rounding error and large population.
We will perform a stratified sample, doing an SRS on each stratum. We will take sample sizes that are approximately proportionate to the size of each stratum.
freq.table = table(loan$StateRegion) # frequency table of strNAICS
rel.freq = freq.table/sum(freq.table) # relative frequency
strata.size = round(rel.freq*4000) # strata size allocation
strata.names=names(strata.size)
Then, we take stratified samples.
strata.sample = loan[1,] # create a reference data frame
strata.sample$add.id = 1 # add a temporary ID to because in the loop
# i =2 testing a single iteration
for (i in 1:length(strata.names)){
ith.strata.names = strata.names[i] # extract data frame names
ith.strata.size = strata.size[i] # allocated stratum size
# The following code identifies observations to be selected
ith.sampling.id = which(loan$StateRegion==ith.strata.names)
ith.strata = loan[ith.sampling.id,] # i-th stratified population
ith.strata$add.id = 1:dim(ith.strata)[1] # add sampling list/frame
# The following code generates a subset of random ID
ith.sampling.id = sample(1:dim(ith.strata)[1], ith.strata.size)
## Create a selection status -- pay attention to the operator: %in%
ith.sample =ith.strata[ith.strata$add.id %in%ith.sampling.id,]
## dim(ith.sample) $ check the sample
strata.sample = rbind(strata.sample, ith.sample) # stack all data frame!
}
# dim(strata.sample)
strat.sample.final = strata.sample[-1,] # drop the temporary stratum ID
# kable(head(strat.sample.final)) # accuracy check!
For our cluster sample, for each state region, we will choose randomly choose two of the three regions and then one of the two postal groups and perform a simple random sample with a sample size of 200.
cluster.sample = loan[1,]
for(i in 1:10){
rand1<- sample(1:3, 2, replace=F)
rand2 <- sample(1:2, 2, replace=F)
sample1 <- loan %>% filter(stategroup == i & regiongroup == rand1[1] & postalgroup == rand2[1])
sample2 <- loan %>% filter(stategroup == i & regiongroup == rand1[2] & postalgroup == rand2[2])
sampled.list1 = sample(1:length(sample1$GrAppv), 200)
sampled.list2 = sample(1:length(sample2$GrAppv), 200)
SRS.sample1 = loan[sampled.list1,]
SRS.sample2 = loan[sampled.list2,]
cluster.sample = rbind(cluster.sample, SRS.sample1)
cluster.sample = rbind(cluster.sample, SRS.sample2)
}
cluster.sample.final = cluster.sample[-1,]