# SAMPLING METHODS
# By, Eralda Gjika
# Linkedin: https://al.linkedin.com/in/eralda-dhamo-gjika-71879128
# Department of Applied Mathematics, Faculty of Natural Science, University of Tirana, Albania
#
# Working with sampling methods in R
# For this work I will use the dataset credit which you may find here:
credit <- read.csv(url("http://statmath.wu.ac.at/~vana/Intro_Data_Analytics_R/credit.csv"))
str(credit) # we see the structure of credit dataset
## 'data.frame': 10000 obs. of 4 variables:
## $ default: chr "No" "No" "No" "No" ...
## $ student: chr "No" "Yes" "No" "No" ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
summary(credit) # a descriptive statistics of variables in credit dataset
## default student balance income
## Length:10000 Length:10000 Min. : 0.0 Min. : 772
## Class :character Class :character 1st Qu.: 481.7 1st Qu.:21340
## Mode :character Mode :character Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
# Libraries we need
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(manipulate)
library(tigerstats)
## Loading required package: abd
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mosaic
## Loading required package: ggformula
## Loading required package: ggplot2
## Loading required package: ggstance
##
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
##
## New to ggformula? Try the tutorials:
## learnr::run_tutorial("introduction", package = "ggformula")
## learnr::run_tutorial("refining", package = "ggformula")
## Loading required package: mosaicData
## Loading required package: Matrix
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
##
## Have you tried the ggformula package for your plots?
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
## http://homerhanumat.github.io/tigerstats
############################################################
# SAMPLING
#########################################################
# SIMPLE RANDOM SAMPLING
# Simple Random Sampling (SRS)
# In simple random sampling, for a given sample size n,every set of n members of the population has the same chance to be the sample that is actually selected.
set.seed(123)
Zgjedhja.1=sample(credit[,3], 5, replace=TRUE)
# random sample of 5 individuals from the dataset, for "balance" variable, with replacement=TRUE
#
# STRATIFIED CLUSTERING
# stratified clustering using dplyr-library
credit_sample_1 <- credit %>% group_by(student) %>% sample_n(15)
credit_sample_1 # we obtain a stratified sample with 15 individuals from each category of student variable
## # A tibble: 30 x 4
## # Groups: student [2]
## default student balance income
## <chr> <chr> <dbl> <dbl>
## 1 No No 115. 27365.
## 2 Yes No 1512. 53507.
## 3 No No 213. 29592.
## 4 No No 654. 12697.
## 5 No No 1559. 38880.
## 6 No No 598. 42734.
## 7 No No 1890. 32510.
## 8 No No 1491. 39596.
## 9 No No 774. 40428.
## 10 No No 43.9 48544.
## # ... with 20 more rows
credit_sample_2 <- credit %>% group_by(default) %>% sample_n(15)
credit_sample_2
## # A tibble: 30 x 4
## # Groups: default [2]
## default student balance income
## <chr> <chr> <dbl> <dbl>
## 1 No No 0 36337.
## 2 No No 1115. 40084.
## 3 No Yes 272. 14081.
## 4 No No 353. 17626.
## 5 No No 319. 27401.
## 6 No No 1168. 46905.
## 7 No No 1067. 49630.
## 8 No Yes 811. 17252.
## 9 No No 915. 23683.
## 10 No No 1387. 56325.
## # ... with 20 more rows
#
set.seed(123)
sample.1 <- popsamp(5,credit) # a sample of 5 individuals form the population
# Try to execute the function below
# SimpleRandom() # You may change the length of the sample and variable in the datase, also observe how the histogram change
# We may calculate some numerical characteristics from the sample
m1<-mean(~income,data=sample.1)# calculates the mean of the sample
m1
## [1] 38696.98
m<- mean(~income,data=credit)# calculates the mean of the population so we may compare it to the sample mean
m
## [1] 33516.98
# we may do the same with standard deviation and other statistics
s1<- sd(~income,data=sample.1)
s1
## [1] 22359.1
s<- sd(~income,data=credit)
s
## [1] 13336.64
#
# Systematic Sampling
# In a systematic sample, the members of the population are put in a row. Then 1 out of every k members are selected. The starting point is randomly chosen from the first k elements and,then elements are sampled at the same location in each of the subsequent segments of size k .
#
# Let's have a sample moving by=50.
# first we have to chose from where will start the sampling
set.seed(1234)
start.position=sample(1:50,1)# let's "choose" randomly from where to start
start.position # we will move by +50 positions
## [1] 28
# careful , you will have different values of start position if you sample again
i=seq(start.position,length(credit$default),50)# build a sequence from start postion and move by 50
sample.2<-credit[i,]# choose only the rows in i- index
head(sample.2)
## default student balance income
## 28 No No 1454.8633 32189.09
## 78 No No 728.3733 45131.72
## 128 No No 928.2370 33722.16
## 178 No Yes 927.8877 22473.38
## 228 No Yes 850.0961 18618.99
## 278 No No 908.7716 45032.85
#
#
# Stratified Sampling
# In a stratified sample, the population must first be separated into homogeneous groups, or strata. Each element only belongs to one stratum and the stratum consist of elements that are alike in some way.A simple random sample is then drawn from each stratum, which is combined to make the stratified sample.
#
# Let's obtaion two stratified samples based on variable "student"
# Create subset from each category student =(yes,no)
set.seed(143)
student.po=subset(credit,student=="Yes")
head(student.po)
## default student balance income
## 2 No Yes 817.1804 12106.135
## 6 No Yes 919.5885 7491.559
## 8 No Yes 808.6675 17600.451
## 11 No Yes 0.0000 21871.073
## 12 No Yes 1220.5838 13268.562
## 18 No Yes 527.5402 17636.540
# Let's obtain a sample with length k=10 from the subset obtained above
sampl.student.po=popsamp(10,student.po)
head(sampl.student.po)
## default student balance income
## 1241 No Yes 502.7977 13376.053
## 1748 No Yes 1190.5441 20291.172
## 9283 No Yes 1225.3460 5524.375
## 2993 No Yes 1031.3039 12195.380
## 3047 No Yes 709.4142 11355.113
## 9213 No Yes 1032.6258 21122.957
#
# We do the same with subset student=="No"
set.seed(134)
student.no=subset(credit,student=="No")
head(student.no)
## default student balance income
## 1 No No 729.5265 44361.63
## 3 No No 1073.5492 31767.14
## 4 No No 529.2506 35704.49
## 5 No No 785.6559 38463.50
## 7 No No 825.5133 24905.23
## 9 No No 1161.0579 37468.53
sampl.student.no=popsamp(10,student.no)
head(sampl.student.no)
## default student balance income
## 6979 No No 443.9010 50768.44
## 236 No No 964.8203 34390.75
## 616 No No 1052.3933 37637.66
## 112 No No 596.9641 58088.36
## 4156 No No 588.7802 58764.36
## 3293 No No 618.8012 25927.57
#
# Then we merge these two sample and create a stratified sample
sample.3<-rbind(sampl.student.po,sampl.student.no)# merge by row
sample.3
## default student balance income
## 1241 No Yes 502.7977 13376.053
## 1748 No Yes 1190.5441 20291.172
## 9283 No Yes 1225.3460 5524.375
## 2993 No Yes 1031.3039 12195.380
## 3047 No Yes 709.4142 11355.113
## 9213 No Yes 1032.6258 21122.957
## 5516 No Yes 757.2728 18876.089
## 6612 No Yes 944.8328 23323.823
## 4723 No Yes 1069.6931 10722.249
## 564 No Yes 663.2499 20454.617
## 6979 No No 443.9010 50768.445
## 236 No No 964.8203 34390.746
## 616 No No 1052.3933 37637.659
## 112 No No 596.9641 58088.360
## 4156 No No 588.7802 58764.364
## 3293 No No 618.8012 25927.572
## 9341 No No 1003.6014 24978.085
## 7106 No No 549.8538 50084.390
## 9867 No No 599.7188 48085.884
## 8602 No No 695.4718 25544.892
#
#
# Comparing samples
# We may use the five number output for every sample and the population
favstats(sample.1$income)
## min Q1 median Q3 max mean sd n missing
## 12158.04 17648.2 47271.35 55219.52 61187.81 38696.98 22359.1 5 0
favstats(sample.2$income)
## min Q1 median Q3 max mean sd n missing
## 8231.037 21298.19 34888.5 45222.54 64952.61 34187.57 13626.29 200 0
favstats(sample.3$income)
## min Q1 median Q3 max mean sd n missing
## 5524.375 17501.08 24150.95 40249.72 58764.36 28575.61 16576.79 20 0
favstats(credit$income)
## min Q1 median Q3 max mean sd n missing
## 771.9677 21340.46 34552.64 43807.73 73554.23 33516.98 13336.64 10000 0
# Or we may use boxplot for a clear view of the variation and spread of the observations
par(mfrow=c(1,4))
boxplot(sample.1$income,main="Sample 1")
boxplot(sample.2$income,main="Sample 2")
boxplot(sample.3$income,main="Sample 3")
boxplot(credit$income,main="Population",col="red")

# Thank you for reading and sharing!
# E.Gjika
# https://al.linkedin.com/in/eralda-dhamo-gjika-71879128