Sampling-methods-in-R.R

# SAMPLING METHODS 
# By, Eralda Gjika
# Linkedin: https://al.linkedin.com/in/eralda-dhamo-gjika-71879128
# Department of Applied Mathematics, Faculty of Natural Science, University of Tirana, Albania
#
# Working with sampling methods in R
# For this work I will use the dataset credit which you may find here:
credit <- read.csv(url("http://statmath.wu.ac.at/~vana/Intro_Data_Analytics_R/credit.csv"))
str(credit) # we see the structure of credit dataset

## 'data.frame':    10000 obs. of  4 variables:
##  $ default: chr  "No" "No" "No" "No" ...
##  $ student: chr  "No" "Yes" "No" "No" ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...

summary(credit) # a descriptive statistics of variables in credit dataset

##    default            student             balance           income     
##  Length:10000       Length:10000       Min.   :   0.0   Min.   :  772  
##  Class :character   Class :character   1st Qu.: 481.7   1st Qu.:21340  
##  Mode  :character   Mode  :character   Median : 823.6   Median :34553  
##                                        Mean   : 835.4   Mean   :33517  
##                                        3rd Qu.:1166.3   3rd Qu.:43808  
##                                        Max.   :2654.3   Max.   :73554

# Libraries we need
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(manipulate)
library(tigerstats)

## Loading required package: abd

## Loading required package: nlme

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

## Loading required package: lattice

## Loading required package: grid

## Loading required package: mosaic

## Loading required package: ggformula

## Loading required package: ggplot2

## Loading required package: ggstance

## 
## Attaching package: 'ggstance'

## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh

## 
## New to ggformula?  Try the tutorials: 
##  learnr::run_tutorial("introduction", package = "ggformula")
##  learnr::run_tutorial("refining", package = "ggformula")

## Loading required package: mosaicData

## Loading required package: Matrix

## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2

## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
## 
## Have you tried the ggformula package for your plots?

## 
## Attaching package: 'mosaic'

## The following object is masked from 'package:Matrix':
## 
##     mean

## The following object is masked from 'package:ggplot2':
## 
##     stat

## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally

## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var

## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

## Welcome to tigerstats!
## To learn more about this package, consult its website:
##  http://homerhanumat.github.io/tigerstats

############################################################
#                             SAMPLING
#########################################################
# SIMPLE RANDOM SAMPLING
# Simple Random Sampling (SRS)
# In simple random sampling, for a given sample size  n,every set of  n  members of the population has the same chance to be the sample that is actually selected. 
set.seed(123)
Zgjedhja.1=sample(credit[,3], 5, replace=TRUE)
# random sample of 5 individuals from the dataset, for "balance" variable, with replacement=TRUE
#
# STRATIFIED CLUSTERING
# stratified clustering using dplyr-library
credit_sample_1 <- credit %>% group_by(student) %>% sample_n(15)
credit_sample_1 # we obtain a stratified sample with 15 individuals from each category of student variable

## # A tibble: 30 x 4
## # Groups:   student [2]
##    default student balance income
##    <chr>   <chr>     <dbl>  <dbl>
##  1 No      No        115.  27365.
##  2 Yes     No       1512.  53507.
##  3 No      No        213.  29592.
##  4 No      No        654.  12697.
##  5 No      No       1559.  38880.
##  6 No      No        598.  42734.
##  7 No      No       1890.  32510.
##  8 No      No       1491.  39596.
##  9 No      No        774.  40428.
## 10 No      No         43.9 48544.
## # ... with 20 more rows

credit_sample_2 <- credit %>% group_by(default) %>% sample_n(15)
credit_sample_2

## # A tibble: 30 x 4
## # Groups:   default [2]
##    default student balance income
##    <chr>   <chr>     <dbl>  <dbl>
##  1 No      No           0  36337.
##  2 No      No        1115. 40084.
##  3 No      Yes        272. 14081.
##  4 No      No         353. 17626.
##  5 No      No         319. 27401.
##  6 No      No        1168. 46905.
##  7 No      No        1067. 49630.
##  8 No      Yes        811. 17252.
##  9 No      No         915. 23683.
## 10 No      No        1387. 56325.
## # ... with 20 more rows

#
set.seed(123)
sample.1 <- popsamp(5,credit) # a sample of 5 individuals form the population 
# Try to execute the function below
# SimpleRandom() # You may change the length of the sample and variable in the datase, also observe how the histogram change 
# We may calculate some numerical characteristics from the sample
m1<-mean(~income,data=sample.1)# calculates the mean of the sample
m1

## [1] 38696.98

m<- mean(~income,data=credit)# calculates the mean of the population so we may compare it to the sample mean
m

## [1] 33516.98

# we may do the same with standard deviation and other statistics 
s1<- sd(~income,data=sample.1)
s1

## [1] 22359.1

s<- sd(~income,data=credit)
s

## [1] 13336.64

#
# Systematic Sampling
# In a systematic sample, the members of the population are put in a row. Then 1 out of every  k  members are selected. The starting point is randomly chosen from the first  k  elements and,then elements are sampled at the same location in each of the subsequent segments of size  k .
# 
# Let's have a sample moving by=50. 
# first we have to chose from where will start the sampling
set.seed(1234)
start.position=sample(1:50,1)# let's "choose" randomly from where to start 
start.position # we will move by +50 positions

## [1] 28

# careful , you will have different values of start position if you sample again
i=seq(start.position,length(credit$default),50)# build a sequence from start postion and move by 50
sample.2<-credit[i,]# choose only the rows in i- index
head(sample.2)

##     default student   balance   income
## 28       No      No 1454.8633 32189.09
## 78       No      No  728.3733 45131.72
## 128      No      No  928.2370 33722.16
## 178      No     Yes  927.8877 22473.38
## 228      No     Yes  850.0961 18618.99
## 278      No      No  908.7716 45032.85

#
#
# Stratified Sampling
# In a stratified sample, the population must first be separated into homogeneous groups, or strata. Each element only belongs to one stratum and the stratum consist of elements that are alike in some way.A simple random sample is then drawn from each stratum, which is combined to make the stratified sample.
# 
# Let's obtaion two stratified samples based on variable "student"
# Create subset from each category student =(yes,no)
set.seed(143)
student.po=subset(credit,student=="Yes")
head(student.po)

##    default student   balance    income
## 2       No     Yes  817.1804 12106.135
## 6       No     Yes  919.5885  7491.559
## 8       No     Yes  808.6675 17600.451
## 11      No     Yes    0.0000 21871.073
## 12      No     Yes 1220.5838 13268.562
## 18      No     Yes  527.5402 17636.540

# Let's obtain a sample with length k=10 from the subset obtained above
sampl.student.po=popsamp(10,student.po)
head(sampl.student.po)

##      default student   balance    income
## 1241      No     Yes  502.7977 13376.053
## 1748      No     Yes 1190.5441 20291.172
## 9283      No     Yes 1225.3460  5524.375
## 2993      No     Yes 1031.3039 12195.380
## 3047      No     Yes  709.4142 11355.113
## 9213      No     Yes 1032.6258 21122.957

#
# We do the same with subset student=="No"
set.seed(134)
student.no=subset(credit,student=="No")
head(student.no)

##   default student   balance   income
## 1      No      No  729.5265 44361.63
## 3      No      No 1073.5492 31767.14
## 4      No      No  529.2506 35704.49
## 5      No      No  785.6559 38463.50
## 7      No      No  825.5133 24905.23
## 9      No      No 1161.0579 37468.53

sampl.student.no=popsamp(10,student.no)
head(sampl.student.no)

##      default student   balance   income
## 6979      No      No  443.9010 50768.44
## 236       No      No  964.8203 34390.75
## 616       No      No 1052.3933 37637.66
## 112       No      No  596.9641 58088.36
## 4156      No      No  588.7802 58764.36
## 3293      No      No  618.8012 25927.57

#
# Then we merge these two sample and create a stratified sample 
sample.3<-rbind(sampl.student.po,sampl.student.no)# merge by row 
sample.3

##      default student   balance    income
## 1241      No     Yes  502.7977 13376.053
## 1748      No     Yes 1190.5441 20291.172
## 9283      No     Yes 1225.3460  5524.375
## 2993      No     Yes 1031.3039 12195.380
## 3047      No     Yes  709.4142 11355.113
## 9213      No     Yes 1032.6258 21122.957
## 5516      No     Yes  757.2728 18876.089
## 6612      No     Yes  944.8328 23323.823
## 4723      No     Yes 1069.6931 10722.249
## 564       No     Yes  663.2499 20454.617
## 6979      No      No  443.9010 50768.445
## 236       No      No  964.8203 34390.746
## 616       No      No 1052.3933 37637.659
## 112       No      No  596.9641 58088.360
## 4156      No      No  588.7802 58764.364
## 3293      No      No  618.8012 25927.572
## 9341      No      No 1003.6014 24978.085
## 7106      No      No  549.8538 50084.390
## 9867      No      No  599.7188 48085.884
## 8602      No      No  695.4718 25544.892

#
#
# Comparing samples
# We may use the five number output for every sample and the population
favstats(sample.1$income)

##       min      Q1   median       Q3      max     mean      sd n missing
##  12158.04 17648.2 47271.35 55219.52 61187.81 38696.98 22359.1 5       0

favstats(sample.2$income)

##       min       Q1  median       Q3      max     mean       sd   n missing
##  8231.037 21298.19 34888.5 45222.54 64952.61 34187.57 13626.29 200       0

favstats(sample.3$income)

##       min       Q1   median       Q3      max     mean       sd  n missing
##  5524.375 17501.08 24150.95 40249.72 58764.36 28575.61 16576.79 20       0

favstats(credit$income)

##       min       Q1   median       Q3      max     mean       sd     n missing
##  771.9677 21340.46 34552.64 43807.73 73554.23 33516.98 13336.64 10000       0

# Or we may use boxplot for a clear view of the variation and spread of the observations
par(mfrow=c(1,4))
boxplot(sample.1$income,main="Sample 1")
boxplot(sample.2$income,main="Sample 2")
boxplot(sample.3$income,main="Sample 3")
boxplot(credit$income,main="Population",col="red")

# Thank you for reading and sharing!
# E.Gjika
# https://al.linkedin.com/in/eralda-dhamo-gjika-71879128

Sampling-methods-in-R.R

user

2020-06-17