Sampling and important statistical tests

knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(message = FALSE)

Sampling methods

simple sampling

library(tidyverse)
library(infer)
library(rstatix)

starwars %>% 
  slice_sample(n=10)

## # A tibble: 10 × 14
##    name        height  mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵
##    <chr>        <int> <dbl> <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>  
##  1 Qui-Gon Ji…    193  89   brown   fair    blue       92   male  mascu… <NA>   
##  2 Jocasta Nu     167  NA   white   fair    blue       NA   fema… femin… Corusc…
##  3 Luminara U…    170  56.2 black   yellow  blue       58   fema… femin… Mirial 
##  4 Adi Gallia     184  50   none    dark    blue       NA   fema… femin… Corusc…
##  5 Finis Valo…    170  NA   blond   fair    blue       91   male  mascu… Corusc…
##  6 Wilhuff Ta…    180  NA   auburn… fair    blue       64   male  mascu… Eriadu 
##  7 Tion Medon     206  80   none    grey    black      NA   male  mascu… Utapau 
##  8 Dormé          165  NA   brown   light   brown      NA   fema… femin… Naboo  
##  9 Kit Fisto      196  87   none    green   black      NA   male  mascu… Glee A…
## 10 Anakin Sky…    188  84   blond   fair    blue       41.9 male  mascu… Tatooi…
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## #   starships <list>, and abbreviated variable names ¹hair_color, ²skin_color,
## #   ³eye_color, ⁴birth_year, ⁵homeworld

stratified_sampling

starwars %>% 
  group_by(gender) %>% 
  slice_sample(n = 2)

## # A tibble: 6 × 14
## # Groups:   gender [3]
##   name         height  mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵
##   <chr>         <int> <dbl> <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>  
## 1 Mon Mothma      150  NA   auburn  fair    blue       48   fema… femin… Chandr…
## 2 Shaak Ti        178  57   none    red, b… black      NA   fema… femin… Shili  
## 3 Boba Fett       183  78.2 black   fair    brown      31.5 male  mascu… Kamino 
## 4 Tarfful         234 136   brown   brown   blue       NA   male  mascu… Kashyy…
## 5 Sly Moore       178  48   none    pale    white      NA   <NA>  <NA>   Umbara 
## 6 Captain Pha…     NA  NA   unknown unknown unknown    NA   <NA>  <NA>   <NA>   
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## #   starships <list>, and abbreviated variable names ¹hair_color, ²skin_color,
## #   ³eye_color, ⁴birth_year, ⁵homeworld

cluster sampling

starwars

## # A tibble: 87 × 14
##    name        height  mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵
##    <chr>        <int> <dbl> <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>  
##  1 Luke Skywa…    172    77 blond   fair    blue       19   male  mascu… Tatooi…
##  2 C-3PO          167    75 <NA>    gold    yellow    112   none  mascu… Tatooi…
##  3 R2-D2           96    32 <NA>    white,… red        33   none  mascu… Naboo  
##  4 Darth Vader    202   136 none    white   yellow     41.9 male  mascu… Tatooi…
##  5 Leia Organa    150    49 brown   light   brown      19   fema… femin… Aldera…
##  6 Owen Lars      178   120 brown,… light   blue       52   male  mascu… Tatooi…
##  7 Beru White…    165    75 brown   light   blue       47   fema… femin… Tatooi…
##  8 R5-D4           97    32 <NA>    white,… red        NA   none  mascu… Tatooi…
##  9 Biggs Dark…    183    84 black   light   brown      24   male  mascu… Tatooi…
## 10 Obi-Wan Ke…    182    77 auburn… fair    blue-g…    57   male  mascu… Stewjon
## # … with 77 more rows, 4 more variables: species <chr>, films <list>,
## #   vehicles <list>, starships <list>, and abbreviated variable names
## #   ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld

sampl=sample(starwars$homeworld,size=3)
starwars %>% 
  filter(homeworld %in% sampl) %>% 
  group_by(homeworld) %>% 
  slice_sample(n=1) %>% 
  ungroup()

## # A tibble: 3 × 14
##   name         height  mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵
##   <chr>         <int> <dbl> <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>  
## 1 Ratts Tyere…     79    15 none    grey, … unknown      NA male  mascu… Aleen …
## 2 Ayla Secura     178    55 none    blue    hazel        48 fema… femin… Ryloth 
## 3 Yoda             66    17 white   green   brown       896 male  mascu… <NA>   
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## #   starships <list>, and abbreviated variable names ¹hair_color, ²skin_color,
## #   ³eye_color, ⁴birth_year, ⁵homeworld

now we will talk about relative error relative_error_in_simple_sampling

simple_sampling=starwars %>% 
  slice_sample(n=5) %>% 
  summarise(mean_sample=mean(mass,na.rm=TRUE))
pop_mean_height=starwars %>% 
  summarise(mean_pop=mean(height,na.rm=TRUE))
relative_error_simple_sampling=abs(pop_mean_height-simple_sampling)*100/pop_mean_height
relative_error_simple_sampling

##   mean_pop
## 1 41.49968

creating a sampling distribution

sampling_dist=replicate(
  n=1000,
  expr=(
    starwars %>% 
      slice_sample(n=10) %>% 
      summarise(mean_sample=mean(height,na.rm=TRUE))
  ) %>% 
    pull(mean_sample)
  
)

sampling_dist=tibble(sampling_dist)
a2=ggplot(sampling_dist,aes(x=sampling_dist))+
  geom_histogram(binwidth = 4)+scale_x_continuous(expand = c(0,0))
a2

now lets see bootstraping bootstrapping is all about samplin with distribution bootstrap is all about estimating population parameter with sample

#bootstarping
starwars %>% 
  slice_sample(n=4,replace = TRUE)

## # A tibble: 4 × 14
##   name     height  mass hair_color skin_c…¹ eye_c…² birth…³ sex   gender homew…⁴
##   <chr>     <int> <dbl> <chr>      <chr>    <chr>     <dbl> <chr> <chr>  <chr>  
## 1 Tarfful     234   136 brown      brown    blue         NA male  mascu… Kashyy…
## 2 Grievous    216   159 none       brown, … green,…      NA male  mascu… Kalee  
## 3 Cordé       157    NA brown      light    brown        NA fema… femin… Naboo  
## 4 Lobot       175    79 none       light    blue         37 male  mascu… Bespin 
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## #   starships <list>, and abbreviated variable names ¹skin_color, ²eye_color,
## #   ³birth_year, ⁴homeworld

bootstraping_dist=replicate(
  n=1000,
  expr = {
    starwars %>% 
      slice_sample(n=4,replace=TRUE) %>% 
      summarise(mean_bootstrap=mean(height,na.rm=TRUE))
  } %>% 
  pull(mean_bootstrap)
)


population_parameter=tibble(bootstraping_dist)
population_parameter

## # A tibble: 1,000 × 1
##    bootstraping_dist
##                <dbl>
##  1              180.
##  2              185.
##  3              164.
##  4              139.
##  5              162.
##  6              164.
##  7              168.
##  8              194 
##  9              165.
## 10              180 
## # … with 990 more rows

a1=ggplot(population_parameter,aes(bootstraping_dist,fill=bootstraping_dist))+
  geom_histogram(binwidth=3)
a1

library(patchwork)
a1/a2

Hypothesis testing: lets ask a question if there is a significant difference in height between male and female lets assume null hupothesis: there is no diff alternative: there is a difference

first doing t test some assumption must be followed 1) there should

starwars %>% 
  filter(height >=2000)

## # A tibble: 0 × 14
## # … with 14 variables: name <chr>, height <int>, mass <dbl>, hair_color <chr>,
## #   skin_color <chr>, eye_color <chr>, birth_year <dbl>, sex <chr>,
## #   gender <chr>, homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>

library(rstatix)
drop=starwars %>% 
  identify_outliers(height) %>% 
 filter(is.extreme %in% TRUE)
unique(drop$height)

## [1]  66  88  94 264  79

data_change=starwars %>% 
  filter(!height %in% c(96 , 97,   66,  88, 94, 264, 79))

data_change %>% 
  shapiro_test(height)

## # A tibble: 1 × 3
##   variable statistic       p
##   <chr>        <dbl>   <dbl>
## 1 height       0.953 0.00869

x1=ggplot(data_change %>% 
  filter(sex %in% c("male","female")),aes(x=height,color=sex,fill=sex))+
  geom_histogram()

x2=ggplot(data_change %>% 
  filter(sex %in% c("male","female")),aes(x=height,color=sex,fill=sex))+
  geom_histogram()+facet_wrap(~sex)

x1/x2

data_change %>%
  filter(sex %in% c("male","female")) %>% 
  t.test(data=.,height~sex,alternative = "two.sided") %>% 
  broom::tidy()

## # A tibble: 1 × 10
##   estim…¹ estim…² estim…³ stati…⁴ p.value param…⁵ conf.…⁶ conf.…⁷ method alter…⁸
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr>  <chr>  
## 1   -15.7    169.    185.   -3.13 0.00368    32.9   -25.9   -5.48 Welch… two.si…
## # … with abbreviated variable names ¹estimate, ²estimate1, ³estimate2,
## #   ⁴statistic, ⁵parameter, ⁶conf.low, ⁷conf.high, ⁸alternative

which means the which means the null hypothesis is rejected,meaning there is diff in height between male and female height

data_change %>%
  filter(sex %in% c("male","female")) %>% 
  t.test(data=.,height~sex,alternative = "less") %>% 
  broom::tidy()

## # A tibble: 1 × 10
##   estim…¹ estim…² estim…³ stati…⁴ p.value param…⁵ conf.…⁶ conf.…⁷ method alter…⁸
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr>  <chr>  
## 1   -15.7    169.    185.   -3.13 0.00184    32.9    -Inf   -7.20 Welch… less   
## # … with abbreviated variable names ¹estimate, ²estimate1, ³estimate2,
## #   ⁴statistic, ⁵parameter, ⁶conf.low, ⁷conf.high, ⁸alternative

meaning male height is greater than female height

what if we want to know if there is a association between species and height

unique(data_change$species)

##  [1] "Human"        "Droid"        "Wookiee"      "Rodian"       "Hutt"        
##  [6] "Trandoshan"   "Mon Calamari" "Sullustan"    "Neimodian"    "Gungan"      
## [11] NA             "Toydarian"    "Dug"          "Zabrak"       "Twi'lek"     
## [16] "Xexto"        "Toong"        "Cerean"       "Nautolan"     "Tholothian"  
## [21] "Iktotchi"     "Kel Dor"      "Chagrian"     "Geonosian"    "Mirialan"    
## [26] "Clawdite"     "Besalisk"     "Kaminoan"     "Skakoan"      "Muun"        
## [31] "Togruta"      "Kaleesh"      "Pau'an"

now lets ask another question ,if there is a connection between speicies and growth h0: there is no association h1 : there is a association

data_change %>%
  mutate(height=cut(height,breaks=3,labels=c("small","medium","large"))) %>% 
  select(height,species) %>% table() %>% chisq.test()

## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 102.46, df = 62, p-value = 0.0009372

the p value suggests that there is no association between height and species,meaning categorical variables are independent

library(DataExplorer)
data_change$species[is.na(data_change$species)]="others"
data_change$species[which(data_change$species %in% c("Besalisk"   ,  "Cerean"    ,   "Chagrian"  ,   "Clawdite"  ,   "Dug"    ,      "Geonosian"  ,  "Hutt" ,       
 "Iktotchi"  ,   "Kaleesh"    ,  "Kel Dor"   ,   "Mon Calamari", "Muun"  ,       "Nautolan"   ,  "Neimodian" ,  "Pau'an" ,     
 "Rodian"   ,    "Skakoan"   ,   "Sullustan"   , "Tholothian" ,  "Togruta" ,     "Toong"  ,      "Toydarian"   , "Trandoshan"  ,
 "Xexto" ))]="others"
data_change %>% 
  mutate(species=as.factor(species)) %>% 
  select(height,species) %>% 
  na.omit() %>% 
  plot_correlation()

data_change %>%
  select(sex,height) %>% 
  aov(data=.,height~sex) %>% 
  tukey_hsd()

## # A tibble: 6 × 9
##   term  group1         group2     null.…¹ estim…² conf.…³ conf.…⁴  p.adj p.adj…⁵
## * <chr> <chr>          <chr>        <dbl>   <dbl>   <dbl>   <dbl>  <dbl> <chr>  
## 1 sex   female         hermaphro…       0    5.73 -51.4      62.9 0.993  ns     
## 2 sex   female         male             0   15.7   -0.527    31.9 0.0615 ns     
## 3 sex   female         none             0   14.2  -27.4      55.9 0.805  ns     
## 4 sex   hermaphroditic male             0    9.96 -45.9      65.8 0.965  ns     
## 5 sex   hermaphroditic none             0    8.50 -59.3      76.3 0.987  ns     
## 6 sex   male           none             0   -1.46 -41.3      38.4 1      ns     
## # … with abbreviated variable names ¹null.value, ²estimate, ³conf.low,
## #   ⁴conf.high, ⁵p.adj.signif

bar=data_change %>% 
  count(species,sort = TRUE)
bar$species[is.na(bar$species)]="others"
bar$species[which(bar$species %in% c("Besalisk"   ,  "Cerean"    ,   "Chagrian"  ,   "Clawdite"  ,   "Dug"    ,      "Geonosian"  ,  "Hutt" ,       
 "Iktotchi"  ,   "Kaleesh"    ,  "Kel Dor"   ,   "Mon Calamari", "Muun"  ,       "Nautolan"   ,  "Neimodian" ,  "Pau'an" ,     
 "Rodian"   ,    "Skakoan"   ,   "Sullustan"   , "Tholothian" ,  "Togruta" ,     "Toong"  ,      "Toydarian"   , "Trandoshan"  ,
 "Xexto" ))]="others"
ggplot(bar,aes(x=species,y=n,color=species,fill=species))+
  geom_bar(stat = "identity")+
  coord_polar() + theme(panel.background = element_rect(fill = "gray96"))+ylab("pie charts of species")

wilcox test for camparing median, this is a not parametric test when data is not normal and has extreme outlier

starwars %>% 
  filter(sex %in% c("male","female")) %>% 
  wilcox.test(height~sex,data=.,alternative= "two.sided") %>% 
  broom::tidy()

## # A tibble: 1 × 4
##   statistic p.value method                                            alternat…¹
##       <dbl>   <dbl> <chr>                                             <chr>     
## 1       212 0.00285 Wilcoxon rank sum test with continuity correction two.sided 
## # … with abbreviated variable name ¹alternative

kruskal test : an extension of wilcox test like anova tests for t_test

starwars %>% 
  select(height,species) %>%
  kruskal.test(height~species,data = .) %>% 
  tidy()

## # A tibble: 1 × 4
##   statistic p.value parameter method                      
##       <dbl>   <dbl>     <int> <chr>                       
## 1      55.4  0.0204        36 Kruskal-Wallis rank sum test

Sampling and important statistical tests

omon das

2023-03-04