knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(message = FALSE)
Sampling methods
simple sampling
library(tidyverse)
library(infer)
library(rstatix)
starwars %>%
slice_sample(n=10)
## # A tibble: 10 × 14
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Qui-Gon Ji… 193 89 brown fair blue 92 male mascu… <NA>
## 2 Jocasta Nu 167 NA white fair blue NA fema… femin… Corusc…
## 3 Luminara U… 170 56.2 black yellow blue 58 fema… femin… Mirial
## 4 Adi Gallia 184 50 none dark blue NA fema… femin… Corusc…
## 5 Finis Valo… 170 NA blond fair blue 91 male mascu… Corusc…
## 6 Wilhuff Ta… 180 NA auburn… fair blue 64 male mascu… Eriadu
## 7 Tion Medon 206 80 none grey black NA male mascu… Utapau
## 8 Dormé 165 NA brown light brown NA fema… femin… Naboo
## 9 Kit Fisto 196 87 none green black NA male mascu… Glee A…
## 10 Anakin Sky… 188 84 blond fair blue 41.9 male mascu… Tatooi…
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## # starships <list>, and abbreviated variable names ¹hair_color, ²skin_color,
## # ³eye_color, ⁴birth_year, ⁵homeworld
stratified_sampling
starwars %>%
group_by(gender) %>%
slice_sample(n = 2)
## # A tibble: 6 × 14
## # Groups: gender [3]
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Mon Mothma 150 NA auburn fair blue 48 fema… femin… Chandr…
## 2 Shaak Ti 178 57 none red, b… black NA fema… femin… Shili
## 3 Boba Fett 183 78.2 black fair brown 31.5 male mascu… Kamino
## 4 Tarfful 234 136 brown brown blue NA male mascu… Kashyy…
## 5 Sly Moore 178 48 none pale white NA <NA> <NA> Umbara
## 6 Captain Pha… NA NA unknown unknown unknown NA <NA> <NA> <NA>
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## # starships <list>, and abbreviated variable names ¹hair_color, ²skin_color,
## # ³eye_color, ⁴birth_year, ⁵homeworld
cluster sampling
starwars
## # A tibble: 87 × 14
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Luke Skywa… 172 77 blond fair blue 19 male mascu… Tatooi…
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu… Tatooi…
## 3 R2-D2 96 32 <NA> white,… red 33 none mascu… Naboo
## 4 Darth Vader 202 136 none white yellow 41.9 male mascu… Tatooi…
## 5 Leia Organa 150 49 brown light brown 19 fema… femin… Aldera…
## 6 Owen Lars 178 120 brown,… light blue 52 male mascu… Tatooi…
## 7 Beru White… 165 75 brown light blue 47 fema… femin… Tatooi…
## 8 R5-D4 97 32 <NA> white,… red NA none mascu… Tatooi…
## 9 Biggs Dark… 183 84 black light brown 24 male mascu… Tatooi…
## 10 Obi-Wan Ke… 182 77 auburn… fair blue-g… 57 male mascu… Stewjon
## # … with 77 more rows, 4 more variables: species <chr>, films <list>,
## # vehicles <list>, starships <list>, and abbreviated variable names
## # ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld
sampl=sample(starwars$homeworld,size=3)
starwars %>%
filter(homeworld %in% sampl) %>%
group_by(homeworld) %>%
slice_sample(n=1) %>%
ungroup()
## # A tibble: 3 × 14
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Ratts Tyere… 79 15 none grey, … unknown NA male mascu… Aleen …
## 2 Ayla Secura 178 55 none blue hazel 48 fema… femin… Ryloth
## 3 Yoda 66 17 white green brown 896 male mascu… <NA>
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## # starships <list>, and abbreviated variable names ¹hair_color, ²skin_color,
## # ³eye_color, ⁴birth_year, ⁵homeworld
now we will talk about relative error relative_error_in_simple_sampling
simple_sampling=starwars %>%
slice_sample(n=5) %>%
summarise(mean_sample=mean(mass,na.rm=TRUE))
pop_mean_height=starwars %>%
summarise(mean_pop=mean(height,na.rm=TRUE))
relative_error_simple_sampling=abs(pop_mean_height-simple_sampling)*100/pop_mean_height
relative_error_simple_sampling
## mean_pop
## 1 41.49968
creating a sampling distribution
sampling_dist=replicate(
n=1000,
expr=(
starwars %>%
slice_sample(n=10) %>%
summarise(mean_sample=mean(height,na.rm=TRUE))
) %>%
pull(mean_sample)
)
sampling_dist=tibble(sampling_dist)
a2=ggplot(sampling_dist,aes(x=sampling_dist))+
geom_histogram(binwidth = 4)+scale_x_continuous(expand = c(0,0))
a2
now lets see bootstraping bootstrapping is all about samplin with
distribution bootstrap is all about estimating population parameter with
sample
#bootstarping
starwars %>%
slice_sample(n=4,replace = TRUE)
## # A tibble: 4 × 14
## name height mass hair_color skin_c…¹ eye_c…² birth…³ sex gender homew…⁴
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Tarfful 234 136 brown brown blue NA male mascu… Kashyy…
## 2 Grievous 216 159 none brown, … green,… NA male mascu… Kalee
## 3 Cordé 157 NA brown light brown NA fema… femin… Naboo
## 4 Lobot 175 79 none light blue 37 male mascu… Bespin
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## # starships <list>, and abbreviated variable names ¹skin_color, ²eye_color,
## # ³birth_year, ⁴homeworld
bootstraping_dist=replicate(
n=1000,
expr = {
starwars %>%
slice_sample(n=4,replace=TRUE) %>%
summarise(mean_bootstrap=mean(height,na.rm=TRUE))
} %>%
pull(mean_bootstrap)
)
population_parameter=tibble(bootstraping_dist)
population_parameter
## # A tibble: 1,000 × 1
## bootstraping_dist
## <dbl>
## 1 180.
## 2 185.
## 3 164.
## 4 139.
## 5 162.
## 6 164.
## 7 168.
## 8 194
## 9 165.
## 10 180
## # … with 990 more rows
a1=ggplot(population_parameter,aes(bootstraping_dist,fill=bootstraping_dist))+
geom_histogram(binwidth=3)
a1
library(patchwork)
a1/a2
Hypothesis testing: lets ask a question if there is a significant difference in height between male and female lets assume null hupothesis: there is no diff alternative: there is a difference
first doing t test some assumption must be followed 1) there should
starwars %>%
filter(height >=2000)
## # A tibble: 0 × 14
## # … with 14 variables: name <chr>, height <int>, mass <dbl>, hair_color <chr>,
## # skin_color <chr>, eye_color <chr>, birth_year <dbl>, sex <chr>,
## # gender <chr>, homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>
library(rstatix)
drop=starwars %>%
identify_outliers(height) %>%
filter(is.extreme %in% TRUE)
unique(drop$height)
## [1] 66 88 94 264 79
data_change=starwars %>%
filter(!height %in% c(96 , 97, 66, 88, 94, 264, 79))
data_change %>%
shapiro_test(height)
## # A tibble: 1 × 3
## variable statistic p
## <chr> <dbl> <dbl>
## 1 height 0.953 0.00869
x1=ggplot(data_change %>%
filter(sex %in% c("male","female")),aes(x=height,color=sex,fill=sex))+
geom_histogram()
x2=ggplot(data_change %>%
filter(sex %in% c("male","female")),aes(x=height,color=sex,fill=sex))+
geom_histogram()+facet_wrap(~sex)
x1/x2
data_change %>%
filter(sex %in% c("male","female")) %>%
t.test(data=.,height~sex,alternative = "two.sided") %>%
broom::tidy()
## # A tibble: 1 × 10
## estim…¹ estim…² estim…³ stati…⁴ p.value param…⁵ conf.…⁶ conf.…⁷ method alter…⁸
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 -15.7 169. 185. -3.13 0.00368 32.9 -25.9 -5.48 Welch… two.si…
## # … with abbreviated variable names ¹estimate, ²estimate1, ³estimate2,
## # ⁴statistic, ⁵parameter, ⁶conf.low, ⁷conf.high, ⁸alternative
which means the which means the null hypothesis is rejected,meaning there is diff in height between male and female height
data_change %>%
filter(sex %in% c("male","female")) %>%
t.test(data=.,height~sex,alternative = "less") %>%
broom::tidy()
## # A tibble: 1 × 10
## estim…¹ estim…² estim…³ stati…⁴ p.value param…⁵ conf.…⁶ conf.…⁷ method alter…⁸
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 -15.7 169. 185. -3.13 0.00184 32.9 -Inf -7.20 Welch… less
## # … with abbreviated variable names ¹estimate, ²estimate1, ³estimate2,
## # ⁴statistic, ⁵parameter, ⁶conf.low, ⁷conf.high, ⁸alternative
meaning male height is greater than female height
what if we want to know if there is a association between species and height
unique(data_change$species)
## [1] "Human" "Droid" "Wookiee" "Rodian" "Hutt"
## [6] "Trandoshan" "Mon Calamari" "Sullustan" "Neimodian" "Gungan"
## [11] NA "Toydarian" "Dug" "Zabrak" "Twi'lek"
## [16] "Xexto" "Toong" "Cerean" "Nautolan" "Tholothian"
## [21] "Iktotchi" "Kel Dor" "Chagrian" "Geonosian" "Mirialan"
## [26] "Clawdite" "Besalisk" "Kaminoan" "Skakoan" "Muun"
## [31] "Togruta" "Kaleesh" "Pau'an"
now lets ask another question ,if there is a connection between speicies and growth h0: there is no association h1 : there is a association
data_change %>%
mutate(height=cut(height,breaks=3,labels=c("small","medium","large"))) %>%
select(height,species) %>% table() %>% chisq.test()
##
## Pearson's Chi-squared test
##
## data: .
## X-squared = 102.46, df = 62, p-value = 0.0009372
the p value suggests that there is no association between height and species,meaning categorical variables are independent
library(DataExplorer)
data_change$species[is.na(data_change$species)]="others"
data_change$species[which(data_change$species %in% c("Besalisk" , "Cerean" , "Chagrian" , "Clawdite" , "Dug" , "Geonosian" , "Hutt" ,
"Iktotchi" , "Kaleesh" , "Kel Dor" , "Mon Calamari", "Muun" , "Nautolan" , "Neimodian" , "Pau'an" ,
"Rodian" , "Skakoan" , "Sullustan" , "Tholothian" , "Togruta" , "Toong" , "Toydarian" , "Trandoshan" ,
"Xexto" ))]="others"
data_change %>%
mutate(species=as.factor(species)) %>%
select(height,species) %>%
na.omit() %>%
plot_correlation()
data_change %>%
select(sex,height) %>%
aov(data=.,height~sex) %>%
tukey_hsd()
## # A tibble: 6 × 9
## term group1 group2 null.…¹ estim…² conf.…³ conf.…⁴ p.adj p.adj…⁵
## * <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 sex female hermaphro… 0 5.73 -51.4 62.9 0.993 ns
## 2 sex female male 0 15.7 -0.527 31.9 0.0615 ns
## 3 sex female none 0 14.2 -27.4 55.9 0.805 ns
## 4 sex hermaphroditic male 0 9.96 -45.9 65.8 0.965 ns
## 5 sex hermaphroditic none 0 8.50 -59.3 76.3 0.987 ns
## 6 sex male none 0 -1.46 -41.3 38.4 1 ns
## # … with abbreviated variable names ¹null.value, ²estimate, ³conf.low,
## # ⁴conf.high, ⁵p.adj.signif
bar=data_change %>%
count(species,sort = TRUE)
bar$species[is.na(bar$species)]="others"
bar$species[which(bar$species %in% c("Besalisk" , "Cerean" , "Chagrian" , "Clawdite" , "Dug" , "Geonosian" , "Hutt" ,
"Iktotchi" , "Kaleesh" , "Kel Dor" , "Mon Calamari", "Muun" , "Nautolan" , "Neimodian" , "Pau'an" ,
"Rodian" , "Skakoan" , "Sullustan" , "Tholothian" , "Togruta" , "Toong" , "Toydarian" , "Trandoshan" ,
"Xexto" ))]="others"
ggplot(bar,aes(x=species,y=n,color=species,fill=species))+
geom_bar(stat = "identity")+
coord_polar() + theme(panel.background = element_rect(fill = "gray96"))+ylab("pie charts of species")
wilcox test for camparing median, this is a not parametric test when
data is not normal and has extreme outlier
starwars %>%
filter(sex %in% c("male","female")) %>%
wilcox.test(height~sex,data=.,alternative= "two.sided") %>%
broom::tidy()
## # A tibble: 1 × 4
## statistic p.value method alternat…¹
## <dbl> <dbl> <chr> <chr>
## 1 212 0.00285 Wilcoxon rank sum test with continuity correction two.sided
## # … with abbreviated variable name ¹alternative
kruskal test : an extension of wilcox test like anova tests for t_test
starwars %>%
select(height,species) %>%
kruskal.test(height~species,data = .) %>%
tidy()
## # A tibble: 1 × 4
## statistic p.value parameter method
## <dbl> <dbl> <int> <chr>
## 1 55.4 0.0204 36 Kruskal-Wallis rank sum test