Ancova

Jasiek

31/01/2022

WPROWADZENIE

ggplot(Salaries,aes(sex,salary))+geom_boxplot()

prop.test(table(Salaries[c(1,2)]))
#> 
#>  3-sample test for equality of proportions without continuity
#>  correction
#> 
#> data:  table(Salaries[c(1, 2)])
#> X-squared = 4.6487, df = 2, p-value = 0.09785
#> alternative hypothesis: two.sided
#> sample estimates:
#>    prop 1    prop 2    prop 3 
#> 0.3582090 0.4062500 0.4924812
prop.test(table(Salaries[c(1,5)]))
#> 
#>  3-sample test for equality of proportions without continuity
#>  correction
#> 
#> data:  table(Salaries[c(1, 5)])
#> X-squared = 8.5259, df = 2, p-value = 0.01408
#> alternative hypothesis: two.sided
#> sample estimates:
#>     prop 1     prop 2     prop 3 
#> 0.16417910 0.15625000 0.06766917
prop.test(table(Salaries[c(2,5)]))
#> 
#>  2-sample test for equality of proportions with continuity correction
#> 
#> data:  table(Salaries[c(2, 5)])
#> X-squared = 2.7708e-30, df = 1, p-value = 1
#> alternative hypothesis: two.sided
#> 95 percent confidence interval:
#>  -0.05883604  0.06328663
#> sample estimates:
#>     prop 1     prop 2 
#> 0.09944751 0.09722222

Wszystkie zmienne są nie zależnie między sobą

PODPUNKT 1

Salaries %>%
  group_by(rank,discipline,sex) %>%
  shapiro_test(salary)
#> # A tibble: 12 × 6
#>    rank      discipline sex    variable statistic        p
#>    <fct>     <fct>      <fct>  <chr>        <dbl>    <dbl>
#>  1 AsstProf  A          Female salary       0.870 0.226   
#>  2 AsstProf  A          Male   salary       0.941 0.300   
#>  3 AsstProf  B          Female salary       0.889 0.354   
#>  4 AsstProf  B          Male   salary       0.941 0.0458  
#>  5 AssocProf A          Female salary       0.863 0.269   
#>  6 AssocProf A          Male   salary       0.878 0.0113  
#>  7 AssocProf B          Female salary       0.635 0.00117 
#>  8 AssocProf B          Male   salary       0.967 0.416   
#>  9 Prof      A          Female salary       0.934 0.549   
#> 10 Prof      A          Male   salary       0.952 0.000259
#> 11 Prof      B          Female salary       0.974 0.923   
#> 12 Prof      B          Male   salary       0.978 0.0435

rozkłady nie są normalne

wartosci.odstajace <- Salaries %>%
  group_by(rank,discipline,sex) %>%
  identify_outliers(salary)
wartosci.odstajace
#> # A tibble: 18 × 8
#>    rank      discipline sex    yrs.since.phd yrs.service salary is.outlier is.extreme
#>    <fct>     <fct>      <fct>          <int>       <int>  <int> <lgl>      <lgl>     
#>  1 AsstProf  A          Female             7           6  63100 TRUE       FALSE     
#>  2 AsstProf  A          Male               3           1  63900 TRUE       FALSE     
#>  3 AsstProf  A          Male               2           0  85000 TRUE       TRUE      
#>  4 AsstProf  A          Male               8           4  81035 TRUE       FALSE     
#>  5 AssocProf A          Female            25          22  62884 TRUE       FALSE     
#>  6 AssocProf A          Male              14           8 100102 TRUE       FALSE     
#>  7 AssocProf A          Male               9           7  70000 TRUE       FALSE     
#>  8 AssocProf A          Male              11           1 104800 TRUE       FALSE     
#>  9 AssocProf A          Male              45          39  70700 TRUE       FALSE     
#> 10 AssocProf A          Male              10           1 108413 TRUE       FALSE     
#> 11 AssocProf A          Male              11           8 104121 TRUE       FALSE     
#> 12 AssocProf B          Female            14           7 109650 TRUE       TRUE      
#> 13 AssocProf B          Female            12           9  71065 TRUE       TRUE      
#> 14 AssocProf B          Male              13          11 126431 TRUE       FALSE     
#> 15 Prof      A          Male              29           7 204000 TRUE       FALSE     
#> 16 Prof      A          Male              42          18 194800 TRUE       FALSE     
#> 17 Prof      A          Male              43          43 205500 TRUE       FALSE     
#> 18 Prof      B          Male              38          38 231545 TRUE       FALSE
library(extraoperators)
Salaries$salary[Salaries$salary%in%wartosci.odstajace$salary] <- NA
library(stats)
Salaries <- na.omit(Salaries)

usuwamy wartości oddstające

Salaries <- mutate(Salaries,log=log10(salary))

logarytm przy podstawie 10 z wartości zmienej ‘salary’

Salaries %>%
  group_by(rank,discipline,sex) %>%
  shapiro_test(log)
#> # A tibble: 12 × 6
#>    rank      discipline sex    variable statistic      p
#>    <fct>     <fct>      <fct>  <chr>        <dbl>  <dbl>
#>  1 AsstProf  A          Female log          0.813 0.104 
#>  2 AsstProf  A          Male   log          0.952 0.560 
#>  3 AsstProf  B          Female log          0.896 0.387 
#>  4 AsstProf  B          Male   log          0.931 0.0218
#>  5 AssocProf A          Female log          0.978 0.717 
#>  6 AssocProf A          Male   log          0.891 0.0587
#>  7 AssocProf B          Female log          0.916 0.517 
#>  8 AssocProf B          Male   log          0.981 0.840 
#>  9 Prof      A          Female log          0.936 0.575 
#> 10 Prof      A          Male   log          0.985 0.212 
#> 11 Prof      B          Female log          0.976 0.939 
#> 12 Prof      B          Male   log          0.986 0.236

rozkłady nowej zmiennej są normalne

##test ANCOVA

Salaries %>%
  anova_test(log~rank*discipline*sex)
#> Coefficient covariances computed by hccm()
#> ANOVA Table (type II tests)
#> 
#>                Effect DFn DFd       F        p p<.05      ges
#> 1                rank   2 366 189.135 3.89e-57     * 0.508000
#> 2          discipline   1 366  57.685 2.59e-13     * 0.136000
#> 3                 sex   1 366   0.312 5.77e-01       0.000851
#> 4     rank:discipline   2 366   1.527 2.19e-01       0.008000
#> 5            rank:sex   2 366   0.089 9.15e-01       0.000486
#> 6      discipline:sex   1 366   0.713 3.99e-01       0.002000
#> 7 rank:discipline:sex   2 366   0.315 7.30e-01       0.002000

zachodzą istotne różnice pomiędzy zmiennymi rank i discpipline

Salaries %>%
  tukey_hsd(log~rank)
#> # A tibble: 3 × 9
#>   term  group1    group2    null.value estimate conf.low conf.high    p.adj
#> * <chr> <chr>     <chr>          <dbl>    <dbl>    <dbl>     <dbl>    <dbl>
#> 1 rank  AsstProf  AssocProf          0   0.0620   0.0270    0.0971 0.000113
#> 2 rank  AsstProf  Prof               0   0.182    0.155     0.208  0       
#> 3 rank  AssocProf Prof               0   0.120    0.0915    0.148  0       
#> # … with 1 more variable: p.adj.signif <chr>

każda grupa różni się istotnie

Salaries %>%
  tukey_hsd(log~discipline)
#> # A tibble: 1 × 9
#>   term       group1 group2 null.value estimate conf.low conf.high   p.adj p.adj.signif
#> * <chr>      <chr>  <chr>       <dbl>    <dbl>    <dbl>     <dbl>   <dbl> <chr>       
#> 1 discipline A      B               0   0.0369   0.0152    0.0587 0.00093 ***

grupy różnią się istotnie

#PODPUNKT 2

ggplot(Salaries,aes(yrs.since.phd,salary))+geom_point()

Salaries %>%
  cor_test(yrs.since.phd,salary,method="pearson")
#> # A tibble: 1 × 8
#>   var1          var2     cor statistic        p conf.low conf.high method 
#>   <chr>         <chr>  <dbl>     <dbl>    <dbl>    <dbl>     <dbl> <chr>  
#> 1 yrs.since.phd salary   0.4      8.53 3.76e-16    0.314     0.484 Pearson
ggplot(Salaries,aes(yrs.service,salary))+geom_point()

Salaries %>%
  cor_test(yrs.service,salary,method="pearson")
#> # A tibble: 1 × 8
#>   var1        var2     cor statistic        p conf.low conf.high method 
#>   <chr>       <chr>  <dbl>     <dbl>    <dbl>    <dbl>     <dbl> <chr>  
#> 1 yrs.service salary  0.33      6.72 6.57e-11    0.235     0.415 Pearson

zmienne są skorelowane ale słabo

#PODPUNKT 3

ggplot(Salaries,aes(yrs.since.phd,salary,color=rank))+geom_point()

Salaries %>%
  group_by(rank) %>%
  cor_test(yrs.since.phd,salary,method="pearson")
#> # A tibble: 3 × 9
#>   rank      var1          var2     cor statistic      p conf.low conf.high method
#>   <fct>     <chr>         <chr>  <dbl>     <dbl>  <dbl>    <dbl>     <dbl> <chr> 
#> 1 AsstProf  yrs.since.phd salary -0.17    -1.33  0.188    -0.399    0.0830 Pears…
#> 2 AssocProf yrs.since.phd salary -0.23    -1.70  0.0957   -0.468    0.0412 Pears…
#> 3 Prof      yrs.since.phd salary -0.05    -0.809 0.419    -0.171    0.0716 Pears…

w poszczególnych podgrupach zmienne nie są skorelowane

ggplot(Salaries,aes(yrs.service,salary,color=rank))+geom_point()

Salaries %>%
  group_by(rank) %>%
  cor_test(yrs.service,salary,method="pearson")
#> # A tibble: 3 × 9
#>   rank      var1        var2      cor statistic      p conf.low conf.high method
#>   <fct>     <chr>       <chr>   <dbl>     <dbl>  <dbl>    <dbl>     <dbl> <chr> 
#> 1 AsstProf  yrs.service salary  0.24       1.95 0.0554 -0.00546    0.463  Pears…
#> 2 AssocProf yrs.service salary -0.21      -1.55 0.128  -0.452      0.0617 Pears…
#> 3 Prof      yrs.service salary -0.077     -1.24 0.215  -0.197      0.0449 Pears…

w poszczególnych podgrupach zmienne nie są skorelowane