Design Homework 4

library(plyr)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::arrange()   masks plyr::arrange()
## x purrr::compact()   masks plyr::compact()
## x dplyr::count()     masks plyr::count()
## x dplyr::failwith()  masks plyr::failwith()
## x dplyr::filter()    masks stats::filter()
## x dplyr::id()        masks plyr::id()
## x dplyr::lag()       masks stats::lag()
## x dplyr::mutate()    masks plyr::mutate()
## x dplyr::rename()    masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()

library(ggplot2)
library(ggdist)
library(ggplot2)
library(gghalves)
library(broom)
library(DescTools)

## 
## Attaching package: 'DescTools'

## The following object is masked from 'package:ggdist':
## 
##     Mode

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

setwd("C:/Users/lhomm/OneDrive/Documents/R")
Dat <- read.table("C:/Users/lhomm/OneDrive/Documents/R/productivitydata.txt",header=FALSE) 

colnames(Dat) <-  c("Productivity", "AERND", "Count", "Location")
factor(Dat$AERND)

##  [1] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3
## Levels: 1 2 3

Dat$AERND[Dat$AERND=="1"] <- "1Low"
Dat$AERND[Dat$AERND=="2"] <- "2Med"
Dat$AERND[Dat$AERND=="3"] <- "3High"
factor(Dat$Location)

##  [1] 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 0 0 0 0 1 0 1 1 0
## Levels: 0 1

Dat$Location[Dat$Location=="1"] <- "1U.S."
Dat$Location[Dat$Location=="0"] <- "2Europe"

means <- tapply(Dat$Productivity, Dat$AERND, mean)
means

##     1Low     2Med    3High 
## 6.877778 8.133333 9.200000

SDs <- tapply(Dat$Productivity, Dat$AERND, sd)
SDs

##      1Low      2Med     3High 
## 0.8135997 0.7571878 0.8671793

stripchart(Dat$Productivity ~ Dat$AERND, data=Dat, method="stack", vertical=TRUE,
           pch=1, cex=1.5, xlab="Factor", ylab="Productivity", main="Dotplot of Average RND Expenditures by Productivity")
title(sub="Exploratory dot-plot", adj=0, cex=5/6)

points(tapply(Dat$Productivity, Dat$AERND, mean),col=2,pch=8)
legend("topleft", legend =  c("Observations", " Trt Mean"), col = c(1,2), text.col= "black",
       lty=c(0,0),pch=c(1,8),bg='gray90', )

Advanced_Plot <- ggplot(Dat, aes(x = Dat$AERND, y = Dat$Productivity, fill=Dat$AERND)) + 
  ggdist::stat_halfeye(
    adjust = .9, 
    width = .6, 
    .width = 0, 
    justification = -.3, 
    point_colour = NA) + 
  geom_boxplot(
    width = .25, 
    outlier.shape = NA
  ) +
  geom_point(
    size = 1.3,
    alpha = .3,
    position = position_jitter(
      seed = 1, width = .1
    )
  ) + 
  coord_cartesian(xlim = c(1.2, NA), clip = "off")
Advanced_Plot <- Advanced_Plot + labs(x = "Average RND Expenditure",  y = "Productivity", fill = "RND Level")
Advanced_Plot

## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

## Warning: Use of `Dat$Productivity` is discouraged. Use `Productivity` instead.

## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

## Warning: Use of `Dat$Productivity` is discouraged. Use `Productivity` instead.

## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

## Warning: Use of `Dat$Productivity` is discouraged. Use `Productivity` instead.

## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

Anova1 <- aov(Dat$Productivity ~ Dat$AERND, data = Dat)
Anova1

## Call:
##    aov(formula = Dat$Productivity ~ Dat$AERND, data = Dat)
## 
## Terms:
##                 Dat$AERND Residuals
## Sum of Squares   20.12518  15.36222
## Deg. of Freedom         2        24
## 
## Residual standard error: 0.8000579
## Estimated effects may be unbalanced

StudentANova <- rstudent(Anova1)
StudentANova

##           1           2           3           4           5           6 
##  0.95574307  1.83766574 -0.10096362 -1.46233725  0.02884088 -0.36152734 
##           7           8           9          10          11          12 
## -0.75918652  1.09453127 -1.17276071 -1.98208267 -0.04260171  1.71973234 
##          13          14          15          16          17          18 
##  0.60106628 -0.42769096 -0.55753010  1.00091218 -0.29877835  0.21321055 
##          19          20          21          22          23          24 
##  0.73260142 -1.37370864  0.34166401 -0.95675396  0.67683065  1.24641682 
##          25          26          27 
## -2.03910796  0.53953368  0.40353458

source("https://raw.githubusercontent.com/athienit/STA4210material/main/check.R")
check(Anova1,tests=TRUE)

## Loading required package: lawstat

## Loading required package: car

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:lawstat':
## 
##     levene.test

## The following object is masked from 'package:DescTools':
## 
##     Recode

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

## Warning in leveneTest.default(lm.object$model[, 1], lm.object$model[, 2]):
## lm.object$model[, 2] coerced to factor.

## $Independence
## $Independence[[1]]
## 
##  Runs Test - Two sided
## 
## data:  re
## Standardized Runs Statistic = 0.20382, p-value = 0.8385
## 
## 
## $Independence[[2]]
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.09048974      2.141167   0.984
##  Alternative hypothesis: rho != 0
## 
## 
## $Normality
## 
##  Shapiro-Wilk normality test
## 
## data:  re
## W = 0.97425, p-value = 0.7164
## 
## 
## [[3]]
## [1] "Constant Variance only valid if data are in groups"
## 
## $ConstantVar
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  0.0245 0.9758
##       24

ncvTest(lm(Dat$Productivity ~ Dat$AERND, data = Dat))

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.001721269, Df = 1, p = 0.96691

sdr <- rstudent(Anova1)
sdr[which(abs(sdr)>=abs(qt(0.05/length(sdr),Anova1$df.residual)))]

## named numeric(0)

hat <- hatvalues(Anova1)
hat[which(hat > 2*5 / length(sdr))]

## named numeric(0)

dftrt <- length(levels(Dat$AERND)) - 1
dffits(Anova1)[which(dffits(Anova1)>2*sqrt(dftrt/length(sdr)))]

## Warning in sqrt(dftrt/length(sdr)): NaNs produced

## named numeric(0)

cd <- cooks.distance(Anova1)
cd[which(cd>1)]

## named numeric(0)

cd[which(cd>qf(0.5,dftrt,df.residual(Anova1)))]

## Warning in qf(0.5, dftrt, df.residual(Anova1)): NaNs produced

## named numeric(0)

### It appears no assumptions were violated. All of the plots look correct, the Shapiro test of normality supports the normality of our data, the Levees test and the Constant Variance test both support the equality of variances of our data and it appears that our data are independent but because we are not sure of the chronology of our data we can not draw conclusions from the runs test. Additionally the Sdr, hat, dftr, and cooks distances all are zero which suggests there are no outliers. All of that being said for thoroughness we will apply some remediation measures. ###

### Remediation ###
oneway.test(Dat$Productivity ~ Dat$AERND, data = Dat, var.equal = FALSE)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  Dat$Productivity and Dat$AERND
## F = 13.665, num df = 2.000, denom df = 12.507, p-value = 0.0007138

Bonferroni <- PostHocTest(Anova1, method = "bonferroni", conf.level = .9)
Bonferroni

## 
##   Posthoc multiple comparisons of means : Bonferroni 
##     90% family-wise confidence level
## 
## $`Dat$AERND`
##                diff    lwr.ci   upr.ci    pval    
## 2Med-1Low  1.255556 0.4590387 2.052072  0.0048 ** 
## 3High-1Low 2.322222 1.3702025 3.274242 3.5e-05 ***
## 3High-2Med 1.066667 0.1635014 1.969832  0.0405 *  
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Datiwls <- rlm(Dat$Productivity ~ Dat$AERND, data = Dat,wt.method="inv.var",method="M") 
summary(Datiwls)

## 
## Call: rlm(formula = Dat$Productivity ~ Dat$AERND, data = Dat, method = "M", 
##     wt.method = "inv.var")
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.46085 -0.49973  0.05053  0.50000  1.35053 
## 
## Coefficients:
##                Value   Std. Error t value
## (Intercept)     6.8495  0.2889    23.7070
## Dat$AERND2Med   1.3005  0.3822     3.4027
## Dat$AERND3High  2.4114  0.4568     5.2786
## 
## Residual standard error: 0.8147 on 24 degrees of freedom

anova(Datiwls)

## Analysis of Variance Table
## 
## Response: Dat$Productivity
##           Df Sum Sq Mean Sq F value Pr(>F)
## Dat$AERND  2 21.017  10.508               
## Residuals    15.395

boxT <- powerTransform(Dat$Productivity ~ Dat$AERND)
summary(boxT)

## bcPower Transformation to Normality 
##    Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1     1.215           1      -1.1016       3.5317
## 
## Likelihood ratio test that transformation parameter is equal to 0
##  (log transformation)
##                            LRT df    pval
## LR test, lambda = (0) 1.072026  1 0.30049
## 
## Likelihood ratio test that no transformation is needed
##                              LRT df    pval
## LR test, lambda = (1) 0.03321187  1 0.85539

Dat$ProductivityB <- bcPower(Dat$Productivity,0)
ANova2 <- aov(Dat$ProductivityB~Dat$AERND,data=Dat)

kruskal.test(Dat$ProductivityB~Dat$AERND,data=Dat)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Dat$ProductivityB by Dat$AERND
## Kruskal-Wallis chi-squared = 14.956, df = 2, p-value = 0.0005655

### All of our remedation techniques are consitant with the data and suggests that our data already follows our assumptions. ###

supp.labs <- c("Europe", "U.S.")
Advanced_Plot2 <- ggplot(Dat, aes(x = Dat$AERND, y = Dat$Productivity, fill=Dat$AERND)) + 
  ggdist::stat_halfeye(
    adjust = .9, 
    width = .6, 
    .width = 0, 
    justification = -.3, 
    point_colour = NA) + 
  geom_boxplot(
    width = .25, 
    outlier.shape = NA
  ) +
  geom_point(
    size = 1.3,
    alpha = .3,
    position = position_jitter(
      seed = 1, width = .1
    )
  ) + 
  coord_cartesian(xlim = c(1.2, NA), clip = "off")
Advanced_Plot2 <- Advanced_Plot2 + labs(x = "Average RND Expenditure",  y = "Productivity", fill = "RND Level", )
Advanced_Plot2 <- Advanced_Plot2 + facet_grid(. ~ Dat$Location)
Advanced_Plot2

## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

## Warning: Use of `Dat$Productivity` is discouraged. Use `Productivity` instead.

## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

## Warning: Use of `Dat$Productivity` is discouraged. Use `Productivity` instead.

## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

## Warning: Use of `Dat$Productivity` is discouraged. Use `Productivity` instead.

## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

Anova3 <- aov(Dat$Productivity ~ Dat$AERND + Dat$Location, data = Dat)
Anova3

## Call:
##    aov(formula = Dat$Productivity ~ Dat$AERND + Dat$Location, data = Dat)
## 
## Terms:
##                 Dat$AERND Dat$Location Residuals
## Sum of Squares  20.125185     0.085988 15.276234
## Deg. of Freedom         2            1        23
## 
## Residual standard error: 0.8149749
## Estimated effects may be unbalanced

StudentANova3 <- rstudent(Anova3)
StudentANova3

##           1           2           3           4           5           6 
##  1.02412884  1.75797037 -0.18450666 -1.57359190 -0.05361679 -0.29473388 
##           7           8           9          10          11          12 
## -0.68991089  1.16432287 -1.10003933 -2.05603362 -0.10271783  1.64286701 
##          13          14          15          16          17          18 
##  0.53663140 -0.34529341 -0.47579137  1.10401301 -0.21558827  0.30126115 
##          19          20          21          22          23          24 
##  0.66696514 -1.43748278  0.27930863 -1.04318071  0.75897535  1.16895322 
##          25          26          27 
## -1.95806293  0.62004267  0.32784721

source("https://raw.githubusercontent.com/athienit/STA4210material/main/check.R")
check(Anova3,tests=TRUE)

## Warning in leveneTest.default(lm.object$model[, 1], lm.object$model[, 2]):
## lm.object$model[, 2] coerced to factor.

## $Independence
## $Independence[[1]]
## 
##  Runs Test - Two sided
## 
## data:  re
## Standardized Runs Statistic = 0.20382, p-value = 0.8385
## 
## 
## $Independence[[2]]
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.07988954      2.116816   0.812
##  Alternative hypothesis: rho != 0
## 
## 
## $Normality
## 
##  Shapiro-Wilk normality test
## 
## data:  re
## W = 0.96702, p-value = 0.5253
## 
## 
## [[3]]
## [1] "Constant Variance only valid if data are in groups"
## 
## $ConstantVar
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  0.0245 0.9758
##       24

ncvTest(lm(Dat$Productivity ~ Dat$AERND + Dat$Location, data = Dat))

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.004745563, Df = 1, p = 0.94508

sdr <- rstudent(Anova3)
sdr[which(abs(sdr)>=abs(qt(0.05/length(sdr),Anova3$df.residual)))]

## named numeric(0)

hat <- hatvalues(Anova3)
hat[which(hat > 2*5 / length(sdr))]

## named numeric(0)

dftrt <- length(levels(Dat$AERND)) - 1
dffits(Anova3)[which(dffits(Anova3)>2*sqrt(dftrt/length(sdr)))]

## Warning in sqrt(dftrt/length(sdr)): NaNs produced

## named numeric(0)

cd <- cooks.distance(Anova3)
cd[which(cd>1)]

## named numeric(0)

cd[which(cd>qf(0.5,dftrt,df.residual(Anova3)))]

## Warning in qf(0.5, dftrt, df.residual(Anova3)): NaNs produced

## named numeric(0)

### It appears that again all of our assumptions have been met. All of our plots and tests suggests that no assumption was violated. And again to ensure thoroughness we will still apply some remediation measures. ###

### Remediation ###
oneway.test(Dat$Productivity ~ Dat$AERND + Dat$Location, data = Dat, var.equal = FALSE)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  Dat$Productivity and Dat$AERND + Dat$Location
## F = 4.1517, num df = 5.0000, denom df = 7.7549, p-value = 0.03883

Bonferroni <- PostHocTest(Anova3, method = "bonferroni", conf.level = .9)
Bonferroni

## 
##   Posthoc multiple comparisons of means : Bonferroni 
##     90% family-wise confidence level
## 
## $`Dat$AERND`
##                diff    lwr.ci   upr.ci    pval    
## 2Med-1Low  1.255556 0.4420391 2.069072  0.0059 ** 
## 3High-1Low 2.322222 1.3498841 3.294560 5.1e-05 ***
## 3High-2Med 1.066667 0.1442258 1.989108  0.0462 *  
## 
## $`Dat$Location`
##                    diff     lwr.ci    upr.ci   pval    
## 2Europe-1U.S. 0.1120879 -0.4258953 0.6500711 0.7243    
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Datiwls <- rlm(Dat$Productivity ~ Dat$AERND + Dat$Location, data = Dat,wt.method="inv.var",method="M") 
summary(Datiwls)

## 
## Call: rlm(formula = Dat$Productivity ~ Dat$AERND + Dat$Location, data = Dat, 
##     method = "M", wt.method = "inv.var")
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -1.509679 -0.466490 -0.009282  0.480839  1.290718 
## 
## Coefficients:
##                     Value   Std. Error t value
## (Intercept)          6.8163  0.3516    19.3855
## Dat$AERND2Med        1.3004  0.4162     3.1245
## Dat$AERND3High       2.4124  0.4942     4.8816
## Dat$Location2Europe  0.0930  0.3636     0.2557
## 
## Residual standard error: 0.727 on 23 degrees of freedom

anova(Datiwls)

## Analysis of Variance Table
## 
## Response: Dat$Productivity
##              Df  Sum Sq Mean Sq F value Pr(>F)
## Dat$AERND     2 20.9005 10.4503               
## Dat$Location  1  0.0547  0.0547               
## Residuals       15.3336

boxT <- powerTransform(Dat$Productivity ~ Dat$AERND + Dat$Location)
summary(boxT)

## bcPower Transformation to Normality 
##    Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1    1.2659           1      -1.0615       3.5932
## 
## Likelihood ratio test that transformation parameter is equal to 0
##  (log transformation)
##                            LRT df    pval
## LR test, lambda = (0) 1.152434  1 0.28304
## 
## Likelihood ratio test that no transformation is needed
##                              LRT df    pval
## LR test, lambda = (1) 0.05032793  1 0.82249

Dat$ProductivityB <- bcPower(Dat$Productivity,0)
ANova4 <- aov(Dat$ProductivityB~Dat$AERND + Dat$Location,data=Dat)

### All of our remedation techniques are consitant with the data and suggests that our data already follows our assumptions. ###

### 18.23 ###
### A. ####
  ### Ho: All means are equal H1: At least one mean is different ###
  Decsion_Rule <- qf(0.05, 2, 24, lower.tail=FALSE) ### Decision Rule ###
  Decsion_Rule

## [1] 3.402826

  Krusk <- kruskal.test(Dat$Productivity~Dat$AERND,data=Dat)
  Krusk

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Dat$Productivity by Dat$AERND
## Kruskal-Wallis chi-squared = 14.956, df = 2, p-value = 0.0005655

  ### Given our decision rule and alpha level of .05 our results are favor H1 the alternative hypothesis that not all of   the means are equal. Our Test stat    is larger than our decision rule and our p-value is smaller than our alpha level.   ###
  ### These results are consistent with the parametric test from 16.7; both tests favor H1. Given that the results are     significant and because the assumption of equal variances was not violated it is unlikely that the parametric test was   necessary, but there is no down side to also conducting a KW test and the consistent results lends extra weight to our   conclusions. ###