library(plyr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::arrange() masks plyr::arrange()
## x purrr::compact() masks plyr::compact()
## x dplyr::count() masks plyr::count()
## x dplyr::failwith() masks plyr::failwith()
## x dplyr::filter() masks stats::filter()
## x dplyr::id() masks plyr::id()
## x dplyr::lag() masks stats::lag()
## x dplyr::mutate() masks plyr::mutate()
## x dplyr::rename() masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()
library(ggplot2)
library(ggdist)
library(ggplot2)
library(gghalves)
library(broom)
library(DescTools)
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:ggdist':
##
## Mode
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
setwd("C:/Users/lhomm/OneDrive/Documents/R")
Dat <- read.table("C:/Users/lhomm/OneDrive/Documents/R/productivitydata.txt",header=FALSE)
colnames(Dat) <- c("Productivity", "AERND", "Count", "Location")
factor(Dat$AERND)
## [1] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3
## Levels: 1 2 3
Dat$AERND[Dat$AERND=="1"] <- "1Low"
Dat$AERND[Dat$AERND=="2"] <- "2Med"
Dat$AERND[Dat$AERND=="3"] <- "3High"
factor(Dat$Location)
## [1] 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 0 0 0 0 1 0 1 1 0
## Levels: 0 1
Dat$Location[Dat$Location=="1"] <- "1U.S."
Dat$Location[Dat$Location=="0"] <- "2Europe"
means <- tapply(Dat$Productivity, Dat$AERND, mean)
means
## 1Low 2Med 3High
## 6.877778 8.133333 9.200000
SDs <- tapply(Dat$Productivity, Dat$AERND, sd)
SDs
## 1Low 2Med 3High
## 0.8135997 0.7571878 0.8671793
stripchart(Dat$Productivity ~ Dat$AERND, data=Dat, method="stack", vertical=TRUE,
pch=1, cex=1.5, xlab="Factor", ylab="Productivity", main="Dotplot of Average RND Expenditures by Productivity")
title(sub="Exploratory dot-plot", adj=0, cex=5/6)
points(tapply(Dat$Productivity, Dat$AERND, mean),col=2,pch=8)
legend("topleft", legend = c("Observations", " Trt Mean"), col = c(1,2), text.col= "black",
lty=c(0,0),pch=c(1,8),bg='gray90', )

Advanced_Plot <- ggplot(Dat, aes(x = Dat$AERND, y = Dat$Productivity, fill=Dat$AERND)) +
ggdist::stat_halfeye(
adjust = .9,
width = .6,
.width = 0,
justification = -.3,
point_colour = NA) +
geom_boxplot(
width = .25,
outlier.shape = NA
) +
geom_point(
size = 1.3,
alpha = .3,
position = position_jitter(
seed = 1, width = .1
)
) +
coord_cartesian(xlim = c(1.2, NA), clip = "off")
Advanced_Plot <- Advanced_Plot + labs(x = "Average RND Expenditure", y = "Productivity", fill = "RND Level")
Advanced_Plot
## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.
## Warning: Use of `Dat$Productivity` is discouraged. Use `Productivity` instead.
## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.
## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.
## Warning: Use of `Dat$Productivity` is discouraged. Use `Productivity` instead.
## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.
## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.
## Warning: Use of `Dat$Productivity` is discouraged. Use `Productivity` instead.
## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

Anova1 <- aov(Dat$Productivity ~ Dat$AERND, data = Dat)
Anova1
## Call:
## aov(formula = Dat$Productivity ~ Dat$AERND, data = Dat)
##
## Terms:
## Dat$AERND Residuals
## Sum of Squares 20.12518 15.36222
## Deg. of Freedom 2 24
##
## Residual standard error: 0.8000579
## Estimated effects may be unbalanced
StudentANova <- rstudent(Anova1)
StudentANova
## 1 2 3 4 5 6
## 0.95574307 1.83766574 -0.10096362 -1.46233725 0.02884088 -0.36152734
## 7 8 9 10 11 12
## -0.75918652 1.09453127 -1.17276071 -1.98208267 -0.04260171 1.71973234
## 13 14 15 16 17 18
## 0.60106628 -0.42769096 -0.55753010 1.00091218 -0.29877835 0.21321055
## 19 20 21 22 23 24
## 0.73260142 -1.37370864 0.34166401 -0.95675396 0.67683065 1.24641682
## 25 26 27
## -2.03910796 0.53953368 0.40353458
source("https://raw.githubusercontent.com/athienit/STA4210material/main/check.R")
check(Anova1,tests=TRUE)
## Loading required package: lawstat
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:lawstat':
##
## levene.test
## The following object is masked from 'package:DescTools':
##
## Recode
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
## Warning in leveneTest.default(lm.object$model[, 1], lm.object$model[, 2]):
## lm.object$model[, 2] coerced to factor.

## $Independence
## $Independence[[1]]
##
## Runs Test - Two sided
##
## data: re
## Standardized Runs Statistic = 0.20382, p-value = 0.8385
##
##
## $Independence[[2]]
## lag Autocorrelation D-W Statistic p-value
## 1 -0.09048974 2.141167 0.984
## Alternative hypothesis: rho != 0
##
##
## $Normality
##
## Shapiro-Wilk normality test
##
## data: re
## W = 0.97425, p-value = 0.7164
##
##
## [[3]]
## [1] "Constant Variance only valid if data are in groups"
##
## $ConstantVar
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.0245 0.9758
## 24
ncvTest(lm(Dat$Productivity ~ Dat$AERND, data = Dat))
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.001721269, Df = 1, p = 0.96691
sdr <- rstudent(Anova1)
sdr[which(abs(sdr)>=abs(qt(0.05/length(sdr),Anova1$df.residual)))]
## named numeric(0)
hat <- hatvalues(Anova1)
hat[which(hat > 2*5 / length(sdr))]
## named numeric(0)
dftrt <- length(levels(Dat$AERND)) - 1
dffits(Anova1)[which(dffits(Anova1)>2*sqrt(dftrt/length(sdr)))]
## Warning in sqrt(dftrt/length(sdr)): NaNs produced
## named numeric(0)
cd <- cooks.distance(Anova1)
cd[which(cd>1)]
## named numeric(0)
cd[which(cd>qf(0.5,dftrt,df.residual(Anova1)))]
## Warning in qf(0.5, dftrt, df.residual(Anova1)): NaNs produced
## named numeric(0)
### It appears no assumptions were violated. All of the plots look correct, the Shapiro test of normality supports the normality of our data, the Levees test and the Constant Variance test both support the equality of variances of our data and it appears that our data are independent but because we are not sure of the chronology of our data we can not draw conclusions from the runs test. Additionally the Sdr, hat, dftr, and cooks distances all are zero which suggests there are no outliers. All of that being said for thoroughness we will apply some remediation measures. ###
### Remediation ###
oneway.test(Dat$Productivity ~ Dat$AERND, data = Dat, var.equal = FALSE)
##
## One-way analysis of means (not assuming equal variances)
##
## data: Dat$Productivity and Dat$AERND
## F = 13.665, num df = 2.000, denom df = 12.507, p-value = 0.0007138
Bonferroni <- PostHocTest(Anova1, method = "bonferroni", conf.level = .9)
Bonferroni
##
## Posthoc multiple comparisons of means : Bonferroni
## 90% family-wise confidence level
##
## $`Dat$AERND`
## diff lwr.ci upr.ci pval
## 2Med-1Low 1.255556 0.4590387 2.052072 0.0048 **
## 3High-1Low 2.322222 1.3702025 3.274242 3.5e-05 ***
## 3High-2Med 1.066667 0.1635014 1.969832 0.0405 *
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Datiwls <- rlm(Dat$Productivity ~ Dat$AERND, data = Dat,wt.method="inv.var",method="M")
summary(Datiwls)
##
## Call: rlm(formula = Dat$Productivity ~ Dat$AERND, data = Dat, method = "M",
## wt.method = "inv.var")
## Residuals:
## Min 1Q Median 3Q Max
## -1.46085 -0.49973 0.05053 0.50000 1.35053
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 6.8495 0.2889 23.7070
## Dat$AERND2Med 1.3005 0.3822 3.4027
## Dat$AERND3High 2.4114 0.4568 5.2786
##
## Residual standard error: 0.8147 on 24 degrees of freedom
anova(Datiwls)
## Analysis of Variance Table
##
## Response: Dat$Productivity
## Df Sum Sq Mean Sq F value Pr(>F)
## Dat$AERND 2 21.017 10.508
## Residuals 15.395
boxT <- powerTransform(Dat$Productivity ~ Dat$AERND)
summary(boxT)
## bcPower Transformation to Normality
## Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1 1.215 1 -1.1016 3.5317
##
## Likelihood ratio test that transformation parameter is equal to 0
## (log transformation)
## LRT df pval
## LR test, lambda = (0) 1.072026 1 0.30049
##
## Likelihood ratio test that no transformation is needed
## LRT df pval
## LR test, lambda = (1) 0.03321187 1 0.85539
Dat$ProductivityB <- bcPower(Dat$Productivity,0)
ANova2 <- aov(Dat$ProductivityB~Dat$AERND,data=Dat)
kruskal.test(Dat$ProductivityB~Dat$AERND,data=Dat)
##
## Kruskal-Wallis rank sum test
##
## data: Dat$ProductivityB by Dat$AERND
## Kruskal-Wallis chi-squared = 14.956, df = 2, p-value = 0.0005655
### All of our remedation techniques are consitant with the data and suggests that our data already follows our assumptions. ###
supp.labs <- c("Europe", "U.S.")
Advanced_Plot2 <- ggplot(Dat, aes(x = Dat$AERND, y = Dat$Productivity, fill=Dat$AERND)) +
ggdist::stat_halfeye(
adjust = .9,
width = .6,
.width = 0,
justification = -.3,
point_colour = NA) +
geom_boxplot(
width = .25,
outlier.shape = NA
) +
geom_point(
size = 1.3,
alpha = .3,
position = position_jitter(
seed = 1, width = .1
)
) +
coord_cartesian(xlim = c(1.2, NA), clip = "off")
Advanced_Plot2 <- Advanced_Plot2 + labs(x = "Average RND Expenditure", y = "Productivity", fill = "RND Level", )
Advanced_Plot2 <- Advanced_Plot2 + facet_grid(. ~ Dat$Location)
Advanced_Plot2
## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.
## Warning: Use of `Dat$Productivity` is discouraged. Use `Productivity` instead.
## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.
## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.
## Warning: Use of `Dat$Productivity` is discouraged. Use `Productivity` instead.
## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.
## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.
## Warning: Use of `Dat$Productivity` is discouraged. Use `Productivity` instead.
## Warning: Use of `Dat$AERND` is discouraged. Use `AERND` instead.

Anova3 <- aov(Dat$Productivity ~ Dat$AERND + Dat$Location, data = Dat)
Anova3
## Call:
## aov(formula = Dat$Productivity ~ Dat$AERND + Dat$Location, data = Dat)
##
## Terms:
## Dat$AERND Dat$Location Residuals
## Sum of Squares 20.125185 0.085988 15.276234
## Deg. of Freedom 2 1 23
##
## Residual standard error: 0.8149749
## Estimated effects may be unbalanced
StudentANova3 <- rstudent(Anova3)
StudentANova3
## 1 2 3 4 5 6
## 1.02412884 1.75797037 -0.18450666 -1.57359190 -0.05361679 -0.29473388
## 7 8 9 10 11 12
## -0.68991089 1.16432287 -1.10003933 -2.05603362 -0.10271783 1.64286701
## 13 14 15 16 17 18
## 0.53663140 -0.34529341 -0.47579137 1.10401301 -0.21558827 0.30126115
## 19 20 21 22 23 24
## 0.66696514 -1.43748278 0.27930863 -1.04318071 0.75897535 1.16895322
## 25 26 27
## -1.95806293 0.62004267 0.32784721
source("https://raw.githubusercontent.com/athienit/STA4210material/main/check.R")
check(Anova3,tests=TRUE)
## Warning in leveneTest.default(lm.object$model[, 1], lm.object$model[, 2]):
## lm.object$model[, 2] coerced to factor.

## $Independence
## $Independence[[1]]
##
## Runs Test - Two sided
##
## data: re
## Standardized Runs Statistic = 0.20382, p-value = 0.8385
##
##
## $Independence[[2]]
## lag Autocorrelation D-W Statistic p-value
## 1 -0.07988954 2.116816 0.812
## Alternative hypothesis: rho != 0
##
##
## $Normality
##
## Shapiro-Wilk normality test
##
## data: re
## W = 0.96702, p-value = 0.5253
##
##
## [[3]]
## [1] "Constant Variance only valid if data are in groups"
##
## $ConstantVar
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.0245 0.9758
## 24
ncvTest(lm(Dat$Productivity ~ Dat$AERND + Dat$Location, data = Dat))
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.004745563, Df = 1, p = 0.94508
sdr <- rstudent(Anova3)
sdr[which(abs(sdr)>=abs(qt(0.05/length(sdr),Anova3$df.residual)))]
## named numeric(0)
hat <- hatvalues(Anova3)
hat[which(hat > 2*5 / length(sdr))]
## named numeric(0)
dftrt <- length(levels(Dat$AERND)) - 1
dffits(Anova3)[which(dffits(Anova3)>2*sqrt(dftrt/length(sdr)))]
## Warning in sqrt(dftrt/length(sdr)): NaNs produced
## named numeric(0)
cd <- cooks.distance(Anova3)
cd[which(cd>1)]
## named numeric(0)
cd[which(cd>qf(0.5,dftrt,df.residual(Anova3)))]
## Warning in qf(0.5, dftrt, df.residual(Anova3)): NaNs produced
## named numeric(0)
### It appears that again all of our assumptions have been met. All of our plots and tests suggests that no assumption was violated. And again to ensure thoroughness we will still apply some remediation measures. ###
### Remediation ###
oneway.test(Dat$Productivity ~ Dat$AERND + Dat$Location, data = Dat, var.equal = FALSE)
##
## One-way analysis of means (not assuming equal variances)
##
## data: Dat$Productivity and Dat$AERND + Dat$Location
## F = 4.1517, num df = 5.0000, denom df = 7.7549, p-value = 0.03883
Bonferroni <- PostHocTest(Anova3, method = "bonferroni", conf.level = .9)
Bonferroni
##
## Posthoc multiple comparisons of means : Bonferroni
## 90% family-wise confidence level
##
## $`Dat$AERND`
## diff lwr.ci upr.ci pval
## 2Med-1Low 1.255556 0.4420391 2.069072 0.0059 **
## 3High-1Low 2.322222 1.3498841 3.294560 5.1e-05 ***
## 3High-2Med 1.066667 0.1442258 1.989108 0.0462 *
##
## $`Dat$Location`
## diff lwr.ci upr.ci pval
## 2Europe-1U.S. 0.1120879 -0.4258953 0.6500711 0.7243
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Datiwls <- rlm(Dat$Productivity ~ Dat$AERND + Dat$Location, data = Dat,wt.method="inv.var",method="M")
summary(Datiwls)
##
## Call: rlm(formula = Dat$Productivity ~ Dat$AERND + Dat$Location, data = Dat,
## method = "M", wt.method = "inv.var")
## Residuals:
## Min 1Q Median 3Q Max
## -1.509679 -0.466490 -0.009282 0.480839 1.290718
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 6.8163 0.3516 19.3855
## Dat$AERND2Med 1.3004 0.4162 3.1245
## Dat$AERND3High 2.4124 0.4942 4.8816
## Dat$Location2Europe 0.0930 0.3636 0.2557
##
## Residual standard error: 0.727 on 23 degrees of freedom
anova(Datiwls)
## Analysis of Variance Table
##
## Response: Dat$Productivity
## Df Sum Sq Mean Sq F value Pr(>F)
## Dat$AERND 2 20.9005 10.4503
## Dat$Location 1 0.0547 0.0547
## Residuals 15.3336
boxT <- powerTransform(Dat$Productivity ~ Dat$AERND + Dat$Location)
summary(boxT)
## bcPower Transformation to Normality
## Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1 1.2659 1 -1.0615 3.5932
##
## Likelihood ratio test that transformation parameter is equal to 0
## (log transformation)
## LRT df pval
## LR test, lambda = (0) 1.152434 1 0.28304
##
## Likelihood ratio test that no transformation is needed
## LRT df pval
## LR test, lambda = (1) 0.05032793 1 0.82249
Dat$ProductivityB <- bcPower(Dat$Productivity,0)
ANova4 <- aov(Dat$ProductivityB~Dat$AERND + Dat$Location,data=Dat)
### All of our remedation techniques are consitant with the data and suggests that our data already follows our assumptions. ###
### 18.23 ###
### A. ####
### Ho: All means are equal H1: At least one mean is different ###
Decsion_Rule <- qf(0.05, 2, 24, lower.tail=FALSE) ### Decision Rule ###
Decsion_Rule
## [1] 3.402826
Krusk <- kruskal.test(Dat$Productivity~Dat$AERND,data=Dat)
Krusk
##
## Kruskal-Wallis rank sum test
##
## data: Dat$Productivity by Dat$AERND
## Kruskal-Wallis chi-squared = 14.956, df = 2, p-value = 0.0005655
### Given our decision rule and alpha level of .05 our results are favor H1 the alternative hypothesis that not all of the means are equal. Our Test stat is larger than our decision rule and our p-value is smaller than our alpha level. ###
### These results are consistent with the parametric test from 16.7; both tests favor H1. Given that the results are significant and because the assumption of equal variances was not violated it is unlikely that the parametric test was necessary, but there is no down side to also conducting a KW test and the consistent results lends extra weight to our conclusions. ###