QUESTION 1

projectdata1<-read.csv(file="C://Users//TOSHIBA//Desktop//Statistics Course//StatisticsDataFiles//Project 4 ANOVA.csv", header=TRUE)
levels(projectdata1$Team)
## [1] "Chicago"      "Dallas"       "Denver"       "New England" 
## [5] "Philadelphia"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
group_by(projectdata1, Team) %>%
  summarise(
    count = n(),
    mean = mean(ï..Weight, na.rm = TRUE),
    sd = sd(ï..Weight, na.rm = TRUE),
    min=min(ï..Weight),
    max=max(ï..Weight)
  )
## # A tibble: 5 x 6
##   Team         count  mean    sd   min   max
##   <fct>        <int> <dbl> <dbl> <int> <int>
## 1 Chicago         17  247.  15.3   220   266
## 2 Dallas          17  257.  14.3   216   281
## 3 Denver          17  249.  16.8   210   268
## 4 New England     17  238.  17.5   208   271
## 5 Philadelphia    17  251   17.1   222   275
library(psych)
describeBy(projectdata1,projectdata1$Team)
## 
##  Descriptive statistics by group 
## group: Chicago
##           vars  n   mean    sd median trimmed  mad min max range skew
## ï..Weight    1 17 247.18 15.34    250  247.73 7.41 220 266    46 -0.6
## Team*        2 17   1.00  0.00      1    1.00 0.00   1   1     0  NaN
##           kurtosis   se
## ï..Weight    -1.01 3.72
## Team*          NaN 0.00
## -------------------------------------------------------- 
## group: Dallas
##           vars  n   mean    sd median trimmed mad min max range  skew
## ï..Weight    1 17 257.06 14.33    256   258.2 8.9 216 281    65 -1.04
## Team*        2 17   2.00  0.00      2     2.0 0.0   2   2     0   NaN
##           kurtosis   se
## ï..Weight     1.68 3.48
## Team*          NaN 0.00
## -------------------------------------------------------- 
## group: Denver
##           vars  n   mean    sd median trimmed   mad min max range skew
## ï..Weight    1 17 249.06 16.76    253   250.4 16.31 210 268    58 -0.8
## Team*        2 17   3.00  0.00      3     3.0  0.00   3   3     0  NaN
##           kurtosis   se
## ï..Weight    -0.55 4.07
## Team*          NaN 0.00
## -------------------------------------------------------- 
## group: New England
##           vars  n   mean   sd median trimmed   mad min max range skew
## ï..Weight    1 17 237.71 17.5    234  237.47 19.27 208 271    63 0.13
## Team*        2 17   4.00  0.0      4    4.00  0.00   4   4     0  NaN
##           kurtosis   se
## ï..Weight    -1.22 4.24
## Team*          NaN 0.00
## -------------------------------------------------------- 
## group: Philadelphia
##           vars  n mean    sd median trimmed   mad min max range  skew
## ï..Weight    1 17  251 17.08    254  251.33 19.27 222 275    53 -0.41
## Team*        2 17    5  0.00      5    5.00  0.00   5   5     0   NaN
##           kurtosis   se
## ï..Weight    -1.24 4.14
## Team*          NaN 0.00
library("ggpubr")
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Loading required package: magrittr
ggboxplot(projectdata1, x = "Team", y = "ï..Weight", 
          color = "Team", palette = c("#00AFBB", "#E7B800", "#FC4E07", "maroon2", "seagreen3"),
          order = c("Chicago", "Dallas", "Denver", "New England", "Philadelphia"),
          ylab = "Weight", xlab = "Team")

qqnorm(projectdata1[projectdata1$Team =="Chicago",]$ï..Weight)
qqline(projectdata1[projectdata1$Team =="Chicago",]$ï..Weight)

qqnorm(projectdata1[projectdata1$Team =="Dallas",]$ï..Weight)
qqline(projectdata1[projectdata1$Team =="Dallas",]$ï..Weight)

qqnorm(projectdata1[projectdata1$Team =="Denver",]$ï..Weight)
qqline(projectdata1[projectdata1$Team =="Denver",]$ï..Weight)

qqnorm(projectdata1[projectdata1$Team =="New England",]$ï..Weight)
qqline(projectdata1[projectdata1$Team =="New England",]$ï..Weight)

qqnorm(projectdata1[projectdata1$Team =="Philadelphia",]$ï..Weight)
qqline(projectdata1[projectdata1$Team =="Philadelphia",]$ï..Weight)

hist(projectdata1[projectdata1$Team =="Chicago",]$ï..Weight)

hist(projectdata1[projectdata1$Team =="Dallas",]$ï..Weight)

hist(projectdata1[projectdata1$Team =="Denver",]$ï..Weight)

hist(projectdata1[projectdata1$Team =="New England",]$ï..Weight)

hist(projectdata1[projectdata1$Team =="Philadelphia",]$ï..Weight)

The weight distribution of the Chicago group does not have a normal distribution (M:247.18 SD:15.34, min:220 max:266) Running a Shapiro-Wilk test:

shapiro.test(projectdata1[projectdata1$Team =="Chicago",]$ï..Weight)
## 
##  Shapiro-Wilk normality test
## 
## data:  projectdata1[projectdata1$Team == "Chicago", ]$ï..Weight
## W = 0.87711, p-value = 0.02855
shapiro.test(projectdata1[projectdata1$Team =="Dallas",]$ï..Weight)
## 
##  Shapiro-Wilk normality test
## 
## data:  projectdata1[projectdata1$Team == "Dallas", ]$ï..Weight
## W = 0.89936, p-value = 0.06627
shapiro.test(projectdata1[projectdata1$Team =="Denver",]$ï..Weight)
## 
##  Shapiro-Wilk normality test
## 
## data:  projectdata1[projectdata1$Team == "Denver", ]$ï..Weight
## W = 0.89888, p-value = 0.06506
shapiro.test(projectdata1[projectdata1$Team =="New England",]$ï..Weight)
## 
##  Shapiro-Wilk normality test
## 
## data:  projectdata1[projectdata1$Team == "New England", ]$ï..Weight
## W = 0.95771, p-value = 0.589
shapiro.test(projectdata1[projectdata1$Team =="Philadelphia",]$ï..Weight)
## 
##  Shapiro-Wilk normality test
## 
## data:  projectdata1[projectdata1$Team == "Philadelphia", ]$ï..Weight
## W = 0.9226, p-value = 0.1634

The Shapiro-Wilk test indicated that the weight distribution of Chicago Team is significantly different than a normal distribution (p<0.05)

Trying to Winsorize the weight distribution for Chicago Team, installing package:

library(robustHD)
## Loading required package: perry
## Loading required package: parallel
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:psych':
## 
##     cushny

Running winsorize function:

projectdata1$ï..Weight <- as.numeric(projectdata1$ï..Weight)

projectdata1[projectdata1$Team =="Chicago",]$ï..Weight<-robustHD::winsorize(projectdata1[projectdata1$Team =="Chicago",]$ï..Weight, standardized = FALSE, centerFun = median, scaleFun = mad, const = 2)

Cheking the winsorized distribution of weights for the level Chicago:

boxplot(projectdata1[projectdata1$Team =="Chicago",]$ï..Weight)

shapiro.test(projectdata1[projectdata1$Team =="Chicago",]$ï..Weight)
## 
##  Shapiro-Wilk normality test
## 
## data:  projectdata1[projectdata1$Team == "Chicago", ]$ï..Weight
## W = 0.90275, p-value = 0.07553
library(psych)
describe(projectdata1[projectdata1$Team =="Chicago",]$ï..Weight)
##    vars  n   mean    sd median trimmed  mad    min    max range  skew
## X1    1 17 249.96 10.56    250  249.96 7.41 235.17 264.83 29.65 -0.07
##    kurtosis   se
## X1     -1.3 2.56

The Shapiro-Wilk test for the wisorized distribution of weights (level=“Chicago”) was not significant (M:249.96, SD: 10.56, min: 235,17, max:264.83)

Testing for homogenity of variance:

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
leveneTest(ï..Weight ~ Team, data = projectdata1)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  4   1.401 0.2412
##       80

The F value is not significant, therefore we can assume homogenity of variance.

Running the one-way ANOVA:

ANOVA.model <- aov(ï..Weight ~ Team, data=projectdata1)
summary(ANOVA.model)
##             Df Sum Sq Mean Sq F value Pr(>F)  
## Team         4   3356   839.0   3.509 0.0108 *
## Residuals   80  19131   239.1                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Checking the effect size by omega-squared:

library(sjstats)
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
## 
## Attaching package: 'sjstats'
## The following objects are masked from 'package:psych':
## 
##     pca, phi

Running omega-squared function:

omega_sq(ANOVA.model)
##   term omegasq
## 1 Team   0.106

ω2 is .106, therefore a medium effect size (Field (2003) suggests that the values of Omega Squared between 0.06 - 0.14 can be considered as a medium effect size.

A one-way analysis of variance was used to test for the differences of weight between the players of 5 NFL teams.Significant differences were observed between means of the five groups (F(4,80) = 3.509, p< .05, ω2 = .106 ).

Post-hoc test: Since the sample sizes are equal and homogenity of variance is non-significant, Tukey’s HSD test can be applied. Installing necessary packages:

library(multcomp)
## Loading required package: mvtnorm
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:robustbase':
## 
##     heart
## Loading required package: TH.data
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
## 
##     geyser
posthoc.model <- glht(ANOVA.model, linfct = mcp(Team = "Tukey"))
summary(posthoc.model)
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = ï..Weight ~ Team, data = projectdata1)
## 
## Linear Hypotheses:
##                                 Estimate Std. Error t value Pr(>|t|)   
## Dallas - Chicago == 0             7.0972     5.3042   1.338  0.66846   
## Denver - Chicago == 0            -0.9028     5.3042  -0.170  0.99981   
## New England - Chicago == 0      -12.2558     5.3042  -2.311  0.15234   
## Philadelphia - Chicago == 0       1.0384     5.3042   0.196  0.99967   
## Denver - Dallas == 0             -8.0000     5.3042  -1.508  0.56021   
## New England - Dallas == 0       -19.3529     5.3042  -3.649  0.00414 **
## Philadelphia - Dallas == 0       -6.0588     5.3042  -1.142  0.78360   
## New England - Denver == 0       -11.3529     5.3042  -2.140  0.21358   
## Philadelphia - Denver == 0        1.9412     5.3042   0.366  0.99610   
## Philadelphia - New England == 0  13.2941     5.3042   2.506  0.09939 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
tukey.test <- TukeyHSD(ANOVA.model)
tukey.test
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = ï..Weight ~ Team, data = projectdata1)
## 
## $Team
##                                 diff        lwr       upr     p adj
## Dallas-Chicago             7.0971765  -7.706591 21.900944 0.6684707
## Denver-Chicago            -0.9028235 -15.706591 13.900944 0.9998078
## New England-Chicago      -12.2557647 -27.059532  2.548003 0.1522568
## Philadelphia-Chicago       1.0383529 -13.765414 15.842120 0.9996653
## Denver-Dallas             -8.0000000 -22.803767  6.803767 0.5602442
## New England-Dallas       -19.3529412 -34.156709 -4.549174 0.0041642
## Philadelphia-Dallas       -6.0588235 -20.862591  8.744944 0.7835870
## New England-Denver       -11.3529412 -26.156709  3.450826 0.2135037
## Philadelphia-Denver        1.9411765 -12.862591 16.744944 0.9960973
## Philadelphia-New England  13.2941176  -1.509650 28.097885 0.0993971

could not calculate confidence intervals Tukey post hoc comparisons of the groups indicated that the average weight of the New England players (M = 237.70, 95%CI[////, ////]) is significantly different than the average weight of Dallas players (M = 257.05, 95%CI [////,////]). A comparison between the weights of the other teams’ players revealed no significant difference.

QUESTION 2.1

projectdata2<-read.csv(file="C://Users//TOSHIBA//Desktop//Statistics Course//StatisticsDataFiles//Project 4 MR.csv", header=TRUE)

Checking the distributions:

describe(projectdata2$ï..Height)
##    vars  n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 54 6.59 0.46   6.65    6.58 0.52 5.7 7.6   1.9 0.02    -0.87 0.06
describe(projectdata2$Weight)
##    vars  n   mean    sd median trimmed   mad min max range  skew kurtosis
## X1    1 54 209.91 30.27  212.5  211.34 33.36 105 263   158 -0.72     0.83
##      se
## X1 4.12
describe(projectdata2$PTS)
##    vars  n  mean  sd median trimmed  mad min  max range skew kurtosis  se
## X1    1 54 11.79 5.9  10.75   11.26 4.23 2.8 27.4  24.6 0.86     0.15 0.8
hist(projectdata2$ï..Height)

hist(projectdata2$Weight)

hist(projectdata2$PTS)

boxplot(projectdata2$ï..Height)

boxplot(projectdata2$Weight)

boxplot(projectdata2$PTS)

shapiro.test(projectdata2$ï..Height)
## 
##  Shapiro-Wilk normality test
## 
## data:  projectdata2$ï..Height
## W = 0.97479, p-value = 0.3102
shapiro.test(projectdata2$Weight)
## 
##  Shapiro-Wilk normality test
## 
## data:  projectdata2$Weight
## W = 0.93677, p-value = 0.006853
shapiro.test(projectdata2$PTS)
## 
##  Shapiro-Wilk normality test
## 
## data:  projectdata2$PTS
## W = 0.92046, p-value = 0.001556

Shapiro-Wilk test result is significant for Weight and Points distributions of NBA Players. Outlier treatment for Weight and Points distributions by winsorizing the data:

projectdata2$Weight.w<-robustHD::winsorize(projectdata2$Weight, standardized = FALSE, centerFun = median, scaleFun = mad, const = 2)
projectdata2$PTS.w<-robustHD::winsorize(projectdata2$PTS, standardized = FALSE, centerFun = median, scaleFun = mad, const = 2)

Checking with Shapiro-Wilk test:

shapiro.test(projectdata2$Weight.w)
## 
##  Shapiro-Wilk normality test
## 
## data:  projectdata2$Weight.w
## W = 0.95466, p-value = 0.03983
shapiro.test(projectdata2$PTS.w)
## 
##  Shapiro-Wilk normality test
## 
## data:  projectdata2$PTS.w
## W = 0.93467, p-value = 0.005621

By winsorizing the data Shapiro-Wilk test result is still significant.

pairs(~ ï..Height + Weight.w + PTS.w, data = projectdata2, row1attop=FALSE)

The scatterplot indicates a positive linear relation between height and weight.

variables<-with(projectdata2, data.frame(ï..Height, Weight.w, PTS.w))
corr.test(variables)
## Warning in abbreviate(colnames(r), minlength = minlength): abbreviate used
## with non-ASCII chars
## Call:corr.test(x = variables)
## Correlation matrix 
##           ï..Height Weight.w PTS.w
## ï..Height      1.00     0.86 -0.12
## Weight.w       0.86     1.00 -0.09
## PTS.w         -0.12    -0.09  1.00
## Sample Size 
## [1] 54
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##           ï..Height Weight.w PTS.w
## ï..Height       0.0     0.00   0.8
## Weight.w        0.0     0.00   0.8
## PTS.w           0.4     0.53   0.0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
psych::describe(projectdata2$Weight.w)
##    vars  n   mean    sd median trimmed   mad    min max  range  skew
## X1    1 54 210.66 28.02  212.5  211.34 33.36 145.78 263 117.22 -0.24
##    kurtosis   se
## X1     -0.9 3.81

Results of the Pearson correlation indicated that there was a significant positive association between weights (M = 210.66 SD = 28.02) and heights (M = 6.59 SD = 0.46) of NBA players, (r(53) = .86, p < .001). No significant relationship was found between the points scored by NBA players and their weights, r = -.09, n.s. No significant relationship was found between the points scored by NBA players and their heights, r = -.12, n.s.

QUESTION 2.2

qplot(ï..Height, PTS.w, data=projectdata2, geom = "point") +  stat_smooth(method="lm")

cor(projectdata2$ï..Height, projectdata2$PTS.w)
## [1] -0.1170158

Results of the Pearson correlation indicated that there was not a significant association between the heights of NBA players and the points scored by them, r=-.12, n.s.

reg1 <- lm(PTS.w ~ ï..Height, data=projectdata2)
summary(reg1)
## 
## Call:
## lm(formula = PTS.w ~ ï..Height, data = projectdata2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.0490 -3.1623 -0.9165  2.1427  8.7168 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   19.244      9.454   2.035   0.0469 *
## ï..Height     -1.217      1.432  -0.850   0.3994  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.784 on 52 degrees of freedom
## Multiple R-squared:  0.01369,    Adjusted R-squared:  -0.005275 
## F-statistic: 0.7219 on 1 and 52 DF,  p-value: 0.3994
confint(reg1)
##                  2.5 %    97.5 %
## (Intercept)  0.2718786 38.215441
## ï..Height   -4.0899542  1.656713

Simple linear regression analysis was used to test if the heights of NBA players significantly predicted the points scored by NBA players.

The results of the regression indicated the heights of NBA players explained 1.3% of the variance in the points scored by NBA players and did not signicantly predict the points scored by NBA players (R2=.013, F(1,52)=0.7219, n.s.).

QUESTION 2.3

reg2 <- lm(PTS.w ~ Weight.w + ï..Height, data = projectdata2)
summary(reg2)
## 
## Call:
## lm(formula = PTS.w ~ Weight.w + ï..Height, data = projectdata2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.073 -3.271 -0.845  2.066  8.825 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 20.383734  11.420338   1.785   0.0802 .
## Weight.w     0.008429   0.046375   0.182   0.8565  
## ï..Height   -1.659272   2.832000  -0.586   0.5605  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.829 on 51 degrees of freedom
## Multiple R-squared:  0.01433,    Adjusted R-squared:  -0.02432 
## F-statistic: 0.3708 on 2 and 51 DF,  p-value: 0.6921
summary(reg2)$coefficient
##                 Estimate  Std. Error    t value   Pr(>|t|)
## (Intercept) 20.383734247 11.42033776  1.7848626 0.08023174
## Weight.w     0.008429046  0.04637473  0.1817595 0.85649200
## ï..Height   -1.659271584  2.83200043 -0.5859009 0.56052619

Multiple regression analysis was used to test if the height and weight of NBA players significantly predicted the points scored by NBA players.

The results of the regression indicated the two predictors explained 0.14% of the variance (R2=.014, F(2,51)=5.56, n.s). It was found that the heights of NBA players did not significantly predict the points scored by NBA players (β = -1.65, n.s.), as neither did weights of NBA players (β = .008, n.s.).

References: Field, A. (2013). Discovering statistics using IBM SPSS statistics. Sage.