##Question 1 - ANOVA importing data

NFL <- read.csv("/Users/Lorraine/Desktop/Project 4 ANOVA.csv")
library(rmarkdown)
library(car)
## Loading required package: carData
library(compute.es)
library(ggplot2)
library(lme4)
## Loading required package: Matrix
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
library(multcomp)
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
## 
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
## 
##     geyser
library(pastecs)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:car':
## 
##     logit
library(sjstats)
## 
## Attaching package: 'sjstats'
## The following objects are masked from 'package:psych':
## 
##     pca, phi
library(WRS2)
library(lessR)
## 
## lessR 3.8.9     feedback: gerbing@pdx.edu     web: lessRstats.com/new
## ---------------------------------------------------------------------
## 1. d <- Read("")           Read text, Excel, SPSS, SAS or R data file
##                            d: default data frame, no need for data=
## 2. l <- Read("", var_labels=TRUE)   Read variable labels into l,
##                            required name for data frame of labels
## 3. Help()                  Get help, and, e.g., Help(Read)
## 4. hs(), bc(), or ca()     All histograms, all bar charts, or both
## 5. Plot(X) or Plot(X,Y)    For continuous and categorical variables
## 6. by1= , by2=             Trellis graphics, a plot for each by1, by2
## 7. reg(Y ~ X, Rmd="eg")    Regression with full interpretative output
## 8. style("gray")           Grayscale theme, + many others available
##    style(show=TRUE)        all color/style options and current values
## 9. getColors()             create many styles of color palettes
## 
## lessR parameter names now use _'s. Names with a period are deprecated.
## Ex:  bin_width  instead of  bin.width
## 
## Attaching package: 'lessR'
## The following object is masked from 'package:sjstats':
## 
##     prop
## The following objects are masked from 'package:psych':
## 
##     reflect, scree
## The following objects are masked from 'package:car':
## 
##     bc, Recode, sp
library(lm.beta)
library(QuantPsyc)
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:survival':
## 
##     aml
## The following object is masked from 'package:car':
## 
##     logit
## 
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:lm.beta':
## 
##     lm.beta
## The following object is masked from 'package:Matrix':
## 
##     norm
## The following object is masked from 'package:base':
## 
##     norm

checking for homogeinty of variance, the assmuption was met

leveneTest(NFL$Weight, NFL$Team, center = median)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  4  0.6939 0.5984
##       80

seperating 5 NFL teams in groups

Chicago <- NFL[c(1:17),1]
Philadelphia <- NFL[c(18:34),1]
Dallas <- NFL[c(35:51),1]
Denver <- NFL[c(52:68),1]
NewEngland <- NFL[c(69:85),1]

checking for the normality assumption, only Chicago did not meet the assumption

shapiro.test(Chicago)
## 
##  Shapiro-Wilk normality test
## 
## data:  Chicago
## W = 0.87711, p-value = 0.02855
shapiro.test(Philadelphia)
## 
##  Shapiro-Wilk normality test
## 
## data:  Philadelphia
## W = 0.9226, p-value = 0.1634
shapiro.test(Dallas)
## 
##  Shapiro-Wilk normality test
## 
## data:  Dallas
## W = 0.89936, p-value = 0.06627
shapiro.test(Denver)
## 
##  Shapiro-Wilk normality test
## 
## data:  Denver
## W = 0.89888, p-value = 0.06506
shapiro.test(NewEngland)
## 
##  Shapiro-Wilk normality test
## 
## data:  NewEngland
## W = 0.95771, p-value = 0.589

normalizing Chicago distribution, it is a small dataset so I only got rid of two outliers, which was enough to rectify the normality issue

boxplot(Chicago)

boxplot(Chicago,plot=FALSE)$out
## [1] 228 221 223 220
Chicagotrimmed <- Chicago[221<Chicago]
shapiro.test(Chicagotrimmed)
## 
##  Shapiro-Wilk normality test
## 
## data:  Chicagotrimmed
## W = 0.89142, p-value = 0.07046

creating a new dataset for normalized NFL data

NFLN <- data.frame(cbind(Chicagotrimmed, Philadelphia, Dallas, Denver, NewEngland))
Stack_NFLN <- stack(NFLN)

running ANOVA using normalized Chicago dataset and original data from other 4 teams

NFLModel <- aov(values ~ ind, data=Stack_NFLN)
summary(NFLModel)
##             Df Sum Sq Mean Sq F value Pr(>F)
## ind          4   3403   850.7   3.487 0.0112
## Residuals   80  19516   244.0

because the p-value was less than 0.05, running Tukey test to sepcify difference(s)

postHocs <- glht(NFLModel, linfct = mcp(ind = "Tukey"))
summary(postHocs)
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = values ~ ind, data = Stack_NFLN)
## 
## Linear Hypotheses:
##                                     Estimate Std. Error t value Pr(>|t|)
## Philadelphia - Chicagotrimmed == 0   0.05882    5.35729   0.011  1.00000
## Dallas - Chicagotrimmed == 0         6.11765    5.35729   1.142  0.78378
## Denver - Chicagotrimmed == 0        -1.88235    5.35729  -0.351  0.99667
## NewEngland - Chicagotrimmed == 0   -13.23529    5.35729  -2.471  0.10777
## Dallas - Philadelphia == 0           6.05882    5.35729   1.131  0.78970
## Denver - Philadelphia == 0          -1.94118    5.35729  -0.362  0.99625
## NewEngland - Philadelphia == 0     -13.29412    5.35729  -2.482  0.10512
## Denver - Dallas == 0                -8.00000    5.35729  -1.493  0.56981
## NewEngland - Dallas == 0           -19.35294    5.35729  -3.612  0.00467
## NewEngland - Denver == 0           -11.35294    5.35729  -2.119  0.22218
## (Adjusted p values reported -- single-step method)

There is only one difference between New England and Dallas

t.test(NewEngland)
## 
##  One Sample t-test
## 
## data:  NewEngland
## t = 56.019, df = 16, p-value < 0.00000000000000022
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  228.7105 246.7013
## sample estimates:
## mean of x 
##  237.7059
t.test(Dallas)
## 
##  One Sample t-test
## 
## data:  Dallas
## t = 73.947, df = 16, p-value < 0.00000000000000022
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  249.6895 264.4282
## sample estimates:
## mean of x 
##  257.0588

A one-way analysis of variance was used to test for the differences among the average player weight across these 5 NFL teams, Chicago, Philadelphia, Dalas, Denver, and New England. Significant weight differences were observed between means of the five groups (F(4,80) = 3.487, p< .05). Tukey post hoc comparisons of the groups indicated no differences in all groups but between New England (M = 237.71, 95%CI [228.71, 246.70]) and Dallas (M = 257.06 , 95%CI [249.69, 264.43]). A comparison between the two groups revealed a significant difference in their weight(in lbs.).

##Question 2 - Correlations importing data and running correlations

NBA <- read.csv("/Users/Lorraine/Desktop/Project 4 MR.csv")
library(psych)
corr.test(NBA)
## Call:corr.test(x = NBA)
## Correlation matrix 
##        Height Weight   PTS
## Height   1.00   0.83 -0.07
## Weight   0.83   1.00 -0.01
## PTS     -0.07  -0.01  1.00
## Sample Size 
## [1] 54
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##        Height Weight PTS
## Height   0.00   0.00   1
## Weight   0.00   0.00   1
## PTS      0.62   0.94   0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

A strong correlation between Weight and Height (r=0.83) was found, a weak correltaion was found between Height and PTS (r=-0.069), and the weakest correlation was between Weight and PTS (r=-0.0098). The only significant correlation was between Height and Weight (p<.05).

##Question 3 - Simple Regression running a simple regression analysis predicting points from height

reg <- lm(PTS ~ Height, data=NBA)
summary(reg)
## 
## Call:
## lm(formula = PTS ~ Height, data = NBA)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.713 -3.864 -1.395  1.644 15.444 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  17.6256    11.7428   1.501    0.139
## Height       -0.8858     1.7785  -0.498    0.621
## 
## Residual standard error: 5.942 on 52 degrees of freedom
## Multiple R-squared:  0.004748,   Adjusted R-squared:  -0.01439 
## F-statistic: 0.2481 on 1 and 52 DF,  p-value: 0.6205

The result was insignificant (F(1,52)= 0.2481, p>0.05). The percent of variance in points explained by height is 0.004748. Points tend to decrease as height increases: Y=17.6256-0.8858x

##Question 4 - Multiple Regression Run amultiple regression analysis predicting points from both height and weight

stat.desc(NBA)
##                    Height        Weight         PTS
## nbr.val       54.00000000    54.0000000  54.0000000
## nbr.null       0.00000000     0.0000000   0.0000000
## nbr.na         0.00000000     0.0000000   0.0000000
## min            5.70000000   105.0000000   2.8000000
## max            7.60000000   263.0000000  27.4000000
## range          1.90000000   158.0000000  24.6000000
## sum          355.70000000 11335.0000000 636.7000000
## median         6.65000000   212.5000000  10.7500000
## mean           6.58703704   209.9074074  11.7907407
## SE.mean        0.06244753     4.1185497   0.8027872
## CI.mean.0.95   0.12525389     8.2607646   1.6101872
## var            0.21058351   915.9723969  34.8012334
## std.dev        0.45889379    30.2650359   5.8992570
## coef.var       0.06966619     0.1441828   0.5003296
Mreg <- lm(PTS ~ Height + Weight, data=NBA)
summary(Mreg)
## 
## Call:
## lm(formula = PTS ~ Height + Weight, data = NBA)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.801 -3.630 -1.222  1.326 14.668 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.28743   13.99607   1.592    0.117
## Height      -2.56738    3.24536  -0.791    0.433
## Weight       0.03056    0.04921   0.621    0.537
## 
## Residual standard error: 5.977 on 51 degrees of freedom
## Multiple R-squared:  0.01222,    Adjusted R-squared:  -0.02652 
## F-statistic: 0.3154 on 2 and 51 DF,  p-value: 0.7309
coefficients(reg) # model coefficients
## (Intercept)      Height 
##  17.6256168  -0.8858119
confint(reg, level=0.95) # CIs for model parameters 
##                 2.5 %    97.5 %
## (Intercept) -5.938055 41.189288
## Height      -4.454601  2.682978
anova(reg) # anova table 
## Analysis of Variance Table
## 
## Response: PTS
##           Df  Sum Sq Mean Sq F value Pr(>F)
## Height     1    8.76   8.758  0.2481 0.6205
## Residuals 52 1835.71  35.302
lm.beta(reg) # standardized (beta) weights
##     Height 
## -0.0689059

The results were both insignificant (F(2,51)= 0.3154,p>0.05). The percent of variance in points explained by height and weight is 0.01222. Points tend to decrease as height increases: Y=22.28743-2.56738x, and points tend to increase as weight increases: Y=22.28743+0.03056x.