library(tidyverse)
library(palmerpenguins)
library(ggfortify)
library(dplyr)
squirrels<-read.csv("C:/Users/paul/OneDrive/Documents/ES3307/squirrels.csv",header=TRUE)
head(squirrels)
## MALE FEMALE
## 1 0.41 0.40
## 2 0.38 0.39
## 3 1.13 0.59
## 4 0.61 0.62
## 5 0.73 0.53
## 6 0.56 0.34
str(squirrels)
## 'data.frame': 50 obs. of 2 variables:
## $ MALE : num 0.41 0.38 1.13 0.61 0.73 0.56 0.62 0.75 0.5 0.75 ...
## $ FEMALE: num 0.4 0.39 0.59 0.62 0.53 0.34 0.41 0.59 0.46 0.63 ...
summary(squirrels)
## MALE FEMALE
## Min. :0.2900 Min. :0.320
## 1st Qu.:0.4350 1st Qu.:0.410
## Median :0.5600 Median :0.495
## Mean :0.5908 Mean :0.518
## 3rd Qu.:0.7025 3rd Qu.:0.605
## Max. :1.1300 Max. :0.850
par(mfrow = c(1, 2))
squirrels$FEMALE <- as.numeric(squirrels$FEMALE)
squirrels$MALE <- as.numeric(squirrels$MALE)
hist(squirrels$FEMALE, main = "Female Squirrels", xlab = "Weight", col = "lightpink", border = "black")
hist(squirrels$MALE, main = "Male Squirrels", xlab = "Weight", col = "lightblue", border = "black")
par(mfrow = c(1, 1))
boxplot(squirrels$FEMALE, squirrels$MALE,
names = c("Female", "Male"),
col = c("lightpink", "lightblue"),
main = "Weight of Squirrels",
ylab = "Weight")
t.test(squirrels$MALE,squirrels$FEMALE,paired=TRUE)
##
## Paired t-test
##
## data: squirrels$MALE and squirrels$FEMALE
## t = 2.1415, df = 49, p-value = 0.03723
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 0.004483996 0.141116004
## sample estimates:
## mean difference
## 0.0728
wilcox.test(squirrels$FEMALE, squirrels$MALE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: squirrels$FEMALE and squirrels$MALE
## W = 1012, p-value = 0.1014
## alternative hypothesis: true location shift is not equal to 0
#The variances around the two means are visibly different as shown on the box plot, suggesting heterogeneity of variance. The histograms show slightly right-skewed data suggesting non-normal distribution
#There is no significant difference between the weights of male and female squirrels (Paired T-Test: t49=1.95, p=0.057)
#The weights of female squirrels are not significantly different from the weights of male squirrels (Wilcoxon rank sum; W=1019, P=0.112)
#Type III error as the null hypothesis is correctly accepted but for the wrong reason as the wrong statistical test has been used
melons <- read.csv("C:/Users/paul/OneDrive/Documents/ES3307/melons.csv",header=TRUE)
melons$VARIETY <- as.factor(melons$VARIETY)
boxplot(YIELDM ~ VARIETY, data = melons,
col = rainbow(length(levels(melons$VARIETY))),
main = "Yield by Melon Variety",
xlab = "Variety",
ylab = "Yield (kg)")
melons$VARIETY<-as.numeric(melons$VARIETY)
melons$YIELDM<-as.numeric(melons$YIELDM)
summary(melons)
## YIELDM VARIETY
## Min. :15.05 Min. :1.000
## 1st Qu.:22.26 1st Qu.:1.250
## Median :27.82 Median :2.000
## Mean :27.66 Mean :2.455
## 3rd Qu.:32.90 3rd Qu.:3.750
## Max. :43.32 Max. :4.000
t.test(melons)
##
## One Sample t-test
##
## data: melons
## t = 7.14, df = 43, p-value = 8.074e-09
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 10.80547 19.31226
## sample estimates:
## mean of x
## 15.05886
model1<-lm(YIELDM~VARIETY,data=melons)
summary(model1)
##
## Call:
## lm(formula = YIELDM ~ VARIETY, data = melons)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3731 -5.2390 -0.8915 4.5585 16.2901
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.243 4.092 5.925 8.54e-06 ***
## VARIETY 1.393 1.508 0.924 0.366
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.183 on 20 degrees of freedom
## Multiple R-squared: 0.04094, Adjusted R-squared: -0.007009
## F-statistic: 0.8538 on 1 and 20 DF, p-value: 0.3665
boxplot(YIELDM~VARIETY,data=melons)
model1<-lm(YIELDM~VARIETY,data=melons)
par(mfrow=c(2,2))
plot(model1)
#The Residuals vs Fitted plot doesn’t show evenly spread points and a
“zig-zag” pattern can be seen suggesting heterogeneity of variance. The
Q-Q Residuals plot shows residuals following the model relatively well
however it is unclear from this plot whether the data is non-normal or
normally distributed as there is some variance from the model line of
best fit. The Scale-location plot suggests heterogenity of residuals as
a “v-shaped” can be seen. The Residuals vs leverage plot shows a small
group of influencial points which may influence the regression results.
Overall, the diagnostic plots suggest that a non-parametric test should
be used on this data.
#There is no significant difference between the melon yields among the different varities (ANOVA: F1,20=0.854, p=0.367)
trees<-read.csv("C:/Users/paul/OneDrive/Documents/ES3307/trees.csv",header=TRUE)
trees$SEX <- factor(trees$SEX, levels = c(1, 2), labels = c("Male", "Female"))
boxplot(FLOWERS ~ SEX, data = trees,
col = c("lightblue", "lightpink"),
main = "Number of Flowers by Sex",
xlab = "Sex",
ylab = "Number of Flowers")
by(trees$FLOWERS, trees$SEX, shapiro.test)
## trees$SEX: Male
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.91808, p-value = 0.09102
##
## ------------------------------------------------------------
## trees$SEX: Female
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.83177, p-value = 0.0002634
hist(trees$FLOWERS,main="histogram of number of flowers",xlab="number of flowers")
#The FLOWERS data is not normally distributed as (Shapiro-Wilks normality test: p<0.001), which can also be shown by the histogram so a non-parametric Mann-Whitney test must be done
wilcox.test(FLOWERS ~ SEX, data = trees, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: FLOWERS by SEX
## W = 298, p-value = 0.9763
## alternative hypothesis: true location shift is not equal to 0
#Male and female trees do not produce significantly different numbers of flowers (Wilcoxon rank sum test: w=298, p=0.976)