Q1)

a)

tomato <- read.table("tomato.txt", header = TRUE)
yield1 <- tomato$yield[tomato$fertiliser==1]
yield2 <- tomato$yield[tomato$fertiliser==2]

# Sample means and variances

paste(mean(yield1), var(yield1))

## [1] "10.262 8.95591428571429"

paste(mean(yield2), var(yield2))

## [1] "12.8484444444444 13.4430043434343"

paste(length(yield1), length(yield2)) #n1 and n2

## [1] "50 45"

par(mfrow=c(1, 2))
qqnorm(yield1, main = "Yield 1 Normal Q-Q plot")
qqline(yield1)
qqnorm(yield2, main = "Yield 2 Normal Q-Q plot")
qqline(yield2)

Normal Quantile plots assesing normality

shapiro.test(yield1); shapiro.test(yield2)

## 
##  Shapiro-Wilk normality test
## 
## data:  yield1
## W = 0.96399, p-value = 0.1307

## 
##  Shapiro-Wilk normality test
## 
## data:  yield2
## W = 0.98832, p-value = 0.9257

Both yield distributions look to be roughly normally distributed from the data lying on the line of best fit. There is questionable observations from yield 1 in the bottom left hand side, though the large p-values for both yields gives no evidence to suggest the distributions are not normally distributed.
Therefore,

b)

# F-distribution quantiles
qf(0.025, 49, 44) # Lower quantile

## [1] 0.5608959

qf(0.975, 49, 44) # Upper quantile

## [1] 1.799381

Therefore, I am 95% confident that the variance of fertilizer one is between 0.3737 and 1.1989 times larger then the variance from fertilizer two. Suggesting tomatoes from the first fertilizer yield have more variability then fertilizer two. Note that the lower limit is close to zero, with a different sample we could see a different result with no difference in the population variance’s.

CIforRatioVar = function(x1, x2, C = 95)
{
var1 = var(x1)
var2 = var(x2)
n1 = length(x1)
n2 = length(x2)
df1 = n2 - 1
df2 = n1 - 1
alpha = 1 - C/100
f_lower = qf(alpha/2, df1, df2)
f_upper = qf(1 - alpha/2, df1, df2)
ll = f_lower*var1/var2
ul = f_upper*var1/var2
return(list(ll = ll, ul = ul))
}
CIforRatioVar(yield1, yield2, 95)

## $ll
## [1] 0.370246
## 
## $ul
## [1] 1.187767

I’d assume my hand calculations don’t exactly match the verification from the CIforRatioVar function as i rounded numbers early to save time and space for hand calculations, though they are very similar.

c)

No, it is not reasonable to assume that the variance of tomatoes are equal between yields one and two. If there was no difference in variability between yields one and two the confidence interval would pass through zero, which it doesn’t.

d)

I am 95% confident that the The difference in population averages between yields one and two, is in the closed interval of [-3.944, -1.228]. Meaning that fertilizer yield 1 is roughly 3.9kg and 1.23kg lower then the fertilizer yield 2.

qt(0.975, 93)

## [1] 1.985802

CIfordiffmu = function(x1, x2, C = 95, DF = "Welch")
{
xbar1 = mean(x1)
xbar2 = mean(x2)
var1 = var(x1)
var2 = var(x2)
n1 = length(x1)
n2 = length(x2)
if (DF == "Welch")
{
df = ((var1/n1 + var2/n2)^2)/((var1/n1)^2/(n1 - 1) + (var2/n2)^2/(n2 - 1))
}
if (DF == "simple")
{
df = min(n1, n2) - 1
}
if (DF == "pooled")
{
df = n1 + n2 - 2
}
alpha = 1 - C/100
t.quant = qt(1 - alpha/2, df)
if (DF == "pooled")
{
s_p = sqrt(((n1 - 1)*var1 + (n2 - 1)*var2)/(n1 + n2 - 2))
me = t.quant*s_p*sqrt(1/n1 + 1/n2)
}
if (DF != "pooled")
{
me = t.quant*sqrt(var1/n1 + var2/n2)
}
diff = xbar1 - xbar2
ll = diff - me
ul = diff + me
return(list(ll = ll, ul = ul))
}

# Note: CIfordiffmu function was provided in computer labs, only used for
# verification


CIfordiffmu(yield1, yield2, C = 95, DF = "pooled")

## $ll
## [1] -3.944613
## 
## $ul
## [1] -1.228276

e)

sum(yield1>=10)/length(yield1)

## [1] 0.52

sum(yield2>=10)/length(yield2)

## [1] 0.7555556

Q2

a)

b)

c)

uv <- read.table("UVsample.txt", head = TRUE)
mean(uv$x)

## [1] 4.781873

sd(uv$x)

## [1] 2.23131

uvMoM <- function(X)
{
  pi = 3.14159
  meanX = mean(X)
  theta = meanX*sqrt(2/pi)
  return(list(theta = theta))
}
uvMoM(uv$x)

## $theta
## [1] 3.815384

uvMLE <- function(x)
{
  n = length(x)
  thetaMLE = sqrt((sum(x^2))/(2*n))
  return(list(thetaMLE = thetaMLE))
}
uvMLE(uv$x)

## $thetaMLE
## [1] 3.729622

Q3

a)

The approximate probability distribution of the sample mean can assumed to be normally distributed as the sample size if large enough for the central limit to apply. Implying the probability distribution of the sample mean follows a normal distribution with mean \(\mu\) and variance \(\frac{\sigma{^2}}{64}\).
Therefore,
\[\overline{X}\sim N(\mu, \frac{\sigma{^2}}{64})\] ## b)

c)

1-pnorm(1.181818182)

## [1] 0.1186389

d)

4)

a)

Newytemp <- read.table("NewcastleOctTemp.txt", header = TRUE)
mean <- mean(Newytemp$maxtemp)
sd <- sd(Newytemp$maxtemp)
list(mean = mean, sd = sd, size = length(Newytemp$maxtemp))

## $mean
## [1] 22.29
## 
## $sd
## [1] 4.448679
## 
## $size
## [1] 60

qt(0.975, 59)

## [1] 2.000995

I am 95% confident that the true population mean of maximum temperature in October at Nobbys Signal Station is between 21.14 and 23.44 degrees celsius.

b)

It is a reasonable assumption to state average temperature in October from Nobbys Signal Station is 22.6 degrees celsius as that value lies within the 95% confidence interval above. If this assumption was not adequate the confidence interval would either be above or below 22.6 degrees Celsius, the interval would not pass through this value.

c)

CItemp <- function(x, c = 95) #default c = 95
{
  samp.mean <- mean(x)
  s <- sd(x)
  n <- length(x)
  df <- n - 1
  C <- c
  alpha <- 1-C/100
  t <- qt(1-alpha/2, df)
  marg.err <- t*s/sqrt(n)
  ll <- samp.mean-marg.err
  ul <- samp.mean+marg.err
  return(list(low.lim = ll, upper.lim = ul))
}
CItemp(Newytemp$maxtemp)

## $low.lim
## [1] 21.14078
## 
## $upper.lim
## [1] 23.43922

The confidence interval function matches the interval derived though hand calculations in part a), with the true average temperature in October at Nobbys Signal Station is between 21.14 and 23.44 degrees celsius

EDA and Statistics Fundamentals 3

Timor Atkins c3332818

24/09/2021

Q1)

a)

b)

c)

d)

e)

Q2

a)

b)

c)

Q3

a)

c)

d)

4)

a)

b)

c)