Assignment description:

Desc<-browseURL("http://uc-media.rhi.hi.is/tmp/waben/tv3_verkefnalysing.html")

a)

r2d <-function(r)
{
    lat <- floor(r/100)
    lon <- (r - lat * 100) %% 50
    halfb <- (r - 100 * lat - lon)/100
    lon <-  - (lon + 0.5)
    lat <- lat + 60 + halfb + 0.25
    data.frame(lat = lat, lon = lon)
}

#av<-read_csv("http://hi.is/~gunnar/LikTol/data98.csv")
av<-read.csv("data98.csv", header = T)
reitir<-unique(av$reit) # Fjarlægjum tvítekin gildi til þess að myndin verði fallegri

#view(av)

summary(av)
##      recid             reit            smrt          tog_nr      
##  Min.   : 99606   Min.   :315.0   Min.   :1.00   Min.   : 1.000  
##  1st Qu.: 99856   1st Qu.:462.0   1st Qu.:1.00   1st Qu.: 2.000  
##  Median :100113   Median :614.0   Median :2.00   Median :11.000  
##  Mean   :100125   Mean   :558.5   Mean   :2.43   Mean   : 8.664  
##  3rd Qu.:100337   3rd Qu.:666.0   3rd Qu.:3.00   3rd Qu.:13.000  
##  Max.   :100742   Max.   :721.0   Max.   :4.00   Max.   :54.000  
##       dag             man           dyp_min         dyp_max            vf    
##  Min.   : 3.00   Min.   :3.000   Min.   : 23.0   Min.   : 20.0   Min.   :73  
##  1st Qu.: 8.00   1st Qu.:3.000   1st Qu.:143.0   1st Qu.:137.0   1st Qu.:73  
##  Median :11.00   Median :3.000   Median :182.0   Median :180.0   Median :73  
##  Mean   :11.49   Mean   :3.036   Mean   :189.4   Mean   :188.9   Mean   :73  
##  3rd Qu.:14.00   3rd Qu.:3.000   3rd Qu.:230.0   3rd Qu.:234.0   3rd Qu.:73  
##  Max.   :21.00   Max.   :4.000   Max.   :516.0   Max.   :456.0   Max.   :73  
##        nr               le               ky             kt       
##  Min.   : 1.000   Min.   : 20.00   Min.   :1.00   Min.   : 1.00  
##  1st Qu.: 3.000   1st Qu.: 50.00   1st Qu.:1.00   1st Qu.: 1.00  
##  Median : 5.000   Median : 62.00   Median :2.00   Median : 1.00  
##  Mean   : 6.961   Mean   : 61.73   Mean   :1.51   Mean   : 1.43  
##  3rd Qu.:10.000   3rd Qu.: 73.00   3rd Qu.:2.00   3rd Qu.: 2.00  
##  Max.   :30.000   Max.   :145.00   Max.   :2.00   Max.   :22.00  
##      aldur             osl              sl              li        
##  Min.   : 1.000   Min.   :   64   Min.   :   58   Min.   :   0.0  
##  1st Qu.: 4.000   1st Qu.: 1080   1st Qu.:  922   1st Qu.:  37.0  
##  Median : 5.000   Median : 2088   Median : 1748   Median : 107.0  
##  Mean   : 5.209   Mean   : 2704   Mean   : 2189   Mean   : 158.6  
##  3rd Qu.: 6.000   3rd Qu.: 3428   3rd Qu.: 2832   3rd Qu.: 215.0  
##  Max.   :15.000   Max.   :34484   Max.   :26063   Max.   :2800.0
x<-r2d(reitir)$lon
y<-r2d(reitir)$lat
plot(x,y,type='n') # Teikna mynd af hafsvæðum
text(x,y,as.character(reitir))

is.na(av) %>% sum() # no missing values
## [1] 0
av$hafsvaediSN<-ifelse(r2d(av$reit)$lat>65,"N","S")
av$hafsvaediAV<-ifelse(r2d(av$reit)$lon>-19,"A","V")
av$hafsvaedi<-paste(av$hafsvaediSN, av$hafsvaediAV, sep="") %>% as.factor
av<-dplyr::select(av,-hafsvaediAV, -hafsvaediSN)



str(av$hafsvaedi)
##  Factor w/ 4 levels "NA","NV","SA",..: 4 4 4 4 4 4 4 4 4 4 ...
summary(av$hafsvaedi)
##   NA   NV   SA   SV 
## 1196 1328  428  729
sample_n(av,6) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
recid reit smrt tog_nr dag man dyp_min dyp_max vf nr le ky kt aldur osl sl li hafsvaedi
100509 624 1 16 12 3 81 87 73 3 63 1 1 5 1713 1621 7 NV
99813 613 1 14 8 3 225 210 73 5 53 2 1 4 1246 1060 85 NA
100403 525 2 12 6 3 100 101 73 2 92 1 2 8 7605 6330 163 NV
100426 576 3 13 8 3 195 212 73 1 61 2 1 5 2228 1848 256 NV
100145 621 4 13 15 3 92 115 73 5 53 1 2 5 1617 1203 73 NV
100089 619 2 4 11 3 207 221 73 5 50 2 1 4 944 796 51 NV
#view(av)

summary(av$kt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    1.00    1.43    2.00   22.00
av$kt2<-ifelse(av$kt==1,"ókynþroska","kynþroska") %>% as.factor()
summary(av$kt2)
##  kynþroska ókynþroska 
##       1341       2340
av<-dplyr::select(av, -kt)

b)

tb<-table(av$kt2,av$hafsvaedi) # every obs corresponds to a single fish.
tb %>% kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
NA NV SA SV
kynþroska 215 394 224 508
ókynþroska 981 934 204 221
t(t(tb)/apply(tb,2,sum)) %>% formatC(digits=2, format="f") %>% 
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
NA NV SA SV
kynþroska 0.18 0.30 0.52 0.70
ókynþroska 0.82 0.70 0.48 0.30
tb %>% data.frame() %>%
  dplyr::rename(Ocean_area=Var2, Kynþroski=Var1) %>% 
  ggplot(mapping=aes(x=Ocean_area,y=Freq, fill=Kynþroski)) +
  geom_bar(stat="identity",position=position_dodge()) + 
  labs(x="Ocean Area", y="Number of fish")

c)

ald.tab<-av %>% group_by(Aldur=aldur) %>%
  dplyr::summarise(Count = n(),
            AVG_length = mean(le),
            AVG_weigth.osl = mean(osl),
            STD_length = sd(le),
            STD_weight.osl = sd(osl)) %>%
  data.frame() 

ald.tab %>% 
  kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Aldur Count AVG_length AVG_weigth.osl STD_length STD_weight.osl
1 3 22.66667 102.0000 2.081666 40.84116
2 76 26.30263 165.5395 3.723868 91.64198
3 632 38.55063 529.3180 5.277765 245.12797
4 367 51.26158 1339.2316 7.902564 1213.28725
5 1211 61.44426 2132.9744 6.819454 935.46430
6 750 71.43333 3424.9480 8.701102 1508.17421
7 262 79.19466 4767.2710 8.877858 2010.69268
8 209 85.25837 6407.2871 9.991354 2977.61492
9 135 88.51852 7266.0667 9.816109 2987.08337
10 27 93.85185 9392.5185 12.877242 5882.18933
11 6 93.66667 10665.3333 24.311863 7293.15245
13 1 132.00000 28282.0000 NaN NaN
14 1 145.00000 34484.0000 NaN NaN
15 1 110.00000 12750.0000 NaN NaN

First thing we note is the fact the table shows some NAs, this is due to the fact that it is meaningless to calculate the standard deviation when we have only 1 observation. Aldur 5 is the level with the highest number of fishes. Looking at the counting in the other levels we can venture to say that the distribution of fish age looks normal. On the other hand average length and average weight increase as the age increases. It seems also that as length increases the standard deviation increases as well. This might means that if we plot length against age, higher values of ages, we would expect to see “fewer” points and more spread out compared to lower value of age. The same can be said for weight.

av %>% ggplot(mapping=aes(x=aldur,y=le)) +
  geom_point(size=2, alpha = 1/10, position = "jitter") +
  scale_x_continuous(breaks=seq(1:15)) +
  geom_point(data=ald.tab,
             mapping=aes(x=Aldur, y=AVG_length,
             col="Average Length"),
             size=3) +
  labs(col="",x="Aldur", y="Length")

#merge(x=av, y=ald.tab, by.x="aldur", by.y="Aldur", all.x = T) %>% 
#  ggplot(mapping=aes(x=aldur,y=le)) +
#  geom_point(size=2.5) + geom_point(mapping=aes(x=aldur,y=AVG_length,col="Average length"),size=1) + scale_x_continuous(breaks=seq(1:15)) +
#  labs(col="",x="Aldur", y="Length")
av %>% ggplot(mapping=aes(x=as.factor(aldur),y=le)) +
  geom_boxplot() +
  stat_summary(fun=mean, geom="point", shape=3, size=2, color="red", fill="red") +
  scale_x_discrete(breaks=seq(1:15)) +
  labs(col="",x="Aldur", y="Length")

When ‘aldur’ is treated as a continuous variable and we plot the obs as dots/points we can get an idea of the number of fishes for each level (I am using the word ‘level’ even though ‘aldur’ is not treated as a factor). As a result of this, we can see that there are no fish for aldur=12. This plot gives us an idea on how the single observations are scattered. Arguments like alpha = 1/10, position = “jitter” help us distinguishing the single observation.

Boxplot carries similar info but, in my opinion, in a better and compct format. The main advantage of a boxplot is that we can see the outliers, the median and how spread is the data for each level in terms of quartiles (1st, 2nd and 3rd). Whiskers length and IQ range add info/details about how the data is spread.

d)

set.seed(0908)
r.haf<-sample(levels(av$hafsvaedi),2)
r.haf
## [1] "NV" "SA"
r.av1<-av %>% filter(av$hafsvaedi==r.haf[1]) %>% sample_n(50)
r.av2<-av %>% filter(av$hafsvaedi==r.haf[2]) %>% sample_n(50)
r.av<-rbind(r.av1,r.av2)


sample_n(r.av, 5) %>% kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
recid reit smrt tog_nr dag man dyp_min dyp_max vf nr le ky aldur osl sl li hafsvaedi kt2
100487 626 4 11 10 3 275 244 73 5 77 2 6 4626 3456 428 NV kynþroska
100318 413 4 16 19 3 160 155 73 3 60 1 4 2006 1665 148 SA ókynþroska
100319 412 3 13 19 3 195 250 73 7 70 2 6 3454 2889 254 SA kynþroska
100318 413 4 16 19 3 160 155 73 13 68 2 6 3008 2468 264 SA kynþroska
100297 462 4 2 17 3 220 220 73 3 62 1 5 2454 1953 244 SA kynþroska

e)

n<-nrow(r.av1)
m<-nrow(r.av2)
alpha<-0.05

xbar<-mean(r.av1$le) # maximum likelihood estimator for mu1 (population mean)
xbar
## [1] 57.94
ybar<-mean(r.av2$le) # maximum likelihood estimator for mu2 (population mean)
ybar
## [1] 71.54

Let’s assume:

\(X\): is random variable measuring fish length (le) in area NV-> independent and normally distributed.

\(Y\): is random variable measuring fish length (le) in area SA-> independent and normally distributed.

For both populations - fishes in NV and fishes in SA - mean and st.dev. are unknown.

The sample mean suggests that the 2 populations have different means. Sample mean for SA is higher than the ones for NV. We want to test whether the population means are equal or not.

\[H_0 : \mu_x = \mu_y \hspace{50pt} H_1: \mu_x \neq \mu_y\]

Let’s assume that the variances of the 2 populations is the same:

\[\sigma_{x}^2 = \sigma_{y}^2\] The test statistic is:

\[ T = \frac{\overline{X} - \overline{Y}}{\sqrt{S_p^2(1/n + 1/m)}}\] Under \(H_0\) \[T \sim t_{n+m-2}\]

We will reject \(H_0\) if \(|T| > t_{1-\alpha/2,n+m-2}\)

We will fail to reject \(H_0\) otherwise.

where \(S_p^2\) is the pooled estimator of the common variance \(\sigma^2\) equal to:

\[ S_p^2 = \frac{(n-1)S_X^2 + (m-1)S_Y^2}{n+m-2}\]

s2x<-var(r.av1$le)
s2y<-var(r.av2$le)

s2p<-((n-1)*s2x + (m-1)*s2y)/(n+m-2)

tstat<-(xbar-ybar)/sqrt(s2p*(1/n + 1/m))

abs(tstat)>qt(1-alpha/2, n+m-2)
## [1] TRUE
pval<-2*pt(tstat,n+m-2)

cr.rg<-c(-qt(1-alpha/2, n+m-2), +qt(1-alpha/2, n+m-2))

pval
## [1] 5.861386e-05
tt<-t.test(data = r.av,
           le ~ hafsvaedi,
           alternative = "two.sided",
           mu = 0, paired = FALSE,
           var.equal = T,
           conf.level = 0.95) #yields the same results
tt
## 
##  Two Sample t-test
## 
## data:  le by hafsvaedi
## t = -4.2011, df = 98, p-value = 5.861e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -20.024187  -7.175813
## sample estimates:
## mean in group NV mean in group SA 
##            57.94            71.54

Since the absolute value of the test statistic, built under the assumption that the populations have the same mean (\(\mu_x=\mu_y\)), is greater than the t-student quantile (leaving alpha/2 on the right) we reject the null hypothesis. In other words the observed statistic fall in the critical region based on alpha = 0.05.

P-value 5.861386^{-5} is close to 0. This means that the probability of obtaining a result (difference in the sample mean) that is at least as extreme as ours (under the assumption that the populations have same mean) is extremely low. P-value is lower than the significance level 0.05, therefore we reject the null hypothesis. This is another way to evaluate the hypothesis test.

The plot below shows that the test statistic - green line - falls in the critical region (tails outside the marks in blue).

ggplot(data=data.frame(T=c(-5,5)), aes(T)) +
  stat_function(fun = dt, args = list(df=n+m-2)) +
  geom_vline(aes(xintercept = 0), col= 'red', size = 0.5) + 
  geom_vline(aes(xintercept = tstat), col= 'green', size = 0.5) + 
  geom_vline(aes(xintercept = cr.rg), col= 'blue', size = 0.5)

Let’s define U as: \[\overline{X} - \overline{Y} = U \sim (\mu_u ; S_u)\] where \(S_u = \sqrt{S_p^2(1/n + 1/m)}\) is the standard deviation of our random variable U.

We don’t know the exact mean \(\mu_u\). However we can estimate the interval that “most likely” contain the mean.

sigmau<-sqrt(s2p*(1/n + 1/m))

CI<-c((xbar-ybar) - qt(1-alpha/2, n+m-2)*sigmau,
      (xbar-ybar) + qt(1-alpha/2, n+m-2)*sigmau)
CI
## [1] -20.024187  -7.175813

The confidence interval for the difference of the population means (-20.0241873, -7.1758127) does not include 0. The confidence interval we just built tells us that 95% of the time such interval includes \(\mu_u = \mu_y - \mu_x\) (the difference of the 2 populations mean). 0 is not included, this allows us to say that \(\mu_y \neq \mu_x\), the population means under study are not the same.

This is in line with the results from the t-test.

worth noticing that the t-test and confidence interval were built with the same alpha level of significance.

f)

av_long = melt(av, id.vars='hafsvaedi', measure.vars='le', value.name='le')

#av_long %>% view()

get_normal_density <- function(x, binwidth) {
  grid <- seq(min(x), max(x), length=100)
  data.frame(
    le = grid,
    normal_curve = dnorm(grid, mean(x), sd(x)) * length(x) * binwidth
  )
}

BW <- 3

normaldens <-
  av %>%
  group_by(hafsvaedi) %>%
  do(get_normal_density(x=.$le, binwidth=BW))
ggplot() + geom_histogram(data=av_long, aes(x=le), binwidth = 3) +
  geom_line(data=normaldens, mapping=aes(x=le, y=normal_curve), col="red", size=1) +
  facet_wrap(~hafsvaedi) + 
  labs(x="Length")

The histograms, plotted based on the actual observations, are approximated pretty well by a normal distribution. The histograms seems symmetric around the mean. Values around the mean are the most common as they have the highest frequency, the more we move towards the tails the lower the frequency. The normal distribution were plotted with parameters the estimator of the mean and the estimator of the standard deviation.

That being said, NA ocean area shows 2 “peaks”. It can suggest that there are 2 distributions overlapping and we might need to split NA area in 2 groups.

In light of the graphs here above we can conclude that the sample means in the 4 ocean areas (hafsvaedi) are normally distributed as assumed at the beginning of the previous point when we tested the difference in the population means for NV and SA.

g)

set.seed(0908)
z<-r.av$le
t0<-tstat %>% abs()
t0
## [1] 4.201116
xyind<-c(rep(1,n),rep(2,m))
perm.test<-replicate(5000, {t.test(z[sample(1:length(z))]~xyind)$statistic})
new.pval<-sum(perm.test>=t0)/5000
new.pval
## [1] 0
max(perm.test)>=t0 # this is due to the fact that t0 was calculated manually and not with the t.test function which uses a different way to count the degree of freedom.
## [1] FALSE

The new p-value from the permutation test, built under the assumption that the null hypothesis is true (2 ocean areas have the same mean) is close to 0. The test statistics, calculated re-shuffling randomly the ocean area labels, are almost always lower than the observed test statistic. Therefore we conclude that there is an actual difference in the mean of the 2 ocean areas SA and NV. This is in line with the original test run in the previous point.

h)

r.av$hafsvaedi<-droplevels(r.av$hafsvaedi)
r.av$hafsvaedi %>% levels()
## [1] "NV" "SA"
r.tb<-table(r.av$kt2,r.av$hafsvaedi) # every obs corresponds to a single fish.

r.tb %>% kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
NV SA
kynþroska 10 26
ókynþroska 40 24
#r.tb

t(t(r.tb)/apply(r.tb,2,sum)) %>% formatC(digits=2, format="f") %>% 
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
NV SA
kynþroska 0.20 0.52
ókynþroska 0.80 0.48
# success = kynþroska
# failure = ókynþroska
prp.t<-prop.test(t(r.tb),
          alternative = "two.sided",
          conf.level = 0.95)
# x = ....a two-dimensional table (or matrix) with 2 columns, giving the counts of successes and failures, respectively - in our case t(r.tb).

Let’s define \(p\) as the probability of the fish been kynþroska

\[H_0 : p_x = p_y \hspace{50pt} H_1: p_x \neq p_y\]

The test is testing whether the proportion of kynþroska are the same in the 2 ocean areas.

prp.t
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  t(r.tb)
## X-squared = 9.7656, df = 1, p-value = 0.001778
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.5173957 -0.1226043
## sample estimates:
## prop 1 prop 2 
##   0.20   0.52
prp.t$p.value
## [1] 0.001778051
prp.t$statistic # the value of Pearson's chi-squared test statistic.
## X-squared 
##  9.765625
prp.t$estimate
## prop 1 prop 2 
##   0.20   0.52

The p-value 0.0017781 is lower that 0.05 (our significance level), therefore we reject the null hypothesis and conclude that the proportions of kynþroska fish is different in the 2 ocean areas.

9.765625 is the value of Pearson’s chi-squared test statistic.

0.2 and 0.52 are simply the number of success (kynþroska) in each ocean area (NV and SA respectively) divided the sample size.

The sample proportion generated by the test refers to kynþroska fish.

prp.t$conf.int
## [1] -0.5173957 -0.1226043
## attr(,"conf.level")
## [1] 0.95

The 95%-confidence interval does not include the value 0. The confidence interval refers to the difference in kynþroska proportion for the 2 ocean areas.

With 95% of confidence the real difference in kynþroska proportion between the 2 ocean areas (\(p_x-p_y\)) is not 0 (the proportions of kynþroska in the 2 ocean area are not the same). This is in line with the results from the prop.test.

i)

ggplot(r.av1, mapping=aes(x=le, y=osl)) +
  geom_point() + geom_smooth() +
  labs(x="Length", y="Weight")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(r.av1, mapping=aes(x=log(le), y=log(osl))) +
  geom_point() + geom_smooth() +
  labs(x="log(Length)", y="log(Weight)")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

fit<-lm(log(osl)~log(le), data=r.av1)
summary(fit)
## 
## Call:
## lm(formula = log(osl) ~ log(le), data = r.av1)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.233169 -0.085258 -0.003168  0.075786  0.299478 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.24893    0.24948  -21.04   <2e-16 ***
## log(le)      3.12566    0.06183   50.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1136 on 48 degrees of freedom
## Multiple R-squared:  0.9816, Adjusted R-squared:  0.9812 
## F-statistic:  2555 on 1 and 48 DF,  p-value: < 2.2e-16
#plot(fit)

As we can see from the first plot, the relationship between length and weight is not linear. After transforming both the predictor and the response the relationship becomes linear. Therefore when we will fit a SLR on log-transformed data, the model will yields higher values of R squared. The regression line will fit the log-transformed data much better than non-transformed data.

The model is as follow:

\[log(\hat{Y}) = \beta_0 + \beta_1logX\] Where the estimates of the coefficients are:

\(\beta_0\) = -5.2489348

\(\beta_1\) = 3.1256569

Yes, it is wise to use this model. The p-value for \(\beta_1\) is close to 0 which means that the relationship between x and y is statistically significant - I reject the underling null hypothesis \(H_0:\beta_1 = 0\). R squared and adjusted R-squared are close to 1, this indicates that the proportion of the total variation in Y explained by X is close to 1. Our model explains the variation in the response very well.

R squared suggests also that the degree of a linear relationship between log(Y) and log(X) is high.

For every unit increase of \(log(X)\), the response \(log(Y)\) increases by \(\beta_1\).

Translated in term of \(X\) and \(Y\): for every increase of \(e\) the response increases by \(e^{\hat{\beta}_1}\)

\(\beta_0\) is not very interesting as it is mainly useful to create a line that fit the data.

In the plot here below the regression line is explicitly expressed in command geom_abline. The line fits well the data.

ggplot(r.av1, mapping=aes(x=log(le), y=log(osl))) +
  geom_point() +
  geom_abline(slope= fit$coefficients[2],
              intercept = fit$coefficients[1],
              col="red", size=1) +
  labs(x="log(Length)", y="log(Weight)")

j)

ggplot(r.av1, mapping=aes(x=log(le), y=log(osl))) +
  geom_point() +
  stat_smooth(method='lm', se=FALSE, col="red") +
  labs(x="log(Length)", y="log(Weight)")
## `geom_smooth()` using formula 'y ~ x'

ggplot(r.av1, mapping=aes(x=le, y=osl)) +
  geom_point() +
  stat_smooth(method='lm', se=FALSE, col="red") +
  labs(x="Length", y="Weight")
## `geom_smooth()` using formula 'y ~ x'

gogn_likan <-
  data.frame(
    x = exp(fit$model[[2]]),
    y = exp(predict(fit))
  )

ggplot(r.av1, mapping=aes(x=le, y=osl)) +
  geom_point() +
  geom_line(data=gogn_likan, aes(x=x,y=y), col="green", size=1) +
  labs(x="Length", y="Weight")

k)

ggplot(r.av1, mapping = aes(x=as.factor(aldur), y=le)) +
  geom_boxplot() + 
  stat_summary(fun=mean, geom="point", shape=3, size=2, color="red", fill="red") + labs(x="Aldur", y="Length")

litid <- lm(le~aldur, data=r.av1)
stort <- lm(le~factor(aldur), data=r.av1)
a<-anova(litid, stort)
a
## Analysis of Variance Table
## 
## Model 1: le ~ aldur
## Model 2: le ~ factor(aldur)
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)   
## 1     48 1574.2                                
## 2     42 1047.6  6    526.68 3.5193 0.006535 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pval.anova<-a[2,6]

Anova function perform a one-way anova test testing whether the mean for the different level of Aldur is the same - \(H_0: \mu_1 = \mu_2=....= \mu_9\). The p-value 0.0065354 is close to 0. We reject the null hypothesis and say that at least the mean for one level of Aldur is different than the others. This can be easily seen from the boxplot graph. The boxes and the related quantiles moves upwards as Aldur level increases.

Even taking into consideration that the variability of length is quite different across the level of Aldur, we can conclude that fish length tends to increases with age.

Bónusspurning)

x<-r2d(reitir)$lon
y<-r2d(reitir)$lat
xyr<-data.frame(x,y,reitir)
xyr$hafsvaediSN<-ifelse(r2d(xyr$reitir)$lat>65,"N","S")
xyr$hafsvaediAV<-ifelse(r2d(xyr$reitir)$lon>-19,"A","V")
xyr$hafsvaedi<-paste(xyr$hafsvaediSN, xyr$hafsvaediAV, sep="") %>% as.factor
xyr<-dplyr::select(xyr,-hafsvaediAV, -hafsvaediSN)

world<-ne_countries(scale = "medium", returnclass = "sf")
ggplot(data = world) +
  geom_sf(color = "black", fill = "blue")  + 
  xlab("Longitude") + ylab("Latitude") +
  ggtitle("Iceland") +
  coord_sf(xlim = c(min(x)-0.5, max(x)-0.5), ylim = c(min(y)-0.5, max(y)+0.5), expand = F) +
  geom_label(data=subset(xyr,hafsvaedi==r.haf[1] | hafsvaedi==r.haf[2]),
            aes(x,y,label=reitir,col=hafsvaedi),
            size=2)