Assignment description:

Desc<-browseURL("http://uc-media.rhi.hi.is/tmp/waben/tv3_verkefnalysing.html")

a)

r2d <-function(r)
{
    lat <- floor(r/100)
    lon <- (r - lat * 100) %% 50
    halfb <- (r - 100 * lat - lon)/100
    lon <-  - (lon + 0.5)
    lat <- lat + 60 + halfb + 0.25
    data.frame(lat = lat, lon = lon)
}

#av<-read_csv("http://hi.is/~gunnar/LikTol/data98.csv")
av<-read.csv("data98.csv", header = T)
reitir<-unique(av$reit) # Fjarlægjum tvítekin gildi til þess að myndin verði fallegri

#view(av)

summary(av)

##      recid             reit            smrt          tog_nr      
##  Min.   : 99606   Min.   :315.0   Min.   :1.00   Min.   : 1.000  
##  1st Qu.: 99856   1st Qu.:462.0   1st Qu.:1.00   1st Qu.: 2.000  
##  Median :100113   Median :614.0   Median :2.00   Median :11.000  
##  Mean   :100125   Mean   :558.5   Mean   :2.43   Mean   : 8.664  
##  3rd Qu.:100337   3rd Qu.:666.0   3rd Qu.:3.00   3rd Qu.:13.000  
##  Max.   :100742   Max.   :721.0   Max.   :4.00   Max.   :54.000  
##       dag             man           dyp_min         dyp_max            vf    
##  Min.   : 3.00   Min.   :3.000   Min.   : 23.0   Min.   : 20.0   Min.   :73  
##  1st Qu.: 8.00   1st Qu.:3.000   1st Qu.:143.0   1st Qu.:137.0   1st Qu.:73  
##  Median :11.00   Median :3.000   Median :182.0   Median :180.0   Median :73  
##  Mean   :11.49   Mean   :3.036   Mean   :189.4   Mean   :188.9   Mean   :73  
##  3rd Qu.:14.00   3rd Qu.:3.000   3rd Qu.:230.0   3rd Qu.:234.0   3rd Qu.:73  
##  Max.   :21.00   Max.   :4.000   Max.   :516.0   Max.   :456.0   Max.   :73  
##        nr               le               ky             kt       
##  Min.   : 1.000   Min.   : 20.00   Min.   :1.00   Min.   : 1.00  
##  1st Qu.: 3.000   1st Qu.: 50.00   1st Qu.:1.00   1st Qu.: 1.00  
##  Median : 5.000   Median : 62.00   Median :2.00   Median : 1.00  
##  Mean   : 6.961   Mean   : 61.73   Mean   :1.51   Mean   : 1.43  
##  3rd Qu.:10.000   3rd Qu.: 73.00   3rd Qu.:2.00   3rd Qu.: 2.00  
##  Max.   :30.000   Max.   :145.00   Max.   :2.00   Max.   :22.00  
##      aldur             osl              sl              li        
##  Min.   : 1.000   Min.   :   64   Min.   :   58   Min.   :   0.0  
##  1st Qu.: 4.000   1st Qu.: 1080   1st Qu.:  922   1st Qu.:  37.0  
##  Median : 5.000   Median : 2088   Median : 1748   Median : 107.0  
##  Mean   : 5.209   Mean   : 2704   Mean   : 2189   Mean   : 158.6  
##  3rd Qu.: 6.000   3rd Qu.: 3428   3rd Qu.: 2832   3rd Qu.: 215.0  
##  Max.   :15.000   Max.   :34484   Max.   :26063   Max.   :2800.0

x<-r2d(reitir)$lon
y<-r2d(reitir)$lat
plot(x,y,type='n') # Teikna mynd af hafsvæðum
text(x,y,as.character(reitir))

is.na(av) %>% sum() # no missing values

## [1] 0

av$hafsvaediSN<-ifelse(r2d(av$reit)$lat>65,"N","S")
av$hafsvaediAV<-ifelse(r2d(av$reit)$lon>-19,"A","V")
av$hafsvaedi<-paste(av$hafsvaediSN, av$hafsvaediAV, sep="") %>% as.factor
av<-dplyr::select(av,-hafsvaediAV, -hafsvaediSN)



str(av$hafsvaedi)

##  Factor w/ 4 levels "NA","NV","SA",..: 4 4 4 4 4 4 4 4 4 4 ...

summary(av$hafsvaedi)

##   NA   NV   SA   SV 
## 1196 1328  428  729

sample_n(av,6) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

recid	reit	smrt	tog_nr	dag	man	dyp_min	dyp_max	vf	nr	le	ky	kt	aldur	osl	sl	li	hafsvaedi
100509	624	1	16	12	3	81	87	73	3	63	1	1	5	1713	1621	7	NV
99813	613	1	14	8	3	225	210	73	5	53	2	1	4	1246	1060	85	NA
100403	525	2	12	6	3	100	101	73	2	92	1	2	8	7605	6330	163	NV
100426	576	3	13	8	3	195	212	73	1	61	2	1	5	2228	1848	256	NV
100145	621	4	13	15	3	92	115	73	5	53	1	2	5	1617	1203	73	NV
100089	619	2	4	11	3	207	221	73	5	50	2	1	4	944	796	51	NV

#view(av)

summary(av$kt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    1.00    1.43    2.00   22.00

av$kt2<-ifelse(av$kt==1,"ókynþroska","kynþroska") %>% as.factor()
summary(av$kt2)

##  kynþroska ókynþroska 
##       1341       2340

av<-dplyr::select(av, -kt)

b)

tb<-table(av$kt2,av$hafsvaedi) # every obs corresponds to a single fish.
tb %>% kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

	NA	NV	SA	SV
kynþroska	215	394	224	508
ókynþroska	981	934	204	221

t(t(tb)/apply(tb,2,sum)) %>% formatC(digits=2, format="f") %>% 
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

	NA	NV	SA	SV
kynþroska	0.18	0.30	0.52	0.70
ókynþroska	0.82	0.70	0.48	0.30

tb %>% data.frame() %>%
  dplyr::rename(Ocean_area=Var2, Kynþroski=Var1) %>% 
  ggplot(mapping=aes(x=Ocean_area,y=Freq, fill=Kynþroski)) +
  geom_bar(stat="identity",position=position_dodge()) + 
  labs(x="Ocean Area", y="Number of fish")

c)

ald.tab<-av %>% group_by(Aldur=aldur) %>%
  dplyr::summarise(Count = n(),
            AVG_length = mean(le),
            AVG_weigth.osl = mean(osl),
            STD_length = sd(le),
            STD_weight.osl = sd(osl)) %>%
  data.frame() 

ald.tab %>% 
  kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Aldur	Count	AVG_length	AVG_weigth.osl	STD_length	STD_weight.osl
1	3	22.66667	102.0000	2.081666	40.84116
2	76	26.30263	165.5395	3.723868	91.64198
3	632	38.55063	529.3180	5.277765	245.12797
4	367	51.26158	1339.2316	7.902564	1213.28725
5	1211	61.44426	2132.9744	6.819454	935.46430
6	750	71.43333	3424.9480	8.701102	1508.17421
7	262	79.19466	4767.2710	8.877858	2010.69268
8	209	85.25837	6407.2871	9.991354	2977.61492
9	135	88.51852	7266.0667	9.816109	2987.08337
10	27	93.85185	9392.5185	12.877242	5882.18933
11	6	93.66667	10665.3333	24.311863	7293.15245
13	1	132.00000	28282.0000	NaN	NaN
14	1	145.00000	34484.0000	NaN	NaN
15	1	110.00000	12750.0000	NaN	NaN

First thing we note is the fact the table shows some NAs, this is due to the fact that it is meaningless to calculate the standard deviation when we have only 1 observation. Aldur 5 is the level with the highest number of fishes. Looking at the counting in the other levels we can venture to say that the distribution of fish age looks normal. On the other hand average length and average weight increase as the age increases. It seems also that as length increases the standard deviation increases as well. This might means that if we plot length against age, higher values of ages, we would expect to see “fewer” points and more spread out compared to lower value of age. The same can be said for weight.

av %>% ggplot(mapping=aes(x=aldur,y=le)) +
  geom_point(size=2, alpha = 1/10, position = "jitter") +
  scale_x_continuous(breaks=seq(1:15)) +
  geom_point(data=ald.tab,
             mapping=aes(x=Aldur, y=AVG_length,
             col="Average Length"),
             size=3) +
  labs(col="",x="Aldur", y="Length")

#merge(x=av, y=ald.tab, by.x="aldur", by.y="Aldur", all.x = T) %>% 
#  ggplot(mapping=aes(x=aldur,y=le)) +
#  geom_point(size=2.5) + geom_point(mapping=aes(x=aldur,y=AVG_length,col="Average length"),size=1) + scale_x_continuous(breaks=seq(1:15)) +
#  labs(col="",x="Aldur", y="Length")

av %>% ggplot(mapping=aes(x=as.factor(aldur),y=le)) +
  geom_boxplot() +
  stat_summary(fun=mean, geom="point", shape=3, size=2, color="red", fill="red") +
  scale_x_discrete(breaks=seq(1:15)) +
  labs(col="",x="Aldur", y="Length")

When ‘aldur’ is treated as a continuous variable and we plot the obs as dots/points we can get an idea of the number of fishes for each level (I am using the word ‘level’ even though ‘aldur’ is not treated as a factor). As a result of this, we can see that there are no fish for aldur=12. This plot gives us an idea on how the single observations are scattered. Arguments like alpha = 1/10, position = “jitter” help us distinguishing the single observation.

Boxplot carries similar info but, in my opinion, in a better and compct format. The main advantage of a boxplot is that we can see the outliers, the median and how spread is the data for each level in terms of quartiles (1st, 2nd and 3rd). Whiskers length and IQ range add info/details about how the data is spread.

d)

set.seed(0908)
r.haf<-sample(levels(av$hafsvaedi),2)
r.haf

## [1] "NV" "SA"

r.av1<-av %>% filter(av$hafsvaedi==r.haf[1]) %>% sample_n(50)
r.av2<-av %>% filter(av$hafsvaedi==r.haf[2]) %>% sample_n(50)
r.av<-rbind(r.av1,r.av2)


sample_n(r.av, 5) %>% kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

recid	reit	smrt	tog_nr	dag	man	dyp_min	dyp_max	vf	nr	le	ky	aldur	osl	sl	li	hafsvaedi	kt2
100487	626	4	11	10	3	275	244	73	5	77	2	6	4626	3456	428	NV	kynþroska
100318	413	4	16	19	3	160	155	73	3	60	1	4	2006	1665	148	SA	ókynþroska
100319	412	3	13	19	3	195	250	73	7	70	2	6	3454	2889	254	SA	kynþroska
100318	413	4	16	19	3	160	155	73	13	68	2	6	3008	2468	264	SA	kynþroska
100297	462	4	2	17	3	220	220	73	3	62	1	5	2454	1953	244	SA	kynþroska

e)

n<-nrow(r.av1)
m<-nrow(r.av2)
alpha<-0.05

xbar<-mean(r.av1$le) # maximum likelihood estimator for mu1 (population mean)
xbar

## [1] 57.94

ybar<-mean(r.av2$le) # maximum likelihood estimator for mu2 (population mean)
ybar

## [1] 71.54

Let’s assume:

\(X\): is random variable measuring fish length (le) in area NV-> independent and normally distributed.

\(Y\): is random variable measuring fish length (le) in area SA-> independent and normally distributed.

For both populations - fishes in NV and fishes in SA - mean and st.dev. are unknown.

The sample mean suggests that the 2 populations have different means. Sample mean for SA is higher than the ones for NV. We want to test whether the population means are equal or not.

\[H_0 : \mu_x = \mu_y \hspace{50pt} H_1: \mu_x \neq \mu_y\]

Let’s assume that the variances of the 2 populations is the same:

\[\sigma_{x}^2 = \sigma_{y}^2\] The test statistic is:

\[ T = \frac{\overline{X} - \overline{Y}}{\sqrt{S_p^2(1/n + 1/m)}}\] Under \(H_0\) \[T \sim t_{n+m-2}\]

We will reject \(H_0\) if \(|T| > t_{1-\alpha/2,n+m-2}\)

We will fail to reject \(H_0\) otherwise.

where \(S_p^2\) is the pooled estimator of the common variance \(\sigma^2\) equal to:

\[ S_p^2 = \frac{(n-1)S_X^2 + (m-1)S_Y^2}{n+m-2}\]

s2x<-var(r.av1$le)
s2y<-var(r.av2$le)

s2p<-((n-1)*s2x + (m-1)*s2y)/(n+m-2)

tstat<-(xbar-ybar)/sqrt(s2p*(1/n + 1/m))

abs(tstat)>qt(1-alpha/2, n+m-2)

## [1] TRUE

pval<-2*pt(tstat,n+m-2)

cr.rg<-c(-qt(1-alpha/2, n+m-2), +qt(1-alpha/2, n+m-2))

pval

## [1] 5.861386e-05

tt<-t.test(data = r.av,
           le ~ hafsvaedi,
           alternative = "two.sided",
           mu = 0, paired = FALSE,
           var.equal = T,
           conf.level = 0.95) #yields the same results
tt

## 
##  Two Sample t-test
## 
## data:  le by hafsvaedi
## t = -4.2011, df = 98, p-value = 5.861e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -20.024187  -7.175813
## sample estimates:
## mean in group NV mean in group SA 
##            57.94            71.54

Since the absolute value of the test statistic, built under the assumption that the populations have the same mean (\(\mu_x=\mu_y\)), is greater than the t-student quantile (leaving alpha/2 on the right) we reject the null hypothesis. In other words the observed statistic fall in the critical region based on alpha = 0.05.

P-value 5.861386^{-5} is close to 0. This means that the probability of obtaining a result (difference in the sample mean) that is at least as extreme as ours (under the assumption that the populations have same mean) is extremely low. P-value is lower than the significance level 0.05, therefore we reject the null hypothesis. This is another way to evaluate the hypothesis test.

The plot below shows that the test statistic - green line - falls in the critical region (tails outside the marks in blue).

ggplot(data=data.frame(T=c(-5,5)), aes(T)) +
  stat_function(fun = dt, args = list(df=n+m-2)) +
  geom_vline(aes(xintercept = 0), col= 'red', size = 0.5) + 
  geom_vline(aes(xintercept = tstat), col= 'green', size = 0.5) + 
  geom_vline(aes(xintercept = cr.rg), col= 'blue', size = 0.5)

Let’s define U as: \[\overline{X} - \overline{Y} = U \sim (\mu_u ; S_u)\] where \(S_u = \sqrt{S_p^2(1/n + 1/m)}\) is the standard deviation of our random variable U.

We don’t know the exact mean \(\mu_u\). However we can estimate the interval that “most likely” contain the mean.

sigmau<-sqrt(s2p*(1/n + 1/m))

CI<-c((xbar-ybar) - qt(1-alpha/2, n+m-2)*sigmau,
      (xbar-ybar) + qt(1-alpha/2, n+m-2)*sigmau)
CI

## [1] -20.024187  -7.175813

The confidence interval for the difference of the population means (-20.0241873, -7.1758127) does not include 0. The confidence interval we just built tells us that 95% of the time such interval includes \(\mu_u = \mu_y - \mu_x\) (the difference of the 2 populations mean). 0 is not included, this allows us to say that \(\mu_y \neq \mu_x\), the population means under study are not the same.

This is in line with the results from the t-test.

worth noticing that the t-test and confidence interval were built with the same alpha level of significance.

f)

av_long = melt(av, id.vars='hafsvaedi', measure.vars='le', value.name='le')

#av_long %>% view()

get_normal_density <- function(x, binwidth) {
  grid <- seq(min(x), max(x), length=100)
  data.frame(
    le = grid,
    normal_curve = dnorm(grid, mean(x), sd(x)) * length(x) * binwidth
  )
}

BW <- 3

normaldens <-
  av %>%
  group_by(hafsvaedi) %>%
  do(get_normal_density(x=.$le, binwidth=BW))

ggplot() + geom_histogram(data=av_long, aes(x=le), binwidth = 3) +
  geom_line(data=normaldens, mapping=aes(x=le, y=normal_curve), col="red", size=1) +
  facet_wrap(~hafsvaedi) + 
  labs(x="Length")

The histograms, plotted based on the actual observations, are approximated pretty well by a normal distribution. The histograms seems symmetric around the mean. Values around the mean are the most common as they have the highest frequency, the more we move towards the tails the lower the frequency. The normal distribution were plotted with parameters the estimator of the mean and the estimator of the standard deviation.

That being said, NA ocean area shows 2 “peaks”. It can suggest that there are 2 distributions overlapping and we might need to split NA area in 2 groups.

In light of the graphs here above we can conclude that the sample means in the 4 ocean areas (hafsvaedi) are normally distributed as assumed at the beginning of the previous point when we tested the difference in the population means for NV and SA.

g)

set.seed(0908)
z<-r.av$le
t0<-tstat %>% abs()
t0

## [1] 4.201116

xyind<-c(rep(1,n),rep(2,m))
perm.test<-replicate(5000, {t.test(z[sample(1:length(z))]~xyind)$statistic})
new.pval<-sum(perm.test>=t0)/5000
new.pval

## [1] 0

max(perm.test)>=t0 # this is due to the fact that t0 was calculated manually and not with the t.test function which uses a different way to count the degree of freedom.

## [1] FALSE

The new p-value from the permutation test, built under the assumption that the null hypothesis is true (2 ocean areas have the same mean) is close to 0. The test statistics, calculated re-shuffling randomly the ocean area labels, are almost always lower than the observed test statistic. Therefore we conclude that there is an actual difference in the mean of the 2 ocean areas SA and NV. This is in line with the original test run in the previous point.

h)

r.av$hafsvaedi<-droplevels(r.av$hafsvaedi)
r.av$hafsvaedi %>% levels()

## [1] "NV" "SA"

r.tb<-table(r.av$kt2,r.av$hafsvaedi) # every obs corresponds to a single fish.

r.tb %>% kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

	NV	SA
kynþroska	10	26
ókynþroska	40	24

#r.tb

t(t(r.tb)/apply(r.tb,2,sum)) %>% formatC(digits=2, format="f") %>% 
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

	NV	SA
kynþroska	0.20	0.52
ókynþroska	0.80	0.48

# success = kynþroska
# failure = ókynþroska
prp.t<-prop.test(t(r.tb),
          alternative = "two.sided",
          conf.level = 0.95)
# x = ....a two-dimensional table (or matrix) with 2 columns, giving the counts of successes and failures, respectively - in our case t(r.tb).

Let’s define \(p\) as the probability of the fish been kynþroska

\[H_0 : p_x = p_y \hspace{50pt} H_1: p_x \neq p_y\]

The test is testing whether the proportion of kynþroska are the same in the 2 ocean areas.

prp.t

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  t(r.tb)
## X-squared = 9.7656, df = 1, p-value = 0.001778
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.5173957 -0.1226043
## sample estimates:
## prop 1 prop 2 
##   0.20   0.52

prp.t$p.value

## [1] 0.001778051

prp.t$statistic # the value of Pearson's chi-squared test statistic.

## X-squared 
##  9.765625

prp.t$estimate

## prop 1 prop 2 
##   0.20   0.52

The p-value 0.0017781 is lower that 0.05 (our significance level), therefore we reject the null hypothesis and conclude that the proportions of kynþroska fish is different in the 2 ocean areas.

9.765625 is the value of Pearson’s chi-squared test statistic.

0.2 and 0.52 are simply the number of success (kynþroska) in each ocean area (NV and SA respectively) divided the sample size.

The sample proportion generated by the test refers to kynþroska fish.

prp.t$conf.int

## [1] -0.5173957 -0.1226043
## attr(,"conf.level")
## [1] 0.95

The 95%-confidence interval does not include the value 0. The confidence interval refers to the difference in kynþroska proportion for the 2 ocean areas.

With 95% of confidence the real difference in kynþroska proportion between the 2 ocean areas (\(p_x-p_y\)) is not 0 (the proportions of kynþroska in the 2 ocean area are not the same). This is in line with the results from the prop.test.

i)

ggplot(r.av1, mapping=aes(x=le, y=osl)) +
  geom_point() + geom_smooth() +
  labs(x="Length", y="Weight")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(r.av1, mapping=aes(x=log(le), y=log(osl))) +
  geom_point() + geom_smooth() +
  labs(x="log(Length)", y="log(Weight)")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

fit<-lm(log(osl)~log(le), data=r.av1)
summary(fit)

## 
## Call:
## lm(formula = log(osl) ~ log(le), data = r.av1)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.233169 -0.085258 -0.003168  0.075786  0.299478 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.24893    0.24948  -21.04   <2e-16 ***
## log(le)      3.12566    0.06183   50.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1136 on 48 degrees of freedom
## Multiple R-squared:  0.9816, Adjusted R-squared:  0.9812 
## F-statistic:  2555 on 1 and 48 DF,  p-value: < 2.2e-16

#plot(fit)

As we can see from the first plot, the relationship between length and weight is not linear. After transforming both the predictor and the response the relationship becomes linear. Therefore when we will fit a SLR on log-transformed data, the model will yields higher values of R squared. The regression line will fit the log-transformed data much better than non-transformed data.

The model is as follow:

\[log(\hat{Y}) = \beta_0 + \beta_1logX\] Where the estimates of the coefficients are:

\(\beta_0\) = -5.2489348

\(\beta_1\) = 3.1256569

Yes, it is wise to use this model. The p-value for \(\beta_1\) is close to 0 which means that the relationship between x and y is statistically significant - I reject the underling null hypothesis \(H_0:\beta_1 = 0\). R squared and adjusted R-squared are close to 1, this indicates that the proportion of the total variation in Y explained by X is close to 1. Our model explains the variation in the response very well.

R squared suggests also that the degree of a linear relationship between log(Y) and log(X) is high.

For every unit increase of \(log(X)\), the response \(log(Y)\) increases by \(\beta_1\).

Translated in term of \(X\) and \(Y\): for every increase of \(e\) the response increases by \(e^{\hat{\beta}_1}\)

\(\beta_0\) is not very interesting as it is mainly useful to create a line that fit the data.

In the plot here below the regression line is explicitly expressed in command geom_abline. The line fits well the data.

ggplot(r.av1, mapping=aes(x=log(le), y=log(osl))) +
  geom_point() +
  geom_abline(slope= fit$coefficients[2],
              intercept = fit$coefficients[1],
              col="red", size=1) +
  labs(x="log(Length)", y="log(Weight)")

j)

ggplot(r.av1, mapping=aes(x=log(le), y=log(osl))) +
  geom_point() +
  stat_smooth(method='lm', se=FALSE, col="red") +
  labs(x="log(Length)", y="log(Weight)")

## `geom_smooth()` using formula 'y ~ x'

ggplot(r.av1, mapping=aes(x=le, y=osl)) +
  geom_point() +
  stat_smooth(method='lm', se=FALSE, col="red") +
  labs(x="Length", y="Weight")

## `geom_smooth()` using formula 'y ~ x'

gogn_likan <-
  data.frame(
    x = exp(fit$model[[2]]),
    y = exp(predict(fit))
  )

ggplot(r.av1, mapping=aes(x=le, y=osl)) +
  geom_point() +
  geom_line(data=gogn_likan, aes(x=x,y=y), col="green", size=1) +
  labs(x="Length", y="Weight")

k)

ggplot(r.av1, mapping = aes(x=as.factor(aldur), y=le)) +
  geom_boxplot() + 
  stat_summary(fun=mean, geom="point", shape=3, size=2, color="red", fill="red") + labs(x="Aldur", y="Length")

litid <- lm(le~aldur, data=r.av1)
stort <- lm(le~factor(aldur), data=r.av1)
a<-anova(litid, stort)
a

## Analysis of Variance Table
## 
## Model 1: le ~ aldur
## Model 2: le ~ factor(aldur)
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)   
## 1     48 1574.2                                
## 2     42 1047.6  6    526.68 3.5193 0.006535 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

pval.anova<-a[2,6]

Anova function perform a one-way anova test testing whether the mean for the different level of Aldur is the same - \(H_0: \mu_1 = \mu_2=....= \mu_9\). The p-value 0.0065354 is close to 0. We reject the null hypothesis and say that at least the mean for one level of Aldur is different than the others. This can be easily seen from the boxplot graph. The boxes and the related quantiles moves upwards as Aldur level increases.

Even taking into consideration that the variability of length is quite different across the level of Aldur, we can conclude that fish length tends to increases with age.

Bónusspurning)

x<-r2d(reitir)$lon
y<-r2d(reitir)$lat
xyr<-data.frame(x,y,reitir)
xyr$hafsvaediSN<-ifelse(r2d(xyr$reitir)$lat>65,"N","S")
xyr$hafsvaediAV<-ifelse(r2d(xyr$reitir)$lon>-19,"A","V")
xyr$hafsvaedi<-paste(xyr$hafsvaediSN, xyr$hafsvaediAV, sep="") %>% as.factor
xyr<-dplyr::select(xyr,-hafsvaediAV, -hafsvaediSN)

world<-ne_countries(scale = "medium", returnclass = "sf")
ggplot(data = world) +
  geom_sf(color = "black", fill = "blue")  + 
  xlab("Longitude") + ylab("Latitude") +
  ggtitle("Iceland") +
  coord_sf(xlim = c(min(x)-0.5, max(x)-0.5), ylim = c(min(y)-0.5, max(y)+0.5), expand = F) +
  geom_label(data=subset(xyr,hafsvaedi==r.haf[1] | hafsvaedi==r.haf[2]),
            aes(x,y,label=reitir,col=hafsvaedi),
            size=2)

Tolvuverkefni 3

Andrea Valtorta - anv13

28/3/2020

a)

b)

c)

d)

e)

f)

g)

h)

i)

j)

k)

Bónusspurning)