Assignment description:
Desc<-browseURL("http://uc-media.rhi.hi.is/tmp/waben/tv3_verkefnalysing.html")
r2d <-function(r)
{
lat <- floor(r/100)
lon <- (r - lat * 100) %% 50
halfb <- (r - 100 * lat - lon)/100
lon <- - (lon + 0.5)
lat <- lat + 60 + halfb + 0.25
data.frame(lat = lat, lon = lon)
}
#av<-read_csv("http://hi.is/~gunnar/LikTol/data98.csv")
av<-read.csv("data98.csv", header = T)
reitir<-unique(av$reit) # Fjarlægjum tvítekin gildi til þess að myndin verði fallegri
#view(av)
summary(av)
## recid reit smrt tog_nr
## Min. : 99606 Min. :315.0 Min. :1.00 Min. : 1.000
## 1st Qu.: 99856 1st Qu.:462.0 1st Qu.:1.00 1st Qu.: 2.000
## Median :100113 Median :614.0 Median :2.00 Median :11.000
## Mean :100125 Mean :558.5 Mean :2.43 Mean : 8.664
## 3rd Qu.:100337 3rd Qu.:666.0 3rd Qu.:3.00 3rd Qu.:13.000
## Max. :100742 Max. :721.0 Max. :4.00 Max. :54.000
## dag man dyp_min dyp_max vf
## Min. : 3.00 Min. :3.000 Min. : 23.0 Min. : 20.0 Min. :73
## 1st Qu.: 8.00 1st Qu.:3.000 1st Qu.:143.0 1st Qu.:137.0 1st Qu.:73
## Median :11.00 Median :3.000 Median :182.0 Median :180.0 Median :73
## Mean :11.49 Mean :3.036 Mean :189.4 Mean :188.9 Mean :73
## 3rd Qu.:14.00 3rd Qu.:3.000 3rd Qu.:230.0 3rd Qu.:234.0 3rd Qu.:73
## Max. :21.00 Max. :4.000 Max. :516.0 Max. :456.0 Max. :73
## nr le ky kt
## Min. : 1.000 Min. : 20.00 Min. :1.00 Min. : 1.00
## 1st Qu.: 3.000 1st Qu.: 50.00 1st Qu.:1.00 1st Qu.: 1.00
## Median : 5.000 Median : 62.00 Median :2.00 Median : 1.00
## Mean : 6.961 Mean : 61.73 Mean :1.51 Mean : 1.43
## 3rd Qu.:10.000 3rd Qu.: 73.00 3rd Qu.:2.00 3rd Qu.: 2.00
## Max. :30.000 Max. :145.00 Max. :2.00 Max. :22.00
## aldur osl sl li
## Min. : 1.000 Min. : 64 Min. : 58 Min. : 0.0
## 1st Qu.: 4.000 1st Qu.: 1080 1st Qu.: 922 1st Qu.: 37.0
## Median : 5.000 Median : 2088 Median : 1748 Median : 107.0
## Mean : 5.209 Mean : 2704 Mean : 2189 Mean : 158.6
## 3rd Qu.: 6.000 3rd Qu.: 3428 3rd Qu.: 2832 3rd Qu.: 215.0
## Max. :15.000 Max. :34484 Max. :26063 Max. :2800.0
x<-r2d(reitir)$lon
y<-r2d(reitir)$lat
plot(x,y,type='n') # Teikna mynd af hafsvæðum
text(x,y,as.character(reitir))
is.na(av) %>% sum() # no missing values
## [1] 0
av$hafsvaediSN<-ifelse(r2d(av$reit)$lat>65,"N","S")
av$hafsvaediAV<-ifelse(r2d(av$reit)$lon>-19,"A","V")
av$hafsvaedi<-paste(av$hafsvaediSN, av$hafsvaediAV, sep="") %>% as.factor
av<-dplyr::select(av,-hafsvaediAV, -hafsvaediSN)
str(av$hafsvaedi)
## Factor w/ 4 levels "NA","NV","SA",..: 4 4 4 4 4 4 4 4 4 4 ...
summary(av$hafsvaedi)
## NA NV SA SV
## 1196 1328 428 729
sample_n(av,6) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| recid | reit | smrt | tog_nr | dag | man | dyp_min | dyp_max | vf | nr | le | ky | kt | aldur | osl | sl | li | hafsvaedi |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 100509 | 624 | 1 | 16 | 12 | 3 | 81 | 87 | 73 | 3 | 63 | 1 | 1 | 5 | 1713 | 1621 | 7 | NV |
| 99813 | 613 | 1 | 14 | 8 | 3 | 225 | 210 | 73 | 5 | 53 | 2 | 1 | 4 | 1246 | 1060 | 85 | NA |
| 100403 | 525 | 2 | 12 | 6 | 3 | 100 | 101 | 73 | 2 | 92 | 1 | 2 | 8 | 7605 | 6330 | 163 | NV |
| 100426 | 576 | 3 | 13 | 8 | 3 | 195 | 212 | 73 | 1 | 61 | 2 | 1 | 5 | 2228 | 1848 | 256 | NV |
| 100145 | 621 | 4 | 13 | 15 | 3 | 92 | 115 | 73 | 5 | 53 | 1 | 2 | 5 | 1617 | 1203 | 73 | NV |
| 100089 | 619 | 2 | 4 | 11 | 3 | 207 | 221 | 73 | 5 | 50 | 2 | 1 | 4 | 944 | 796 | 51 | NV |
#view(av)
summary(av$kt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 1.00 1.43 2.00 22.00
av$kt2<-ifelse(av$kt==1,"ókynþroska","kynþroska") %>% as.factor()
summary(av$kt2)
## kynþroska ókynþroska
## 1341 2340
av<-dplyr::select(av, -kt)
tb<-table(av$kt2,av$hafsvaedi) # every obs corresponds to a single fish.
tb %>% kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| NA | NV | SA | SV | |
|---|---|---|---|---|
| kynþroska | 215 | 394 | 224 | 508 |
| ókynþroska | 981 | 934 | 204 | 221 |
t(t(tb)/apply(tb,2,sum)) %>% formatC(digits=2, format="f") %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| NA | NV | SA | SV | |
|---|---|---|---|---|
| kynþroska | 0.18 | 0.30 | 0.52 | 0.70 |
| ókynþroska | 0.82 | 0.70 | 0.48 | 0.30 |
tb %>% data.frame() %>%
dplyr::rename(Ocean_area=Var2, Kynþroski=Var1) %>%
ggplot(mapping=aes(x=Ocean_area,y=Freq, fill=Kynþroski)) +
geom_bar(stat="identity",position=position_dodge()) +
labs(x="Ocean Area", y="Number of fish")
ald.tab<-av %>% group_by(Aldur=aldur) %>%
dplyr::summarise(Count = n(),
AVG_length = mean(le),
AVG_weigth.osl = mean(osl),
STD_length = sd(le),
STD_weight.osl = sd(osl)) %>%
data.frame()
ald.tab %>%
kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Aldur | Count | AVG_length | AVG_weigth.osl | STD_length | STD_weight.osl |
|---|---|---|---|---|---|
| 1 | 3 | 22.66667 | 102.0000 | 2.081666 | 40.84116 |
| 2 | 76 | 26.30263 | 165.5395 | 3.723868 | 91.64198 |
| 3 | 632 | 38.55063 | 529.3180 | 5.277765 | 245.12797 |
| 4 | 367 | 51.26158 | 1339.2316 | 7.902564 | 1213.28725 |
| 5 | 1211 | 61.44426 | 2132.9744 | 6.819454 | 935.46430 |
| 6 | 750 | 71.43333 | 3424.9480 | 8.701102 | 1508.17421 |
| 7 | 262 | 79.19466 | 4767.2710 | 8.877858 | 2010.69268 |
| 8 | 209 | 85.25837 | 6407.2871 | 9.991354 | 2977.61492 |
| 9 | 135 | 88.51852 | 7266.0667 | 9.816109 | 2987.08337 |
| 10 | 27 | 93.85185 | 9392.5185 | 12.877242 | 5882.18933 |
| 11 | 6 | 93.66667 | 10665.3333 | 24.311863 | 7293.15245 |
| 13 | 1 | 132.00000 | 28282.0000 | NaN | NaN |
| 14 | 1 | 145.00000 | 34484.0000 | NaN | NaN |
| 15 | 1 | 110.00000 | 12750.0000 | NaN | NaN |
First thing we note is the fact the table shows some NAs, this is due to the fact that it is meaningless to calculate the standard deviation when we have only 1 observation. Aldur 5 is the level with the highest number of fishes. Looking at the counting in the other levels we can venture to say that the distribution of fish age looks normal. On the other hand average length and average weight increase as the age increases. It seems also that as length increases the standard deviation increases as well. This might means that if we plot length against age, higher values of ages, we would expect to see “fewer” points and more spread out compared to lower value of age. The same can be said for weight.
av %>% ggplot(mapping=aes(x=aldur,y=le)) +
geom_point(size=2, alpha = 1/10, position = "jitter") +
scale_x_continuous(breaks=seq(1:15)) +
geom_point(data=ald.tab,
mapping=aes(x=Aldur, y=AVG_length,
col="Average Length"),
size=3) +
labs(col="",x="Aldur", y="Length")
#merge(x=av, y=ald.tab, by.x="aldur", by.y="Aldur", all.x = T) %>%
# ggplot(mapping=aes(x=aldur,y=le)) +
# geom_point(size=2.5) + geom_point(mapping=aes(x=aldur,y=AVG_length,col="Average length"),size=1) + scale_x_continuous(breaks=seq(1:15)) +
# labs(col="",x="Aldur", y="Length")
av %>% ggplot(mapping=aes(x=as.factor(aldur),y=le)) +
geom_boxplot() +
stat_summary(fun=mean, geom="point", shape=3, size=2, color="red", fill="red") +
scale_x_discrete(breaks=seq(1:15)) +
labs(col="",x="Aldur", y="Length")
When ‘aldur’ is treated as a continuous variable and we plot the obs as dots/points we can get an idea of the number of fishes for each level (I am using the word ‘level’ even though ‘aldur’ is not treated as a factor). As a result of this, we can see that there are no fish for aldur=12. This plot gives us an idea on how the single observations are scattered. Arguments like alpha = 1/10, position = “jitter” help us distinguishing the single observation.
Boxplot carries similar info but, in my opinion, in a better and compct format. The main advantage of a boxplot is that we can see the outliers, the median and how spread is the data for each level in terms of quartiles (1st, 2nd and 3rd). Whiskers length and IQ range add info/details about how the data is spread.
set.seed(0908)
r.haf<-sample(levels(av$hafsvaedi),2)
r.haf
## [1] "NV" "SA"
r.av1<-av %>% filter(av$hafsvaedi==r.haf[1]) %>% sample_n(50)
r.av2<-av %>% filter(av$hafsvaedi==r.haf[2]) %>% sample_n(50)
r.av<-rbind(r.av1,r.av2)
sample_n(r.av, 5) %>% kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| recid | reit | smrt | tog_nr | dag | man | dyp_min | dyp_max | vf | nr | le | ky | aldur | osl | sl | li | hafsvaedi | kt2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 100487 | 626 | 4 | 11 | 10 | 3 | 275 | 244 | 73 | 5 | 77 | 2 | 6 | 4626 | 3456 | 428 | NV | kynþroska |
| 100318 | 413 | 4 | 16 | 19 | 3 | 160 | 155 | 73 | 3 | 60 | 1 | 4 | 2006 | 1665 | 148 | SA | ókynþroska |
| 100319 | 412 | 3 | 13 | 19 | 3 | 195 | 250 | 73 | 7 | 70 | 2 | 6 | 3454 | 2889 | 254 | SA | kynþroska |
| 100318 | 413 | 4 | 16 | 19 | 3 | 160 | 155 | 73 | 13 | 68 | 2 | 6 | 3008 | 2468 | 264 | SA | kynþroska |
| 100297 | 462 | 4 | 2 | 17 | 3 | 220 | 220 | 73 | 3 | 62 | 1 | 5 | 2454 | 1953 | 244 | SA | kynþroska |
n<-nrow(r.av1)
m<-nrow(r.av2)
alpha<-0.05
xbar<-mean(r.av1$le) # maximum likelihood estimator for mu1 (population mean)
xbar
## [1] 57.94
ybar<-mean(r.av2$le) # maximum likelihood estimator for mu2 (population mean)
ybar
## [1] 71.54
Let’s assume:
\(X\): is random variable measuring fish length (le) in area NV-> independent and normally distributed.
\(Y\): is random variable measuring fish length (le) in area SA-> independent and normally distributed.
For both populations - fishes in NV and fishes in SA - mean and st.dev. are unknown.
The sample mean suggests that the 2 populations have different means. Sample mean for SA is higher than the ones for NV. We want to test whether the population means are equal or not.
\[H_0 : \mu_x = \mu_y \hspace{50pt} H_1: \mu_x \neq \mu_y\]
Let’s assume that the variances of the 2 populations is the same:
\[\sigma_{x}^2 = \sigma_{y}^2\] The test statistic is:
\[ T = \frac{\overline{X} - \overline{Y}}{\sqrt{S_p^2(1/n + 1/m)}}\] Under \(H_0\) \[T \sim t_{n+m-2}\]
We will reject \(H_0\) if \(|T| > t_{1-\alpha/2,n+m-2}\)
We will fail to reject \(H_0\) otherwise.
where \(S_p^2\) is the pooled estimator of the common variance \(\sigma^2\) equal to:
\[ S_p^2 = \frac{(n-1)S_X^2 + (m-1)S_Y^2}{n+m-2}\]
s2x<-var(r.av1$le)
s2y<-var(r.av2$le)
s2p<-((n-1)*s2x + (m-1)*s2y)/(n+m-2)
tstat<-(xbar-ybar)/sqrt(s2p*(1/n + 1/m))
abs(tstat)>qt(1-alpha/2, n+m-2)
## [1] TRUE
pval<-2*pt(tstat,n+m-2)
cr.rg<-c(-qt(1-alpha/2, n+m-2), +qt(1-alpha/2, n+m-2))
pval
## [1] 5.861386e-05
tt<-t.test(data = r.av,
le ~ hafsvaedi,
alternative = "two.sided",
mu = 0, paired = FALSE,
var.equal = T,
conf.level = 0.95) #yields the same results
tt
##
## Two Sample t-test
##
## data: le by hafsvaedi
## t = -4.2011, df = 98, p-value = 5.861e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -20.024187 -7.175813
## sample estimates:
## mean in group NV mean in group SA
## 57.94 71.54
Since the absolute value of the test statistic, built under the assumption that the populations have the same mean (\(\mu_x=\mu_y\)), is greater than the t-student quantile (leaving alpha/2 on the right) we reject the null hypothesis. In other words the observed statistic fall in the critical region based on alpha = 0.05.
P-value 5.861386^{-5} is close to 0. This means that the probability of obtaining a result (difference in the sample mean) that is at least as extreme as ours (under the assumption that the populations have same mean) is extremely low. P-value is lower than the significance level 0.05, therefore we reject the null hypothesis. This is another way to evaluate the hypothesis test.
The plot below shows that the test statistic - green line - falls in the critical region (tails outside the marks in blue).
ggplot(data=data.frame(T=c(-5,5)), aes(T)) +
stat_function(fun = dt, args = list(df=n+m-2)) +
geom_vline(aes(xintercept = 0), col= 'red', size = 0.5) +
geom_vline(aes(xintercept = tstat), col= 'green', size = 0.5) +
geom_vline(aes(xintercept = cr.rg), col= 'blue', size = 0.5)
Let’s define U as: \[\overline{X} - \overline{Y} = U \sim (\mu_u ; S_u)\] where \(S_u = \sqrt{S_p^2(1/n + 1/m)}\) is the standard deviation of our random variable U.
We don’t know the exact mean \(\mu_u\). However we can estimate the interval that “most likely” contain the mean.
sigmau<-sqrt(s2p*(1/n + 1/m))
CI<-c((xbar-ybar) - qt(1-alpha/2, n+m-2)*sigmau,
(xbar-ybar) + qt(1-alpha/2, n+m-2)*sigmau)
CI
## [1] -20.024187 -7.175813
The confidence interval for the difference of the population means (-20.0241873, -7.1758127) does not include 0. The confidence interval we just built tells us that 95% of the time such interval includes \(\mu_u = \mu_y - \mu_x\) (the difference of the 2 populations mean). 0 is not included, this allows us to say that \(\mu_y \neq \mu_x\), the population means under study are not the same.
This is in line with the results from the t-test.
worth noticing that the t-test and confidence interval were built with the same alpha level of significance.
av_long = melt(av, id.vars='hafsvaedi', measure.vars='le', value.name='le')
#av_long %>% view()
get_normal_density <- function(x, binwidth) {
grid <- seq(min(x), max(x), length=100)
data.frame(
le = grid,
normal_curve = dnorm(grid, mean(x), sd(x)) * length(x) * binwidth
)
}
BW <- 3
normaldens <-
av %>%
group_by(hafsvaedi) %>%
do(get_normal_density(x=.$le, binwidth=BW))
ggplot() + geom_histogram(data=av_long, aes(x=le), binwidth = 3) +
geom_line(data=normaldens, mapping=aes(x=le, y=normal_curve), col="red", size=1) +
facet_wrap(~hafsvaedi) +
labs(x="Length")
The histograms, plotted based on the actual observations, are approximated pretty well by a normal distribution. The histograms seems symmetric around the mean. Values around the mean are the most common as they have the highest frequency, the more we move towards the tails the lower the frequency. The normal distribution were plotted with parameters the estimator of the mean and the estimator of the standard deviation.
That being said, NA ocean area shows 2 “peaks”. It can suggest that there are 2 distributions overlapping and we might need to split NA area in 2 groups.
In light of the graphs here above we can conclude that the sample means in the 4 ocean areas (hafsvaedi) are normally distributed as assumed at the beginning of the previous point when we tested the difference in the population means for NV and SA.
set.seed(0908)
z<-r.av$le
t0<-tstat %>% abs()
t0
## [1] 4.201116
xyind<-c(rep(1,n),rep(2,m))
perm.test<-replicate(5000, {t.test(z[sample(1:length(z))]~xyind)$statistic})
new.pval<-sum(perm.test>=t0)/5000
new.pval
## [1] 0
max(perm.test)>=t0 # this is due to the fact that t0 was calculated manually and not with the t.test function which uses a different way to count the degree of freedom.
## [1] FALSE
The new p-value from the permutation test, built under the assumption that the null hypothesis is true (2 ocean areas have the same mean) is close to 0. The test statistics, calculated re-shuffling randomly the ocean area labels, are almost always lower than the observed test statistic. Therefore we conclude that there is an actual difference in the mean of the 2 ocean areas SA and NV. This is in line with the original test run in the previous point.
r.av$hafsvaedi<-droplevels(r.av$hafsvaedi)
r.av$hafsvaedi %>% levels()
## [1] "NV" "SA"
r.tb<-table(r.av$kt2,r.av$hafsvaedi) # every obs corresponds to a single fish.
r.tb %>% kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| NV | SA | |
|---|---|---|
| kynþroska | 10 | 26 |
| ókynþroska | 40 | 24 |
#r.tb
t(t(r.tb)/apply(r.tb,2,sum)) %>% formatC(digits=2, format="f") %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| NV | SA | |
|---|---|---|
| kynþroska | 0.20 | 0.52 |
| ókynþroska | 0.80 | 0.48 |
# success = kynþroska
# failure = ókynþroska
prp.t<-prop.test(t(r.tb),
alternative = "two.sided",
conf.level = 0.95)
# x = ....a two-dimensional table (or matrix) with 2 columns, giving the counts of successes and failures, respectively - in our case t(r.tb).
Let’s define \(p\) as the probability of the fish been kynþroska
\[H_0 : p_x = p_y \hspace{50pt} H_1: p_x \neq p_y\]
The test is testing whether the proportion of kynþroska are the same in the 2 ocean areas.
prp.t
##
## 2-sample test for equality of proportions with continuity correction
##
## data: t(r.tb)
## X-squared = 9.7656, df = 1, p-value = 0.001778
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.5173957 -0.1226043
## sample estimates:
## prop 1 prop 2
## 0.20 0.52
prp.t$p.value
## [1] 0.001778051
prp.t$statistic # the value of Pearson's chi-squared test statistic.
## X-squared
## 9.765625
prp.t$estimate
## prop 1 prop 2
## 0.20 0.52
The p-value 0.0017781 is lower that 0.05 (our significance level), therefore we reject the null hypothesis and conclude that the proportions of kynþroska fish is different in the 2 ocean areas.
9.765625 is the value of Pearson’s chi-squared test statistic.
0.2 and 0.52 are simply the number of success (kynþroska) in each ocean area (NV and SA respectively) divided the sample size.
The sample proportion generated by the test refers to kynþroska fish.
prp.t$conf.int
## [1] -0.5173957 -0.1226043
## attr(,"conf.level")
## [1] 0.95
The 95%-confidence interval does not include the value 0. The confidence interval refers to the difference in kynþroska proportion for the 2 ocean areas.
With 95% of confidence the real difference in kynþroska proportion between the 2 ocean areas (\(p_x-p_y\)) is not 0 (the proportions of kynþroska in the 2 ocean area are not the same). This is in line with the results from the prop.test.
ggplot(r.av1, mapping=aes(x=le, y=osl)) +
geom_point() + geom_smooth() +
labs(x="Length", y="Weight")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(r.av1, mapping=aes(x=log(le), y=log(osl))) +
geom_point() + geom_smooth() +
labs(x="log(Length)", y="log(Weight)")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
fit<-lm(log(osl)~log(le), data=r.av1)
summary(fit)
##
## Call:
## lm(formula = log(osl) ~ log(le), data = r.av1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.233169 -0.085258 -0.003168 0.075786 0.299478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.24893 0.24948 -21.04 <2e-16 ***
## log(le) 3.12566 0.06183 50.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1136 on 48 degrees of freedom
## Multiple R-squared: 0.9816, Adjusted R-squared: 0.9812
## F-statistic: 2555 on 1 and 48 DF, p-value: < 2.2e-16
#plot(fit)
As we can see from the first plot, the relationship between length and weight is not linear. After transforming both the predictor and the response the relationship becomes linear. Therefore when we will fit a SLR on log-transformed data, the model will yields higher values of R squared. The regression line will fit the log-transformed data much better than non-transformed data.
The model is as follow:
\[log(\hat{Y}) = \beta_0 + \beta_1logX\] Where the estimates of the coefficients are:
\(\beta_0\) = -5.2489348
\(\beta_1\) = 3.1256569
Yes, it is wise to use this model. The p-value for \(\beta_1\) is close to 0 which means that the relationship between x and y is statistically significant - I reject the underling null hypothesis \(H_0:\beta_1 = 0\). R squared and adjusted R-squared are close to 1, this indicates that the proportion of the total variation in Y explained by X is close to 1. Our model explains the variation in the response very well.
R squared suggests also that the degree of a linear relationship between log(Y) and log(X) is high.
For every unit increase of \(log(X)\), the response \(log(Y)\) increases by \(\beta_1\).
Translated in term of \(X\) and \(Y\): for every increase of \(e\) the response increases by \(e^{\hat{\beta}_1}\)
\(\beta_0\) is not very interesting as it is mainly useful to create a line that fit the data.
In the plot here below the regression line is explicitly expressed in command geom_abline. The line fits well the data.
ggplot(r.av1, mapping=aes(x=log(le), y=log(osl))) +
geom_point() +
geom_abline(slope= fit$coefficients[2],
intercept = fit$coefficients[1],
col="red", size=1) +
labs(x="log(Length)", y="log(Weight)")
ggplot(r.av1, mapping=aes(x=log(le), y=log(osl))) +
geom_point() +
stat_smooth(method='lm', se=FALSE, col="red") +
labs(x="log(Length)", y="log(Weight)")
## `geom_smooth()` using formula 'y ~ x'
ggplot(r.av1, mapping=aes(x=le, y=osl)) +
geom_point() +
stat_smooth(method='lm', se=FALSE, col="red") +
labs(x="Length", y="Weight")
## `geom_smooth()` using formula 'y ~ x'
gogn_likan <-
data.frame(
x = exp(fit$model[[2]]),
y = exp(predict(fit))
)
ggplot(r.av1, mapping=aes(x=le, y=osl)) +
geom_point() +
geom_line(data=gogn_likan, aes(x=x,y=y), col="green", size=1) +
labs(x="Length", y="Weight")
ggplot(r.av1, mapping = aes(x=as.factor(aldur), y=le)) +
geom_boxplot() +
stat_summary(fun=mean, geom="point", shape=3, size=2, color="red", fill="red") + labs(x="Aldur", y="Length")
litid <- lm(le~aldur, data=r.av1)
stort <- lm(le~factor(aldur), data=r.av1)
a<-anova(litid, stort)
a
## Analysis of Variance Table
##
## Model 1: le ~ aldur
## Model 2: le ~ factor(aldur)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 48 1574.2
## 2 42 1047.6 6 526.68 3.5193 0.006535 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pval.anova<-a[2,6]
Anova function perform a one-way anova test testing whether the mean for the different level of Aldur is the same - \(H_0: \mu_1 = \mu_2=....= \mu_9\). The p-value 0.0065354 is close to 0. We reject the null hypothesis and say that at least the mean for one level of Aldur is different than the others. This can be easily seen from the boxplot graph. The boxes and the related quantiles moves upwards as Aldur level increases.
Even taking into consideration that the variability of length is quite different across the level of Aldur, we can conclude that fish length tends to increases with age.
x<-r2d(reitir)$lon
y<-r2d(reitir)$lat
xyr<-data.frame(x,y,reitir)
xyr$hafsvaediSN<-ifelse(r2d(xyr$reitir)$lat>65,"N","S")
xyr$hafsvaediAV<-ifelse(r2d(xyr$reitir)$lon>-19,"A","V")
xyr$hafsvaedi<-paste(xyr$hafsvaediSN, xyr$hafsvaediAV, sep="") %>% as.factor
xyr<-dplyr::select(xyr,-hafsvaediAV, -hafsvaediSN)
world<-ne_countries(scale = "medium", returnclass = "sf")
ggplot(data = world) +
geom_sf(color = "black", fill = "blue") +
xlab("Longitude") + ylab("Latitude") +
ggtitle("Iceland") +
coord_sf(xlim = c(min(x)-0.5, max(x)-0.5), ylim = c(min(y)-0.5, max(y)+0.5), expand = F) +
geom_label(data=subset(xyr,hafsvaedi==r.haf[1] | hafsvaedi==r.haf[2]),
aes(x,y,label=reitir,col=hafsvaedi),
size=2)