library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
setwd("C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/IC8b/")
squid <- read_csv("squid1.csv")
## Rows: 519 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): sample.no, specimen, year, month, weight, sex, maturity.stage, DML...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(squid)
## Rows: 519
## Columns: 13
## $ sample.no <dbl> 105128901, 105128901, 105128901, 105128901, 10512890…
## $ specimen <dbl> 1002, 1003, 1005, 1007, 1008, 1009, 1011, 1013, 1014…
## $ year <dbl> 1989, 1989, 1989, 1989, 1989, 1989, 1989, 1989, 1989…
## $ month <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1, 1…
## $ weight <dbl> 152.0, 105.9, 138.4, 140.8, 126.2, 54.3, 81.2, 182.7…
## $ sex <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
## $ maturity.stage <dbl> 3, 1, 2, 2, 3, 1, 2, 3, 3, 4, 3, 4, 4, 5, 4, 4, 4, 5…
## $ DML <dbl> 174, 153, 169, 175, 169, 116, 135, 192, 170, 205, 19…
## $ eviscerate.weight <dbl> 87.5, 62.6, 79.4, 83.1, 72.2, 30.2, 46.6, 107.7, 72.…
## $ dig.weight <dbl> 4.648, 3.138, 0.307, 4.123, 3.605, 1.092, 2.168, 2.0…
## $ nid.length <dbl> 39.4, 24.1, 39.0, 41.4, 39.8, 20.0, 14.0, 55.0, 44.0…
## $ nid.weight <dbl> 2.460, 0.319, 1.169, 1.631, 2.030, 0.148, 0.252, 5.6…
## $ ovary.weight <dbl> 1.680, 0.103, 0.289, 0.252, 0.860, 0.016, 0.043, 5.8…
squid <- read.table(file = "C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/IC8b/squid1.txt", header = TRUE)
head(squid)
## sample.no specimen year month weight sex maturity.stage DML eviscerate.weight
## 1 105128901 1002 1989 12 152.0 2 3 174 87.5
## 2 105128901 1003 1989 12 105.9 2 1 153 62.6
## 3 105128901 1005 1989 12 138.4 2 2 169 79.4
## 4 105128901 1007 1989 12 140.8 2 2 175 83.1
## 5 105128901 1008 1989 12 126.2 2 3 169 72.2
## 6 105128901 1009 1989 12 54.3 2 1 116 30.2
## dig.weight nid.length nid.weight ovary.weight
## 1 4.648 39.4 2.460 1.680
## 2 3.138 24.1 0.319 0.103
## 3 0.307 39.0 1.169 0.289
## 4 4.123 41.4 1.631 0.252
## 5 3.605 39.8 2.030 0.860
## 6 1.092 20.0 0.148 0.016
which(is.na(squid))
## integer(0)
str(squid)
## 'data.frame': 519 obs. of 13 variables:
## $ sample.no : int 105128901 105128901 105128901 105128901 105128901 105128901 105128901 105128901 105128901 105128901 ...
## $ specimen : int 1002 1003 1005 1007 1008 1009 1011 1013 1014 1017 ...
## $ year : int 1989 1989 1989 1989 1989 1989 1989 1989 1989 1989 ...
## $ month : int 12 12 12 12 12 12 12 12 12 12 ...
## $ weight : num 152 106 138 141 126 ...
## $ sex : int 2 2 2 2 2 2 2 2 2 2 ...
## $ maturity.stage : int 3 1 2 2 3 1 2 3 3 4 ...
## $ DML : int 174 153 169 175 169 116 135 192 170 205 ...
## $ eviscerate.weight: num 87.5 62.6 79.4 83.1 72.2 ...
## $ dig.weight : num 4.648 3.138 0.307 4.123 3.605 ...
## $ nid.length : num 39.4 24.1 39 41.4 39.8 20 14 55 44 53 ...
## $ nid.weight : num 2.46 0.319 1.169 1.631 2.03 ...
## $ ovary.weight : num 1.68 0.103 0.289 0.252 0.86 ...
summary(squid)
## sample.no specimen year month
## Min. :100039001 Min. :1001 Min. :1989 Min. : 1.000
## 1st Qu.:105079001 1st Qu.:1009 1st Qu.:1990 1st Qu.: 3.000
## Median :113099001 Median :1026 Median :1990 Median : 7.000
## Mean :112499032 Mean :1028 Mean :1990 Mean : 6.803
## 3rd Qu.:121029101 3rd Qu.:1045 3rd Qu.:1991 3rd Qu.:10.000
## Max. :130039001 Max. :1076 Max. :1991 Max. :12.000
## weight sex maturity.stage DML eviscerate.weight
## Min. : 34.0 Min. :2 Min. :1.000 Min. : 88 Min. : 16.8
## 1st Qu.:184.5 1st Qu.:2 1st Qu.:2.000 1st Qu.:187 1st Qu.: 97.0
## Median :272.0 Median :2 Median :3.000 Median :217 Median :138.0
## Mean :286.8 Mean :2 Mean :3.355 Mean :215 Mean :149.4
## 3rd Qu.:360.5 3rd Qu.:2 3rd Qu.:5.000 3rd Qu.:240 3rd Qu.:187.0
## Max. :809.0 Max. :2 Max. :5.000 Max. :323 Max. :397.0
## dig.weight nid.length nid.weight ovary.weight
## Min. : 0.307 Min. : 10.00 Min. : 0.031 Min. : 0.016
## 1st Qu.: 4.705 1st Qu.: 34.00 1st Qu.: 0.863 1st Qu.: 0.429
## Median : 7.321 Median : 65.10 Median : 7.769 Median :10.461
## Mean : 8.118 Mean : 59.65 Mean : 9.675 Mean :12.564
## 3rd Qu.: 10.028 3rd Qu.: 81.00 3rd Qu.:16.140 3rd Qu.:22.784
## Max. :100.341 Max. :430.20 Max. :39.325 Max. :50.230
#squid$year <-factor(squid$year)
#squid$month <-factor(squid$month)
#squid$maturity.stage <-factor(squid$maturity.stage)
#str(squid)
#year_factor <-as.factor(data$year)
# DML, weight, nid.length and ovary.weight
jpeg(file = "C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/IC8b/plot.jpg")
pdf(file = "C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/IC8b/plot.pdf")
par(mfrow = c(1, 2))
plot(squid$DML, squid$weight, xlab = "Dorsal Mantle Length",
ylab = "Weight")
boxplot(nid.length ~ ovary.weight, data = squid, cex.axis = 0.6)
which(squid$nid.length > 400)
## [1] 11
#view(squid$nid.length)
squid[11,"nid.length"] <- 42.3
str(squid)
## 'data.frame': 519 obs. of 13 variables:
## $ sample.no : int 105128901 105128901 105128901 105128901 105128901 105128901 105128901 105128901 105128901 105128901 ...
## $ specimen : int 1002 1003 1005 1007 1008 1009 1011 1013 1014 1017 ...
## $ year : int 1989 1989 1989 1989 1989 1989 1989 1989 1989 1989 ...
## $ month : int 12 12 12 12 12 12 12 12 12 12 ...
## $ weight : num 152 106 138 141 126 ...
## $ sex : int 2 2 2 2 2 2 2 2 2 2 ...
## $ maturity.stage : int 3 1 2 2 3 1 2 3 3 4 ...
## $ DML : int 174 153 169 175 169 116 135 192 170 205 ...
## $ eviscerate.weight: num 87.5 62.6 79.4 83.1 72.2 ...
## $ dig.weight : num 4.648 3.138 0.307 4.123 3.605 ...
## $ nid.length : num 39.4 24.1 39 41.4 39.8 20 14 55 44 53 ...
## $ nid.weight : num 2.46 0.319 1.169 1.631 2.03 ...
## $ ovary.weight : num 1.68 0.103 0.289 0.252 0.86 ...
view(squid$nid.length)
#example
#df["rowName", "columnName"] <- value
#df[df$serial.id==5, "gender"] <- 1
#Question 8
#DML, weight, eviscerate.weight and ovary.weight
jpeg(file = "C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/IC8b/histogram.jpg")
pdf(file = "C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/IC8b/histogram.pdf")
hist(squid$DML, breaks = "Sturges", main = "Dorsal Mantle Length")
hist(squid$weight, breaks = "Scott", main = "Weight")
hist(squid$eviscerate.weight, breaks =50, main = "Eviscerated.weight")
hist(squid$ovary.weight, breaks = 15, main = "Ovary Weight")
# DML on the x axis and weight on the y axis
plot(squid$DML ~ squid$weight)
#scatter plot with linear regression
lmsquid = lm(DML~weight, data = squid)
plot(lmsquid, pch = 16, col='blue')
abline(lmsquid)
#transforming the weight variable with either a natural log (log()) or square root (sqrt()) transformation.
plot_log <- lm(squid$DML~squid$weight)
summary(plot_log)
##
## Call:
## lm(formula = squid$DML ~ squid$weight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.377 -6.770 1.689 8.747 29.956
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.337e+02 1.209e+00 110.59 <2e-16 ***
## squid$weight 2.832e-01 3.797e-03 74.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.99 on 517 degrees of freedom
## Multiple R-squared: 0.915, Adjusted R-squared: 0.9148
## F-statistic: 5564 on 1 and 517 DF, p-value: < 2.2e-16
coef(plot_log)
## (Intercept) squid$weight
## 133.747913 0.283215
####10. When visualising differences in a continuous variable between levels of a factor (categorical variable) then a boxplot is your friend (avoid using bar plots - Google ‘bar plots are evil’ for more info). Create a boxplot to visualise the differences in DML at each maturity stage (don’t forget to use the recoded version of this variable you created in Q4) . Include x and y axes labels in your plot. Make sure you understand the anatomy of a boxplot before moving on - please ask if you’re not sure (also see Section 4.2.3 of the book). An alternative to the boxplot is the violin plot. A violin plot is a combination of a boxplot and a kernel density plot and is great at visualising the distribution of a variable. To create a violin plot you will first need to install the vioplot package from CRAN and make it available library(vioplot). You can now use the vioplot() function in pretty much the same way as you created your boxplot (again Section 4.2.3 of the book walks you though this).
#Create a boxplot to visualise the differences in DML at each maturity stage (don’t forget to use the recoded version of this variable you created in Q4)
#boxplot
boxplot(squid$DML~squid$maturity.stage, frame = FALSE,
horizontal = TRUE)
boxplot(squid$DML~squid$maturity.stage, frame = FALSE,
border = c("#999999", "#E69F00", "#56B4E9", "#b042ff","#AAFF00" ),
main = "Plot of length by dose",
xlab = "Maturity Stage", ylab = "Dorsal Mantle Length",
col = "lightgray", )
#Use the plot() function to produce a scatterplot of DML on the x axis and ovary weight on the y axis (you might need to apply a transformation on the variable ovary.weight).
plot(DML~ovary.weight, data = squid, pch = 16, col=squid$maturity.stage,
main = "Plot of length by dose",
xlab = "Ovary Weight", ylab = "Dorsal Mantle Length")