IC9

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)

setwd("C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/IC8b/")
squid <- read_csv("squid1.csv")
## Rows: 519 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): sample.no, specimen, year, month, weight, sex, maturity.stage, DML...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(squid)
## Rows: 519
## Columns: 13
## $ sample.no         <dbl> 105128901, 105128901, 105128901, 105128901, 10512890…
## $ specimen          <dbl> 1002, 1003, 1005, 1007, 1008, 1009, 1011, 1013, 1014…
## $ year              <dbl> 1989, 1989, 1989, 1989, 1989, 1989, 1989, 1989, 1989…
## $ month             <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1, 1…
## $ weight            <dbl> 152.0, 105.9, 138.4, 140.8, 126.2, 54.3, 81.2, 182.7…
## $ sex               <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
## $ maturity.stage    <dbl> 3, 1, 2, 2, 3, 1, 2, 3, 3, 4, 3, 4, 4, 5, 4, 4, 4, 5…
## $ DML               <dbl> 174, 153, 169, 175, 169, 116, 135, 192, 170, 205, 19…
## $ eviscerate.weight <dbl> 87.5, 62.6, 79.4, 83.1, 72.2, 30.2, 46.6, 107.7, 72.…
## $ dig.weight        <dbl> 4.648, 3.138, 0.307, 4.123, 3.605, 1.092, 2.168, 2.0…
## $ nid.length        <dbl> 39.4, 24.1, 39.0, 41.4, 39.8, 20.0, 14.0, 55.0, 44.0…
## $ nid.weight        <dbl> 2.460, 0.319, 1.169, 1.631, 2.030, 0.148, 0.252, 5.6…
## $ ovary.weight      <dbl> 1.680, 0.103, 0.289, 0.252, 0.860, 0.016, 0.043, 5.8…

Question 4

Import the ‘squid1.txt’ file into R using the read.table() function and assign it to a variable named squid. Use the str() function to display the structure of the dataset and the summary() function to summarise the dataset. How many observations are in this dataset? How many variables? The year, month and maturity.stage variables were coded as integers in the original dataset. Here we would like to code them as factors. Create a new variable for each of these variables in the squid dataframe and recode them as factors. Use the str() function again to check the coding of these new variables.

How many observations are in this dataset? 519 observations and 13 variables

The year, month and maturity.stage variables were coded as integers in the original dataset. Here we would like to code them as factors.

squid <- read.table(file = "C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/IC8b/squid1.txt", header = TRUE)
head(squid)
##   sample.no specimen year month weight sex maturity.stage DML eviscerate.weight
## 1 105128901     1002 1989    12  152.0   2              3 174              87.5
## 2 105128901     1003 1989    12  105.9   2              1 153              62.6
## 3 105128901     1005 1989    12  138.4   2              2 169              79.4
## 4 105128901     1007 1989    12  140.8   2              2 175              83.1
## 5 105128901     1008 1989    12  126.2   2              3 169              72.2
## 6 105128901     1009 1989    12   54.3   2              1 116              30.2
##   dig.weight nid.length nid.weight ovary.weight
## 1      4.648       39.4      2.460        1.680
## 2      3.138       24.1      0.319        0.103
## 3      0.307       39.0      1.169        0.289
## 4      4.123       41.4      1.631        0.252
## 5      3.605       39.8      2.030        0.860
## 6      1.092       20.0      0.148        0.016
which(is.na(squid))
## integer(0)
str(squid)
## 'data.frame':    519 obs. of  13 variables:
##  $ sample.no        : int  105128901 105128901 105128901 105128901 105128901 105128901 105128901 105128901 105128901 105128901 ...
##  $ specimen         : int  1002 1003 1005 1007 1008 1009 1011 1013 1014 1017 ...
##  $ year             : int  1989 1989 1989 1989 1989 1989 1989 1989 1989 1989 ...
##  $ month            : int  12 12 12 12 12 12 12 12 12 12 ...
##  $ weight           : num  152 106 138 141 126 ...
##  $ sex              : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ maturity.stage   : int  3 1 2 2 3 1 2 3 3 4 ...
##  $ DML              : int  174 153 169 175 169 116 135 192 170 205 ...
##  $ eviscerate.weight: num  87.5 62.6 79.4 83.1 72.2 ...
##  $ dig.weight       : num  4.648 3.138 0.307 4.123 3.605 ...
##  $ nid.length       : num  39.4 24.1 39 41.4 39.8 20 14 55 44 53 ...
##  $ nid.weight       : num  2.46 0.319 1.169 1.631 2.03 ...
##  $ ovary.weight     : num  1.68 0.103 0.289 0.252 0.86 ...
summary(squid)
##    sample.no            specimen         year          month       
##  Min.   :100039001   Min.   :1001   Min.   :1989   Min.   : 1.000  
##  1st Qu.:105079001   1st Qu.:1009   1st Qu.:1990   1st Qu.: 3.000  
##  Median :113099001   Median :1026   Median :1990   Median : 7.000  
##  Mean   :112499032   Mean   :1028   Mean   :1990   Mean   : 6.803  
##  3rd Qu.:121029101   3rd Qu.:1045   3rd Qu.:1991   3rd Qu.:10.000  
##  Max.   :130039001   Max.   :1076   Max.   :1991   Max.   :12.000  
##      weight           sex    maturity.stage       DML      eviscerate.weight
##  Min.   : 34.0   Min.   :2   Min.   :1.000   Min.   : 88   Min.   : 16.8    
##  1st Qu.:184.5   1st Qu.:2   1st Qu.:2.000   1st Qu.:187   1st Qu.: 97.0    
##  Median :272.0   Median :2   Median :3.000   Median :217   Median :138.0    
##  Mean   :286.8   Mean   :2   Mean   :3.355   Mean   :215   Mean   :149.4    
##  3rd Qu.:360.5   3rd Qu.:2   3rd Qu.:5.000   3rd Qu.:240   3rd Qu.:187.0    
##  Max.   :809.0   Max.   :2   Max.   :5.000   Max.   :323   Max.   :397.0    
##    dig.weight        nid.length       nid.weight      ovary.weight   
##  Min.   :  0.307   Min.   : 10.00   Min.   : 0.031   Min.   : 0.016  
##  1st Qu.:  4.705   1st Qu.: 34.00   1st Qu.: 0.863   1st Qu.: 0.429  
##  Median :  7.321   Median : 65.10   Median : 7.769   Median :10.461  
##  Mean   :  8.118   Mean   : 59.65   Mean   : 9.675   Mean   :12.564  
##  3rd Qu.: 10.028   3rd Qu.: 81.00   3rd Qu.:16.140   3rd Qu.:22.784  
##  Max.   :100.341   Max.   :430.20   Max.   :39.325   Max.   :50.230
#squid$year <-factor(squid$year)
#squid$month <-factor(squid$month)
#squid$maturity.stage <-factor(squid$maturity.stage)
#str(squid)
#year_factor <-as.factor(data$year)

Question 6

6. The humble cleveland dotplot is a great way of identifying if you have potential outliers in continuous variables (See Section 4.2.4. Create dotplots (using the dotchart() function) for the following variables; DML, weight, nid.length and ovary.weight. Do these variables contain any unusually large or small observations? Don’t forget, if you prefer to create a single figure with all 4 plots you can always split your plotting device into 2 rows and 2 columns (see Section 4.4 of the book). Use the pdf() function to save a pdf version of your plot(s) in your output directory you created in Exercise 1 (see Section 4.5 of the book to see how the pdf() function works). I have also included some alternative code in the solutions for this exercise using the dotplot() function from the lattice package.

# DML, weight, nid.length and ovary.weight


jpeg(file = "C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/IC8b/plot.jpg")
pdf(file = "C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/IC8b/plot.pdf")
par(mfrow = c(1, 2))
  plot(squid$DML, squid$weight, xlab = "Dorsal Mantle Length",
      ylab = "Weight")
boxplot(nid.length ~ ovary.weight, data = squid, cex.axis = 0.6)

Question 7

7. It looks like the variable nid.length contains an unusually large value. Actually, this value is biologically implausible and clearly an error. The researchers were asked to go back and check their field notebooks and sure enough they discover a typo. It looks like a tired researcher accidentally inserted a zero by mistake when transcribing these data (mistakes in data are very common and why we always explore, check and validate any data we are working on). We can clearly see this value is over 400 so we can use the which() function to identify which observation this is which(squid\(nid.length > 400). It looks like this is the 11th observation of the squid\)nid.length variable. Use your skill with the square brackets [ ] to first confirm the this is the correct value (you should get 430.2) and then change this value to 43.2. Now redo the dotchart to visualise your correction. Caution: You can only do this because you have confirmed that this is an transcribing error. You should not remove or change values in your data just because you feel like it or they look ‘unusual’. This is scientific fraud! Also, the advantage of making this change in your R script rather than in Excel is that you now have a permanent record of the change you made and can state a clear reason for the change.

which(squid$nid.length > 400)
## [1] 11
#view(squid$nid.length)


squid[11,"nid.length"] <- 42.3
str(squid)
## 'data.frame':    519 obs. of  13 variables:
##  $ sample.no        : int  105128901 105128901 105128901 105128901 105128901 105128901 105128901 105128901 105128901 105128901 ...
##  $ specimen         : int  1002 1003 1005 1007 1008 1009 1011 1013 1014 1017 ...
##  $ year             : int  1989 1989 1989 1989 1989 1989 1989 1989 1989 1989 ...
##  $ month            : int  12 12 12 12 12 12 12 12 12 12 ...
##  $ weight           : num  152 106 138 141 126 ...
##  $ sex              : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ maturity.stage   : int  3 1 2 2 3 1 2 3 3 4 ...
##  $ DML              : int  174 153 169 175 169 116 135 192 170 205 ...
##  $ eviscerate.weight: num  87.5 62.6 79.4 83.1 72.2 ...
##  $ dig.weight       : num  4.648 3.138 0.307 4.123 3.605 ...
##  $ nid.length       : num  39.4 24.1 39 41.4 39.8 20 14 55 44 53 ...
##  $ nid.weight       : num  2.46 0.319 1.169 1.631 2.03 ...
##  $ ovary.weight     : num  1.68 0.103 0.289 0.252 0.86 ...
view(squid$nid.length)
#example
#df["rowName", "columnName"] <- value
#df[df$serial.id==5, "gender"] <- 1

#Question 8

8. When exploring your data it is often useful to visualise the distribution of continuous variables. Take a look at Section 4.2.2 and then create histograms for the variables; DML, weight, eviscerate.weight and ovary.weight. Again, its up to you if you want to plot all 4 plots separately or in the same figure. Export your plot(s) as pdf file(s) to the output directory. One potential problem with histograms is that the distribution of data can look quite different depending on the number of ‘breaks’ used. The hist() function does it’s best to create appropriate ‘breaks’ for your plots (it uses the Sturges algorithm for those that want to know) but experiment with changing the number of breaks for the DML variable to see how the shape of the distribution changes (see Section 4.2.2 of the book for further details of how to change the breaks).

#DML, weight, eviscerate.weight and ovary.weight

jpeg(file = "C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/IC8b/histogram.jpg")
pdf(file = "C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/IC8b/histogram.pdf")
hist(squid$DML, breaks = "Sturges", main = "Dorsal Mantle Length")
hist(squid$weight, breaks = "Scott", main = "Weight")
hist(squid$eviscerate.weight, breaks =50, main = "Eviscerated.weight")
hist(squid$ovary.weight, breaks = 15, main = "Ovary Weight")

Question 9

9. Scatterplots are great for visualising relationships between two continuous variables (Section 4.2.1). Plot the relationship between DML on the x axis and weight on the y axis. How would you describe this relationship? Is it linear? One approach to linearising relationships is to apply a transformation on one or both variables. Try transforming the weight variable with either a natural log (log()) or square root (sqrt()) transformation. I suggest you create new variables in the squid dataframe for your transformed variables and use these variables when creating your new plots (ask if you’re not sure how to do this). Which transformation best linearises this relationship? Again, save your plots as a pdf file (or try saving in your output directory as jpeg or png format using the jpeg() or png() functions (Section 4.5) if you feel the need for a change!).

How would you describe this relationship? Is it linear? One approach to linearising relationships is to apply a transformation on one or both variables.

There is a positive correlation to the the graph. As the dorsal mantle length increase the weight increases.

# DML on the x axis and weight on the y axis
plot(squid$DML ~ squid$weight)

#scatter plot with linear regression
lmsquid = lm(DML~weight, data = squid)
plot(lmsquid, pch = 16, col='blue')

abline(lmsquid)

#transforming the weight variable with either a natural log (log()) or square root (sqrt()) transformation.

plot_log <- lm(squid$DML~squid$weight)
summary(plot_log)
## 
## Call:
## lm(formula = squid$DML ~ squid$weight)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.377  -6.770   1.689   8.747  29.956 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.337e+02  1.209e+00  110.59   <2e-16 ***
## squid$weight 2.832e-01  3.797e-03   74.59   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.99 on 517 degrees of freedom
## Multiple R-squared:  0.915,  Adjusted R-squared:  0.9148 
## F-statistic:  5564 on 1 and 517 DF,  p-value: < 2.2e-16
coef(plot_log)
##  (Intercept) squid$weight 
##   133.747913     0.283215

Question 10

####10. When visualising differences in a continuous variable between levels of a factor (categorical variable) then a boxplot is your friend (avoid using bar plots - Google ‘bar plots are evil’ for more info). Create a boxplot to visualise the differences in DML at each maturity stage (don’t forget to use the recoded version of this variable you created in Q4) . Include x and y axes labels in your plot. Make sure you understand the anatomy of a boxplot before moving on - please ask if you’re not sure (also see Section 4.2.3 of the book). An alternative to the boxplot is the violin plot. A violin plot is a combination of a boxplot and a kernel density plot and is great at visualising the distribution of a variable. To create a violin plot you will first need to install the vioplot package from CRAN and make it available library(vioplot). You can now use the vioplot() function in pretty much the same way as you created your boxplot (again Section 4.2.3 of the book walks you though this).

#Create a boxplot to visualise the differences in DML at each maturity stage (don’t forget to use the recoded version of this variable you created in Q4)


#boxplot
boxplot(squid$DML~squid$maturity.stage, frame = FALSE,
        horizontal = TRUE)

boxplot(squid$DML~squid$maturity.stage, frame = FALSE,
        border = c("#999999", "#E69F00", "#56B4E9", "#b042ff","#AAFF00" ),
        main = "Plot of length by dose",
        xlab = "Maturity Stage", ylab = "Dorsal Mantle Length",
        col = "lightgray", )

Question 12

12. To explore the relationships between multiple continuous variables it’s hard to beat a pairs plot. Create a pairs plot for the variables; DML, weight, eviscerate.weight, ovary.weight, nid.length, and nid.weight (see Section 4.2.5 of the book for more details). If it looks a little cramped in RStudio then click on the ‘zoom’ button in the plot viewer to see a larger version. One of the great things about the pairs() function is that you can customise what goes into each panel. Modify your pairs plot to include a histogram of the variables on the diagonal panel and include a correlation coefficient for each relationship on the upper triangle panels. Also include a smoother (wiggly line) in the lower triangle panels to help visualise these relationships. Take a look at the Introduction to R book to see how to do all this (or ?pairs).

## PAIRS WITH LINEAR REGRESSION

pairs(squid[, c("DML", "weight", "eviscerate.weight", "ovary.weight", 
                "nid.length", "nid.weight")],
                panel = panel.smooth)

## CORELATION
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
    usr <- par("usr")
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits = digits)[1]
    txt <- paste0(prefix, txt)
    if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt, cex = cex.cor * r)
}

pairs(squid[, c("DML", "weight", "eviscerate.weight", "ovary.weight", 
                "nid.length", "nid.weight")],
                lower.panel = panel.cor)

## HISTOGRAM
panel.hist <- function(x, ...)
{
    usr <- par("usr")
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}

pairs(squid[, c("DML", "weight", "eviscerate.weight", "ovary.weight", 
                "nid.length", "nid.weight")],
                lower.panel = panel.cor,
                diag.panel = panel.hist,
                upper.panel = panel.smooth)

#Question 13

13. Almost every aspect of the figures you create in R is customisable. Learning how to get your plot looking just right is not only rewarding but also means that you will never have to include a plot in your paper/thesis that you’re not completely happy with. When you start learning how to use R it can sometimes seem to take a lot of work to customise your plots. Don’t worry, it gets easier with experience (most of the time anyway) and you will always have your code if you want to create a similar plot in the future. Use the plot() function to produce a scatterplot of DML on the x axis and ovary weight on the y axis (you might need to apply a transformation on the variable ovary.weight). Use a different colour to highlight points for each level of maturity stage. Produce a legend explaining the different colours and place it in a suitable position on the plot. Format the graph further to make it suitable for inclusion into your paper/thesis (i.e. add axes labels, change the axes scales etc). See Section 4.3 for more details about customising plots.

#Use the plot() function to produce a scatterplot of DML on the x axis and ovary weight on the y axis (you might need to apply a transformation on the variable ovary.weight).

plot(DML~ovary.weight, data = squid, pch = 16, col=squid$maturity.stage,
       main = "Plot of length by dose",
        xlab = "Ovary Weight", ylab = "Dorsal Mantle Length")