Markdown Author: Jessie Bell, 2023
Libraries Used: tidyverse, gt, ggplot2
Answers: Pewter Blue
View Jessie’s data wrangling here.
I tried for about 2 hours to figure out how to host my files in GitHub and embed them into my R Markdown. The best I could come up with was the link above (😔) but I know there is a better way! I intend to embed a .txt file that automatically downloads to your computer when you click it.
fishiesData <- read.csv("fishies_cleaned.csv") #apparently I assigned NA to every blank cell in excel. Edited below via subsetting instead of going back to excel.
fishiesData <- subset(fishiesData, Month <10)
fishiesData <- fishiesData %>% #using this to change month names so they show up in the table as actual month name (Itried an if else statement, but couldnt figure out how to list multiple arguments in it.)
mutate(Months = case_when(
Month == 5 ~ "May",
Month == 6 ~ "June",
Month == 7 ~ "July",
Month == 8 ~ "August",
Month == 9 ~ "September"))
fishiesDataTable <- fishiesData |>
drop_na(DO_mgperL) |>
group_by(Months) |>
summarise(Mean_Temp = mean(temp_c),
Mean_DO = mean(DO_mgperL))
gt(fishiesDataTable) |>
opt_table_font(google_font("Caveat")) #no necessary library for Google Fonts, just use google_font() function. BTW, opt_table_font() function comes with gt and can be used to change font on gt table.
| Months | Mean_Temp | Mean_DO |
|---|---|---|
| August | 7.744185 | 2.119836 |
| July | 7.516663 | 1.848069 |
| June | 8.578735 | 2.943225 |
| May | 8.069267 | 2.703870 |
| September | 7.809440 | 1.383814 |
fishiesDataTable2 <- fishiesData |>
drop_na(SleNAer.Sole) |>
group_by(Months) |>
summarise(Mean_Temp = mean(temp_c),
SD_Temp = sd(temp_c),
Mean_slendersole = mean(SleNAer.Sole),
SD_slendersole = sd(SleNAer.Sole))
gt(fishiesDataTable2) |>
opt_table_font(google_font("Caveat"))
| Months | Mean_Temp | SD_Temp | Mean_slendersole | SD_slendersole |
|---|---|---|---|---|
| August | 7.830281 | 1.0314430 | 59.58333 | 105.4253 |
| July | 7.516663 | 0.3824471 | 64.26829 | 173.4604 |
| June | 8.578735 | 1.0351033 | 90.70588 | 176.1962 |
| May | 8.069267 | 0.1470770 | 543.33333 | 484.1656 |
| September | 7.809440 | 0.2104897 | 392.80000 | 344.8263 |
#checking the distribution
fishiesData$log_SleNAer.Sole = log(fishiesData$SleNAer.Sole+1)# I added 1 because the data contained 0s.
fishiesData$log_temp_c = log(fishiesData$temp_c+1) #I didthe same thing to temp.
ggplot(fishiesData, aes(log_temp_c))+
geom_histogram(fill="#d577ff", bins=30)+
labs(title = "Temperature Log Transformation Distribution", x = "Log Transformed Temperature", caption = "Figure 1: Displaying distribution of the log temperature + 1 for fishiesData.csv")+
theme(plot.caption = element_text(hjust = 0))
ggplot(fishiesData, aes(log_SleNAer.Sole))+
geom_histogram(fill="#aec344",bins=30)+
labs(title = "Slender Sole Fish Count Log Transformation Distribution", x = "Log Transformed Slender Sole", caption = "Figure 2: Displaying distribution of the log slender sole fish count + 1 for fishiesData.csv")+
theme(
plot.caption = element_text(hjust = 0)
)#there is still a right skew. :(
ggplot(fishiesData, aes(temp_c, SleNAer.Sole, color=Months))+
geom_point()+
labs(title="Temperature vs. Fish Frequency", x="Temperature", y="Slender Sole Frequency", caption = "Figure 3: Displaying data to demonstrate a relationship between frequency data and numerical data.")
skatesum <- sum(fishiesData$Big.Skate, na.rm = T)/length(fishiesData)#not large enough %
solesum <- sum(fishiesData$SleNAer.Sole, na.rm = T)/length(fishiesData)
poachersum <- sum(fishiesData$Warty.Poacher, na.rm = T)/length(fishiesData)
a<- apply(fishiesData[15:28], 2, sum) #trying the above using the apply function
#OMG WTF. I just changed the second number to 2 in the function and it popped out with the table I wanted. That is crazy. I spent so much time trying to figure out what this code meant.
b<- data.frame(a/length(fishiesData)) #Warty poacher (Warty.Poacher) and slender sole (SleNAer.Sole)
sole <- fishiesData$SleNAer.Sole #subset slender sole and warty poacher
poacher <- fishiesData$Warty.Poacher
newfishies <- fishiesData[1:12] #subset just the columns 1-12
newfishies <- cbind(newfishies, sole, poacher) #added sole and pacher using cbind
fortable <- head(newfishies)
dim(newfishies)
## [1] 114 14
fortable %>%
gt() %>%
opt_table_font(google_font("Caveat"))
| Year | Month | Day | DOY | Date | Station | Depth | time_towed_min | distance_towed_km | DO_mgperL | temp_c | salinity_ppt | sole | poacher |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2012 | 8 | 7 | 220 | 8/7/2012 | NH15 | 123.8 | 10 | 546.0000 | 1.22415 | 7.2643 | 33.8893 | 76 | 46 |
| 2022 | 8 | 7 | 220 | 8/7/2012 | NH20 | 120.0 | 8 | 372.0000 | 1.20000 | 7.3000 | NA | 88 | 28 |
| 2013 | 8 | 1 | 213 | 8/1/2013 | NH15 | 107.9 | 10 | 626.0000 | 1.74521 | 7.3190 | 33.8481 | 6 | 44 |
| 2012 | 9 | 18 | 262 | 9/18/2012 | NH15 | 106.6 | 10 | 598.0000 | 1.14459 | 7.6287 | 33.7037 | 337 | 32 |
| 2013 | 6 | 28 | 179 | 6/28/2013 | NH15 | 105.3 | 13 | 838.0000 | 1.48007 | 7.0610 | 33.8942 | 0 | 34 |
| 2008 | 8 | 11 | 224 | 8/11/2008 | NH10 | 81.6 | 10 | 606.1757 | 0.97840 | 7.3419 | NA | 70 | 47 |
data_long <- gather(newfishies, Fish, count, sole:poacher, factor_key=TRUE)
dim(data_long)
## [1] 228 14
table3 <- head(data_long)
table3 %>%
gt() %>%
opt_table_font(google_font("Caveat"))
| Year | Month | Day | DOY | Date | Station | Depth | time_towed_min | distance_towed_km | DO_mgperL | temp_c | salinity_ppt | Fish | count |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2012 | 8 | 7 | 220 | 8/7/2012 | NH15 | 123.8 | 10 | 546.0000 | 1.22415 | 7.2643 | 33.8893 | sole | 76 |
| 2022 | 8 | 7 | 220 | 8/7/2012 | NH20 | 120.0 | 8 | 372.0000 | 1.20000 | 7.3000 | NA | sole | 88 |
| 2013 | 8 | 1 | 213 | 8/1/2013 | NH15 | 107.9 | 10 | 626.0000 | 1.74521 | 7.3190 | 33.8481 | sole | 6 |
| 2012 | 9 | 18 | 262 | 9/18/2012 | NH15 | 106.6 | 10 | 598.0000 | 1.14459 | 7.6287 | 33.7037 | sole | 337 |
| 2013 | 6 | 28 | 179 | 6/28/2013 | NH15 | 105.3 | 13 | 838.0000 | 1.48007 | 7.0610 | 33.8942 | sole | 0 |
| 2008 | 8 | 11 | 224 | 8/11/2008 | NH10 | 81.6 | 10 | 606.1757 | 0.97840 | 7.3419 | NA | sole | 70 |
ggplot(data_long, aes(Depth, count, color = Fish))+
geom_point()+
labs(title="Depth vs. Fish Count", x="Depth (fathoms)", y="Fish #", caption = "Figure 4: Displaying data to demonstrate a relationship between fish count (# fish) and water depth (fathoms).")
#grabing that sole data
data_long$log_count <- log(data_long$count+1) #I added 1 because there are 0 in the data, and that is what my textbook said to do
soleData <- subset(data_long, Fish == "sole")
depth <- soleData$Depth
sole <- soleData$count
soleData$log_depth <- log(soleData$Depth+1)
soleData$log_count <- log(soleData$count+1)
log_transformations <- data.frame(depth, sole, soleData$log_count, soleData$log_depth)
#making histograms of log transformations
suppressWarnings({ggplot(log_transformations, aes(soleData.log_depth))+
geom_histogram(fill = "#b3c004", bins=40)+
labs(title="Log transformation of depth (fathoms)", x="Depth", y="Count", caption = "Figure 5: Checking out what happens when we log transform the depth data. Lookin good!")+
theme(plot.caption = element_text(hjust = 0))})
suppressWarnings({ggplot(log_transformations, aes(soleData.log_count))+
geom_histogram(fill = "#f8766d", bins=40)+
labs(title="Log transformation of sole fish count", x="Sender Sole", y="Count", caption = "Figure 6: Checking out what happens when we log transform just the sole count data. That 0 data is just annoying.")+
theme(plot.caption = element_text(hjust = 0))})
#I don't want zeros in the data anymore.
hi <- ifelse(log_transformations$soleData.log_count > 0, "cool", "zeroes")
log_transformations <- cbind(log_transformations, hi)
nozero <- subset(log_transformations, hi=="cool")
#graphing histograms with no 0s
suppressWarnings({ggplot(nozero, aes(soleData.log_count))+
geom_histogram(fill = "#f8766d", bins=20)+
labs(title="Log transformation of sole fish count NO ZEROS", x="Slender Sole", y="Count", caption = "Figure 7: Checking out what happens when we log transform just the sole count data. Without the zeros this time!")+
theme(plot.caption = element_text(hjust = 0))})
suppressWarnings({ggplot(nozero, aes(soleData.log_depth))+
geom_histogram(fill = "#b3c004", bins=20)+
labs(title="Log transformation of depth data (fathoms) NO ZEROS", x="Depth (Fathoms)", y="Count", caption = "Figure 8: Checking out what happens when we log transform the depth data. Without the zeros this time!")+
theme(plot.caption = element_text(hjust = 0))})
#graphing log transformations
ggplot(nozero, aes(soleData.log_depth, soleData.log_count))+
geom_point(color="#f8766d")+
labs(title="Depth vs. Slender Sole Count (LOG TRANSFORMATION)", x="log(Depth (fathoms))", y="log(Slender Sole Count)", caption = "Figure 9: Displaying data to demonstrate a relationship between fish count (# fish) and water depth (fathoms).")+
theme(plot.caption = element_text(hjust = 0))
# hmm, what about poacher?
poacheData <- subset(newfishies, poacher >0)
ggplot(poacheData, aes(poacher))+
geom_histogram(fill="#00bfc4", bins=30)+
labs(title="Distribution of Warty Poacher", x="Warty Poacher", y="Count", caption = "Figure 10: Displaying distribution of warty poacher fish count.")+
theme(plot.caption = element_text(hjust = 0))
#Just poacher fish data
ggplot(poacheData, aes(Depth, poacher))+
geom_point(color="#00bfc4")+
labs(title="Depth vs. Warty Poacher", x="Depth (fathoms)", y="Poacher #", caption = "Figure 11: Displaying data to demonstrate a relationship between fish count (# fish) and water depth (fathoms).")+
theme(plot.caption = element_text(hjust = 0))
#OH BOY! That is lookin good!! Not log transformed.
#Now just sole fish data, not log transformed
ggplot(poacheData, aes(Depth, sole))+
geom_point(color="#f8766d")+
labs(title="Depth vs. number of sole fishies", x="Depth (fathoms)", y="Slender Sole Count", caption = "Figure 12: Displaying data to demonstrate a relationship between fish count (# fish) and water depth (fathoms).")+
theme(plot.caption = element_text(hjust = 0))
#Def not as exciting.
#I want to try one last thing, I want to look at the log transformation of the poacher data vs. depth
poachies <- subset(data_long, Fish=="poacher")
data_long$log_poachies <- log(data_long$count+1)
data_long$log_depth <- log(data_long$Depth+1)
ggplot(data_long, aes(log_depth, log_poachies))+
geom_point(color="#00bfc4")+
labs(title="Depth vs. Warty Poacher (LOG TRANSFORMATION)", x="log(Depth (fathoms))", y="log(Warty Poacher Count)", caption = "Figure 13: Displaying data to demonstrate a relationship between fish count (# fish) and water depth (fathoms).")+
theme(plot.caption = element_text(hjust = 0))
#okay yah that does nothing. Meh!
ggplot(nozero, aes(depth, soleData.log_count, color=depth))+
geom_point()+
labs(title="Depth vs. Slender Sole", x="Depth (fathoms)", y="log(Slender Sole Count)", caption = "Figure 14: Displaying data to demonstrate a relationship between fish count (# fish) and water depth (fathoms).")+
theme(plot.caption = element_text(hjust = 0))
a<- apply(fishiesData[15:28], 2, sum) #trying the above using the apply function
#OMG WTF. I just changed the second number to 2 in the function and it popped out with the table I wanted. That is crazy. I spent so much time trying to figure out what this code meant.
anew <- data.frame(a)
fishsum <- sum(anew$a, na.rm=T)
b<- data.frame(a/fishsum)*100 #Warty poacher (Warty.Poacher) and slender sole (SleNAer.Sole)
b
## a.fishsum
## Alligatorfish.spp. NA
## Arrowtooth.FlouNAer 0.089639202
## Big.Skate NA
## Big.skate NA
## Bigeye.Poacher NA
## Black.Rockfish NA
## Blackbelly.Eelpout 0.014939867
## Blackfin.Poacher NA
## SleNAer.Sole 79.412863226
## Warty.Poacher 19.765444088
## Wattled.Eelpout 0.336147008
## Whitebait.Smelt 0.358556809
## Whitebarred.Prickleback 0.007469934
## Yellowtail.Rockfish 0.014939867
penguinData <- palmerpenguins::penguins
#quick table
penguinData |>
drop_na(bill_length_mm) |>
summarise(bill_length_MEAN = mean(bill_length_mm),
bill_length_MIN = min(bill_length_mm),
bill_length_MAX = max(bill_length_mm))
## # A tibble: 1 × 3
## bill_length_MEAN bill_length_MIN bill_length_MAX
## <dbl> <dbl> <dbl>
## 1 43.9 32.1 59.6
#summarize single column
penguinData |>
drop_na(body_mass_g) |>
summarise(body_mass_MEAN = mean(body_mass_g))
## # A tibble: 1 × 1
## body_mass_MEAN
## <dbl>
## 1 4202.
#What if we want to dimensionally analyze that column as kg instead of g? Conversion factor: 1000 g = 1 kg, mm
penguinData |>
drop_na(body_mass_g) |>
summarise(body_mass_kg_MEAN = mean(body_mass_g/1000),
body_mass_kg_SD =sd(body_mass_g/1000),
flipper_length_kg_MEAN = mean(flipper_length_mm/10), #flipper length is in mm and needs converted to cm. 10 mm = 1 cm
flipper_length_kg_SD = sd(flipper_length_mm/10))
## # A tibble: 1 × 4
## body_mass_kg_MEAN body_mass_kg_SD flipper_length_kg_MEAN flipper_length_kg_SD
## <dbl> <dbl> <dbl> <dbl>
## 1 4.20 0.802 20.1 1.41
#Now you can separate your data by categories. Say you would like to use species to group your data
penguinData |>
drop_na(body_mass_g) |>
group_by(species) |> #now you are grouping by species
summarise(body_mass_g_MEAN = mean(body_mass_g)) #NICE. Similar to Tapply() function
## # A tibble: 3 × 2
## species body_mass_g_MEAN
## <fct> <dbl>
## 1 Adelie 3701.
## 2 Chinstrap 3733.
## 3 Gentoo 5076.
#Now to create a frequency table:
penguinData |>
group_by(species) |>
summarise(n=n()) #remember function n()
## # A tibble: 3 × 2
## species n
## <fct> <int>
## 1 Adelie 152
## 2 Chinstrap 68
## 3 Gentoo 124
#Note: if all you want is a frequency table, then use the function count() or tally() since this is a bit easier than the n() function.
penguinData |>
group_by(species) |>
tally()
## # A tibble: 3 × 2
## species n
## <fct> <int>
## 1 Adelie 152
## 2 Chinstrap 68
## 3 Gentoo 124