final_assessment_dataset <- read.csv("C:/Users/ruth/Downloads/final_assessment_dataset.csv")
View(final_assessment_dataset)
str(final_assessment_dataset)
## 'data.frame': 607 obs. of 10 variables:
## $ farm : chr "Benson" "Benson" "Benson" "Benson" ...
## $ latitude : chr "N51:35:53" "N51:35:53" "N51:35:53" "N51:35:53" ...
## $ longitude : chr "W1:05:37" "W1:05:37" "W1:05:37" "W1:05:37" ...
## $ pct_flower : int 10 10 10 10 25 30 30 10 10 50 ...
## $ temp : int 27 27 22 22 25 25 25 25 25 12 ...
## $ variety : chr "Variety_1" "Variety_1" "Variety_1" "Variety_1" ...
## $ type : chr "Hybrid" "Hybrid" "Hybrid" "Hybrid" ...
## $ species : chr "Andrena_carantonica" "Andrena_nigroaenea" "Andrena_pubescens" "Apis_mellifera" ...
## $ group : chr "Solitary bee" "Solitary bee" "Solitary bee" "Honeybee" ...
## $ relative_abundance: num 0.5 0.5 0.5 0.5 0.5 ...
dim(final_assessment_dataset)
## [1] 607 10
class(final_assessment_dataset)
## [1] "data.frame"
colSums(is.na(final_assessment_dataset))
## farm latitude longitude pct_flower
## 0 0 0 1
## temp variety type species
## 0 0 0 0
## group relative_abundance
## 0 0
There is one missing value. We’ll impute it with the mean of the data in that column. #Simple Imputation
final_assessment_dataset$pct_flower[which(is.na(final_assessment_dataset$pct_flower))] = mean(final_assessment_dataset$pct_flower, na.rm = TRUE)
colSums(is.na(final_assessment_dataset))
## farm latitude longitude pct_flower
## 0 0 0 0
## temp variety type species
## 0 0 0 0
## group relative_abundance
## 0 0
summary(final_assessment_dataset)
## farm latitude longitude pct_flower
## Length:607 Length:607 Length:607 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 5.00
## Mode :character Mode :character Mode :character Median :10.00
## Mean :16.88
## 3rd Qu.:25.00
## Max. :65.00
## temp variety type species
## Min. :12.00 Length:607 Length:607 Length:607
## 1st Qu.:16.00 Class :character Class :character Class :character
## Median :22.00 Mode :character Mode :character Mode :character
## Mean :20.14
## 3rd Qu.:25.00
## Max. :27.00
## group relative_abundance
## Length:607 Min. :0.09091
## Class :character 1st Qu.:0.25000
## Mode :character Median :0.50000
## Mean :0.46674
## 3rd Qu.:0.65152
## Max. :1.00000
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.2 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
numerical_adv <- final_assessment_dataset %>% select(4, 5, 10)
summary(numerical_adv)
## pct_flower temp relative_abundance
## Min. : 0.00 Min. :12.00 Min. :0.09091
## 1st Qu.: 5.00 1st Qu.:16.00 1st Qu.:0.25000
## Median :10.00 Median :22.00 Median :0.50000
## Mean :16.88 Mean :20.14 Mean :0.46674
## 3rd Qu.:25.00 3rd Qu.:25.00 3rd Qu.:0.65152
## Max. :65.00 Max. :27.00 Max. :1.00000
str(numerical_adv)
## 'data.frame': 607 obs. of 3 variables:
## $ pct_flower : num 10 10 10 10 25 30 30 10 10 50 ...
## $ temp : int 27 27 22 22 25 25 25 25 25 12 ...
## $ relative_abundance: num 0.5 0.5 0.5 0.5 0.5 ...
pct_flower_frequency <- table(final_assessment_dataset$pct_flower)
barplot(sort(pct_flower_frequency, decreasing = TRUE))
temp_frequency <- table(final_assessment_dataset$temp)
barplot(sort(temp_frequency, decreasing= TRUE))
relative_abundance_frequency <- table(final_assessment_dataset$relative_abundance)
barplot(sort(relative_abundance_frequency, decreasing = TRUE))
cat_col <- final_assessment_dataset %>% select(1, 6, 7, 8, 9)
summary(cat_col)
## farm variety type species
## Length:607 Length:607 Length:607 Length:607
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## group
## Length:607
## Class :character
## Mode :character
library(ggplot2)
library(forcats)
ggplot(mutate(cat_col, farm = fct_infreq(farm))) + geom_bar(aes(x = farm))
ggplot(mutate(cat_col, variety = fct_infreq(variety))) + geom_bar(aes(x = variety))
ggplot(mutate(cat_col, type = fct_infreq(type))) + geom_bar(aes(x = type))
ggplot(mutate(cat_col, species = fct_infreq(species))) + geom_bar(aes(x = species))
ggplot(mutate(cat_col, group = fct_infreq(group))) + geom_bar(aes(x =group))
#BIVARIATE ANALYSIS
flowermeans = with(final_assessment_dataset, sort(by(temp, group, mean), decreasing = TRUE))
barplot(flowermeans, main="Overallthe flower percentage per pollinator", xlab = "pollinators", ylab="temp")
HB <- filter(final_assessment_dataset,group=="Honeybee")
barplot(HB$pct_flower, col=heat.colors(5), main="Overallthe flower percentage per pollinator", xlab = "pollinators", ylab="temp")
# plot the distribution of pollinators
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
ggplot(final_assessment_dataset,
aes(y = group,
x = temp,
color = group)) +
geom_jitter(alpha = 0.7,
size = 1.5) +
scale_x_continuous() +
labs(title = "",
x = "Temperature",
y = "") +
theme_minimal() +
theme(legend.position = "none")
df <- dplyr::select_if(final_assessment_dataset, is.numeric)
# calulate the correlations
r <- cor(df, use="complete.obs")
round(r,2)
## pct_flower temp relative_abundance
## pct_flower 1.00 -0.20 0.05
## temp -0.20 1.00 0.07
## relative_abundance 0.05 0.07 1.00
library(ggplot2)
library(ggcorrplot)
ggcorrplot(r, hc.order = TRUE, type = "full", lab = TRUE,
outline.col = "white",
ggtheme = ggplot2::theme_gray)
None of the numeric values show any correlation