final_assessment_dataset <- read.csv("C:/Users/ruth/Downloads/final_assessment_dataset.csv")
View(final_assessment_dataset)
str(final_assessment_dataset)

## 'data.frame':    607 obs. of  10 variables:
##  $ farm              : chr  "Benson" "Benson" "Benson" "Benson" ...
##  $ latitude          : chr  "N51:35:53" "N51:35:53" "N51:35:53" "N51:35:53" ...
##  $ longitude         : chr  "W1:05:37" "W1:05:37" "W1:05:37" "W1:05:37" ...
##  $ pct_flower        : int  10 10 10 10 25 30 30 10 10 50 ...
##  $ temp              : int  27 27 22 22 25 25 25 25 25 12 ...
##  $ variety           : chr  "Variety_1" "Variety_1" "Variety_1" "Variety_1" ...
##  $ type              : chr  "Hybrid" "Hybrid" "Hybrid" "Hybrid" ...
##  $ species           : chr  "Andrena_carantonica" "Andrena_nigroaenea" "Andrena_pubescens" "Apis_mellifera" ...
##  $ group             : chr  "Solitary bee" "Solitary bee" "Solitary bee" "Honeybee" ...
##  $ relative_abundance: num  0.5 0.5 0.5 0.5 0.5 ...

dim(final_assessment_dataset)

## [1] 607  10

class(final_assessment_dataset)

## [1] "data.frame"

colSums(is.na(final_assessment_dataset))

##               farm           latitude          longitude         pct_flower 
##                  0                  0                  0                  1 
##               temp            variety               type            species 
##                  0                  0                  0                  0 
##              group relative_abundance 
##                  0                  0

There is one missing value. We’ll impute it with the mean of the data in that column. #Simple Imputation

final_assessment_dataset$pct_flower[which(is.na(final_assessment_dataset$pct_flower))] = mean(final_assessment_dataset$pct_flower, na.rm = TRUE)

colSums(is.na(final_assessment_dataset))

##               farm           latitude          longitude         pct_flower 
##                  0                  0                  0                  0 
##               temp            variety               type            species 
##                  0                  0                  0                  0 
##              group relative_abundance 
##                  0                  0

UNIVARIATE ANALYSIS

summary(final_assessment_dataset)

##      farm             latitude          longitude           pct_flower   
##  Length:607         Length:607         Length:607         Min.   : 0.00  
##  Class :character   Class :character   Class :character   1st Qu.: 5.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :10.00  
##                                                           Mean   :16.88  
##                                                           3rd Qu.:25.00  
##                                                           Max.   :65.00  
##       temp         variety              type             species         
##  Min.   :12.00   Length:607         Length:607         Length:607        
##  1st Qu.:16.00   Class :character   Class :character   Class :character  
##  Median :22.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :20.14                                                           
##  3rd Qu.:25.00                                                           
##  Max.   :27.00                                                           
##     group           relative_abundance
##  Length:607         Min.   :0.09091   
##  Class :character   1st Qu.:0.25000   
##  Mode  :character   Median :0.50000   
##                     Mean   :0.46674   
##                     3rd Qu.:0.65152   
##                     Max.   :1.00000

library(tidyverse)

## -- Attaching packages ---------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.2     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

numerical_adv <- final_assessment_dataset %>% select(4, 5, 10)
summary(numerical_adv)

##    pct_flower         temp       relative_abundance
##  Min.   : 0.00   Min.   :12.00   Min.   :0.09091   
##  1st Qu.: 5.00   1st Qu.:16.00   1st Qu.:0.25000   
##  Median :10.00   Median :22.00   Median :0.50000   
##  Mean   :16.88   Mean   :20.14   Mean   :0.46674   
##  3rd Qu.:25.00   3rd Qu.:25.00   3rd Qu.:0.65152   
##  Max.   :65.00   Max.   :27.00   Max.   :1.00000

str(numerical_adv)

## 'data.frame':    607 obs. of  3 variables:
##  $ pct_flower        : num  10 10 10 10 25 30 30 10 10 50 ...
##  $ temp              : int  27 27 22 22 25 25 25 25 25 12 ...
##  $ relative_abundance: num  0.5 0.5 0.5 0.5 0.5 ...

Graphical analysis

Barplots

pct_flower_frequency <- table(final_assessment_dataset$pct_flower)
barplot(sort(pct_flower_frequency, decreasing = TRUE))

temp_frequency <- table(final_assessment_dataset$temp)
barplot(sort(temp_frequency, decreasing= TRUE))

relative_abundance_frequency <- table(final_assessment_dataset$relative_abundance)
barplot(sort(relative_abundance_frequency, decreasing = TRUE))

cat_col <- final_assessment_dataset %>% select(1, 6, 7, 8, 9)
summary(cat_col)

##      farm             variety              type             species         
##  Length:607         Length:607         Length:607         Length:607        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##     group          
##  Length:607        
##  Class :character  
##  Mode  :character

library(ggplot2)
library(forcats)
ggplot(mutate(cat_col, farm = fct_infreq(farm))) + geom_bar(aes(x = farm))

ggplot(mutate(cat_col, variety = fct_infreq(variety))) + geom_bar(aes(x = variety))

ggplot(mutate(cat_col, type = fct_infreq(type))) + geom_bar(aes(x = type))

ggplot(mutate(cat_col, species = fct_infreq(species))) + geom_bar(aes(x = species))

ggplot(mutate(cat_col, group = fct_infreq(group))) + geom_bar(aes(x =group))

#BIVARIATE ANALYSIS

flowermeans = with(final_assessment_dataset, sort(by(temp, group, mean), decreasing = TRUE))
library(dplyr)
plotdata <- final_assessment_dataset %>%
  group_by(group) %>%
  summarize(temp)

## `summarise()` regrouping output by 'group' (override with `.groups` argument)

# plot mean salaries
ggplot(plotdata, 
       aes(x = group, 
           y = temp)) +
  geom_bar(stat = "identity")

HB <- filter(final_assessment_dataset,group=="Honeybee")

barplot(HB$pct_flower, col=heat.colors(5), main="Overallthe flower percentage per pollinator", xlab = "pollinators", ylab="temp")

# plot the distribution of pollinators 
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

ggplot(plotdata, 
       aes(y = group,
           x = temp, 
           color = group)) +
  geom_jitter(alpha = 0.7,
              size = 1.5) + 
  scale_x_continuous() +
  labs(title = "", 
       x = "Temperature",
       y = "") +
  theme_minimal() +
  theme(legend.position = "none")

select numeric variables

df <- dplyr::select_if(final_assessment_dataset, is.numeric)

# calulate the correlations
r <- cor(df, use="complete.obs")
round(r,2)

##                    pct_flower  temp relative_abundance
## pct_flower               1.00 -0.20               0.05
## temp                    -0.20  1.00               0.07
## relative_abundance       0.05  0.07               1.00

library(ggplot2)
library(ggcorrplot)
ggcorrplot(r, hc.order = TRUE, type = "full", lab = TRUE,
   outline.col = "white",
   ggtheme = ggplot2::theme_gray)

None of the numeric values show any correlation

job

Ruth Muriithi

1/12/2021

UNIVARIATE ANALYSIS

Graphical analysis

Barplots

select numeric variables