final_assessment_dataset <- read.csv("C:/Users/ruth/Downloads/final_assessment_dataset.csv")
#View(final_assessment_dataset)
#str(final_assessment_dataset)
#dim(final_assessment_dataset)
#class(final_assessment_dataset)
colSums(is.na(final_assessment_dataset))
##               farm           latitude          longitude         pct_flower 
##                  0                  0                  0                  1 
##               temp            variety               type            species 
##                  0                  0                  0                  0 
##              group relative_abundance 
##                  0                  0

There is one missing value. We’ll impute it with the mean of the data in that column. #Simple Imputation

final_assessment_dataset$pct_flower[which(is.na(final_assessment_dataset$pct_flower))] = mean(final_assessment_dataset$pct_flower, na.rm = TRUE)
colSums(is.na(final_assessment_dataset))
##               farm           latitude          longitude         pct_flower 
##                  0                  0                  0                  0 
##               temp            variety               type            species 
##                  0                  0                  0                  0 
##              group relative_abundance 
##                  0                  0

UNIVARIATE ANALYSIS

summary(final_assessment_dataset)
##      farm             latitude          longitude           pct_flower   
##  Length:607         Length:607         Length:607         Min.   : 0.00  
##  Class :character   Class :character   Class :character   1st Qu.: 5.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :10.00  
##                                                           Mean   :16.88  
##                                                           3rd Qu.:25.00  
##                                                           Max.   :65.00  
##       temp         variety              type             species         
##  Min.   :12.00   Length:607         Length:607         Length:607        
##  1st Qu.:16.00   Class :character   Class :character   Class :character  
##  Median :22.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :20.14                                                           
##  3rd Qu.:25.00                                                           
##  Max.   :27.00                                                           
##     group           relative_abundance
##  Length:607         Min.   :0.09091   
##  Class :character   1st Qu.:0.25000   
##  Mode  :character   Median :0.50000   
##                     Mean   :0.46674   
##                     3rd Qu.:0.65152   
##                     Max.   :1.00000
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.2     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
numerical_adv <- final_assessment_dataset %>% select(4, 5, 10)
summary(numerical_adv)
##    pct_flower         temp       relative_abundance
##  Min.   : 0.00   Min.   :12.00   Min.   :0.09091   
##  1st Qu.: 5.00   1st Qu.:16.00   1st Qu.:0.25000   
##  Median :10.00   Median :22.00   Median :0.50000   
##  Mean   :16.88   Mean   :20.14   Mean   :0.46674   
##  3rd Qu.:25.00   3rd Qu.:25.00   3rd Qu.:0.65152   
##  Max.   :65.00   Max.   :27.00   Max.   :1.00000
str(numerical_adv)
## 'data.frame':    607 obs. of  3 variables:
##  $ pct_flower        : num  10 10 10 10 25 30 30 10 10 50 ...
##  $ temp              : int  27 27 22 22 25 25 25 25 25 12 ...
##  $ relative_abundance: num  0.5 0.5 0.5 0.5 0.5 ...

frequency tables for group and farm

library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: nnet
## 
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
tab1(final_assessment_dataset$group, sort.group = "decreasing", cum.percent = TRUE)

## final_assessment_dataset$group : 
##              Frequency Percent Cum. percent
## Solitary bee       233    38.4         38.4
## Honeybee           185    30.5         68.9
## Bumblebee           94    15.5         84.3
## Fly                 93    15.3         99.7
## Solitary             2     0.3        100.0
##   Total            607   100.0        100.0
#library(summarytools)
#freq(final_assessment_dataset$type, order = "freq")
tab1(final_assessment_dataset$type, sort.group = "decreasing", cum.percent = TRUE)

## final_assessment_dataset$type : 
##              Frequency Percent Cum. percent
## Hybrid             309    50.9         50.9
## conventional       207    34.1         85.0
## SD_Hybrid           91    15.0        100.0
##   Total            607   100.0        100.0