final_assessment_dataset <- read.csv("C:/Users/ruth/Downloads/final_assessment_dataset.csv")
#View(final_assessment_dataset)
#str(final_assessment_dataset)
#dim(final_assessment_dataset)
#class(final_assessment_dataset)
colSums(is.na(final_assessment_dataset))
## farm latitude longitude pct_flower
## 0 0 0 1
## temp variety type species
## 0 0 0 0
## group relative_abundance
## 0 0
There is one missing value. We’ll impute it with the mean of the data in that column. #Simple Imputation
final_assessment_dataset$pct_flower[which(is.na(final_assessment_dataset$pct_flower))] = mean(final_assessment_dataset$pct_flower, na.rm = TRUE)
colSums(is.na(final_assessment_dataset))
## farm latitude longitude pct_flower
## 0 0 0 0
## temp variety type species
## 0 0 0 0
## group relative_abundance
## 0 0
summary(final_assessment_dataset)
## farm latitude longitude pct_flower
## Length:607 Length:607 Length:607 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 5.00
## Mode :character Mode :character Mode :character Median :10.00
## Mean :16.88
## 3rd Qu.:25.00
## Max. :65.00
## temp variety type species
## Min. :12.00 Length:607 Length:607 Length:607
## 1st Qu.:16.00 Class :character Class :character Class :character
## Median :22.00 Mode :character Mode :character Mode :character
## Mean :20.14
## 3rd Qu.:25.00
## Max. :27.00
## group relative_abundance
## Length:607 Min. :0.09091
## Class :character 1st Qu.:0.25000
## Mode :character Median :0.50000
## Mean :0.46674
## 3rd Qu.:0.65152
## Max. :1.00000
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.2 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
numerical_adv <- final_assessment_dataset %>% select(4, 5, 10)
summary(numerical_adv)
## pct_flower temp relative_abundance
## Min. : 0.00 Min. :12.00 Min. :0.09091
## 1st Qu.: 5.00 1st Qu.:16.00 1st Qu.:0.25000
## Median :10.00 Median :22.00 Median :0.50000
## Mean :16.88 Mean :20.14 Mean :0.46674
## 3rd Qu.:25.00 3rd Qu.:25.00 3rd Qu.:0.65152
## Max. :65.00 Max. :27.00 Max. :1.00000
str(numerical_adv)
## 'data.frame': 607 obs. of 3 variables:
## $ pct_flower : num 10 10 10 10 25 30 30 10 10 50 ...
## $ temp : int 27 27 22 22 25 25 25 25 25 12 ...
## $ relative_abundance: num 0.5 0.5 0.5 0.5 0.5 ...
frequency tables for group and farm
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: nnet
##
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
##
## alpha
tab1(final_assessment_dataset$group, sort.group = "decreasing", cum.percent = TRUE)
## final_assessment_dataset$group :
## Frequency Percent Cum. percent
## Solitary bee 233 38.4 38.4
## Honeybee 185 30.5 68.9
## Bumblebee 94 15.5 84.3
## Fly 93 15.3 99.7
## Solitary 2 0.3 100.0
## Total 607 100.0 100.0
#library(summarytools)
#freq(final_assessment_dataset$type, order = "freq")
tab1(final_assessment_dataset$type, sort.group = "decreasing", cum.percent = TRUE)
## final_assessment_dataset$type :
## Frequency Percent Cum. percent
## Hybrid 309 50.9 50.9
## conventional 207 34.1 85.0
## SD_Hybrid 91 15.0 100.0
## Total 607 100.0 100.0