library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data("InsectSprays")
Buggies<-InsectSprays
data("iris")
purpleeyeball<-iris
# this should be invisible
#Chunk 2 Data Exploration
?iris
## starting httpd help server ... done
#the three species being explored in this data set are Iris setosa, versicolor, and virginica
?InsectSprays
#there are 6 levels in the variable "Spray" that represent the different insecticides used
head(Buggies)
## count spray
## 1 10 A
## 2 7 A
## 3 20 A
## 4 14 A
## 5 14 A
## 6 12 A
str(Buggies)
## 'data.frame': 72 obs. of 2 variables:
## $ count: num 10 7 20 14 14 12 10 23 17 20 ...
## $ spray: Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(Buggies)
## count spray
## Min. : 0.00 A:12
## 1st Qu.: 3.00 B:12
## Median : 7.00 C:12
## Mean : 9.50 D:12
## 3rd Qu.:14.25 E:12
## Max. :26.00 F:12
head(purpleeyeball)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(purpleeyeball)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(purpleeyeball)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
LeafData_corrected<-read.csv("~/Biostats 2024/Data/LeafData_corrected.csv")
#these functions help you actually understand what the data means. the head function allows you to see the beginning of your data, the str function breaks down the data in words, and the summary shows different essential calculations, such as median, in order to help you build box plots and graphs later on in your code
#Chunk 3 My First Boxplot
#ggplot(data=Buggies, aes(x="spray", y="count")) + geom_boxplot()
#ggplot(Buggies, aes(x=spray, y=count)) + geom_boxplot()
ggplot(data=Buggies, aes(x=spray, y=count)) + geom_boxplot()
#the black line within the boxes is representative of the median, the boxes represent 50% of the data, and the whiskers represent the other 50% of the data with both ends acting as extreme points, representing the highest or lowest value recorded (that is not a technical outlier)
ggplot(data=purpleeyeball,aes(x=Species, y=Sepal.Length, fill=Species)) +geom_violin() + ylab("Sepal Length (mm)")
# Chunk 4 Histograms
ggplot(data=purpleeyeball, aes(x=Sepal.Width)) + geom_histogram(bins=12,color="black",fill="white") + xlab("Sepal Width (mm)")
differentOne<-ggplot(data=purpleeyeball, aes(x=Sepal.Width)) + geom_histogram(bins=10,color="blue", fill="green") + xlab("Sepal Width (mm)")
differentOne
differentTwo<-ggplot(data=purpleeyeball, aes(x=Sepal.Width)) + geom_histogram(bins=20, color="purple", fill="pink") + xlab("'Sepal Width (mm)")
differentTwo
hist(purpleeyeball$Sepal.Length)
#Chunk 5 Dplyr to manipulate data
library(dplyr)
pine<-filter(LeafData_corrected, Species == "pine")
pine
## Leaf.ID Name Species Deciduous Length Width Damage
## 1 1 MG pine no 8.2 0.1 2
## 2 2 MG pine no 8.4 0.1 2
## 3 3 MG pine no 6.0 0.1 1
## 4 4 MG pine no 6.5 0.1 1
## 5 5 MG pine no 5.9 0.1 2
## 6 1 SA pine no 8.8 0.1 0
## 7 2 SA pine no 8.8 0.1 1
## 8 3 SA pine no 8.5 0.1 0
## 9 4 SA pine no 8.6 0.1 0
## 10 5 SA pine no 8.8 0.1 0
## 11 1 MH pine no 8.0 0.1 0
## 12 2 MH pine no 9.0 0.1 0
## 13 3 MH pine no 7.3 0.1 0
## 14 4 MH pine no 8.0 0.1 0
## 15 5 MH pine no 9.4 0.1 0
## 16 1 IS pine no 6.2 0.1 0
## 17 2 IS pine no 5.2 0.1 0
## 18 3 IS pine no 5.5 0.1 0
## 19 4 IS pine no 6.1 0.1 0
## 20 5 IS pine no 4.2 0.1 0
## 21 1 AR pine no 6.2 0.1 0
## 22 2 AR pine no 6.7 0.1 0
## 23 3 AR pine no 7.5 0.1 0
## 24 4 AR pine no 6.8 0.1 0
## 25 5 AR pine no 6.9 0.1 0
## 26 6 JW pine no 6.0 0.1 0
## 27 7 JW pine no 6.0 0.1 0
## 28 8 JW pine no 5.8 0.1 0
## 29 9 JW pine no 5.6 0.1 0
## 30 10 JW pine no 5.9 0.1 0
## 31 1 AVB pine no 7.6 0.1 1
## 32 2 AVB pine no 8.0 0.1 1
## 33 3 AVB pine no 8.5 0.1 0
## 34 4 AVB pine no 7.4 0.1 1
## 35 5 AVB pine no 7.6 0.1 1
## 36 1 EO pine no 8.9 0.1 0
## 37 2 EO pine no 8.9 0.1 0
## 38 3 EO pine no 8.9 0.1 0
## 39 4 EO pine no 8.8 0.1 0
## 40 5 EO pine no 8.8 0.1 0
## 41 6 STA pine no 7.8 0.1 0
## 42 7 STA pine no 7.9 0.1 0
## 43 8 STA pine no 7.9 0.1 0
## 44 9 STA pine no 7.7 0.1 0
## 45 10 STA pine no 7.8 0.1 0
## 46 6 BS pine no 8.4 0.1 0
## 47 7 BS pine no 8.1 0.1 0
## 48 8 BS pine no 7.3 0.1 0
## 49 9 BS pine no 5.2 0.1 0
## 50 10 BS pine no 5.5 0.1 0
## 51 1 AEW pine no 8.5 0.1 0
## 52 2 AEW pine no 8.7 0.1 0
## 53 3 AEW pine no 8.6 0.1 0
## 54 4 AEW pine no 8.6 0.1 0
## 55 5 AEW pine no 8.8 0.1 0
notPine<-filter(LeafData_corrected,Species == "buckeye")
notPine
## Leaf.ID Name Species Deciduous Length Width Damage
## 1 6 MH buckeye yes 19.2 6.1 2
## 2 7 MH buckeye yes 16.0 5.5 2
## 3 8 MH buckeye yes 13.7 4.3 1
## 4 9 MH buckeye yes 21.4 7.3 2
## 5 10 MH buckeye yes 19.6 6.7 1
## 6 6 IS buckeye yes 13.7 4.8 3
## 7 7 IS buckeye yes 26.4 9.9 4
## 8 8 IS buckeye yes 19.1 6.5 1
## 9 9 IS buckeye yes 18.2 6.8 2
## 10 10 IS buckeye yes 22.8 7.8 3
## 11 6 AR buckeye yes 22.6 8.4 1
## 12 7 AR buckeye yes 20.5 7.6 3
## 13 8 AR buckeye yes 16.0 5.9 1
## 14 9 AR buckeye yes 21.5 7.9 1
## 15 10 AR buckeye yes 16.7 5.8 1
## 16 1 JW buckeye yes 14.7 5.8 2
## 17 2 JW buckeye yes 10.9 3.9 0
## 18 3 JW buckeye yes 11.5 4.2 2
## 19 4 JW buckeye yes 27.5 9.0 1
## 20 5 JW buckeye yes 24.0 8.5 1
## 21 6 AVB buckeye yes 24.0 9.5 1
## 22 7 AVB buckeye yes 26.5 10.5 3
## 23 8 AVB buckeye yes 25.0 9.0 4
## 24 9 AVB buckeye yes 14.0 6.0 3
## 25 10 AVB buckeye yes 15.3 5.5 2
## 26 6 EO buckeye yes 19.5 6.5 2
## 27 7 EO buckeye yes 18.5 6.8 1
## 28 8 EO buckeye yes 23.0 8.0 0
## 29 9 EO buckeye yes 26.0 8.5 0
## 30 10 EO buckeye yes 25.5 8.0 1
## 31 1 STA buckeye yes 27.7 8.8 5
## 32 2 STC buckeye yes 26.5 8.0 1
## 33 3 STA buckeye yes 28.3 8.5 1
## 34 4 STA buckeye yes 26.6 8.2 2
## 35 5 STA buckeye yes 24.3 7.3 1
## 36 1 BS buckeye yes 24.9 7.6 1
## 37 2 BS buckeye yes 16.6 6.7 2
## 38 3 BS buckeye yes 18.0 6.7 0
## 39 4 BS buckeye yes 16.2 5.6 2
## 40 5 BS buckeye yes 25.2 8.3 0
## 41 6 AEW buckeye yes 24.9 8.2 1
## 42 7 AEW buckeye yes 18.0 6.5 1
## 43 8 AEW buckeye yes 25.1 8.1 1
## 44 9 AEW buckeye yes 23.2 8.5 1
## 45 10 AEW buckeye yes 21.7 8.8 3
ggplot(pine, aes(x=Length)) + geom_histogram(bins=12, color="black", fill="white") + xlab("Pine Leaf Length (cm)")
ggplot(notPine, aes(x=Length)) + geom_histogram(bins=12, color="black", fill="white") + xlab("Buckeye Leaf Length (cm)")
#buckeye is more variable because there is the values are less consistent than the pine values. the graph has a less consistent pattern as well, compared to the pine values.
#Chunk 6 Summary Stats
mean(purpleeyeball$Sepal.Width)
## [1] 3.057333
sd(purpleeyeball$Sepal.Width)
## [1] 0.4358663
median(purpleeyeball$Sepal.Width)
## [1] 3
IQR(purpleeyeball$Sepal.Width)
## [1] 0.5
mean(pine$Length)
## [1] 7.472727
sd(pine$Length)
## [1] 1.287037
median(pine$Length)
## [1] 7.8
IQR(pine$Length)
## [1] 2.4
mean(notPine$Length)
## [1] 20.9
sd(notPine$Length)
## [1] 4.781261
median(notPine$Length)
## [1] 21.5
IQR(notPine$Length)
## [1] 8.3
#Bonus!
dplyr::summarise(purpleeyeball, avg=mean(Sepal.Length))
## avg
## 1 5.843333
dplyr::summarise(purpleeyeball, avg=mean(Sepal.Width))
## avg
## 1 3.057333
dplyr::summarise(purpleeyeball, avg=mean(Petal.Length))
## avg
## 1 3.758
dplyr::summarise(purpleeyeball, avg=mean(Petal.Width))
## avg
## 1 1.199333