This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and M
setwd("C:/Users/Cedar Scholars/Downloads")
You can also embed plots, for example:
guppies = read.csv("guppies.csv")
plot(guppies$father.ornament, guppies$son.attract,
xlab="Father's ornamentation", ylab="Son's Attractiveness",
col="red", pch=19, ylim=c(-0.5, 1.5))
##Example 2.2b
birds = read.csv("desert_birds.csv")
norm_data = rnorm(length(birds$Count), mean=mean(birds$Count), sd=sd(birds$Count))
norm_data
## [1] 134.060562 -75.462296 4.926699 317.948249 207.650742
## [6] 59.666325 61.806546 118.547895 193.249451 -111.772227
## [11] -11.869759 -295.748956 -60.361403 157.059088 -43.915702
## [16] -43.750329 87.588390 43.378441 326.154171 153.555332
## [21] 97.006705 -1.503800 9.920281 -11.480879 90.349444
## [26] 21.022780 22.495949 48.044757 54.785220 40.143450
## [31] 193.550026 275.792611 -56.196751 54.962842 71.918823
## [36] 47.989742 218.528776 17.802285 -149.358823 -35.200040
## [41] 231.613360 104.471017 225.794542
emp_cdf = ecdf(birds$Count)
norm_cdf = ecdf(norm_data)
plot(emp_cdf, col="red",
main="CDF of Bird Counts Compared to a Normal Distribution",
xlab="Count", ylab="Cumulative relative frequency")
lines(norm_cdf)
legend(400, .4, c("Birds", "Normal Distribution"),
col =c("red", "black"), pch=c(19,19))
# Save the plot
dev.print(pdf, file="bird_plot.pdf", width=7, height=7, pointsize=12)
## png
## 2
lynx = read.csv("lynx_data.csv")
#Line plot method 1
plot(no.pelts~date,data=lynx,pch=16,col="blue", xlab="Date",ylab="Number of pelts") #change the point type with the argument pch
lines(no.pelts~date,data=lynx)
#Line plot method 2
plot(no.pelts~date,data=lynx,pch=16,type="b", xlab="Date",ylab="Number of pelts")
#Read in the data
plant = read.csv("plant_data.csv")
#View and explore your data
View(plant) #view your dataset
names(plant) #look at the variables
## [1] "tran_number" "sp_richness"
## [3] "percent_cover" "dist_from_edge_m"
## [5] "dist_from_tree_m" "day_of_week"
## [7] "num_flowers" "num_dand_flowers"
## [9] "num_leaves_in_rosette" "dand_rosette_diam_cm"
summary(plant) #run some basic summary statistics
## tran_number sp_richness percent_cover dist_from_edge_m
## Min. : 0.00 Min. : 1.000 Min. : 0.50 0.0m: 91
## 1st Qu.: 9.00 1st Qu.: 4.000 1st Qu.: 62.50 3.0m:101
## Median :14.00 Median : 5.000 Median : 75.00 6.0m: 67
## Mean :14.58 Mean : 5.039 Mean : 73.87
## 3rd Qu.:20.50 3rd Qu.: 6.000 3rd Qu.: 90.00
## Max. :30.00 Max. :10.000 Max. :100.00
##
## dist_from_tree_m day_of_week num_flowers num_dand_flowers
## Min. : 1.00 Monday :73 Min. : 0.00 Min. : 0.000
## 1st Qu.: 6.00 Monday : 3 1st Qu.: 2.00 1st Qu.: 1.000
## Median : 8.00 Thursday :71 Median : 6.00 Median : 3.000
## Mean :10.72 Tuesday :71 Mean : 11.78 Mean : 6.652
## 3rd Qu.:13.00 Tuesday : 1 3rd Qu.: 13.00 3rd Qu.: 8.000
## Max. :40.00 Wednesday :33 Max. :145.00 Max. :36.000
## Wednesday : 7 NA's :3
## num_leaves_in_rosette dand_rosette_diam_cm
## Min. : 0.00 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.: 7.000
## Median : 8.00 Median : 9.000
## Mean :10.08 Mean : 8.682
## 3rd Qu.:12.00 3rd Qu.:11.000
## Max. :39.00 Max. :18.000
## NA's :42 NA's :42
str(plant) #look at data structure, what types of variables you are dealing with
## 'data.frame': 259 obs. of 10 variables:
## $ tran_number : int 0 0 0 1 1 1 1 1 1 1 ...
## $ sp_richness : num 4 2 3 4 5 5 5 5 5 5 ...
## $ percent_cover : num 95 95 98 90 75 75 75 75 75 75 ...
## $ dist_from_edge_m : Factor w/ 3 levels "0.0m","3.0m",..: 1 2 3 1 2 2 2 2 2 2 ...
## $ dist_from_tree_m : num 20 17 15 9 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 7 levels "Monday","Monday ",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ num_flowers : num 140 0 0 0 8 8 8 8 8 8 ...
## $ num_dand_flowers : num 0 0 0 0 7 7 7 7 7 7 ...
## $ num_leaves_in_rosette: num NA NA NA NA 11 5 6 15 16 8 ...
## $ dand_rosette_diam_cm : num NA NA NA NA 9 8.8 9.3 11.9 12 9.2 ...
dim(plant) #what are the dimensions of the data
## [1] 259 10
#Make a histogram of num_leaves_in_rosette [practice]
#explore what you can do with the hist() function
hist(plant$num_leaves_in_rosette)
hist(plant$num_leaves_in_rosette,breaks=20) #changing the number of 'breaks' will give you more/fewer bins - play around with this
hist(plant$num_leaves_in_rosette,breaks=20,col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette") #add axes labels and title!
#note that the distribution is right skewed
#now we will compare the number of leaves between 0m, 3m, and 6m from the habitat edge
#0m
hist(plant$num_leaves_in_rosette[plant$dist_from_edge_m=="0.0m"],col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette - 0m",xlim=c(0,40))
#3m
hist(plant$num_leaves_in_rosette[plant$dist_from_edge_m=="3.0m"],col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette - 3m",xlim=c(0,40))
#6m
hist(plant$num_leaves_in_rosette[plant$dist_from_edge_m=="6.0m"],col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette - 6m",xlim=c(0,40))
#To make all three histograms into one cool plot
par(mfrow=c(3,1)) #sets up your plot (3 x 1)
#then re-run the three plots
hist(plant$num_leaves_in_rosette[plant$dist_from_edge_m=="0.0m"],col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette - 0m",xlim=c(0,40))
hist(plant$num_leaves_in_rosette[plant$dist_from_edge_m=="3.0m"],col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette - 3m",xlim=c(0,40))
hist(plant$num_leaves_in_rosette[plant$dist_from_edge_m=="6.0m"],col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette - 6m",xlim=c(0,40))
#to switch back to looking at one plot at a time
par(mfrow=c(1,1))
#but what if we instead want to create a boxplot?
#how does distance from edge of habitat (your explanatory variable) affect number of leaves per rosette (your response variable; what we measured)?
boxplot(num_leaves_in_rosette~dist_from_edge_m,data=plant,xlab="Distance from edge (m)",ylab="Number of leaves per rosette",col='darkgray')
#ok, now it's your turn
#do the same thing for dand_rosette_diam_cm
boxplot(dand_rosette_diam_cm~dist_from_edge_m,data=plant,xlab="Distance from edge (m)",ylab="Number of leaves per rosette",col='darkgray')
##Exercise 2
Log-transforming decreased the right skew while squaring the data caused a great increase in the left skew.
The log-transformed data looked most normal.
The log(0) is undefined because you can never get zero by raising any number to the power of a value. To avoid error or any nonreal numbers, there needs to be a 1 added. Because a log would be involved, a 1 would need to be added to the variable dand_rosette_diam_cm.
log_num_flowers <- log(plant$num_flowers+1) #creates a new variable, natural log + 1 (LOG TRANSFORMATION)
head(log_num_flowers) #what do the numbers look like
## [1] 4.948760 0.000000 0.000000 0.000000 2.197225 2.197225
plant$log_num_flowers <- log(plant$num_flowers+1) #if you want to create a new variable AND add it to your dataframe
View(plant) #double check that it's there
#to square num fowers
plant$num_flowers_sq <- plant$num_flowers^2
#now, make a histogram of the variables and answer our questions three
par(mfrow=c(3,1))
hist(plant$num_flowers)
hist(log_num_flowers)
hist(plant$num_flowers_sq)
You see an upwards sloping trend. The linear trend line seems to be accurate as the data points continue to accumulate upwards.
The explanatory variable is the distance from the edge while the response variable is the number of leaves per rosette.
# We want to see whether there is some association between the size of a dandelion rosette and the number of leaves on a dandelion rosette
#both of these variables are numerical --> scatterplot
plot(num_leaves_in_rosette~dand_rosette_diam_cm,data=plant,xlab="Dandelion rosette diameter (cm)",ylab="Number of leaves in rosette") #insert y~x
#alternate code - same result
plot(plant$num_leaves_in_rosette~plant$dand_rosette_diam_cm)
#now, you want to overlay a least squares line to get an idea of the trend
#run linear regression
fit <- lm(num_leaves_in_rosette~dand_rosette_diam_cm,data=plant)
abline(fit,col='green') #overlay line on existing plot
#now, make a scatterplot to explore the relationship between dist_from_tree_m and percent_cover
#i.e. How does percent cover change with distance from trees?
plot(percent_cover~dist_from_tree_m,data=plant,xlab="Distance from edge (m)",ylab="Number of leaves per rosette",col='darkgray')
fit <- lm(percent_cover~dist_from_tree_m,data=plant)
abline(fit,col='green')
Note that the
echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.