Problem 1

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and M

setwd("C:/Users/Cedar Scholars/Downloads")

Example 2.2a

You can also embed plots, for example:

guppies = read.csv("guppies.csv")
plot(guppies$father.ornament, guppies$son.attract,
     xlab="Father's ornamentation", ylab="Son's Attractiveness",
     col="red", pch=19, ylim=c(-0.5, 1.5))

##Example 2.2b

birds = read.csv("desert_birds.csv")

norm_data = rnorm(length(birds$Count), mean=mean(birds$Count), sd=sd(birds$Count))
norm_data
##  [1]  134.060562  -75.462296    4.926699  317.948249  207.650742
##  [6]   59.666325   61.806546  118.547895  193.249451 -111.772227
## [11]  -11.869759 -295.748956  -60.361403  157.059088  -43.915702
## [16]  -43.750329   87.588390   43.378441  326.154171  153.555332
## [21]   97.006705   -1.503800    9.920281  -11.480879   90.349444
## [26]   21.022780   22.495949   48.044757   54.785220   40.143450
## [31]  193.550026  275.792611  -56.196751   54.962842   71.918823
## [36]   47.989742  218.528776   17.802285 -149.358823  -35.200040
## [41]  231.613360  104.471017  225.794542
emp_cdf = ecdf(birds$Count)

norm_cdf = ecdf(norm_data)

plot(emp_cdf, col="red",
     main="CDF of Bird Counts Compared to a Normal Distribution",
     xlab="Count", ylab="Cumulative relative frequency")

lines(norm_cdf)

legend(400, .4, c("Birds", "Normal Distribution"),
       col =c("red", "black"), pch=c(19,19))

# Save the plot
dev.print(pdf, file="bird_plot.pdf", width=7, height=7, pointsize=12)
## png 
##   2

Example 3

lynx = read.csv("lynx_data.csv")

#Line plot method 1
plot(no.pelts~date,data=lynx,pch=16,col="blue", xlab="Date",ylab="Number of pelts") #change the point type with the argument pch
lines(no.pelts~date,data=lynx)

#Line plot method 2
plot(no.pelts~date,data=lynx,pch=16,type="b", xlab="Date",ylab="Number of pelts")

Exercises

#Read in the data
plant = read.csv("plant_data.csv")

#View and explore your data
View(plant) #view your dataset
names(plant) #look at the variables
##  [1] "tran_number"           "sp_richness"          
##  [3] "percent_cover"         "dist_from_edge_m"     
##  [5] "dist_from_tree_m"      "day_of_week"          
##  [7] "num_flowers"           "num_dand_flowers"     
##  [9] "num_leaves_in_rosette" "dand_rosette_diam_cm"
summary(plant) #run some basic summary statistics
##   tran_number     sp_richness     percent_cover    dist_from_edge_m
##  Min.   : 0.00   Min.   : 1.000   Min.   :  0.50   0.0m: 91        
##  1st Qu.: 9.00   1st Qu.: 4.000   1st Qu.: 62.50   3.0m:101        
##  Median :14.00   Median : 5.000   Median : 75.00   6.0m: 67        
##  Mean   :14.58   Mean   : 5.039   Mean   : 73.87                   
##  3rd Qu.:20.50   3rd Qu.: 6.000   3rd Qu.: 90.00                   
##  Max.   :30.00   Max.   :10.000   Max.   :100.00                   
##                                                                    
##  dist_from_tree_m     day_of_week  num_flowers     num_dand_flowers
##  Min.   : 1.00    Monday    :73   Min.   :  0.00   Min.   : 0.000  
##  1st Qu.: 6.00    Monday    : 3   1st Qu.:  2.00   1st Qu.: 1.000  
##  Median : 8.00    Thursday  :71   Median :  6.00   Median : 3.000  
##  Mean   :10.72    Tuesday   :71   Mean   : 11.78   Mean   : 6.652  
##  3rd Qu.:13.00    Tuesday   : 1   3rd Qu.: 13.00   3rd Qu.: 8.000  
##  Max.   :40.00    Wednesday :33   Max.   :145.00   Max.   :36.000  
##                   Wednesday : 7                    NA's   :3       
##  num_leaves_in_rosette dand_rosette_diam_cm
##  Min.   : 0.00         Min.   : 0.000      
##  1st Qu.: 6.00         1st Qu.: 7.000      
##  Median : 8.00         Median : 9.000      
##  Mean   :10.08         Mean   : 8.682      
##  3rd Qu.:12.00         3rd Qu.:11.000      
##  Max.   :39.00         Max.   :18.000      
##  NA's   :42            NA's   :42
str(plant) #look at data structure, what types of variables you are dealing with
## 'data.frame':    259 obs. of  10 variables:
##  $ tran_number          : int  0 0 0 1 1 1 1 1 1 1 ...
##  $ sp_richness          : num  4 2 3 4 5 5 5 5 5 5 ...
##  $ percent_cover        : num  95 95 98 90 75 75 75 75 75 75 ...
##  $ dist_from_edge_m     : Factor w/ 3 levels "0.0m","3.0m",..: 1 2 3 1 2 2 2 2 2 2 ...
##  $ dist_from_tree_m     : num  20 17 15 9 7 7 7 7 7 7 ...
##  $ day_of_week          : Factor w/ 7 levels "Monday","Monday ",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ num_flowers          : num  140 0 0 0 8 8 8 8 8 8 ...
##  $ num_dand_flowers     : num  0 0 0 0 7 7 7 7 7 7 ...
##  $ num_leaves_in_rosette: num  NA NA NA NA 11 5 6 15 16 8 ...
##  $ dand_rosette_diam_cm : num  NA NA NA NA 9 8.8 9.3 11.9 12 9.2 ...
dim(plant) #what are the dimensions of the data
## [1] 259  10

Exercise 1

#Make a histogram of num_leaves_in_rosette [practice]
#explore what you can do with the hist() function

hist(plant$num_leaves_in_rosette)

hist(plant$num_leaves_in_rosette,breaks=20) #changing the number of 'breaks' will give you more/fewer bins - play around with this

hist(plant$num_leaves_in_rosette,breaks=20,col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette") #add axes labels and title!

#note that the distribution is right skewed

#now we will compare the number of leaves between 0m, 3m, and 6m from the habitat edge
#0m
hist(plant$num_leaves_in_rosette[plant$dist_from_edge_m=="0.0m"],col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette - 0m",xlim=c(0,40))

#3m
hist(plant$num_leaves_in_rosette[plant$dist_from_edge_m=="3.0m"],col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette - 3m",xlim=c(0,40))

#6m
hist(plant$num_leaves_in_rosette[plant$dist_from_edge_m=="6.0m"],col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette - 6m",xlim=c(0,40))

#To make all three histograms into one cool plot
par(mfrow=c(3,1)) #sets up your plot (3 x 1)
#then re-run the three plots
hist(plant$num_leaves_in_rosette[plant$dist_from_edge_m=="0.0m"],col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette - 0m",xlim=c(0,40))
hist(plant$num_leaves_in_rosette[plant$dist_from_edge_m=="3.0m"],col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette - 3m",xlim=c(0,40))
hist(plant$num_leaves_in_rosette[plant$dist_from_edge_m=="6.0m"],col='darkgray',xlab="Number of leaves in rosette",ylab="Density",main="Histogram of number of leaves in rosette - 6m",xlim=c(0,40))

#to switch back to looking at one plot at a time
par(mfrow=c(1,1))

#but what if we instead want to create a boxplot?
#how does distance from edge of habitat (your explanatory variable) affect number of leaves per rosette (your response variable; what we measured)?
boxplot(num_leaves_in_rosette~dist_from_edge_m,data=plant,xlab="Distance from edge (m)",ylab="Number of leaves per rosette",col='darkgray')

#ok, now it's your turn
#do the same thing for dand_rosette_diam_cm

boxplot(dand_rosette_diam_cm~dist_from_edge_m,data=plant,xlab="Distance from edge (m)",ylab="Number of leaves per rosette",col='darkgray')

##Exercise 2

  1. Log-transforming decreased the right skew while squaring the data caused a great increase in the left skew.

  2. The log-transformed data looked most normal.

  3. The log(0) is undefined because you can never get zero by raising any number to the power of a value. To avoid error or any nonreal numbers, there needs to be a 1 added. Because a log would be involved, a 1 would need to be added to the variable dand_rosette_diam_cm.

log_num_flowers <- log(plant$num_flowers+1)  #creates a new variable, natural log + 1 (LOG TRANSFORMATION)

head(log_num_flowers) #what do the numbers look like
## [1] 4.948760 0.000000 0.000000 0.000000 2.197225 2.197225
plant$log_num_flowers <- log(plant$num_flowers+1) #if you want to create a new variable AND add it to your dataframe

View(plant) #double check that it's there

#to square num fowers
plant$num_flowers_sq <- plant$num_flowers^2

#now, make a histogram of the variables and answer our questions three

par(mfrow=c(3,1))

hist(plant$num_flowers)

hist(log_num_flowers)

hist(plant$num_flowers_sq)

Exercise 3

  1. You see an upwards sloping trend. The linear trend line seems to be accurate as the data points continue to accumulate upwards.

  2. The explanatory variable is the distance from the edge while the response variable is the number of leaves per rosette.

# We want to see whether there is some association between the size of a dandelion rosette and the number of leaves on a dandelion rosette
#both of these variables are numerical --> scatterplot

plot(num_leaves_in_rosette~dand_rosette_diam_cm,data=plant,xlab="Dandelion rosette diameter (cm)",ylab="Number of leaves in rosette")  #insert y~x

#alternate code - same result

plot(plant$num_leaves_in_rosette~plant$dand_rosette_diam_cm) 

#now, you want to overlay a least squares line to get an idea of the trend
#run linear regression
fit <- lm(num_leaves_in_rosette~dand_rosette_diam_cm,data=plant)
abline(fit,col='green') #overlay line on existing plot

#now, make a scatterplot to explore the relationship between dist_from_tree_m and percent_cover
#i.e. How does percent cover change with distance from trees?

plot(percent_cover~dist_from_tree_m,data=plant,xlab="Distance from edge (m)",ylab="Number of leaves per rosette",col='darkgray')

fit <- lm(percent_cover~dist_from_tree_m,data=plant)
abline(fit,col='green')

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.