library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
### Orchid Code ###

## Before reading the data into RStudio ----
# Check the Col names -- modification 

Orchid <- read.csv("Orchid.csv")
names(Orchid)
##  [1] "plant"         "plot.No"       "plant.No"      "date"         
##  [5] "time"          "flower.No"     "condition"     "spur.length"  
##  [9] "nectar.level"  "sugar.percent" "scent"         "pollinia.L"   
## [13] "pollinia.R"    "latitude"      "longitude"
head(Orchid)
##   plant plot.No plant.No   date  time flower.No condition spur.length
## 1     1       1        1 09-Jul 12:29         1         3          36
## 2     1       1        1 09-Jul 12:29         2         3          40
## 3     1       1        1 09-Jul 12:29         3         3          37
## 4     1       1        1 09-Jul 12:29         4         3          NA
## 5     1       1        1 09-Jul 12:29         5         3          NA
## 6     2       1        2 09-Jul 12:34         1         3          44
##   nectar.level sugar.percent scent pollinia.L pollinia.R latitude
## 1            2            28     2          P          P    7.143
## 2           12            28     2          P          P    7.143
## 3            4            28     2          P          P    7.143
## 4           NA            NA     2          P          P    7.143
## 5           NA            NA     2          P          P    7.143
## 6            7            27     2          P          P    7.141
##   longitude
## 1    38.457
## 2    38.457
## 3    38.457
## 4    38.457
## 5    38.457
## 6    38.450
## Question1 ## ----
## How does the sugar concentration compare between individual flowers on a single plant?
## Another way to put the question is: how variable is the sugar concentration between flowers in the same plant?

# Since Q1 we are focusing on sugar concentration, using 'Index.Q1' marking the non-NA data 
# the index of data that sugar concentration is not NA
Index.Q1 <-!is.na(Orchid$sugar.percent)

# the dataset used for Q1
# it is a subdataset with full record of sugar concentration
Orchid.Q1 <- Orchid[Index.Q1,]

# the number of unique plants that has sugar concentration data
# the total number of plants is 172
length(unique(Orchid.Q1$plant)) 
## [1] 168
# use a loop to record the number of flowers for each plant 
# n.flower <- NULL
# for(i in 1:172){
#   n.flower <- c(n.flower,length(unique(Orchid.Q1$flower.No[Orchid.Q1$plant==i])))
#     # if the number of flower is 0, print the plant number
#   if(length(unique(Orchid.Q1$flower.No[Orchid.Q1$plant==i]))==0) print(i)
# }
# # 8, 31, 111, 164
# table(n.flower)

Plant.Index <- unique(Orchid.Q1$plant)

SD.vec <- NULL
RANGE.vec <- NULL
N.flower.vec <- NULL
for(j in Plant.Index){
  INDEX <- Orchid.Q1$plant==j
  Plant.Sugar <- Orchid.Q1$sugar.percent[INDEX]
  SD.vec <- c(SD.vec,sd(Plant.Sugar))
  RANGE.vec <- c(RANGE.vec,diff(range(Plant.Sugar)))
  N.flower.vec <- c(N.flower.vec,max(Orchid.Q1$flower.No[INDEX]))
}

SD.vec[48] <- 0

# Histogram of 168 Sd and Range:
# hist(SD.vec,ylim=c(0,110),main="Histogram of Standard Deviation of \n Percentage 
#      Sugar Content in Each Plant",xlab="Standard Deviation",breaks=seq(0,10.5,by=1.5))
# hist(RANGE.vec,ylim=c(0,100),main="Histogram of the Ranges of the \n Percentage Sugar Content in Each Plant",
#      xlab="Range",breaks=seq(0,20,by=2))
# 
# plot(SD.vec~N.flower.vec,xlab="Number of Flowers",ylab="Standard Deviation",
#      main="Scatterplot of Standard Deviations in Each Plant \n Against the Number of Flowers Sampled from Each Plant")
# plot(RANGE.vec~N.flower.vec,xlab="Number of Flowers",ylab="Range",
#      main="Scatterplot of Range in Each Plant \n Against the Number of Flowers Sampled from Each Plant")

df.Q1 <- data.frame(SD=SD.vec,N.flower=N.flower.vec,Range=RANGE.vec)

# Q1 Histograms ------
ggplot(df.Q1,aes(df.Q1$SD)) + geom_histogram(binwidth = 1.5,fill=I("turquoise2"),col=I("lightblue3"),alpha=0.5) +
  labs(x="Standard Deviation",title="Histogram of Standard Deviation of Percentage 
    Sugar Content in Each Plant") + theme(plot.title = element_text(hjust = 0.5))

ggplot(df.Q1,aes(df.Q1$Range)) + geom_histogram(binwidth =2,fill=I("plum2"),col=I("khaki1"),alpha=0.5) +
  labs(x="Range",title="Histogram of the Ranges of the Percentage Sugar Content in Each Plant") + theme(plot.title = element_text(hjust = 0.5))

# Q1 Scatter plots ----
ggplot(df.Q1,aes(x=N.flower,y=SD)) + geom_point(alpha=0.5,size=3,col="chartreuse4") +
  labs(x="Number of Flowers",y="Standard Deviation",title="Scatterplot of Standard Deviations in Each Plant \n Against the Number of Flowers Sampled from Each Plant")+
  theme(plot.title = element_text(hjust = 0.5))

# not very much variation for the most part

ggplot(df.Q1,aes(x=N.flower,y=Range)) + geom_point(alpha=0.5,size=3,col="purple") + 
  labs(x="Number of Flowers",y="Range",title="Scatterplot of Range in Each Plant Against the Number of Flowers Sampled from Each Plant")

# The spread is small for many of the plants


## Question2 ## ----
## How does the sugar concentration compare between individual plants?
Day.vec <- as.numeric(substr(Orchid.Q1$date,1,2)) # for later, Q5

Day.num.vec <- NULL
Mean.Sugar.vec <- NULL
Mean.Condition.vec <- NULL
for(j in Plant.Index){
  INDEX <- Orchid.Q1$plant==j
  Plant.Sugar <- Orchid.Q1$sugar.percent[INDEX]
  Condition <- Orchid.Q1$condition[INDEX]
  Mean.Sugar.vec <- c(Mean.Sugar.vec,mean(Plant.Sugar))
  Mean.Condition.vec <- c(Mean.Condition.vec,mean(Condition))
  Day.num.vec <- c(Day.num.vec, as.numeric(unique(Day.vec[INDEX])[1])) 
}

df.Q2 <- data.frame(MeanSugar=Mean.Sugar.vec,MeanCondition=Mean.Condition.vec)

# Q2 histogram ----
#hist(Mean.Sugar.vec,xlab="Average Sugar Concentration",main = "Histogram of the Averages of the \n Percentage Sugar Content in Each Plant")
ggplot(df.Q2,aes(x=df.Q2$MeanSugar)) + geom_histogram(binwidth = 4, fill="goldenrod1",col=I("darkorange2"),alpha=0.5) +
  labs(x="Average Sugar Concentration",title="Histogram of the Averages of the Percentage Sugar Content in Each Plant") + 
  theme(plot.title = element_text(hjust = 0.5))

# Q2 boxplot ----
# boxplot(Mean.Sugar.vec,main="Boxplot of the Average Sugar Concentrations")
ggplot(df.Q2,aes(x="",y=df.Q2$MeanSugar)) + geom_boxplot(width=0.35) + 
  labs(x="",y="Average Sugar Concentration",title="Boxplot of the Average Sugar Concentrations") +
  theme(plot.title = element_text(hjust = 0.5)) 

# Q2 scatterplot ----
#plot(Mean.Condition.vec,Mean.Sugar.vec,xlab="Condition",ylab="Sugar Concentration (%)",main="Scatterplot of Average Sugar Concentration in the Plant \n Against Average Condition of the Flowers on the Plant")
ggplot(df.Q2,aes(x=df.Q2$MeanCondition,y=df.Q2$MeanSugar)) + geom_point(alpha=0.5,size=3,col="firebrick1") +
  labs(x="Condition",y="Sugar Concentration (%)",title="Scatterplot of Average Sugar Concentration in the Plant Against Average Condition of the Flowers on the Plant") + theme(plot.title = element_text(hjust = 0.5))

## Question3 ## ----
## Does the sugar concentration change within a twenty-four-hour period?

HourMin <- as.POSIXct(Orchid.Q1$time, format="%H:%M")
df.Q3 <- data.frame(time = HourMin,sugar.percent=Orchid.Q1$sugar.percent)

# Q3 Scatterplot 1 ----
#plot(sugar.percent~time,data=Orchid.Q1,xlab="Time of Day",ylab="Percent Sugar Content",main="Variation in Sugar Concentration \n in the Flowers Over the Day")
ggplot(df.Q3,aes(x=time,y=sugar.percent)) + geom_point(alpha=0.25,size=3,col="limegreen") +
  labs(x="Time of Day",y="Percent Sugar Content",title ="Variation in Sugar Concentration in the Flowers Over the Day") + theme(plot.title = element_text(hjust = 0.5))

# Q3 Scatterplot 2 ----
Hour.fraction <- integer(length(HourMin)) 
Hour.fraction[minute(HourMin)>30] <- 1
Hour.integer <- hour(HourMin)
Hour <- Hour.fraction + Hour.integer
Mean.Half.Hour <- aggregate(Orchid.Q1$sugar.percent,list(Hour),mean)
ggplot(Mean.Half.Hour,aes(x=Group.1,y=x)) + geom_point(size=5,alpha=0.5,shape=18,col="maroon1") + 
  labs(x="Time of Day",y="Percent Sugar Concentration",title="Average Variation in Sugar Concentration in the Flowers Over the Day by Half Hour Periods")+
  theme(plot.title = element_text(hjust = 0.5))

## Question 4 ## ----
## Is there a difference in the sugar concentration between plants that flowered early
## in the season compared to plants that folowered late in the season?

Date <- as.Date(paste("17",Orchid.Q1$date,sep="-"),"%y-%d-%b")
Date1 <- format(Date, format="%m-%d")

Uniquedate.Mean.Sugar <- NULL
Uniquedate.Date.vec <- NULL
for(j in Plant.Index){
  INDEX <- Orchid.Q1$plant==j
  # Unique.date <- unique(Orchid.Q1$date[INDEX])
   Unique.date <- unique(Date1[INDEX])
  if(length(Unique.date)==1){
    Uniquedate.Mean.Sugar <- c(Uniquedate.Mean.Sugar, mean(Orchid.Q1$sugar.percent[INDEX]))
    Uniquedate.Date.vec <- c(Uniquedate.Date.vec,as.character(Unique.date))
  }
}

# Q4 Scatterplot ----
df.Q4 <- data.frame(Date = Uniquedate.Date.vec, Mean.Sugar=Uniquedate.Mean.Sugar)
ggplot(data=df.Q4, aes(Date,Mean.Sugar)) + geom_point(size=3,col=I("deepskyblue"),alpha=0.5)+
  labs(x="Date",y="Sugar Concentration",title="Scatterplot of the Average Sugar Concentration per Plant by Date of Measurement")+
  theme(plot.title = element_text(hjust = 0.5))

## Question 5 ## ----
## Is there a difference between the sugar concentration in the early opened flowers and lately
## opened flowers on an individual plant, i.e. bottom versus top of the plant?
# My process of the dataset may have some glitch!

#  Q5 -- Prat I -----
Bi.Mean.Sugar <- NULL
for(j in Plant.Index){
  INDEX <- Orchid.Q1$plant==j
  Unique.date <- unique(Date[INDEX])
  Sugar.plant <- Orchid.Q1$sugar.percent[INDEX]
  Date.plant <- Date[INDEX]
  if(length(unique(Date.plant))==2){
    Mean.Sugar1 <- mean(Sugar.plant[Date.plant==Unique.date[1]])
    Mean.Sugar2 <- mean(Sugar.plant[Date.plant==Unique.date[2]])
    Bi.Mean.Sugar <- rbind(Bi.Mean.Sugar,c(Mean.Sugar1,Mean.Sugar2))
  }
}
Diff.diff.time <- Bi.Mean.Sugar[,1]-Bi.Mean.Sugar[,2]
Q5.table0 <- rbind(cbind(apply(Bi.Mean.Sugar,2,mean),apply(Bi.Mean.Sugar,2,sd)),
                  c(mean(Diff.diff.time),sd(Diff.diff.time)))
Q5.table <- cbind(dim(Bi.Mean.Sugar)[1],Q5.table0,Q5.table0[,2]/sqrt(dim(Bi.Mean.Sugar)[1]))

row.names(Q5.table) <- c("Early date","Later date","Difference")
colnames(Q5.table) <- c("N","Mean","StDev","SE Mean")
round(Q5.table,3)
##             N   Mean StDev SE Mean
## Early date 39 24.916 2.635   0.422
## Later date 39 22.534 5.549   0.889
## Difference 39  2.383 5.434   0.870
# Q5 Boxplot I ----
# boxplot(Diff.diff.time)
df.Q5.1 <- data.frame(Diff=Diff.diff.time)
ggplot(df.Q5.1,aes(x="",y=Diff)) + geom_boxplot(width=0.35) + 
  labs(x="",y="Difference",title="Boxplot of Differences") +theme(plot.title = element_text(hjust = 0.5))

t.test(Bi.Mean.Sugar[,1],Bi.Mean.Sugar[,2],paired = T)
## 
##  Paired t-test
## 
## data:  Bi.Mean.Sugar[, 1] and Bi.Mean.Sugar[, 2]
## t = 2.7383, df = 38, p-value = 0.009348
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.6211556 4.1440154
## sample estimates:
## mean of the differences 
##                2.382585
# Q5 -- Part II ----
Bottom.vec <- NULL
Top.vec <- NULL
count = 0
for(j in Plant.Index){
  INDEX <- Orchid.Q1$plant==j
  Sugar.plant <- Orchid.Q1$sugar.percent[INDEX]
  n <- length(Sugar.plant)
  if(length(unique(Day.vec[INDEX]))==1 & length(Sugar.plant)>=6){
    Top.vec <- c(Top.vec,mean(Sugar.plant[1:3]))
    Bottom.vec <- c(Bottom.vec,mean(Sugar.plant[(n-2):n]))
    count=count+1
  }
}
Q5.Mat <- cbind(count, rbind(c(mean(Top.vec),sd(Top.vec)),c(mean(Bottom.vec),sd(Bottom.vec)),c(mean(Top.vec-Bottom.vec),sd(Top.vec-Bottom.vec))))
Q5.Mat <- cbind(Q5.Mat,Q5.Mat[,3]/sqrt(Q5.Mat[,1]))
rownames(Q5.Mat) <- c("Bottom flowers","Top flowers", "Difference")
colnames(Q5.Mat) <- c("N","Mean","StDev","SE Mean")
Q5.Mat
##                 N       Mean    StDev   SE Mean
## Bottom flowers 67 24.2462687 5.374162 0.6565584
## Top flowers    67 23.6019900 2.944629 0.3597436
## Difference     67  0.6442786 4.562685 0.5574207
# Q5 Boxplot II ----
df.Q5.2 <- data.frame(Diff=Top.vec-Bottom.vec)
ggplot(df.Q5.2,aes(x="",y=Diff)) + geom_boxplot(width=0.35) + 
  labs(x="",y="Difference",title="Boxplot of Differences") + 
  theme(plot.title = element_text(hjust = 0.5))

t.test(Top.vec,Bottom.vec,paired = T)
## 
##  Paired t-test
## 
## data:  Top.vec and Bottom.vec
## t = 1.1558, df = 66, p-value = 0.2519
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4686477  1.7572049
## sample estimates:
## mean of the differences 
##               0.6442786
# Question 6 ## ----
## Is there a difference in the sugar concentration between plants with a large
## inflourescene and a plant with a small inflourescence?
flower.no.total <- NULL
for(j in Plant.Index){
  INDEX <- Orchid.Q1$plant==j
  Plant.Flower.vec <- Orchid.Q1$flower.No[INDEX]
  flower.no.total <- c(flower.no.total,Plant.Flower.vec[length(Plant.Flower.vec)])
}

# Q6 Scatterplot ----
df.Q6 <- data.frame(Mean.Sugar=Mean.Sugar.vec,Flower.Total=flower.no.total,Date=Day.num.vec) # Mean.Sugar.vec is from Q2.
ggplot(df.Q6,aes(Flower.Total,Mean.Sugar)) + geom_point(alpha=0.7,size=3,col="gold2") +
  labs(x="Number of Flowers",y="Sugar Concentration",title="Scatterplot of the Average Sugar Concentration per Plant by the Number of Flowers on the Plant") + 
  theme(plot.title = element_text(hjust = 0.5))

Mod.Q6 <- lm(Mean.Sugar~Flower.Total,data=df.Q6)
summary(Mod.Q6)
## 
## Call:
## lm(formula = Mean.Sugar ~ Flower.Total, data = df.Q6)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.0069  -1.8828   0.3507   2.6062  11.4803 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  23.33589    0.75162   31.05   <2e-16 ***
## Flower.Total  0.05699    0.08504    0.67    0.504    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.114 on 166 degrees of freedom
## Multiple R-squared:  0.002698,   Adjusted R-squared:  -0.00331 
## F-statistic: 0.4491 on 1 and 166 DF,  p-value: 0.5037
anova(Mod.Q6)
## Analysis of Variance Table
## 
## Response: Mean.Sugar
##               Df Sum Sq Mean Sq F value Pr(>F)
## Flower.Total   1    7.6  7.5994  0.4491 0.5037
## Residuals    166 2809.1 16.9220
Mod1.Q6 <- lm(Mean.Sugar~Flower.Total+Date,data=df.Q6)
summary(Mod1.Q6)
## 
## Call:
## lm(formula = Mean.Sugar ~ Flower.Total + Date, data = df.Q6)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.8914  -2.1183   0.1565   2.7022  11.8499 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.57044    1.41641  18.053   <2e-16 ***
## Flower.Total  0.04591    0.08463   0.543   0.5882    
## Date         -0.15445    0.08322  -1.856   0.0652 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.084 on 165 degrees of freedom
## Multiple R-squared:  0.02309,    Adjusted R-squared:  0.01125 
## F-statistic:  1.95 on 2 and 165 DF,  p-value: 0.1455
anova(Mod1.Q6)
## Analysis of Variance Table
## 
## Response: Mean.Sugar
##               Df  Sum Sq Mean Sq F value  Pr(>F)  
## Flower.Total   1    7.60   7.599  0.4557 0.50059  
## Date           1   57.45  57.447  3.4448 0.06523 .
## Residuals    165 2751.60  16.676                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Question 7 ## ----
## How does the scent fluctuate within a twenty-four-hour period?
Orchid.Q7 <- Orchid[complete.cases(Orchid[,c("time","scent")]),]
Orchid.Q7$time <- as.POSIXct(Orchid.Q7$time, format="%H:%M")

# Q7 - Scatterplot
ggplot(Orchid.Q7,aes(time,scent)) + geom_point(alpha=0.2,size=3,col="slateblue1") +
  labs(x="Time of Day",y="Strenth of Scent", title="Strength of the Scent of Flowers Varying by Time of Day")