Markdown Author: Jessie Bell, 2023

Libraries Used: MASS, ggplot, emmeans

Fake Halo Data Analysis

View the data

fakedata <- read.csv("fakeHALOdata.csv")

gt(fakedata)|>
  opt_table_font(google_font("Corbel"))|>
  opt_stylize(style=4, color="pink")
date time ambientweather.F high.tide.ft low.tide.ft tide.range.ft windspeedanddirection.mph cell_count onsite_notes picking_notes lunar_illumination_percent next_full_days lunar_distance_mi nitrate_mg nitrate_volts act_cond_us_cm spc_cond_um_cm salinity_pu resistivity_ohm_cm density total_diss_solids_ppt ph ph_mv orp_mv chl_a_flu_rfu water_temp_c baro_pressure_mmhg pressure_psi depth_ft lat long
12/1/2023 5:30 PM 41 6.6 -1.300000 7.90000 S6 19 FAKEDATA FAKEDATA 83 25 247101.0 28.26 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 48.73 -122.51
12/2/2023 10:02 AM 45 9.2 -0.700000 9.90000 S5 22 FAKEDATA FAKEDATA 72 24 249605.0 0.91 NA 29514.89 43101.90 27.22 34.04 1.02 28.02 7.68 -45.38 275.24 0.00661993 8.50 743.4 14.58 33.46 48.73 -122.51
12/3/2023 10:43 AM 44 9.0 0.100000 8.90000 S20 30 FAKEDATA FAKEDATA 64 23 250758.0 42.84 NA 32863.91 47810.86 30.53 30.43 1.02 31.08 8.16 -43.40 219.66 0.00372587 8.70 743.4 14.66 33.65 48.73 -122.51
12/4/2023 10:56 AM 44 8.8 0.900000 7.90000 N7 10 FAKEDATA FAKEDATA 54 22 251148.0 21.45 61.72 30027.95 44527.21 28.14 33.30 1.02 28.94 8.22 -46.80 190.62 0.00326642 7.95 743.4 14.82 34.02 48.73 -122.51
12/5/2023 11:46 AM 51 8.7 3.900000 4.80000 S3 15 FAKEDATA FAKEDATA 45 21 250730.0 30.08 53.98 33779.56 48485.04 31.07 29.60 1.02 31.52 8.25 -48.70 206.56 0.00143601 9.16 743.4 14.76 33.87 48.73 -122.51
12/6/2023 11:38 AM 48 8.6 3.000000 5.60000 Calm 29 FAKEDATA FAKEDATA 39 20 249520.0 16.11 68.64 19552.55 28587.31 17.35 51.15 1.01 18.58 8.32 -52.01 216.59 0.00346331 8.49 743.4 14.65 33.62 48.73 -122.51
12/7/2023 12:29 PM 42 8.6 2.000000 6.60000 W6 23 FAKEDATA FAKEDATA 27 19 247601.0 0.77 140.12 11830.73 17635.78 10.24 84.62 1.01 11.46 8.15 -42.58 204.16 0.00523515 7.79 743.4 14.68 33.69 48.73 -122.51
12/8/2023 12:51 PM 42 8.6 1.000000 7.60000 S3 0 FAKEDATA FAKEDATA 19 18 245113.0 1.53 123.78 8340.95 12609.22 7.12 119.91 1.01 8.20 8.19 -44.66 188.39 0.00342754 7.29 743.4 14.94 34.28 48.73 -122.51
12/9/2023 1:13 PM 37 8.6 0.000000 8.60000 SE13 4 FAKEDATA FAKEDATA 11 17 241993.0 18.12 65.75 31840.40 46991.37 29.89 31.41 1.02 30.54 7.74 -19.92 251.90 0.00114000 8.08 743.4 14.86 34.09 48.73 -122.51
12/10/2023 1:36 PM 43 8.7 -1.000000 9.70000 NE5 3 FAKEDATA FAKEDATA 5 16 238965.0 3.37 105.96 33343.61 47760.38 30.57 29.99 1.02 31.04 8.34 -53.26 223.71 0.00037800 9.28 743.4 14.78 33.92 48.73 -122.51
12/11/2023 2:00 PM 47 8.7 -1.800000 10.50000 N8 25 FAKEDATA FAKEDATA 2 15 236034.0 11.93 75.78 31983.05 46475.11 29.60 31.28 1.02 30.21 8.36 -54.27 210.61 0.00046883 8.66 743.4 14.88 34.14 48.73 -122.51
12/12/2023 1:10 PM 45 8.4 -2.400000 10.80000 NE6 55 FAKEDATA FAKEDATA 0 14 233623.0 14.22 71.80 33274.26 47539.65 30.43 30.05 1.02 30.90 8.35 -54.06 200.72 0.00000000 9.38 743.4 14.87 34.13 48.73 -122.51
12/13/2023 2:52 PM 45 8.8 -2.700000 11.50000 S6 20 FAKEDATA FAKEDATA 83 13 235694.1 28.26 61.72 30027.95 47810.86 27.22 29.60 1.02 28.02 7.68 -48.17 203.63 -0.00037195 8.74 743.4 14.92 34.23 48.73 -122.51
12/14/2023 3:25 PM 45 8.7 -1.461538 10.16154 S5 4 FAKEDATA FAKEDATA 72 12 234234.3 0.91 53.98 33779.56 44527.21 30.53 51.15 1.02 31.08 8.16 -48.54 201.39 -0.00087578 8.79 743.4 14.94 34.29 48.73 -122.51
12/15/2023 4:09 PM 45 8.4 -1.681319 10.08132 S20 2 FAKEDATA FAKEDATA 64 11 232774.6 42.84 68.64 19552.55 48485.04 28.14 84.62 1.02 28.94 8.22 -48.92 199.14 -0.00137961 8.83 743.4 14.97 34.34 48.73 -122.51
12/16/2023 8:52 AM 45 9.5 -1.901099 11.40110 N8 10 FAKEDATA FAKEDATA 54 10 231314.8 21.45 140.12 11830.73 28587.31 31.07 119.91 1.02 31.52 8.25 -49.30 196.90 -0.00188343 8.87 743.4 14.99 34.40 48.73 -122.51
12/17/2023 9:33 AM 45 9.4 -2.120879 11.52088 S4 13 FAKEDATA FAKEDATA 45 9 229855.0 30.08 123.78 8340.95 17635.78 17.35 31.41 1.01 18.58 8.32 -49.67 194.65 -0.00238726 8.92 743.4 15.02 34.45 48.73 -122.51
12/18/2023 10:11 AM 45 9.3 -2.340659 11.64066 Calm 6 FAKEDATA FAKEDATA 39 8 228395.3 16.11 65.75 31840.40 12609.22 10.24 29.99 1.01 11.46 8.15 -50.05 192.41 -0.00289109 8.96 743.4 15.04 34.51 48.73 -122.51
12/19/2023 10:47 AM 45 9.3 -2.560440 11.86044 W7 23 FAKEDATA FAKEDATA 27 7 226935.5 0.77 105.96 33343.61 46991.37 7.12 31.28 1.01 8.20 8.19 -50.42 190.16 -0.00339491 9.01 743.4 15.07 34.56 48.73 -122.51
12/20/2023 11:21 AM 45 9.2 -2.780220 11.98022 S4 0 FAKEDATA FAKEDATA 19 6 225475.7 1.53 75.78 31983.05 47760.38 29.89 30.05 1.02 30.54 7.74 -50.80 187.91 -0.00389874 9.05 743.4 15.09 34.62 48.73 -122.51
12/21/2023 11:52 AM 45 9.2 -3.000000 12.20000 SE14 4 FAKEDATA FAKEDATA 11 5 224016.0 18.12 71.80 33274.26 46475.11 30.57 29.60 1.02 31.04 8.34 -51.17 185.67 -0.00440256 9.09 743.4 15.12 34.67 48.73 -122.51
12/22/2023 12:23 PM 45 9.1 -3.219780 12.31978 NE6 3 FAKEDATA FAKEDATA 5 4 222556.2 3.37 61.72 30027.95 47539.65 29.60 51.15 1.02 30.21 8.36 -51.55 183.42 -0.00490639 9.14 743.4 15.14 34.73 48.73 -122.51
12/23/2023 12:53 PM 45 9.0 -3.439560 12.43956 N9 25 FAKEDATA FAKEDATA 2 3 221096.4 11.93 53.98 33779.56 47810.86 30.43 84.62 1.02 30.90 8.35 -51.93 181.18 -0.00541022 9.18 743.4 15.17 34.79 48.73 -122.51
12/24/2023 1:24 PM 45 8.9 -3.659341 12.55934 NE7 55 FAKEDATA FAKEDATA 0 2 219636.7 14.22 68.64 19552.55 44527.21 27.22 119.91 1.02 28.02 7.68 -52.30 178.93 -0.00591404 9.23 743.4 15.19 34.84 48.73 -122.51
12/25/2023 1:56 PM 45 8.6 -3.879121 12.47912 S6 25 FAKEDATA FAKEDATA 83 1 218176.9 0.77 140.12 11830.73 48485.04 30.53 31.41 1.02 31.08 8.16 -52.68 176.69 -0.00641787 9.27 743.4 15.22 34.90 48.73 -122.51
12/26/2023 2:31 PM 45 8.4 -4.098901 12.49890 S5 4 FAKEDATA FAKEDATA 72 0 216717.2 1.53 123.78 8340.95 28587.31 28.14 29.99 1.02 28.94 8.22 -53.05 174.44 -0.00692170 9.31 743.4 15.24 34.95 48.73 -122.51
12/27/2023 3:09 PM 45 8.2 -4.318681 12.51868 S20 2 FAKEDATA FAKEDATA 64 -1 215257.4 18.12 65.75 31840.40 17635.78 31.07 31.28 1.02 31.52 8.25 -53.43 172.20 -0.00742552 9.36 743.4 15.27 35.01 48.73 -122.51
12/28/2023 3:51 PM 45 7.8 -4.538462 12.33846 N9 10 FAKEDATA FAKEDATA 54 -2 213797.6 3.37 105.96 33343.61 12609.22 17.35 30.05 1.01 18.58 8.32 -53.81 169.95 -0.00792935 9.40 743.4 15.29 35.06 48.73 -122.51
12/29/2023 8:04 AM 45 9.6 -4.758242 14.35824 S5 13 FAKEDATA FAKEDATA 45 -3 212337.9 11.93 75.78 31983.05 46991.37 10.24 31.00 1.01 11.46 8.15 -54.18 167.70 -0.00843318 9.45 743.4 15.32 35.12 48.73 -122.51
12/30/2023 8:37 AM 45 9.4 -4.978022 14.37802 Calm 6 FAKEDATA FAKEDATA 39 -4 210878.1 14.22 71.80 33274.26 47760.38 7.12 31.00 1.01 8.20 8.19 -54.56 165.46 -0.00893700 9.49 743.4 15.34 35.17 48.73 -122.51

Poisson

#data exploration


#separating data into what I feel like looking at
one <- fakedata[6]
five <- fakedata[8]
seven <- fakedata[13:14]
twelve <- fakedata[16]
seventeen <- fakedata[18]
twenty <- fakedata[21:22]
twentyone <- fakedata[25]

wantedData <- data.frame(c(five, seven, twelve, seventeen, twenty, twentyone))


#got rid of water temp and tide_range because of high collinearity between tidal range, water temp, and chl_a
ggpairs(wantedData)

#rid of lunar dist
one <- fakedata[6]
five <- fakedata[8]
seven <- fakedata[14]
twelve <- fakedata[16]
seventeen <- fakedata[18]
twenty <- fakedata[22]
twentyone <- fakedata[25]

wantedData2 <- data.frame(c(five, seven, twelve, seventeen, twenty, twentyone))

#got rid of total dissolevd solids because high collinearity with salinity
#got rid of lunar_dist because of high correlation with chl a and dist. 
ggpairs(wantedData2)

#strongest relationship is salinity and nitrate
#strongest with cell count is chl A, then pH, then nitrate

#checking some graphs
ggplot(wantedData2, aes(chl_a_flu_rfu, cell_count))+
  geom_point()

ggplot(wantedData2, aes(ph, cell_count))+
  geom_point()

ggplot(wantedData2, aes(nitrate_mg, cell_count))+
  geom_point()

ggplot(wantedData2, aes(act_cond_us_cm, cell_count))+
  geom_point()

ggplot(wantedData2, aes(salinity_pu, cell_count))+
  geom_point()

ggplot(wantedData2, aes(cell_count))+
  geom_histogram(binwidth = 5, color="pink", fill="white")

#trying to create a model
#checking variance and mean of response variable: 
sd(fakedata$cell_count)
## [1] 14.3126
mean(fakedata$cell_count) #pretty much equal! Poisson assumes this, great! Moving on. 
## [1] 15.33333
#create first model
model1 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + salinity_pu, data = wantedData2, family = poisson(link = "log"))
summary(model1) #null deviance is 366.4 (28 df), residual deviance is 349.88
## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + 
##     act_cond_us_cm + salinity_pu, family = poisson(link = "log"), 
##     data = wantedData2)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     6.072e+00  1.724e+00   3.523 0.000427 ***
## chl_a_flu_rfu   2.862e+01  1.147e+01   2.495 0.012587 *  
## ph             -4.237e-01  2.089e-01  -2.029 0.042491 *  
## nitrate_mg      3.929e-03  3.936e-03   0.998 0.318214    
## act_cond_us_cm -2.387e-06  5.450e-06  -0.438 0.661433    
## salinity_pu     6.221e-03  6.164e-03   1.009 0.312831    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 349.88  on 23  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 476.41
## 
## Number of Fisher Scoring iterations: 5
#using single term deletions
drop1(model1, test="Chi") #drop highest p 
## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + 
##     salinity_pu
##                Df Deviance    AIC    LRT Pr(>Chi)  
## <none>              349.88 476.41                  
## chl_a_flu_rfu   1   356.09 480.62 6.2098  0.01270 *
## ph              1   353.87 478.40 3.9944  0.04565 *
## nitrate_mg      1   350.87 475.40 0.9880  0.32022  
## act_cond_us_cm  1   350.07 474.60 0.1906  0.66238  
## salinity_pu     1   350.91 475.44 1.0337  0.30930  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#dropping actual conductivity
model2 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + salinity_pu, data = wantedData2, family = poisson)
summary(model2)
## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + 
##     salinity_pu, family = poisson, data = wantedData2)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    5.980213   1.708772   3.500 0.000466 ***
## chl_a_flu_rfu 28.964830  11.449965   2.530 0.011416 *  
## ph            -0.418068   0.208290  -2.007 0.044735 *  
## nitrate_mg     0.003892   0.003942   0.987 0.323427    
## salinity_pu    0.005554   0.005960   0.932 0.351383    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 350.07  on 24  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 474.6
## 
## Number of Fisher Scoring iterations: 5
drop1(model2, test="Chi") #all are significant (except salinity and nitrate, will drop highest p, salinity. 
## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + salinity_pu
##               Df Deviance    AIC    LRT Pr(>Chi)  
## <none>             350.07 474.60                  
## chl_a_flu_rfu  1   356.45 478.98 6.3755  0.01157 *
## ph             1   353.98 476.51 3.9091  0.04803 *
## nitrate_mg     1   351.04 473.57 0.9672  0.32537  
## salinity_pu    1   350.95 473.48 0.8795  0.34834  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#drop salinity

model3 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg, data = wantedData2, family = poisson)
summary(model3)
## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg, family = poisson, 
##     data = wantedData2)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    6.313762   1.682046   3.754 0.000174 ***
## chl_a_flu_rfu 28.885225  11.354740   2.544 0.010963 *  
## ph            -0.444180   0.207662  -2.139 0.032439 *  
## nitrate_mg     0.004875   0.003802   1.282 0.199766    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 350.95  on 25  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 473.48
## 
## Number of Fisher Scoring iterations: 5
#nitrate not significant still. 

#drop nitrate
model4 <- glm(cell_count ~ chl_a_flu_rfu + ph, data = wantedData2, family = poisson)
summary(model4) #all are significant here. #has the lowest AIC value. GREAT! Model 4 looks good. 
## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph, family = poisson, 
##     data = wantedData2)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     6.1960     1.6813   3.685 0.000228 ***
## chl_a_flu_rfu  30.6709    11.1631   2.748 0.006005 ** 
## ph             -0.4208     0.2071  -2.032 0.042126 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 352.57  on 26  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 473.1
## 
## Number of Fisher Scoring iterations: 5
#All slopes are sig dif. from 0 and coefficients have small effect on halo cell counts (more below)



#I think poison model 4 is best. 
#plot assumptions 
par(mfrow=c(2,2))
plot(model4)

#look at AIC too

allmodels <- c(model1$aic, model2$aic, model3$aic, model4$aic)
labels <- c("M1", "M2", "M3", "M4")
AIC <- data.frame(rbind(labels, allmodels)) #model 4 is best

gt(AIC)|>
  tab_options(
    column_labels.hidden = T)|>
  opt_table_font(google_font("Caveat"))|>
  opt_stylize(color="cyan", style = 3)|>
  fmt_number(decimals = 2)
M1 M2 M3 M4
476.410289798928 474.600932934541 473.480429008004 473.10482469859
#model 4 has lowest AIC, so might be best?



#checking overdispersion (when variance is larger than the mean)
residual_deviance <- deviance(model4)
df <- df.residual(model4)


# Calculate the ratio of residual deviance to degrees of freedom
residual_deviance / df #*greater than 1
## [1] 13.56055
#...so we can assume these data are overdispersed and negaitve binomial might be the way to go


#lets check model 1
residual_deviance <- deviance(model1)
df <- df.residual(model1)

# Calculate the ratio of residual deviance to degrees of freedom
residual_deviance / df #*greater than 1
## [1] 15.21216
#another way to calculate over dispersion (from Sob)
pp<- sum(resid(model4, type="pearson")^2)
1-pchisq(pp, model4$df.resid)
## [1] 0
#this says no overdispersion? WEIRD. The method above from zuur says there is overdispersion. 


#Approx./Psuedo R^2
1-(model4$deviance/model4$null)
## [1] 0.03774392
#hot dang that is low. 

Added Variable Plots

avPlots(model4, pch=1, col="black", col.lines = "#00bfc4")

Model with observed and predicted points

#get estimated marginal means with emmeans (this means visualizing the effects of one variable given the others--like an added variable plot but on the scale of the response)

#Create new dataframe over which to make predictions
# Remove rows with NA in the 'ph' column
wantedData2 <- wantedData2[complete.cases(wantedData2$ph), ]

# Create new dataframe over which to make predictions
#Look up emmeans (Estimated Marginal Means) to understand arguments
new_df <- emmeans(model4, ~ chl_a_flu_rfu + ph, 
                           at = list(ph = seq(min(wantedData2$ph), 
                                             max(wantedData2$ph), 
                                             length.out = 100)))



ggplot(wantedData, aes(x = chl_a_flu_rfu, y = cell_count, col = ph)) +
  geom_point() +
  geom_line(data = as.data.frame(new_df), aes(y = emmean))
## Warning: Removed 1 rows containing missing values (`geom_point()`).

plot(wantedData2$chl_a_flu_rfu, wantedData2$cell_count, ylab="Halo Cell Count", xlab="Chlorophyll a (RFU)", pch=16, col="tomato")
lines(wantedData2$chl_a_flu_rfu, predict(model4), col="red", lwd=3)

Quasi-poisson

BOTH QUASI AND NEG BINOM WERE WORKED OUT FULLY BUT NONE WERE SIGNIFICANT

#moving onto quasi poisson to correct for over-dispersion
# begin with the quasi model

model5 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + salinity_pu, data = wantedData2, family = quasipoisson)
summary(model5)
## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + 
##     act_cond_us_cm + salinity_pu, family = quasipoisson, data = wantedData2)
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)     6.072e+00  6.899e+00   0.880    0.388
## chl_a_flu_rfu   2.862e+01  4.591e+01   0.623    0.539
## ph             -4.237e-01  8.361e-01  -0.507    0.617
## nitrate_mg      3.929e-03  1.576e-02   0.249    0.805
## act_cond_us_cm -2.387e-06  2.182e-05  -0.109    0.914
## salinity_pu     6.221e-03  2.467e-02   0.252    0.803
## 
## (Dispersion parameter for quasipoisson family taken to be 16.02372)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 349.88  on 23  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
#You can see the only difference is specifying the family option as quasipoisson instead of poisson. This gives the impression that there is a quasi-Poisson distribution, but there is no such thing! All we do here is specify the mean and variance relationship and an exponential link between the expected values and explanatory variables. It is a software issue to call this ‘quasipoisson’. Do not write in your report or paper that you used a quasi-Poisson distribution. Just say that you did a Poisson GLM, detected overdispersion, and corrected the standard errors using a quasi-GLM model where the variance is given by φ × µ, where µ is the mean and φ the dispersion parameter.
sqrt(16.0237) #the dispersion parameter for quasipoisson
## [1] 4.002961
#To get the numerical output for this model, use summary(model4), which gives the output below. Note that the ratio of the residual deviance and the degrees of freedom is still larger than 1, but that is no longer a problem as we now allow for overdispersion. The dispersion parameter φ is estimated as 16.0237. This means that all standard errors have been multiplied by 4.002961 (the square root of 16.0237), and as a result, most parameters are no longer significant! We can move onto model selection


drop1(model5,test="F") #drop actual conductivity first from model
## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + 
##     salinity_pu
##                Df Deviance F value Pr(>F)
## <none>              349.88               
## chl_a_flu_rfu   1   356.09  0.4082 0.5292
## ph              1   353.87  0.2626 0.6132
## nitrate_mg      1   350.87  0.0650 0.8011
## act_cond_us_cm  1   350.07  0.0125 0.9118
## salinity_pu     1   350.91  0.0679 0.7967
model6 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + salinity_pu, data = wantedData2, family = quasipoisson)
summary(model6)
## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + 
##     salinity_pu, family = quasipoisson, data = wantedData2)
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)    5.980213   6.703116   0.892    0.381
## chl_a_flu_rfu 28.964830  44.915550   0.645    0.525
## ph            -0.418068   0.817074  -0.512    0.614
## nitrate_mg     0.003892   0.015463   0.252    0.803
## salinity_pu    0.005554   0.023381   0.238    0.814
## 
## (Dispersion parameter for quasipoisson family taken to be 15.3881)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 350.07  on 24  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
#dropping salinity
model7 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg, data = wantedData2, family = quasipoisson)
summary(model7)
## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg, family = quasipoisson, 
##     data = wantedData2)
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)    6.313762   6.521931   0.968    0.342
## chl_a_flu_rfu 28.885225  44.026646   0.656    0.518
## ph            -0.444180   0.805186  -0.552    0.586
## nitrate_mg     0.004875   0.014741   0.331    0.744
## 
## (Dispersion parameter for quasipoisson family taken to be 15.03408)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 350.95  on 25  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
#drop nitrate
model8 <- glm(cell_count ~ chl_a_flu_rfu + ph, data = wantedData2, family = quasipoisson)
summary(model8) #drop ph
## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph, family = quasipoisson, 
##     data = wantedData2)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)     6.1960     6.3963   0.969    0.342
## chl_a_flu_rfu  30.6709    42.4697   0.722    0.477
## ph             -0.4208     0.7878  -0.534    0.598
## 
## (Dispersion parameter for quasipoisson family taken to be 14.47405)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 352.57  on 26  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
model9 <- glm(cell_count ~ chl_a_flu_rfu, data = wantedData2, family = quasipoisson)
summary(model9)
## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu, family = quasipoisson, 
##     data = wantedData2)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2.7760     0.1891  14.678 2.17e-14 ***
## chl_a_flu_rfu  34.7170    42.2255   0.822    0.418    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 14.5625)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 356.56  on 27  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
#not significant :/

#what about negative binomial?

Negative Binomial

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
# Fit negative binomial model
model_nb <- glm.nb(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + salinity_pu, link="log", data = wantedData2)
summary(model_nb) #none of the parameters are significant still :/ wah wah
## 
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + 
##     act_cond_us_cm + salinity_pu, data = wantedData2, link = "log", 
##     init.theta = 1.121166406)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)     6.422e+00  6.970e+00   0.921    0.357
## chl_a_flu_rfu   3.339e+01  4.388e+01   0.761    0.447
## ph             -4.694e-01  8.470e-01  -0.554    0.579
## nitrate_mg      2.692e-03  1.565e-02   0.172    0.863
## act_cond_us_cm -7.689e-07  2.021e-05  -0.038    0.970
## salinity_pu     6.400e-03  2.193e-02   0.292    0.770
## 
## (Dispersion parameter for Negative Binomial(1.1212) family taken to be 1)
## 
##     Null deviance: 34.711  on 28  degrees of freedom
## Residual deviance: 33.493  on 23  degrees of freedom
## AIC: 230.47
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.121 
##           Std. Err.:  0.312 
## 
##  2 x log-likelihood:  -216.466
#there is a very small amount of overdispersion (1.12 is not high enough) so we can keep going with it. 
#model selection process
drop1(model_nb, test = "Chi")
## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + 
##     salinity_pu
##                Df Deviance    AIC     LRT Pr(>Chi)
## <none>              33.493 228.47                 
## chl_a_flu_rfu   1   34.088 227.06 0.59502   0.4405
## ph              1   33.859 226.83 0.36587   0.5453
## nitrate_mg      1   33.516 226.49 0.02241   0.8810
## act_cond_us_cm  1   33.495 226.47 0.00129   0.9713
## salinity_pu     1   33.572 226.54 0.07884   0.7789
#drop actual conductivity

model_nb2 <- glm.nb(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg+ salinity_pu, link="log", data = wantedData2)
summary(model_nb2)
## 
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + 
##     salinity_pu, data = wantedData2, link = "log", init.theta = 1.1210926)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)    6.454859   6.934536   0.931    0.352
## chl_a_flu_rfu 33.427615  43.597136   0.767    0.443
## ph            -0.475646   0.846258  -0.562    0.574
## nitrate_mg     0.002615   0.015641   0.167    0.867
## salinity_pu    0.006351   0.021554   0.295    0.768
## 
## (Dispersion parameter for Negative Binomial(1.1211) family taken to be 1)
## 
##     Null deviance: 34.710  on 28  degrees of freedom
## Residual deviance: 33.493  on 24  degrees of freedom
## AIC: 228.47
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.121 
##           Std. Err.:  0.312 
## 
##  2 x log-likelihood:  -216.467
#nothing significant. Drop1 again

drop1(model_nb2, test="Chi")
## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + salinity_pu
##               Df Deviance    AIC     LRT Pr(>Chi)
## <none>             33.493 226.47                 
## chl_a_flu_rfu  1   34.090 225.06 0.59693   0.4398
## ph             1   33.888 224.86 0.39517   0.5296
## nitrate_mg     1   33.514 224.49 0.02146   0.8835
## salinity_pu    1   33.571 224.54 0.07790   0.7802
#dropping nitrate

model_nb3 <- glm.nb(cell_count ~ chl_a_flu_rfu + ph + salinity_pu, link="log", data = wantedData2)
summary(model_nb3)
## 
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu + ph + salinity_pu, 
##     data = wantedData2, link = "log", init.theta = 1.120099512)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)    6.485986   6.918367   0.938    0.349
## chl_a_flu_rfu 35.031927  43.066362   0.813    0.416
## ph            -0.477472   0.843166  -0.566    0.571
## salinity_pu    0.007321   0.020990   0.349    0.727
## 
## (Dispersion parameter for Negative Binomial(1.1201) family taken to be 1)
## 
##     Null deviance: 34.684  on 28  degrees of freedom
## Residual deviance: 33.490  on 25  degrees of freedom
## AIC: 226.49
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.120 
##           Std. Err.:  0.311 
## 
##  2 x log-likelihood:  -216.489
#still nada. :/

drop1(model_nb3, test="Chi")
## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph + salinity_pu
##               Df Deviance    AIC     LRT Pr(>Chi)
## <none>             33.490 224.49                 
## chl_a_flu_rfu  1   34.175 223.17 0.68515   0.4078
## ph             1   33.887 222.88 0.39665   0.5288
## salinity_pu    1   33.602 222.60 0.11195   0.7379
#drop salilinity

model_nb4 <- glm.nb(cell_count ~ chl_a_flu_rfu + ph, link="log", data = wantedData2)
summary(model_nb4)
## 
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu + ph, data = wantedData2, 
##     link = "log", init.theta = 1.115608648)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)     6.7669     6.8660   0.986    0.324
## chl_a_flu_rfu  36.4182    43.0353   0.846    0.397
## ph             -0.4898     0.8433  -0.581    0.561
## 
## (Dispersion parameter for Negative Binomial(1.1156) family taken to be 1)
## 
##     Null deviance: 34.571  on 28  degrees of freedom
## Residual deviance: 33.492  on 26  degrees of freedom
## AIC: 224.6
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.116 
##           Std. Err.:  0.310 
## 
##  2 x log-likelihood:  -216.600
drop1(model_nb4)
## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph
##               Df Deviance    AIC
## <none>             33.492 222.60
## chl_a_flu_rfu  1   34.223 221.33
## ph             1   33.917 221.03
#bleh!

model_nb5 <- glm.nb(cell_count ~ chl_a_flu_rfu, link = "log", data = wantedData2)
summary(model_nb5)      
## 
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu, data = wantedData2, 
##     link = "log", init.theta = 1.099374933)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.7747     0.1997  13.894   <2e-16 ***
## chl_a_flu_rfu  33.8733    42.7832   0.792    0.429    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.0994) family taken to be 1)
## 
##     Null deviance: 34.159  on 28  degrees of freedom
## Residual deviance: 33.514  on 27  degrees of freedom
## AIC: 223.02
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.099 
##           Std. Err.:  0.304 
## 
##  2 x log-likelihood:  -217.023
#it seems again like there is no significance in the model. Is regular poisson still the best?