Markdown Author: Jessie Bell, 2023
Libraries Used: MASS, ggplot, emmeans
fakedata <- read.csv("fakeHALOdata.csv")
gt(fakedata)|>
opt_table_font(google_font("Corbel"))|>
opt_stylize(style=4, color="pink")
| date | time | ambientweather.F | high.tide.ft | low.tide.ft | tide.range.ft | windspeedanddirection.mph | cell_count | onsite_notes | picking_notes | lunar_illumination_percent | next_full_days | lunar_distance_mi | nitrate_mg | nitrate_volts | act_cond_us_cm | spc_cond_um_cm | salinity_pu | resistivity_ohm_cm | density | total_diss_solids_ppt | ph | ph_mv | orp_mv | chl_a_flu_rfu | water_temp_c | baro_pressure_mmhg | pressure_psi | depth_ft | lat | long |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12/1/2023 | 5:30 PM | 41 | 6.6 | -1.300000 | 7.90000 | S6 | 19 | FAKEDATA | FAKEDATA | 83 | 25 | 247101.0 | 28.26 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 48.73 | -122.51 |
| 12/2/2023 | 10:02 AM | 45 | 9.2 | -0.700000 | 9.90000 | S5 | 22 | FAKEDATA | FAKEDATA | 72 | 24 | 249605.0 | 0.91 | NA | 29514.89 | 43101.90 | 27.22 | 34.04 | 1.02 | 28.02 | 7.68 | -45.38 | 275.24 | 0.00661993 | 8.50 | 743.4 | 14.58 | 33.46 | 48.73 | -122.51 |
| 12/3/2023 | 10:43 AM | 44 | 9.0 | 0.100000 | 8.90000 | S20 | 30 | FAKEDATA | FAKEDATA | 64 | 23 | 250758.0 | 42.84 | NA | 32863.91 | 47810.86 | 30.53 | 30.43 | 1.02 | 31.08 | 8.16 | -43.40 | 219.66 | 0.00372587 | 8.70 | 743.4 | 14.66 | 33.65 | 48.73 | -122.51 |
| 12/4/2023 | 10:56 AM | 44 | 8.8 | 0.900000 | 7.90000 | N7 | 10 | FAKEDATA | FAKEDATA | 54 | 22 | 251148.0 | 21.45 | 61.72 | 30027.95 | 44527.21 | 28.14 | 33.30 | 1.02 | 28.94 | 8.22 | -46.80 | 190.62 | 0.00326642 | 7.95 | 743.4 | 14.82 | 34.02 | 48.73 | -122.51 |
| 12/5/2023 | 11:46 AM | 51 | 8.7 | 3.900000 | 4.80000 | S3 | 15 | FAKEDATA | FAKEDATA | 45 | 21 | 250730.0 | 30.08 | 53.98 | 33779.56 | 48485.04 | 31.07 | 29.60 | 1.02 | 31.52 | 8.25 | -48.70 | 206.56 | 0.00143601 | 9.16 | 743.4 | 14.76 | 33.87 | 48.73 | -122.51 |
| 12/6/2023 | 11:38 AM | 48 | 8.6 | 3.000000 | 5.60000 | Calm | 29 | FAKEDATA | FAKEDATA | 39 | 20 | 249520.0 | 16.11 | 68.64 | 19552.55 | 28587.31 | 17.35 | 51.15 | 1.01 | 18.58 | 8.32 | -52.01 | 216.59 | 0.00346331 | 8.49 | 743.4 | 14.65 | 33.62 | 48.73 | -122.51 |
| 12/7/2023 | 12:29 PM | 42 | 8.6 | 2.000000 | 6.60000 | W6 | 23 | FAKEDATA | FAKEDATA | 27 | 19 | 247601.0 | 0.77 | 140.12 | 11830.73 | 17635.78 | 10.24 | 84.62 | 1.01 | 11.46 | 8.15 | -42.58 | 204.16 | 0.00523515 | 7.79 | 743.4 | 14.68 | 33.69 | 48.73 | -122.51 |
| 12/8/2023 | 12:51 PM | 42 | 8.6 | 1.000000 | 7.60000 | S3 | 0 | FAKEDATA | FAKEDATA | 19 | 18 | 245113.0 | 1.53 | 123.78 | 8340.95 | 12609.22 | 7.12 | 119.91 | 1.01 | 8.20 | 8.19 | -44.66 | 188.39 | 0.00342754 | 7.29 | 743.4 | 14.94 | 34.28 | 48.73 | -122.51 |
| 12/9/2023 | 1:13 PM | 37 | 8.6 | 0.000000 | 8.60000 | SE13 | 4 | FAKEDATA | FAKEDATA | 11 | 17 | 241993.0 | 18.12 | 65.75 | 31840.40 | 46991.37 | 29.89 | 31.41 | 1.02 | 30.54 | 7.74 | -19.92 | 251.90 | 0.00114000 | 8.08 | 743.4 | 14.86 | 34.09 | 48.73 | -122.51 |
| 12/10/2023 | 1:36 PM | 43 | 8.7 | -1.000000 | 9.70000 | NE5 | 3 | FAKEDATA | FAKEDATA | 5 | 16 | 238965.0 | 3.37 | 105.96 | 33343.61 | 47760.38 | 30.57 | 29.99 | 1.02 | 31.04 | 8.34 | -53.26 | 223.71 | 0.00037800 | 9.28 | 743.4 | 14.78 | 33.92 | 48.73 | -122.51 |
| 12/11/2023 | 2:00 PM | 47 | 8.7 | -1.800000 | 10.50000 | N8 | 25 | FAKEDATA | FAKEDATA | 2 | 15 | 236034.0 | 11.93 | 75.78 | 31983.05 | 46475.11 | 29.60 | 31.28 | 1.02 | 30.21 | 8.36 | -54.27 | 210.61 | 0.00046883 | 8.66 | 743.4 | 14.88 | 34.14 | 48.73 | -122.51 |
| 12/12/2023 | 1:10 PM | 45 | 8.4 | -2.400000 | 10.80000 | NE6 | 55 | FAKEDATA | FAKEDATA | 0 | 14 | 233623.0 | 14.22 | 71.80 | 33274.26 | 47539.65 | 30.43 | 30.05 | 1.02 | 30.90 | 8.35 | -54.06 | 200.72 | 0.00000000 | 9.38 | 743.4 | 14.87 | 34.13 | 48.73 | -122.51 |
| 12/13/2023 | 2:52 PM | 45 | 8.8 | -2.700000 | 11.50000 | S6 | 20 | FAKEDATA | FAKEDATA | 83 | 13 | 235694.1 | 28.26 | 61.72 | 30027.95 | 47810.86 | 27.22 | 29.60 | 1.02 | 28.02 | 7.68 | -48.17 | 203.63 | -0.00037195 | 8.74 | 743.4 | 14.92 | 34.23 | 48.73 | -122.51 |
| 12/14/2023 | 3:25 PM | 45 | 8.7 | -1.461538 | 10.16154 | S5 | 4 | FAKEDATA | FAKEDATA | 72 | 12 | 234234.3 | 0.91 | 53.98 | 33779.56 | 44527.21 | 30.53 | 51.15 | 1.02 | 31.08 | 8.16 | -48.54 | 201.39 | -0.00087578 | 8.79 | 743.4 | 14.94 | 34.29 | 48.73 | -122.51 |
| 12/15/2023 | 4:09 PM | 45 | 8.4 | -1.681319 | 10.08132 | S20 | 2 | FAKEDATA | FAKEDATA | 64 | 11 | 232774.6 | 42.84 | 68.64 | 19552.55 | 48485.04 | 28.14 | 84.62 | 1.02 | 28.94 | 8.22 | -48.92 | 199.14 | -0.00137961 | 8.83 | 743.4 | 14.97 | 34.34 | 48.73 | -122.51 |
| 12/16/2023 | 8:52 AM | 45 | 9.5 | -1.901099 | 11.40110 | N8 | 10 | FAKEDATA | FAKEDATA | 54 | 10 | 231314.8 | 21.45 | 140.12 | 11830.73 | 28587.31 | 31.07 | 119.91 | 1.02 | 31.52 | 8.25 | -49.30 | 196.90 | -0.00188343 | 8.87 | 743.4 | 14.99 | 34.40 | 48.73 | -122.51 |
| 12/17/2023 | 9:33 AM | 45 | 9.4 | -2.120879 | 11.52088 | S4 | 13 | FAKEDATA | FAKEDATA | 45 | 9 | 229855.0 | 30.08 | 123.78 | 8340.95 | 17635.78 | 17.35 | 31.41 | 1.01 | 18.58 | 8.32 | -49.67 | 194.65 | -0.00238726 | 8.92 | 743.4 | 15.02 | 34.45 | 48.73 | -122.51 |
| 12/18/2023 | 10:11 AM | 45 | 9.3 | -2.340659 | 11.64066 | Calm | 6 | FAKEDATA | FAKEDATA | 39 | 8 | 228395.3 | 16.11 | 65.75 | 31840.40 | 12609.22 | 10.24 | 29.99 | 1.01 | 11.46 | 8.15 | -50.05 | 192.41 | -0.00289109 | 8.96 | 743.4 | 15.04 | 34.51 | 48.73 | -122.51 |
| 12/19/2023 | 10:47 AM | 45 | 9.3 | -2.560440 | 11.86044 | W7 | 23 | FAKEDATA | FAKEDATA | 27 | 7 | 226935.5 | 0.77 | 105.96 | 33343.61 | 46991.37 | 7.12 | 31.28 | 1.01 | 8.20 | 8.19 | -50.42 | 190.16 | -0.00339491 | 9.01 | 743.4 | 15.07 | 34.56 | 48.73 | -122.51 |
| 12/20/2023 | 11:21 AM | 45 | 9.2 | -2.780220 | 11.98022 | S4 | 0 | FAKEDATA | FAKEDATA | 19 | 6 | 225475.7 | 1.53 | 75.78 | 31983.05 | 47760.38 | 29.89 | 30.05 | 1.02 | 30.54 | 7.74 | -50.80 | 187.91 | -0.00389874 | 9.05 | 743.4 | 15.09 | 34.62 | 48.73 | -122.51 |
| 12/21/2023 | 11:52 AM | 45 | 9.2 | -3.000000 | 12.20000 | SE14 | 4 | FAKEDATA | FAKEDATA | 11 | 5 | 224016.0 | 18.12 | 71.80 | 33274.26 | 46475.11 | 30.57 | 29.60 | 1.02 | 31.04 | 8.34 | -51.17 | 185.67 | -0.00440256 | 9.09 | 743.4 | 15.12 | 34.67 | 48.73 | -122.51 |
| 12/22/2023 | 12:23 PM | 45 | 9.1 | -3.219780 | 12.31978 | NE6 | 3 | FAKEDATA | FAKEDATA | 5 | 4 | 222556.2 | 3.37 | 61.72 | 30027.95 | 47539.65 | 29.60 | 51.15 | 1.02 | 30.21 | 8.36 | -51.55 | 183.42 | -0.00490639 | 9.14 | 743.4 | 15.14 | 34.73 | 48.73 | -122.51 |
| 12/23/2023 | 12:53 PM | 45 | 9.0 | -3.439560 | 12.43956 | N9 | 25 | FAKEDATA | FAKEDATA | 2 | 3 | 221096.4 | 11.93 | 53.98 | 33779.56 | 47810.86 | 30.43 | 84.62 | 1.02 | 30.90 | 8.35 | -51.93 | 181.18 | -0.00541022 | 9.18 | 743.4 | 15.17 | 34.79 | 48.73 | -122.51 |
| 12/24/2023 | 1:24 PM | 45 | 8.9 | -3.659341 | 12.55934 | NE7 | 55 | FAKEDATA | FAKEDATA | 0 | 2 | 219636.7 | 14.22 | 68.64 | 19552.55 | 44527.21 | 27.22 | 119.91 | 1.02 | 28.02 | 7.68 | -52.30 | 178.93 | -0.00591404 | 9.23 | 743.4 | 15.19 | 34.84 | 48.73 | -122.51 |
| 12/25/2023 | 1:56 PM | 45 | 8.6 | -3.879121 | 12.47912 | S6 | 25 | FAKEDATA | FAKEDATA | 83 | 1 | 218176.9 | 0.77 | 140.12 | 11830.73 | 48485.04 | 30.53 | 31.41 | 1.02 | 31.08 | 8.16 | -52.68 | 176.69 | -0.00641787 | 9.27 | 743.4 | 15.22 | 34.90 | 48.73 | -122.51 |
| 12/26/2023 | 2:31 PM | 45 | 8.4 | -4.098901 | 12.49890 | S5 | 4 | FAKEDATA | FAKEDATA | 72 | 0 | 216717.2 | 1.53 | 123.78 | 8340.95 | 28587.31 | 28.14 | 29.99 | 1.02 | 28.94 | 8.22 | -53.05 | 174.44 | -0.00692170 | 9.31 | 743.4 | 15.24 | 34.95 | 48.73 | -122.51 |
| 12/27/2023 | 3:09 PM | 45 | 8.2 | -4.318681 | 12.51868 | S20 | 2 | FAKEDATA | FAKEDATA | 64 | -1 | 215257.4 | 18.12 | 65.75 | 31840.40 | 17635.78 | 31.07 | 31.28 | 1.02 | 31.52 | 8.25 | -53.43 | 172.20 | -0.00742552 | 9.36 | 743.4 | 15.27 | 35.01 | 48.73 | -122.51 |
| 12/28/2023 | 3:51 PM | 45 | 7.8 | -4.538462 | 12.33846 | N9 | 10 | FAKEDATA | FAKEDATA | 54 | -2 | 213797.6 | 3.37 | 105.96 | 33343.61 | 12609.22 | 17.35 | 30.05 | 1.01 | 18.58 | 8.32 | -53.81 | 169.95 | -0.00792935 | 9.40 | 743.4 | 15.29 | 35.06 | 48.73 | -122.51 |
| 12/29/2023 | 8:04 AM | 45 | 9.6 | -4.758242 | 14.35824 | S5 | 13 | FAKEDATA | FAKEDATA | 45 | -3 | 212337.9 | 11.93 | 75.78 | 31983.05 | 46991.37 | 10.24 | 31.00 | 1.01 | 11.46 | 8.15 | -54.18 | 167.70 | -0.00843318 | 9.45 | 743.4 | 15.32 | 35.12 | 48.73 | -122.51 |
| 12/30/2023 | 8:37 AM | 45 | 9.4 | -4.978022 | 14.37802 | Calm | 6 | FAKEDATA | FAKEDATA | 39 | -4 | 210878.1 | 14.22 | 71.80 | 33274.26 | 47760.38 | 7.12 | 31.00 | 1.01 | 8.20 | 8.19 | -54.56 | 165.46 | -0.00893700 | 9.49 | 743.4 | 15.34 | 35.17 | 48.73 | -122.51 |
#data exploration
#separating data into what I feel like looking at
one <- fakedata[6]
five <- fakedata[8]
seven <- fakedata[13:14]
twelve <- fakedata[16]
seventeen <- fakedata[18]
twenty <- fakedata[21:22]
twentyone <- fakedata[25]
wantedData <- data.frame(c(five, seven, twelve, seventeen, twenty, twentyone))
#got rid of water temp and tide_range because of high collinearity between tidal range, water temp, and chl_a
ggpairs(wantedData)
#rid of lunar dist
one <- fakedata[6]
five <- fakedata[8]
seven <- fakedata[14]
twelve <- fakedata[16]
seventeen <- fakedata[18]
twenty <- fakedata[22]
twentyone <- fakedata[25]
wantedData2 <- data.frame(c(five, seven, twelve, seventeen, twenty, twentyone))
#got rid of total dissolevd solids because high collinearity with salinity
#got rid of lunar_dist because of high correlation with chl a and dist.
ggpairs(wantedData2)
#strongest relationship is salinity and nitrate
#strongest with cell count is chl A, then pH, then nitrate
#checking some graphs
ggplot(wantedData2, aes(chl_a_flu_rfu, cell_count))+
geom_point()
ggplot(wantedData2, aes(ph, cell_count))+
geom_point()
ggplot(wantedData2, aes(nitrate_mg, cell_count))+
geom_point()
ggplot(wantedData2, aes(act_cond_us_cm, cell_count))+
geom_point()
ggplot(wantedData2, aes(salinity_pu, cell_count))+
geom_point()
ggplot(wantedData2, aes(cell_count))+
geom_histogram(binwidth = 5, color="pink", fill="white")
#trying to create a model
#checking variance and mean of response variable:
sd(fakedata$cell_count)
## [1] 14.3126
mean(fakedata$cell_count) #pretty much equal! Poisson assumes this, great! Moving on.
## [1] 15.33333
#create first model
model1 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + salinity_pu, data = wantedData2, family = poisson(link = "log"))
summary(model1) #null deviance is 366.4 (28 df), residual deviance is 349.88
##
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg +
## act_cond_us_cm + salinity_pu, family = poisson(link = "log"),
## data = wantedData2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.072e+00 1.724e+00 3.523 0.000427 ***
## chl_a_flu_rfu 2.862e+01 1.147e+01 2.495 0.012587 *
## ph -4.237e-01 2.089e-01 -2.029 0.042491 *
## nitrate_mg 3.929e-03 3.936e-03 0.998 0.318214
## act_cond_us_cm -2.387e-06 5.450e-06 -0.438 0.661433
## salinity_pu 6.221e-03 6.164e-03 1.009 0.312831
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 366.40 on 28 degrees of freedom
## Residual deviance: 349.88 on 23 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 476.41
##
## Number of Fisher Scoring iterations: 5
#using single term deletions
drop1(model1, test="Chi") #drop highest p
## Single term deletions
##
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm +
## salinity_pu
## Df Deviance AIC LRT Pr(>Chi)
## <none> 349.88 476.41
## chl_a_flu_rfu 1 356.09 480.62 6.2098 0.01270 *
## ph 1 353.87 478.40 3.9944 0.04565 *
## nitrate_mg 1 350.87 475.40 0.9880 0.32022
## act_cond_us_cm 1 350.07 474.60 0.1906 0.66238
## salinity_pu 1 350.91 475.44 1.0337 0.30930
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#dropping actual conductivity
model2 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + salinity_pu, data = wantedData2, family = poisson)
summary(model2)
##
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg +
## salinity_pu, family = poisson, data = wantedData2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.980213 1.708772 3.500 0.000466 ***
## chl_a_flu_rfu 28.964830 11.449965 2.530 0.011416 *
## ph -0.418068 0.208290 -2.007 0.044735 *
## nitrate_mg 0.003892 0.003942 0.987 0.323427
## salinity_pu 0.005554 0.005960 0.932 0.351383
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 366.40 on 28 degrees of freedom
## Residual deviance: 350.07 on 24 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 474.6
##
## Number of Fisher Scoring iterations: 5
drop1(model2, test="Chi") #all are significant (except salinity and nitrate, will drop highest p, salinity.
## Single term deletions
##
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + salinity_pu
## Df Deviance AIC LRT Pr(>Chi)
## <none> 350.07 474.60
## chl_a_flu_rfu 1 356.45 478.98 6.3755 0.01157 *
## ph 1 353.98 476.51 3.9091 0.04803 *
## nitrate_mg 1 351.04 473.57 0.9672 0.32537
## salinity_pu 1 350.95 473.48 0.8795 0.34834
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#drop salinity
model3 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg, data = wantedData2, family = poisson)
summary(model3)
##
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg, family = poisson,
## data = wantedData2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.313762 1.682046 3.754 0.000174 ***
## chl_a_flu_rfu 28.885225 11.354740 2.544 0.010963 *
## ph -0.444180 0.207662 -2.139 0.032439 *
## nitrate_mg 0.004875 0.003802 1.282 0.199766
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 366.40 on 28 degrees of freedom
## Residual deviance: 350.95 on 25 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 473.48
##
## Number of Fisher Scoring iterations: 5
#nitrate not significant still.
#drop nitrate
model4 <- glm(cell_count ~ chl_a_flu_rfu + ph, data = wantedData2, family = poisson)
summary(model4) #all are significant here. #has the lowest AIC value. GREAT! Model 4 looks good.
##
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph, family = poisson,
## data = wantedData2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.1960 1.6813 3.685 0.000228 ***
## chl_a_flu_rfu 30.6709 11.1631 2.748 0.006005 **
## ph -0.4208 0.2071 -2.032 0.042126 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 366.40 on 28 degrees of freedom
## Residual deviance: 352.57 on 26 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 473.1
##
## Number of Fisher Scoring iterations: 5
#All slopes are sig dif. from 0 and coefficients have small effect on halo cell counts (more below)
#I think poison model 4 is best.
#plot assumptions
par(mfrow=c(2,2))
plot(model4)
#look at AIC too
allmodels <- c(model1$aic, model2$aic, model3$aic, model4$aic)
labels <- c("M1", "M2", "M3", "M4")
AIC <- data.frame(rbind(labels, allmodels)) #model 4 is best
gt(AIC)|>
tab_options(
column_labels.hidden = T)|>
opt_table_font(google_font("Caveat"))|>
opt_stylize(color="cyan", style = 3)|>
fmt_number(decimals = 2)
| M1 | M2 | M3 | M4 |
| 476.410289798928 | 474.600932934541 | 473.480429008004 | 473.10482469859 |
#model 4 has lowest AIC, so might be best?
#checking overdispersion (when variance is larger than the mean)
residual_deviance <- deviance(model4)
df <- df.residual(model4)
# Calculate the ratio of residual deviance to degrees of freedom
residual_deviance / df #*greater than 1
## [1] 13.56055
#...so we can assume these data are overdispersed and negaitve binomial might be the way to go
#lets check model 1
residual_deviance <- deviance(model1)
df <- df.residual(model1)
# Calculate the ratio of residual deviance to degrees of freedom
residual_deviance / df #*greater than 1
## [1] 15.21216
#another way to calculate over dispersion (from Sob)
pp<- sum(resid(model4, type="pearson")^2)
1-pchisq(pp, model4$df.resid)
## [1] 0
#this says no overdispersion? WEIRD. The method above from zuur says there is overdispersion.
#Approx./Psuedo R^2
1-(model4$deviance/model4$null)
## [1] 0.03774392
#hot dang that is low.
avPlots(model4, pch=1, col="black", col.lines = "#00bfc4")
#get estimated marginal means with emmeans (this means visualizing the effects of one variable given the others--like an added variable plot but on the scale of the response)
#Create new dataframe over which to make predictions
# Remove rows with NA in the 'ph' column
wantedData2 <- wantedData2[complete.cases(wantedData2$ph), ]
# Create new dataframe over which to make predictions
#Look up emmeans (Estimated Marginal Means) to understand arguments
new_df <- emmeans(model4, ~ chl_a_flu_rfu + ph,
at = list(ph = seq(min(wantedData2$ph),
max(wantedData2$ph),
length.out = 100)))
ggplot(wantedData, aes(x = chl_a_flu_rfu, y = cell_count, col = ph)) +
geom_point() +
geom_line(data = as.data.frame(new_df), aes(y = emmean))
## Warning: Removed 1 rows containing missing values (`geom_point()`).
plot(wantedData2$chl_a_flu_rfu, wantedData2$cell_count, ylab="Halo Cell Count", xlab="Chlorophyll a (RFU)", pch=16, col="tomato")
lines(wantedData2$chl_a_flu_rfu, predict(model4), col="red", lwd=3)
BOTH QUASI AND NEG BINOM WERE WORKED OUT FULLY BUT NONE WERE SIGNIFICANT
#moving onto quasi poisson to correct for over-dispersion
# begin with the quasi model
model5 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + salinity_pu, data = wantedData2, family = quasipoisson)
summary(model5)
##
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg +
## act_cond_us_cm + salinity_pu, family = quasipoisson, data = wantedData2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.072e+00 6.899e+00 0.880 0.388
## chl_a_flu_rfu 2.862e+01 4.591e+01 0.623 0.539
## ph -4.237e-01 8.361e-01 -0.507 0.617
## nitrate_mg 3.929e-03 1.576e-02 0.249 0.805
## act_cond_us_cm -2.387e-06 2.182e-05 -0.109 0.914
## salinity_pu 6.221e-03 2.467e-02 0.252 0.803
##
## (Dispersion parameter for quasipoisson family taken to be 16.02372)
##
## Null deviance: 366.40 on 28 degrees of freedom
## Residual deviance: 349.88 on 23 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
#You can see the only difference is specifying the family option as quasipoisson instead of poisson. This gives the impression that there is a quasi-Poisson distribution, but there is no such thing! All we do here is specify the mean and variance relationship and an exponential link between the expected values and explanatory variables. It is a software issue to call this ‘quasipoisson’. Do not write in your report or paper that you used a quasi-Poisson distribution. Just say that you did a Poisson GLM, detected overdispersion, and corrected the standard errors using a quasi-GLM model where the variance is given by φ × µ, where µ is the mean and φ the dispersion parameter.
sqrt(16.0237) #the dispersion parameter for quasipoisson
## [1] 4.002961
#To get the numerical output for this model, use summary(model4), which gives the output below. Note that the ratio of the residual deviance and the degrees of freedom is still larger than 1, but that is no longer a problem as we now allow for overdispersion. The dispersion parameter φ is estimated as 16.0237. This means that all standard errors have been multiplied by 4.002961 (the square root of 16.0237), and as a result, most parameters are no longer significant! We can move onto model selection
drop1(model5,test="F") #drop actual conductivity first from model
## Single term deletions
##
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm +
## salinity_pu
## Df Deviance F value Pr(>F)
## <none> 349.88
## chl_a_flu_rfu 1 356.09 0.4082 0.5292
## ph 1 353.87 0.2626 0.6132
## nitrate_mg 1 350.87 0.0650 0.8011
## act_cond_us_cm 1 350.07 0.0125 0.9118
## salinity_pu 1 350.91 0.0679 0.7967
model6 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + salinity_pu, data = wantedData2, family = quasipoisson)
summary(model6)
##
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg +
## salinity_pu, family = quasipoisson, data = wantedData2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.980213 6.703116 0.892 0.381
## chl_a_flu_rfu 28.964830 44.915550 0.645 0.525
## ph -0.418068 0.817074 -0.512 0.614
## nitrate_mg 0.003892 0.015463 0.252 0.803
## salinity_pu 0.005554 0.023381 0.238 0.814
##
## (Dispersion parameter for quasipoisson family taken to be 15.3881)
##
## Null deviance: 366.40 on 28 degrees of freedom
## Residual deviance: 350.07 on 24 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
#dropping salinity
model7 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg, data = wantedData2, family = quasipoisson)
summary(model7)
##
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg, family = quasipoisson,
## data = wantedData2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.313762 6.521931 0.968 0.342
## chl_a_flu_rfu 28.885225 44.026646 0.656 0.518
## ph -0.444180 0.805186 -0.552 0.586
## nitrate_mg 0.004875 0.014741 0.331 0.744
##
## (Dispersion parameter for quasipoisson family taken to be 15.03408)
##
## Null deviance: 366.40 on 28 degrees of freedom
## Residual deviance: 350.95 on 25 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
#drop nitrate
model8 <- glm(cell_count ~ chl_a_flu_rfu + ph, data = wantedData2, family = quasipoisson)
summary(model8) #drop ph
##
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph, family = quasipoisson,
## data = wantedData2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.1960 6.3963 0.969 0.342
## chl_a_flu_rfu 30.6709 42.4697 0.722 0.477
## ph -0.4208 0.7878 -0.534 0.598
##
## (Dispersion parameter for quasipoisson family taken to be 14.47405)
##
## Null deviance: 366.40 on 28 degrees of freedom
## Residual deviance: 352.57 on 26 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
model9 <- glm(cell_count ~ chl_a_flu_rfu, data = wantedData2, family = quasipoisson)
summary(model9)
##
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu, family = quasipoisson,
## data = wantedData2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.7760 0.1891 14.678 2.17e-14 ***
## chl_a_flu_rfu 34.7170 42.2255 0.822 0.418
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 14.5625)
##
## Null deviance: 366.40 on 28 degrees of freedom
## Residual deviance: 356.56 on 27 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
#not significant :/
#what about negative binomial?
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Fit negative binomial model
model_nb <- glm.nb(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + salinity_pu, link="log", data = wantedData2)
summary(model_nb) #none of the parameters are significant still :/ wah wah
##
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg +
## act_cond_us_cm + salinity_pu, data = wantedData2, link = "log",
## init.theta = 1.121166406)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.422e+00 6.970e+00 0.921 0.357
## chl_a_flu_rfu 3.339e+01 4.388e+01 0.761 0.447
## ph -4.694e-01 8.470e-01 -0.554 0.579
## nitrate_mg 2.692e-03 1.565e-02 0.172 0.863
## act_cond_us_cm -7.689e-07 2.021e-05 -0.038 0.970
## salinity_pu 6.400e-03 2.193e-02 0.292 0.770
##
## (Dispersion parameter for Negative Binomial(1.1212) family taken to be 1)
##
## Null deviance: 34.711 on 28 degrees of freedom
## Residual deviance: 33.493 on 23 degrees of freedom
## AIC: 230.47
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.121
## Std. Err.: 0.312
##
## 2 x log-likelihood: -216.466
#there is a very small amount of overdispersion (1.12 is not high enough) so we can keep going with it.
#model selection process
drop1(model_nb, test = "Chi")
## Single term deletions
##
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm +
## salinity_pu
## Df Deviance AIC LRT Pr(>Chi)
## <none> 33.493 228.47
## chl_a_flu_rfu 1 34.088 227.06 0.59502 0.4405
## ph 1 33.859 226.83 0.36587 0.5453
## nitrate_mg 1 33.516 226.49 0.02241 0.8810
## act_cond_us_cm 1 33.495 226.47 0.00129 0.9713
## salinity_pu 1 33.572 226.54 0.07884 0.7789
#drop actual conductivity
model_nb2 <- glm.nb(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg+ salinity_pu, link="log", data = wantedData2)
summary(model_nb2)
##
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg +
## salinity_pu, data = wantedData2, link = "log", init.theta = 1.1210926)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.454859 6.934536 0.931 0.352
## chl_a_flu_rfu 33.427615 43.597136 0.767 0.443
## ph -0.475646 0.846258 -0.562 0.574
## nitrate_mg 0.002615 0.015641 0.167 0.867
## salinity_pu 0.006351 0.021554 0.295 0.768
##
## (Dispersion parameter for Negative Binomial(1.1211) family taken to be 1)
##
## Null deviance: 34.710 on 28 degrees of freedom
## Residual deviance: 33.493 on 24 degrees of freedom
## AIC: 228.47
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.121
## Std. Err.: 0.312
##
## 2 x log-likelihood: -216.467
#nothing significant. Drop1 again
drop1(model_nb2, test="Chi")
## Single term deletions
##
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + salinity_pu
## Df Deviance AIC LRT Pr(>Chi)
## <none> 33.493 226.47
## chl_a_flu_rfu 1 34.090 225.06 0.59693 0.4398
## ph 1 33.888 224.86 0.39517 0.5296
## nitrate_mg 1 33.514 224.49 0.02146 0.8835
## salinity_pu 1 33.571 224.54 0.07790 0.7802
#dropping nitrate
model_nb3 <- glm.nb(cell_count ~ chl_a_flu_rfu + ph + salinity_pu, link="log", data = wantedData2)
summary(model_nb3)
##
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu + ph + salinity_pu,
## data = wantedData2, link = "log", init.theta = 1.120099512)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.485986 6.918367 0.938 0.349
## chl_a_flu_rfu 35.031927 43.066362 0.813 0.416
## ph -0.477472 0.843166 -0.566 0.571
## salinity_pu 0.007321 0.020990 0.349 0.727
##
## (Dispersion parameter for Negative Binomial(1.1201) family taken to be 1)
##
## Null deviance: 34.684 on 28 degrees of freedom
## Residual deviance: 33.490 on 25 degrees of freedom
## AIC: 226.49
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.120
## Std. Err.: 0.311
##
## 2 x log-likelihood: -216.489
#still nada. :/
drop1(model_nb3, test="Chi")
## Single term deletions
##
## Model:
## cell_count ~ chl_a_flu_rfu + ph + salinity_pu
## Df Deviance AIC LRT Pr(>Chi)
## <none> 33.490 224.49
## chl_a_flu_rfu 1 34.175 223.17 0.68515 0.4078
## ph 1 33.887 222.88 0.39665 0.5288
## salinity_pu 1 33.602 222.60 0.11195 0.7379
#drop salilinity
model_nb4 <- glm.nb(cell_count ~ chl_a_flu_rfu + ph, link="log", data = wantedData2)
summary(model_nb4)
##
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu + ph, data = wantedData2,
## link = "log", init.theta = 1.115608648)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.7669 6.8660 0.986 0.324
## chl_a_flu_rfu 36.4182 43.0353 0.846 0.397
## ph -0.4898 0.8433 -0.581 0.561
##
## (Dispersion parameter for Negative Binomial(1.1156) family taken to be 1)
##
## Null deviance: 34.571 on 28 degrees of freedom
## Residual deviance: 33.492 on 26 degrees of freedom
## AIC: 224.6
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.116
## Std. Err.: 0.310
##
## 2 x log-likelihood: -216.600
drop1(model_nb4)
## Single term deletions
##
## Model:
## cell_count ~ chl_a_flu_rfu + ph
## Df Deviance AIC
## <none> 33.492 222.60
## chl_a_flu_rfu 1 34.223 221.33
## ph 1 33.917 221.03
#bleh!
model_nb5 <- glm.nb(cell_count ~ chl_a_flu_rfu, link = "log", data = wantedData2)
summary(model_nb5)
##
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu, data = wantedData2,
## link = "log", init.theta = 1.099374933)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.7747 0.1997 13.894 <2e-16 ***
## chl_a_flu_rfu 33.8733 42.7832 0.792 0.429
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.0994) family taken to be 1)
##
## Null deviance: 34.159 on 28 degrees of freedom
## Residual deviance: 33.514 on 27 degrees of freedom
## AIC: 223.02
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.099
## Std. Err.: 0.304
##
## 2 x log-likelihood: -217.023
#it seems again like there is no significance in the model. Is regular poisson still the best?