Fake Halo Data Analysis

View the data

fakedata <- read.csv("fakeHALOdata.csv")

gt(fakedata)|>
  opt_table_font(google_font("Corbel"))|>
  opt_stylize(style=4, color="pink")

date	time	ambientweather.F	high.tide.ft	low.tide.ft	tide.range.ft	windspeedanddirection.mph	cell_count	onsite_notes	picking_notes	lunar_illumination_percent	next_full_days	lunar_distance_mi	nitrate_mg	nitrate_volts	act_cond_us_cm	spc_cond_um_cm	salinity_pu	resistivity_ohm_cm	density	total_diss_solids_ppt	ph	ph_mv	orp_mv	chl_a_flu_rfu	water_temp_c	baro_pressure_mmhg	pressure_psi	depth_ft	lat	long
12/1/2023	5:30 PM	41	6.6	-1.300000	7.90000	S6	19	FAKEDATA	FAKEDATA	83	25	247101.0	28.26	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	48.73	-122.51
12/2/2023	10:02 AM	45	9.2	-0.700000	9.90000	S5	22	FAKEDATA	FAKEDATA	72	24	249605.0	0.91	NA	29514.89	43101.90	27.22	34.04	1.02	28.02	7.68	-45.38	275.24	0.00661993	8.50	743.4	14.58	33.46	48.73	-122.51
12/3/2023	10:43 AM	44	9.0	0.100000	8.90000	S20	30	FAKEDATA	FAKEDATA	64	23	250758.0	42.84	NA	32863.91	47810.86	30.53	30.43	1.02	31.08	8.16	-43.40	219.66	0.00372587	8.70	743.4	14.66	33.65	48.73	-122.51
12/4/2023	10:56 AM	44	8.8	0.900000	7.90000	N7	10	FAKEDATA	FAKEDATA	54	22	251148.0	21.45	61.72	30027.95	44527.21	28.14	33.30	1.02	28.94	8.22	-46.80	190.62	0.00326642	7.95	743.4	14.82	34.02	48.73	-122.51
12/5/2023	11:46 AM	51	8.7	3.900000	4.80000	S3	15	FAKEDATA	FAKEDATA	45	21	250730.0	30.08	53.98	33779.56	48485.04	31.07	29.60	1.02	31.52	8.25	-48.70	206.56	0.00143601	9.16	743.4	14.76	33.87	48.73	-122.51
12/6/2023	11:38 AM	48	8.6	3.000000	5.60000	Calm	29	FAKEDATA	FAKEDATA	39	20	249520.0	16.11	68.64	19552.55	28587.31	17.35	51.15	1.01	18.58	8.32	-52.01	216.59	0.00346331	8.49	743.4	14.65	33.62	48.73	-122.51
12/7/2023	12:29 PM	42	8.6	2.000000	6.60000	W6	23	FAKEDATA	FAKEDATA	27	19	247601.0	0.77	140.12	11830.73	17635.78	10.24	84.62	1.01	11.46	8.15	-42.58	204.16	0.00523515	7.79	743.4	14.68	33.69	48.73	-122.51
12/8/2023	12:51 PM	42	8.6	1.000000	7.60000	S3	0	FAKEDATA	FAKEDATA	19	18	245113.0	1.53	123.78	8340.95	12609.22	7.12	119.91	1.01	8.20	8.19	-44.66	188.39	0.00342754	7.29	743.4	14.94	34.28	48.73	-122.51
12/9/2023	1:13 PM	37	8.6	0.000000	8.60000	SE13	4	FAKEDATA	FAKEDATA	11	17	241993.0	18.12	65.75	31840.40	46991.37	29.89	31.41	1.02	30.54	7.74	-19.92	251.90	0.00114000	8.08	743.4	14.86	34.09	48.73	-122.51
12/10/2023	1:36 PM	43	8.7	-1.000000	9.70000	NE5	3	FAKEDATA	FAKEDATA	5	16	238965.0	3.37	105.96	33343.61	47760.38	30.57	29.99	1.02	31.04	8.34	-53.26	223.71	0.00037800	9.28	743.4	14.78	33.92	48.73	-122.51
12/11/2023	2:00 PM	47	8.7	-1.800000	10.50000	N8	25	FAKEDATA	FAKEDATA	2	15	236034.0	11.93	75.78	31983.05	46475.11	29.60	31.28	1.02	30.21	8.36	-54.27	210.61	0.00046883	8.66	743.4	14.88	34.14	48.73	-122.51
12/12/2023	1:10 PM	45	8.4	-2.400000	10.80000	NE6	55	FAKEDATA	FAKEDATA	0	14	233623.0	14.22	71.80	33274.26	47539.65	30.43	30.05	1.02	30.90	8.35	-54.06	200.72	0.00000000	9.38	743.4	14.87	34.13	48.73	-122.51
12/13/2023	2:52 PM	45	8.8	-2.700000	11.50000	S6	20	FAKEDATA	FAKEDATA	83	13	235694.1	28.26	61.72	30027.95	47810.86	27.22	29.60	1.02	28.02	7.68	-48.17	203.63	-0.00037195	8.74	743.4	14.92	34.23	48.73	-122.51
12/14/2023	3:25 PM	45	8.7	-1.461538	10.16154	S5	4	FAKEDATA	FAKEDATA	72	12	234234.3	0.91	53.98	33779.56	44527.21	30.53	51.15	1.02	31.08	8.16	-48.54	201.39	-0.00087578	8.79	743.4	14.94	34.29	48.73	-122.51
12/15/2023	4:09 PM	45	8.4	-1.681319	10.08132	S20	2	FAKEDATA	FAKEDATA	64	11	232774.6	42.84	68.64	19552.55	48485.04	28.14	84.62	1.02	28.94	8.22	-48.92	199.14	-0.00137961	8.83	743.4	14.97	34.34	48.73	-122.51
12/16/2023	8:52 AM	45	9.5	-1.901099	11.40110	N8	10	FAKEDATA	FAKEDATA	54	10	231314.8	21.45	140.12	11830.73	28587.31	31.07	119.91	1.02	31.52	8.25	-49.30	196.90	-0.00188343	8.87	743.4	14.99	34.40	48.73	-122.51
12/17/2023	9:33 AM	45	9.4	-2.120879	11.52088	S4	13	FAKEDATA	FAKEDATA	45	9	229855.0	30.08	123.78	8340.95	17635.78	17.35	31.41	1.01	18.58	8.32	-49.67	194.65	-0.00238726	8.92	743.4	15.02	34.45	48.73	-122.51
12/18/2023	10:11 AM	45	9.3	-2.340659	11.64066	Calm	6	FAKEDATA	FAKEDATA	39	8	228395.3	16.11	65.75	31840.40	12609.22	10.24	29.99	1.01	11.46	8.15	-50.05	192.41	-0.00289109	8.96	743.4	15.04	34.51	48.73	-122.51
12/19/2023	10:47 AM	45	9.3	-2.560440	11.86044	W7	23	FAKEDATA	FAKEDATA	27	7	226935.5	0.77	105.96	33343.61	46991.37	7.12	31.28	1.01	8.20	8.19	-50.42	190.16	-0.00339491	9.01	743.4	15.07	34.56	48.73	-122.51
12/20/2023	11:21 AM	45	9.2	-2.780220	11.98022	S4	0	FAKEDATA	FAKEDATA	19	6	225475.7	1.53	75.78	31983.05	47760.38	29.89	30.05	1.02	30.54	7.74	-50.80	187.91	-0.00389874	9.05	743.4	15.09	34.62	48.73	-122.51
12/21/2023	11:52 AM	45	9.2	-3.000000	12.20000	SE14	4	FAKEDATA	FAKEDATA	11	5	224016.0	18.12	71.80	33274.26	46475.11	30.57	29.60	1.02	31.04	8.34	-51.17	185.67	-0.00440256	9.09	743.4	15.12	34.67	48.73	-122.51
12/22/2023	12:23 PM	45	9.1	-3.219780	12.31978	NE6	3	FAKEDATA	FAKEDATA	5	4	222556.2	3.37	61.72	30027.95	47539.65	29.60	51.15	1.02	30.21	8.36	-51.55	183.42	-0.00490639	9.14	743.4	15.14	34.73	48.73	-122.51
12/23/2023	12:53 PM	45	9.0	-3.439560	12.43956	N9	25	FAKEDATA	FAKEDATA	2	3	221096.4	11.93	53.98	33779.56	47810.86	30.43	84.62	1.02	30.90	8.35	-51.93	181.18	-0.00541022	9.18	743.4	15.17	34.79	48.73	-122.51
12/24/2023	1:24 PM	45	8.9	-3.659341	12.55934	NE7	55	FAKEDATA	FAKEDATA	0	2	219636.7	14.22	68.64	19552.55	44527.21	27.22	119.91	1.02	28.02	7.68	-52.30	178.93	-0.00591404	9.23	743.4	15.19	34.84	48.73	-122.51
12/25/2023	1:56 PM	45	8.6	-3.879121	12.47912	S6	25	FAKEDATA	FAKEDATA	83	1	218176.9	0.77	140.12	11830.73	48485.04	30.53	31.41	1.02	31.08	8.16	-52.68	176.69	-0.00641787	9.27	743.4	15.22	34.90	48.73	-122.51
12/26/2023	2:31 PM	45	8.4	-4.098901	12.49890	S5	4	FAKEDATA	FAKEDATA	72	0	216717.2	1.53	123.78	8340.95	28587.31	28.14	29.99	1.02	28.94	8.22	-53.05	174.44	-0.00692170	9.31	743.4	15.24	34.95	48.73	-122.51
12/27/2023	3:09 PM	45	8.2	-4.318681	12.51868	S20	2	FAKEDATA	FAKEDATA	64	-1	215257.4	18.12	65.75	31840.40	17635.78	31.07	31.28	1.02	31.52	8.25	-53.43	172.20	-0.00742552	9.36	743.4	15.27	35.01	48.73	-122.51
12/28/2023	3:51 PM	45	7.8	-4.538462	12.33846	N9	10	FAKEDATA	FAKEDATA	54	-2	213797.6	3.37	105.96	33343.61	12609.22	17.35	30.05	1.01	18.58	8.32	-53.81	169.95	-0.00792935	9.40	743.4	15.29	35.06	48.73	-122.51
12/29/2023	8:04 AM	45	9.6	-4.758242	14.35824	S5	13	FAKEDATA	FAKEDATA	45	-3	212337.9	11.93	75.78	31983.05	46991.37	10.24	31.00	1.01	11.46	8.15	-54.18	167.70	-0.00843318	9.45	743.4	15.32	35.12	48.73	-122.51
12/30/2023	8:37 AM	45	9.4	-4.978022	14.37802	Calm	6	FAKEDATA	FAKEDATA	39	-4	210878.1	14.22	71.80	33274.26	47760.38	7.12	31.00	1.01	8.20	8.19	-54.56	165.46	-0.00893700	9.49	743.4	15.34	35.17	48.73	-122.51

Poisson

#data exploration


#separating data into what I feel like looking at
one <- fakedata[6]
five <- fakedata[8]
seven <- fakedata[13:14]
twelve <- fakedata[16]
seventeen <- fakedata[18]
twenty <- fakedata[21:22]
twentyone <- fakedata[25]

wantedData <- data.frame(c(five, seven, twelve, seventeen, twenty, twentyone))


#got rid of water temp and tide_range because of high collinearity between tidal range, water temp, and chl_a
ggpairs(wantedData)

#rid of lunar dist
one <- fakedata[6]
five <- fakedata[8]
seven <- fakedata[14]
twelve <- fakedata[16]
seventeen <- fakedata[18]
twenty <- fakedata[22]
twentyone <- fakedata[25]

wantedData2 <- data.frame(c(five, seven, twelve, seventeen, twenty, twentyone))

#got rid of total dissolevd solids because high collinearity with salinity
#got rid of lunar_dist because of high correlation with chl a and dist. 
ggpairs(wantedData2)

#strongest relationship is salinity and nitrate
#strongest with cell count is chl A, then pH, then nitrate

#checking some graphs
ggplot(wantedData2, aes(chl_a_flu_rfu, cell_count))+
  geom_point()

ggplot(wantedData2, aes(ph, cell_count))+
  geom_point()

ggplot(wantedData2, aes(nitrate_mg, cell_count))+
  geom_point()

ggplot(wantedData2, aes(act_cond_us_cm, cell_count))+
  geom_point()

ggplot(wantedData2, aes(salinity_pu, cell_count))+
  geom_point()

ggplot(wantedData2, aes(cell_count))+
  geom_histogram(binwidth = 5, color="pink", fill="white")

#trying to create a model
#checking variance and mean of response variable: 
sd(fakedata$cell_count)

## [1] 14.3126

mean(fakedata$cell_count) #pretty much equal! Poisson assumes this, great! Moving on.

## [1] 15.33333

#create first model
model1 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + salinity_pu, data = wantedData2, family = poisson(link = "log"))
summary(model1) #null deviance is 366.4 (28 df), residual deviance is 349.88

## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + 
##     act_cond_us_cm + salinity_pu, family = poisson(link = "log"), 
##     data = wantedData2)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     6.072e+00  1.724e+00   3.523 0.000427 ***
## chl_a_flu_rfu   2.862e+01  1.147e+01   2.495 0.012587 *  
## ph             -4.237e-01  2.089e-01  -2.029 0.042491 *  
## nitrate_mg      3.929e-03  3.936e-03   0.998 0.318214    
## act_cond_us_cm -2.387e-06  5.450e-06  -0.438 0.661433    
## salinity_pu     6.221e-03  6.164e-03   1.009 0.312831    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 349.88  on 23  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 476.41
## 
## Number of Fisher Scoring iterations: 5

#using single term deletions
drop1(model1, test="Chi") #drop highest p

## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + 
##     salinity_pu
##                Df Deviance    AIC    LRT Pr(>Chi)  
## <none>              349.88 476.41                  
## chl_a_flu_rfu   1   356.09 480.62 6.2098  0.01270 *
## ph              1   353.87 478.40 3.9944  0.04565 *
## nitrate_mg      1   350.87 475.40 0.9880  0.32022  
## act_cond_us_cm  1   350.07 474.60 0.1906  0.66238  
## salinity_pu     1   350.91 475.44 1.0337  0.30930  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#dropping actual conductivity
model2 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + salinity_pu, data = wantedData2, family = poisson)
summary(model2)

## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + 
##     salinity_pu, family = poisson, data = wantedData2)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    5.980213   1.708772   3.500 0.000466 ***
## chl_a_flu_rfu 28.964830  11.449965   2.530 0.011416 *  
## ph            -0.418068   0.208290  -2.007 0.044735 *  
## nitrate_mg     0.003892   0.003942   0.987 0.323427    
## salinity_pu    0.005554   0.005960   0.932 0.351383    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 350.07  on 24  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 474.6
## 
## Number of Fisher Scoring iterations: 5

drop1(model2, test="Chi") #all are significant (except salinity and nitrate, will drop highest p, salinity.

## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + salinity_pu
##               Df Deviance    AIC    LRT Pr(>Chi)  
## <none>             350.07 474.60                  
## chl_a_flu_rfu  1   356.45 478.98 6.3755  0.01157 *
## ph             1   353.98 476.51 3.9091  0.04803 *
## nitrate_mg     1   351.04 473.57 0.9672  0.32537  
## salinity_pu    1   350.95 473.48 0.8795  0.34834  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#drop salinity

model3 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg, data = wantedData2, family = poisson)
summary(model3)

## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg, family = poisson, 
##     data = wantedData2)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    6.313762   1.682046   3.754 0.000174 ***
## chl_a_flu_rfu 28.885225  11.354740   2.544 0.010963 *  
## ph            -0.444180   0.207662  -2.139 0.032439 *  
## nitrate_mg     0.004875   0.003802   1.282 0.199766    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 350.95  on 25  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 473.48
## 
## Number of Fisher Scoring iterations: 5

#nitrate not significant still. 

#drop nitrate
model4 <- glm(cell_count ~ chl_a_flu_rfu + ph, data = wantedData2, family = poisson)
summary(model4) #all are significant here. #has the lowest AIC value. GREAT! Model 4 looks good.

## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph, family = poisson, 
##     data = wantedData2)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     6.1960     1.6813   3.685 0.000228 ***
## chl_a_flu_rfu  30.6709    11.1631   2.748 0.006005 ** 
## ph             -0.4208     0.2071  -2.032 0.042126 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 352.57  on 26  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 473.1
## 
## Number of Fisher Scoring iterations: 5

#All slopes are sig dif. from 0 and coefficients have small effect on halo cell counts (more below)



#I think poison model 4 is best. 
#plot assumptions 
par(mfrow=c(2,2))
plot(model4)

#look at AIC too

allmodels <- c(model1$aic, model2$aic, model3$aic, model4$aic)
labels <- c("M1", "M2", "M3", "M4")
AIC <- data.frame(rbind(labels, allmodels)) #model 4 is best

gt(AIC)|>
  tab_options(
    column_labels.hidden = T)|>
  opt_table_font(google_font("Caveat"))|>
  opt_stylize(color="cyan", style = 3)|>
  fmt_number(decimals = 2)

M1	M2	M3	M4
476.410289798928	474.600932934541	473.480429008004	473.10482469859

#model 4 has lowest AIC, so might be best?



#checking overdispersion (when variance is larger than the mean)
residual_deviance <- deviance(model4)
df <- df.residual(model4)


# Calculate the ratio of residual deviance to degrees of freedom
residual_deviance / df #*greater than 1

## [1] 13.56055

#...so we can assume these data are overdispersed and negaitve binomial might be the way to go


#lets check model 1
residual_deviance <- deviance(model1)
df <- df.residual(model1)

# Calculate the ratio of residual deviance to degrees of freedom
residual_deviance / df #*greater than 1

## [1] 15.21216

#another way to calculate over dispersion (from Sob)
pp<- sum(resid(model4, type="pearson")^2)
1-pchisq(pp, model4$df.resid)

## [1] 0

#this says no overdispersion? WEIRD. The method above from zuur says there is overdispersion. 


#Approx./Psuedo R^2
1-(model4$deviance/model4$null)

## [1] 0.03774392

#hot dang that is low.

Added Variable Plots

avPlots(model4, pch=1, col="black", col.lines = "#00bfc4")

Model with observed and predicted points

#get estimated marginal means with emmeans (this means visualizing the effects of one variable given the others--like an added variable plot but on the scale of the response)

#Create new dataframe over which to make predictions
# Remove rows with NA in the 'ph' column
wantedData2 <- wantedData2[complete.cases(wantedData2$ph), ]

# Create new dataframe over which to make predictions
#Look up emmeans (Estimated Marginal Means) to understand arguments
new_df <- emmeans(model4, ~ chl_a_flu_rfu + ph, 
                           at = list(ph = seq(min(wantedData2$ph), 
                                             max(wantedData2$ph), 
                                             length.out = 100)))



ggplot(wantedData, aes(x = chl_a_flu_rfu, y = cell_count, col = ph)) +
  geom_point() +
  geom_line(data = as.data.frame(new_df), aes(y = emmean))

## Warning: Removed 1 rows containing missing values (`geom_point()`).

plot(wantedData2$chl_a_flu_rfu, wantedData2$cell_count, ylab="Halo Cell Count", xlab="Chlorophyll a (RFU)", pch=16, col="tomato")
lines(wantedData2$chl_a_flu_rfu, predict(model4), col="red", lwd=3)

Quasi-poisson

BOTH QUASI AND NEG BINOM WERE WORKED OUT FULLY BUT NONE WERE SIGNIFICANT

#moving onto quasi poisson to correct for over-dispersion
# begin with the quasi model

model5 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + salinity_pu, data = wantedData2, family = quasipoisson)
summary(model5)

## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + 
##     act_cond_us_cm + salinity_pu, family = quasipoisson, data = wantedData2)
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)     6.072e+00  6.899e+00   0.880    0.388
## chl_a_flu_rfu   2.862e+01  4.591e+01   0.623    0.539
## ph             -4.237e-01  8.361e-01  -0.507    0.617
## nitrate_mg      3.929e-03  1.576e-02   0.249    0.805
## act_cond_us_cm -2.387e-06  2.182e-05  -0.109    0.914
## salinity_pu     6.221e-03  2.467e-02   0.252    0.803
## 
## (Dispersion parameter for quasipoisson family taken to be 16.02372)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 349.88  on 23  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5

#You can see the only difference is specifying the family option as quasipoisson instead of poisson. This gives the impression that there is a quasi-Poisson distribution, but there is no such thing! All we do here is specify the mean and variance relationship and an exponential link between the expected values and explanatory variables. It is a software issue to call this ‘quasipoisson’. Do not write in your report or paper that you used a quasi-Poisson distribution. Just say that you did a Poisson GLM, detected overdispersion, and corrected the standard errors using a quasi-GLM model where the variance is given by φ × µ, where µ is the mean and φ the dispersion parameter.
sqrt(16.0237) #the dispersion parameter for quasipoisson

## [1] 4.002961

#To get the numerical output for this model, use summary(model4), which gives the output below. Note that the ratio of the residual deviance and the degrees of freedom is still larger than 1, but that is no longer a problem as we now allow for overdispersion. The dispersion parameter φ is estimated as 16.0237. This means that all standard errors have been multiplied by 4.002961 (the square root of 16.0237), and as a result, most parameters are no longer significant! We can move onto model selection


drop1(model5,test="F") #drop actual conductivity first from model

## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + 
##     salinity_pu
##                Df Deviance F value Pr(>F)
## <none>              349.88               
## chl_a_flu_rfu   1   356.09  0.4082 0.5292
## ph              1   353.87  0.2626 0.6132
## nitrate_mg      1   350.87  0.0650 0.8011
## act_cond_us_cm  1   350.07  0.0125 0.9118
## salinity_pu     1   350.91  0.0679 0.7967

model6 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + salinity_pu, data = wantedData2, family = quasipoisson)
summary(model6)

## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + 
##     salinity_pu, family = quasipoisson, data = wantedData2)
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)    5.980213   6.703116   0.892    0.381
## chl_a_flu_rfu 28.964830  44.915550   0.645    0.525
## ph            -0.418068   0.817074  -0.512    0.614
## nitrate_mg     0.003892   0.015463   0.252    0.803
## salinity_pu    0.005554   0.023381   0.238    0.814
## 
## (Dispersion parameter for quasipoisson family taken to be 15.3881)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 350.07  on 24  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5

#dropping salinity
model7 <- glm(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg, data = wantedData2, family = quasipoisson)
summary(model7)

## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg, family = quasipoisson, 
##     data = wantedData2)
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)    6.313762   6.521931   0.968    0.342
## chl_a_flu_rfu 28.885225  44.026646   0.656    0.518
## ph            -0.444180   0.805186  -0.552    0.586
## nitrate_mg     0.004875   0.014741   0.331    0.744
## 
## (Dispersion parameter for quasipoisson family taken to be 15.03408)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 350.95  on 25  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5

#drop nitrate
model8 <- glm(cell_count ~ chl_a_flu_rfu + ph, data = wantedData2, family = quasipoisson)
summary(model8) #drop ph

## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu + ph, family = quasipoisson, 
##     data = wantedData2)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)     6.1960     6.3963   0.969    0.342
## chl_a_flu_rfu  30.6709    42.4697   0.722    0.477
## ph             -0.4208     0.7878  -0.534    0.598
## 
## (Dispersion parameter for quasipoisson family taken to be 14.47405)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 352.57  on 26  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5

model9 <- glm(cell_count ~ chl_a_flu_rfu, data = wantedData2, family = quasipoisson)
summary(model9)

## 
## Call:
## glm(formula = cell_count ~ chl_a_flu_rfu, family = quasipoisson, 
##     data = wantedData2)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2.7760     0.1891  14.678 2.17e-14 ***
## chl_a_flu_rfu  34.7170    42.2255   0.822    0.418    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 14.5625)
## 
##     Null deviance: 366.40  on 28  degrees of freedom
## Residual deviance: 356.56  on 27  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5

#not significant :/

#what about negative binomial?

Negative Binomial

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

# Fit negative binomial model
model_nb <- glm.nb(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + salinity_pu, link="log", data = wantedData2)
summary(model_nb) #none of the parameters are significant still :/ wah wah

## 
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + 
##     act_cond_us_cm + salinity_pu, data = wantedData2, link = "log", 
##     init.theta = 1.121166406)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)     6.422e+00  6.970e+00   0.921    0.357
## chl_a_flu_rfu   3.339e+01  4.388e+01   0.761    0.447
## ph             -4.694e-01  8.470e-01  -0.554    0.579
## nitrate_mg      2.692e-03  1.565e-02   0.172    0.863
## act_cond_us_cm -7.689e-07  2.021e-05  -0.038    0.970
## salinity_pu     6.400e-03  2.193e-02   0.292    0.770
## 
## (Dispersion parameter for Negative Binomial(1.1212) family taken to be 1)
## 
##     Null deviance: 34.711  on 28  degrees of freedom
## Residual deviance: 33.493  on 23  degrees of freedom
## AIC: 230.47
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.121 
##           Std. Err.:  0.312 
## 
##  2 x log-likelihood:  -216.466

#there is a very small amount of overdispersion (1.12 is not high enough) so we can keep going with it. 
#model selection process
drop1(model_nb, test = "Chi")

## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + act_cond_us_cm + 
##     salinity_pu
##                Df Deviance    AIC     LRT Pr(>Chi)
## <none>              33.493 228.47                 
## chl_a_flu_rfu   1   34.088 227.06 0.59502   0.4405
## ph              1   33.859 226.83 0.36587   0.5453
## nitrate_mg      1   33.516 226.49 0.02241   0.8810
## act_cond_us_cm  1   33.495 226.47 0.00129   0.9713
## salinity_pu     1   33.572 226.54 0.07884   0.7789

#drop actual conductivity

model_nb2 <- glm.nb(cell_count ~ chl_a_flu_rfu + ph + nitrate_mg+ salinity_pu, link="log", data = wantedData2)
summary(model_nb2)

## 
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + 
##     salinity_pu, data = wantedData2, link = "log", init.theta = 1.1210926)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)    6.454859   6.934536   0.931    0.352
## chl_a_flu_rfu 33.427615  43.597136   0.767    0.443
## ph            -0.475646   0.846258  -0.562    0.574
## nitrate_mg     0.002615   0.015641   0.167    0.867
## salinity_pu    0.006351   0.021554   0.295    0.768
## 
## (Dispersion parameter for Negative Binomial(1.1211) family taken to be 1)
## 
##     Null deviance: 34.710  on 28  degrees of freedom
## Residual deviance: 33.493  on 24  degrees of freedom
## AIC: 228.47
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.121 
##           Std. Err.:  0.312 
## 
##  2 x log-likelihood:  -216.467

#nothing significant. Drop1 again

drop1(model_nb2, test="Chi")

## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph + nitrate_mg + salinity_pu
##               Df Deviance    AIC     LRT Pr(>Chi)
## <none>             33.493 226.47                 
## chl_a_flu_rfu  1   34.090 225.06 0.59693   0.4398
## ph             1   33.888 224.86 0.39517   0.5296
## nitrate_mg     1   33.514 224.49 0.02146   0.8835
## salinity_pu    1   33.571 224.54 0.07790   0.7802

#dropping nitrate

model_nb3 <- glm.nb(cell_count ~ chl_a_flu_rfu + ph + salinity_pu, link="log", data = wantedData2)
summary(model_nb3)

## 
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu + ph + salinity_pu, 
##     data = wantedData2, link = "log", init.theta = 1.120099512)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)    6.485986   6.918367   0.938    0.349
## chl_a_flu_rfu 35.031927  43.066362   0.813    0.416
## ph            -0.477472   0.843166  -0.566    0.571
## salinity_pu    0.007321   0.020990   0.349    0.727
## 
## (Dispersion parameter for Negative Binomial(1.1201) family taken to be 1)
## 
##     Null deviance: 34.684  on 28  degrees of freedom
## Residual deviance: 33.490  on 25  degrees of freedom
## AIC: 226.49
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.120 
##           Std. Err.:  0.311 
## 
##  2 x log-likelihood:  -216.489

#still nada. :/

drop1(model_nb3, test="Chi")

## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph + salinity_pu
##               Df Deviance    AIC     LRT Pr(>Chi)
## <none>             33.490 224.49                 
## chl_a_flu_rfu  1   34.175 223.17 0.68515   0.4078
## ph             1   33.887 222.88 0.39665   0.5288
## salinity_pu    1   33.602 222.60 0.11195   0.7379

#drop salilinity

model_nb4 <- glm.nb(cell_count ~ chl_a_flu_rfu + ph, link="log", data = wantedData2)
summary(model_nb4)

## 
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu + ph, data = wantedData2, 
##     link = "log", init.theta = 1.115608648)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)     6.7669     6.8660   0.986    0.324
## chl_a_flu_rfu  36.4182    43.0353   0.846    0.397
## ph             -0.4898     0.8433  -0.581    0.561
## 
## (Dispersion parameter for Negative Binomial(1.1156) family taken to be 1)
## 
##     Null deviance: 34.571  on 28  degrees of freedom
## Residual deviance: 33.492  on 26  degrees of freedom
## AIC: 224.6
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.116 
##           Std. Err.:  0.310 
## 
##  2 x log-likelihood:  -216.600

drop1(model_nb4)

## Single term deletions
## 
## Model:
## cell_count ~ chl_a_flu_rfu + ph
##               Df Deviance    AIC
## <none>             33.492 222.60
## chl_a_flu_rfu  1   34.223 221.33
## ph             1   33.917 221.03

#bleh!

model_nb5 <- glm.nb(cell_count ~ chl_a_flu_rfu, link = "log", data = wantedData2)
summary(model_nb5)

## 
## Call:
## glm.nb(formula = cell_count ~ chl_a_flu_rfu, data = wantedData2, 
##     link = "log", init.theta = 1.099374933)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.7747     0.1997  13.894   <2e-16 ***
## chl_a_flu_rfu  33.8733    42.7832   0.792    0.429    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.0994) family taken to be 1)
## 
##     Null deviance: 34.159  on 28  degrees of freedom
## Residual deviance: 33.514  on 27  degrees of freedom
## AIC: 223.02
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.099 
##           Std. Err.:  0.304 
## 
##  2 x log-likelihood:  -217.023

#it seems again like there is no significance in the model. Is regular poisson still the best?

FAKE Halo Data

Fake Halo Data Analysis

View the data

Poisson

Added Variable Plots

Model with observed and predicted points

Quasi-poisson

Negative Binomial