In an effort to promote reproducibility of my analysis, I will try to document most of my analytical process in this R notebook. Where I have conducted steps in ArcGIS, I will include some screenshots and description.
The aims of my analysis are to:
Model of the above mentioned relationships AND assess for spatial structure in the results by taking into account spatial autocorrelation between the residuals
Predict the spatial pattern of aerosolized ARG concentrations (during the summer) using interpolation
Aesthetics note: I will appropriately label and style key figures in the future, but want to go ahead and share the information for feedback first.
Load R libraries
library(ggplot2)
library(rio)
library(tidyverse)
library(sf)
library(tmap)
Import data files
ddpcr <-import('C:/Users/dvern/Desktop/Fresh Thesis Analysis/LAP_ddpcr_14JAN.csv') #file with ARG concentrations
#ddpcr1 <- ddpcr %>% select(Site) # this doesn't work because I have a bunch of duplicate column names
ddpcr1 <- ddpcr[, !duplicated(colnames(ddpcr))] #creates dataframe without duplicte column names
ddpcr2 <- ddpcr1 %>% select(Sample, Site, `GPS X`, `GPS Y`, `Pos for 1`, intl1_conc, qnrB_conc,
tetA_conc, blaTEM_conc, FloR_conc)
dist <- import('C:/Users/dvern/Desktop/Fresh Thesis Analysis/samp_dist.csv') #file with sample locations distance from nearest river
dist1 <- dist %>% select(Sample_ID, Date, Time_Day, river_distance)
Join and clean files
arg <- left_join(ddpcr2, dist1, by = c('Sample' = 'Sample_ID')) #create new dataframe with all desired values
arg$`GPS X`[14] <- -68.08400 # fix 2 incorrectly signed GPS values
arg$`GPS X`[15] <- -68.08400
arg1 <- arg %>% filter(`GPS X` < 0) #removes the Chacaltaya site since it's GPS value was NA
#ggplot(blat_model6, aes(x=Distance_adj)) + geom_histogram(binwidth = 10, boundary=0, color='black', fill= 'white', pad=T) + xlab('Distance to nearest river (m)')
arg2 <- arg1 %>% filter(river_distance < 200) #removies outlier site that's ~450 m from river
#argjoin.wide.adj$diff <- argjoin.wide.adj$Distance_adj - argjoin.wide.adj$Distance
#summary(argjoin.wide.adj$diff)
This first bit of code shows my process for making simple single ARG vs. distance plots. Next is the code for making a dataset that allows for plotting all ARGs at once.
# The first two code chunks below plot just one ARG vs. River Distance
#ggplot(arg2, aes(x=river_distance, y= blaTEM_conc)) + geom_point() + geom_smooth(method = lm)
#ggplot(arg2, aes(x=river_distance, y= intl1_conc)) + geom_point() + geom_smooth(method = lm)
## Code below makes a 'wide' dataset to enable facet plotting
arg3 <- arg1 %>% select(Sample, Site, intl1_conc, qnrB_conc, tetA_conc, blaTEM_conc)
arg3.wide <- arg3 %>% gather(key, value, -Sample, -Site) %>% #creates an observation for each ARG concentration
separate(key, c("ARG", "Concentration"), "\\_") %>%
spread(Concentration, value)
argjoin.wide <- left_join(arg3.wide, dist1, by = c('Sample' = 'Sample_ID')) %>% filter(river_distance < 200)#join river distance values back to each observation
This plot shows the relationship between river distance and concentration for all ARGs (except FloR, which was zeroes across the board).
ggplot(argjoin.wide, aes(x=river_distance, y= conc #, col= Time_Day
)
) + geom_point() + geom_smooth(method = lm, se =F) + facet_grid(. ~ ARG) +
ggtitle("Detected ARG concentrations \n by distance to nearest river") + theme(plot.title = element_text(hjust = 0.5)) +
xlab("River Distance (m)") +
ylab("Concentration\n(Gene copies per m^3 air)") +
ylim(0,4500) + theme(axis.text = element_text(size=10))
Same as above but shows the difference when time of day is included.
ggplot(argjoin.wide, aes(x=river_distance, y= conc , col= Time_Day
)
) + geom_point() + geom_smooth(method = lm, se =T) + facet_grid(. ~ ARG)+ ggtitle("Detected ARG concentrations by distance to nearest river,\n by time of day") + theme(plot.title = element_text(hjust = 0.5)) +
xlab("River Distance (m)") +
ylab("Concentration\n(Gene copies per m^3 air)") +
theme(axis.text = element_text(size=10))
These plots show just beta-lactamase-TEM and Integrase-1 (intl1).
argjoin.wide.sub <- argjoin.wide %>% filter(ARG== 'blaTEM' | ARG == 'intl1')
ggplot(argjoin.wide.sub, aes(x=river_distance, y= conc #,col= Time_Day
)) + geom_point() + geom_smooth(method = lm, se =T) + facet_grid(. ~ ARG) #facets by ARG type
ggplot(argjoin.wide.sub, aes(x=river_distance, y= conc ,col= Time_Day
)) + geom_point() + geom_smooth(method = lm, se =T) + facet_grid(. ~ ARG) #facets by ARG type
Data import, joining, and Cleaning
### Bring in sample site to hospital distance file and clean
hosp <-st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/shp/samp_to_hosp_dis.shp')
Reading layer `samp_to_hosp_dis' from data source `C:\Users\dvern\Desktop\Fresh Thesis Analysis\shp\samp_to_hosp_dis.shp' using driver `ESRI Shapefile'
Simple feature collection with 48 features and 58 fields
geometry type: POINT
dimension: XY
bbox: xmin: 594005.2 ymin: 8170936 xmax: 599332.7 ymax: 8178327
epsg (SRID): NA
proj4string: +proj=utm +zone=19 +south +ellps=GRS80 +units=m +no_defs
## Upon examination of the dataset, I've noticed two errors: the column names for location ID and sample ID are switched
## Also the samples are misnumbered as they were going off a previous ordering with the control included
hosp.t <- hosp %>% st_drop_geometry() %>% arrange(Location_I) #arranges table in ascending order based on Location ID (which is actually sample ID)
hosp.t$Sample <- 1:48 #create new correctly ordered Sample ID variable
hosp.table <- hosp.t %>% select(Sample, name, Distance, Distance_1, GPSX, GPSY) %>% rename(latitude = GPSY, longitude = GPSX) #select variables of concern
hosp.table.1 <- hosp.table %>% rename(Samp_to_Hosp = Distance_1, Hosp_to_river = Distance) #rename variables appropriately
## Join hosp distance to main ARG dataframe
arg.hosp.wide <- left_join(argjoin.wide, hosp.table.1, by = c('Sample'= 'Sample'))
The first plot shows the relationship between distance of the sample site from the nearest hospital and ARG concentration.
## Visualize ARG conc vs. distance from hosp
arg.hosp.wide1 <- arg.hosp.wide %>% filter(ARG== 'blaTEM' | ARG == 'intl1')
ggplot(arg.hosp.wide1, aes(x=Samp_to_Hosp, y= conc)) + geom_point() + geom_smooth(method = lm, se =T) + facet_grid(. ~ ARG) + ggtitle("Detected ARG concentrations by distance to nearest hospital") + theme(plot.title = element_text(hjust = 0.5)) +
xlab("Hospital Distance (m)") +
ylab("Concentration\n(Gene copies per m^3 air)") +
theme(axis.text = element_text(size=10))
With all sample sites considered, it looks like there is no relationship. However, many sample sites are along rivers without any hospitals nearby.
This code converts our table with ARG concentrations to a georeferenced object and saves it as an ESRI shapefile for use in ArcMap.
#arg2_r <- arg2 %>% rename(X= `GPS X`, Y= `GPS Y`)
#arg2_sf <-st_as_sf(arg2_r, coords = c('X', 'Y'), crs=4326) #speciify long (x) and lat (y) columns and EPSG code of projection
#st_write(arg2_sf,'C:/Users/dvern/Desktop/Fresh Thesis Analysis/shp/arg_conc_river_dis4.shp')
#plot(arg2_sf)
The below map was created in ArcGIS. It demonstrates that hospitals in La Paz mostly adjoin the Choqueyapu and Orkojahuira rivers. Furthermore, it shows how higher blaTEM concentrations tend be found in samples taken near these rivers.
This code brings in a file with the name of the nearest river associated with each sample site, joins it to the main data frame, and selects only sample sites near rivers with adjoining hospitals.
riv_name <- st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/shp/name_join.shp')
Reading layer `name_join' from data source `C:\Users\dvern\Desktop\Fresh Thesis Analysis\shp\name_join.shp' using driver `ESRI Shapefile'
Simple feature collection with 48 features and 16 fields
geometry type: POINT
dimension: XY
bbox: xmin: 594005.2 ymin: 8170936 xmax: 599332.7 ymax: 8178327
epsg (SRID): NA
proj4string: +proj=utm +zone=19 +south +ellps=GRS80 +units=m +no_defs
riv.name.tab <- riv_name %>% st_drop_geometry() %>% arrange(Location_I) %>% #performs same correction as above in one command
mutate(Sample = 1:48) %>% select(Sample, Sample_ID, Name) %>% #verify sample id against location ID (misnamed as Sample_ID)
select(Sample, Name)
## join names to main dataframe
arg.hosp.nam.wid <- left_join(arg.hosp.wide, riv.name.tab, by = c('Sample' = 'Sample')) %>%
rename(riv_name = Name, hosp_name = name) # had two name variables, renamed to avoid confusion
## Select rivers that have many hospitals upstream
arg.hosp.nam.wid$riv_name <- as.character(arg.hosp.nam.wid$riv_name)
arg.hos.name.filt <- arg.hosp.nam.wid %>% filter(riv_name == 'Choqueyapu' | riv_name == 'Orkojahuira' #|riv_name == 'Irpavi'
)
This plot shows the distance from sample site to nearest hospital vs. ARG concentration when considering only the Choqueyapu and Orkojahuira rivers.
ggplot(arg.hos.name.filt, aes(x=Samp_to_Hosp, y= conc)) + geom_point() + geom_smooth(method = lm, se =T) + facet_grid(. ~ ARG)
This plot shows the relationship between hospital distance and blaTEM concentration for each river individually.
If hospital discharge into the environment contributes to sampled ARG concentrations, then for some of these rivers the relationship is the opposite of what might be expected. That is, as euclidean distance from hospitals to sample sites increases, sampled ARG concentrations increase for sites adjoining the Achumani and Choqueyapu Rivers. As the previous map makes clear, there are no hospitals adjoining the Achumani, so the relationsip observed is likely spurious.
Regarding the Choqueyapu sites, a possible explanation is that the effect of hospitals proximate to the river is cumulative as the river flows southward. However, this is still an unsatisfactory explanation as most of the hospitals are already located upstream of our sample sites.
On the other hand, the map below provokes another conjecture: could the influence of population density next to rivers be cumulative? That is, do the shit units (if you will) add up as the river flows downstream, and does this accumulationn affect our measured aersolized ARG concentration?
In the future, I’d like to come up with a more nuanced measure of the cumulative population density influence on a given portion of river and by relation the ARG concentration measured at a particular site. One way I might approach this is cut each river into many segments and join each segment with the pop density polygons within a certain distance. Then I would add up the pop density influence for each segment. Next I would have to figure out how to assign each segment the sum of the density values for all up river segments.
Another idea I have is try and use ideas from hydrological models to do this. I’ll go into this more at another time.
For now, I will simply use distance traveled downriver as a proxy. And sadly I will have to start with a proxy for that by plotting the relationship between change in geographical position (north to south, as indicated by change in absolute value of latitude) and ARG concentration.
When all rivers are considered together, there seems to be little relationship.
## what if I show the relationship of south vs north
south.samp.dis <- arg.hosp.nam.wid %>% filter(ARG == 'blaTEM' | ARG == 'intl1')
ggplot(south.samp.dis, aes(x=abs(latitude), y= conc)) + geom_point() +
geom_smooth(method = lm, se =T) + facet_grid(. ~ ARG)
However, when considering just the sites next to the Choqueyapu and Orkojahuira, which have much higher bordering population densities, there does appear to be a weak relationship between southwards travel and ARG concentrations.
south.samp.dis1 <- arg.hosp.nam.wid %>% filter(riv_name == "Choqueyapu" |
riv_name == "Orkojahuira"# | riv_name == 'Irpavi'
) %>% filter(ARG == 'blaTEM' | ARG == 'intl1')
ggplot(south.samp.dis1, aes(x=abs(latitude), y= conc)) + geom_point() + #arg concentration vs. latitude for Choqueyapu
geom_smooth(method = lm, se =T) + facet_grid(. ~ ARG) + ggtitle("Detected ARG concentrations \n by latitude for the Choqueyapu and Orkojahuira Rivers") + theme(plot.title = element_text(hjust = 0.5)) +
xlab("abs(Latitude)") +
ylab("Concentration\n(Gene copies per m^3 air)") +
ylim(-1000,4500) + theme(axis.text = element_text(size=10))
I want to try and show plots with model coefficient values and significance on top.
The code below defines a function I found online which pulls values from the linear model output and plots them with ggplot2.
argjoin.wide.blat <- argjoin.wide %>% filter(ARG== 'blaTEM') #first subset to blaTEM
#I found this function online at https://sejohnston.com/2012/08/09/a-quick-and-easy-function-to-plot-lm-results-in-r/
#first step is to define the function which pulls values from the results of a linear model function and then plots them with ggplot2
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
labs(title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
"Intercept =",signif(fit$coef[[1]],5 ),
" Slope =",signif(fit$coef[[2]], 5),
" P =",signif(summary(fit)$coef[2,4], 5)))
}
#next define the model, concentration = intercept + B1(river_distance)
bla_dis <- lm(conc ~ river_distance, data= argjoin.wide.blat)
#now try the function
ggplotRegression(bla_dis)
Approaching significance…
There doesn’t seem to be a statistical relationship here. Also, the numbers given are for 1 degree change in latitude so dividing by 100 makes this more interpretable. Additionally, the function wouldn’t accept the absolute value of latitude. so keep in mind that a more negative latitude is more SOUTH.
Thus we have the slope as a 104.7 unit increase in concentration per 0.01 degree travel southwards with a significance of
south_blat <- south.samp.dis1 %>% filter(ARG == 'blaTEM')
south_dis <- lm(conc ~ latitude, data= south_blat)
south_dis
Call:
lm(formula = conc ~ latitude, data = south_blat)
Coefficients:
(Intercept) latitude
-172070 -10467
ggplotRegression(south_dis)
I want to adjust my sample site to river distance measurement to account potential differences in elevation. The current distance measurements assume a uniformly flat plane and may be innacurate for some sample sites.
The following code imports that shapefile, converts it to a table, and cleans it in preparation for joining to the main dataset.
Planned Method:
What actually happened:
First I tried this with the ASTER GDEM V2.
Then I tried it with the 1 arc-second (30 m) SRTM Global DEM.
#read in shapefile with elevation data
elev <-st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/shp/elev_join3.shp')
Reading layer `elev_join3' from data source `C:\Users\dvern\Desktop\Fresh Thesis Analysis\shp\elev_join3.shp' using driver `ESRI Shapefile'
Simple feature collection with 48 features and 28 fields
geometry type: POINT
dimension: XY
bbox: xmin: 594005.2 ymin: 8170936 xmax: 599332.7 ymax: 8178327
epsg (SRID): NA
proj4string: +proj=utm +zone=19 +south +ellps=GRS80 +units=m +no_defs
#code below renames raster elevation variables appropriately, fixes naming mixup with location and sample IDs, fixes numbering of samples to match other datasets in this project, removes the two observations that were about 450m from the river
elev_tab <- elev %>% st_drop_geometry() %>% select(Location_I, Sample_ID, RASTERVALU, RASTERVA_1, Distance) %>% rename(Sample = Location_I, samp_elev = RASTERVALU, river_elev = RASTERVA_1, Site=Sample_ID) %>% arrange(Sample) %>% mutate(SampleID = 1:48)%>% filter(Distance < 200)
elev_tab$Sample <- elev_tab$SampleID
elev_tab_clean <- elev_tab %>% select(Sample, Site, samp_elev, river_elev, Distance)
The next step is to calculate the adjusted distances using trigonometry.
Where theta = arctan(abs(samp_elev-river_elev)/Distance)
and Distance_adjusted = Distance / cos(theta)
#adjustment calculation
dist_adj <- elev_tab_clean %>% mutate(theta = atan(abs(samp_elev-river_elev)/Distance), Distance_adj = Distance/ cos(theta))
#now join the adjusted distance to the main dataset
argjoin.wide.adj <- left_join(argjoin.wide, dist_adj, by = c('Sample' = 'Sample')) #join river distance values back to each observation
Now re-run model and graph of adjusted river distance vs. blaTEM concentration.
argjoin.wide.blat.adj <- argjoin.wide.adj %>% filter(ARG== 'blaTEM') #first subset to blaTEM
#I found this function online at https://sejohnston.com/2012/08/09/a-quick-and-easy-function-to-plot-lm-results-in-r/
#first step is to define the function which pulls values from the results of a linear model function and then plots them with ggplot2
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
labs(title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
"Intercept =",signif(fit$coef[[1]],5 ),
" Slope =",signif(fit$coef[[2]], 5),
" P =",signif(summary(fit)$coef[2,4], 5)))
}
#next define the model, concentration = intercept + B1(river_distance)
bla_dis_adj <- lm(conc ~ Distance_adj, data= argjoin.wide.blat.adj)
#now try the function
ggplotRegression(bla_dis_adj)
Well it’s only ever so slightly better.
The below values show the result when time of day is incorporated into the model. One observation was coded as AM/PM and removed. This observation is shifted an hour later than the latest morning observation but two hours earlier than the earliest afternoon observation. Perhaps I should recode it as morning?
#filter out the one observation coded as AM/PM
argjoin.wide.blat.adj.time1 <- argjoin.wide.blat.adj %>% filter(Time_Day != 'AM/PM') #version with one obs removed
argjoin.wide.blat.adj.time2 <- argjoin.wide.blat.adj
argjoin.wide.blat.adj.time2$Time_Day[33] <- 'AM' #version with obs recoded as 'AM'
# Go back and do this for the version that includes all ARGs
arg.time.dis.adj <- argjoin.wide.adj
arg.time.dis.adj$Time_Day[129:132] <- 'AM' #version with 'AM/PM' obs recoded as 'AM'
#run model with Time_Day included
bla_dis_time <- lm(conc ~ Distance_adj + Time_Day, data= argjoin.wide.blat.adj.time1)
#now try the function
bla_dis_time
Call:
lm(formula = conc ~ Distance_adj + Time_Day, data = argjoin.wide.blat.adj.time1)
Coefficients:
(Intercept) Distance_adj Time_DayPM
729.199 -7.047 171.782
ggplotRegression(bla_dis_time)
First, read in new srtm data file.
#read in shapefile with elevation data
elev_srtm <-st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/shp/elev_join_srtm.shp')
Reading layer `elev_join_srtm' from data source `C:\Users\dvern\Desktop\Fresh Thesis Analysis\shp\elev_join_srtm.shp' using driver `ESRI Shapefile'
Simple feature collection with 48 features and 28 fields
geometry type: POINT
dimension: XY
bbox: xmin: 594005.2 ymin: 8170936 xmax: 599332.7 ymax: 8178327
epsg (SRID): NA
proj4string: +proj=utm +zone=19 +south +ellps=GRS80 +units=m +no_defs
#code below renames raster elevation variables appropriately, fixes naming mixup with location and sample IDs, fixes numbering of samples to match other datasets in this project, removes the two observations that were about 450m from the river
elev_tab_srtm <- elev_srtm %>% st_drop_geometry() %>% select(Location_I, Sample_ID, RASTERVALU, RASTERVA_1, Distance) %>% rename(Sample = Location_I, samp_elev = RASTERVALU, river_elev = RASTERVA_1, Site=Sample_ID) %>% arrange(Sample) %>% mutate(SampleID = 1:48)%>% filter(Distance < 200)
elev_tab_srtm$Sample <- elev_tab_srtm$SampleID
elev_tab_clean_srtm <- elev_tab %>% select(Sample, Site, samp_elev, river_elev, Distance)
The next step is to calculate the adjusted distances using trigonometry.
#adjustment calculation
dist_adj_srtm <- elev_tab_clean %>% mutate(theta = atan(abs(samp_elev-river_elev)/Distance), Distance_adj = Distance/ cos(theta))
#now join the adjusted distance to the main dataset
argjoin.wide.adj.srtm <- left_join(argjoin.wide, dist_adj_srtm, by = c('Sample' = 'Sample')) #join river distance values back to each observation
Now re-run model and graph of adjusted river distance vs. blaTEM concentration.
argjoin.wide.blat.adj.srtm <- argjoin.wide.adj.srtm %>% filter(ARG== 'blaTEM') #first subset to blaTEM
#I found this function online at https://sejohnston.com/2012/08/09/a-quick-and-easy-function-to-plot-lm-results-in-r/
#first step is to define the function which pulls values from the results of a linear model function and then plots them with ggplot2
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
labs(title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
"Intercept =",signif(fit$coef[[1]],5 ),
" Slope =",signif(fit$coef[[2]], 5),
" P =",signif(summary(fit)$coef[2,4], 5)))
}
#next define the model, concentration = intercept + B1(river_distance)
bla_dis_adj_srtm <- lm(conc ~ Distance_adj, data= argjoin.wide.blat.adj.srtm)
#now try the function
ggplotRegression(bla_dis_adj_srtm)
The results seem identical, and the code below shows that the measurements from the two different DEMs are identical.
elev_tab_clean == elev_tab_clean_srtm
Sample Site samp_elev river_elev Distance
[1,] TRUE TRUE TRUE TRUE TRUE
[2,] TRUE TRUE TRUE TRUE TRUE
[3,] TRUE TRUE TRUE TRUE TRUE
[4,] TRUE TRUE TRUE TRUE TRUE
[5,] TRUE TRUE TRUE TRUE TRUE
[6,] TRUE TRUE TRUE TRUE TRUE
[7,] TRUE TRUE TRUE TRUE TRUE
[8,] TRUE TRUE TRUE TRUE TRUE
[9,] TRUE TRUE TRUE TRUE TRUE
[10,] TRUE TRUE TRUE TRUE TRUE
[11,] TRUE TRUE TRUE TRUE TRUE
[12,] TRUE TRUE TRUE TRUE TRUE
[13,] TRUE TRUE TRUE TRUE TRUE
[14,] TRUE TRUE TRUE TRUE TRUE
[15,] TRUE TRUE TRUE TRUE TRUE
[16,] TRUE TRUE TRUE TRUE TRUE
[17,] TRUE TRUE TRUE TRUE TRUE
[18,] TRUE TRUE TRUE TRUE TRUE
[19,] TRUE TRUE TRUE TRUE TRUE
[20,] TRUE TRUE TRUE TRUE TRUE
[21,] TRUE TRUE TRUE TRUE TRUE
[22,] TRUE TRUE TRUE TRUE TRUE
[23,] TRUE TRUE TRUE TRUE TRUE
[24,] TRUE TRUE TRUE TRUE TRUE
[25,] TRUE TRUE TRUE TRUE TRUE
[26,] TRUE TRUE TRUE TRUE TRUE
[27,] TRUE TRUE TRUE TRUE TRUE
[28,] TRUE TRUE TRUE TRUE TRUE
[29,] TRUE TRUE TRUE TRUE TRUE
[30,] TRUE TRUE TRUE TRUE TRUE
[31,] TRUE TRUE TRUE TRUE TRUE
[32,] TRUE TRUE TRUE TRUE TRUE
[33,] TRUE TRUE TRUE TRUE TRUE
[34,] TRUE TRUE TRUE TRUE TRUE
[35,] TRUE TRUE TRUE TRUE TRUE
[36,] TRUE TRUE TRUE TRUE TRUE
[37,] TRUE TRUE TRUE TRUE TRUE
[38,] TRUE TRUE TRUE TRUE TRUE
[39,] TRUE TRUE TRUE TRUE TRUE
[40,] TRUE TRUE TRUE TRUE TRUE
[41,] TRUE TRUE TRUE TRUE TRUE
[42,] TRUE TRUE TRUE TRUE TRUE
[43,] TRUE TRUE TRUE TRUE TRUE
[44,] TRUE TRUE TRUE TRUE TRUE
[45,] TRUE TRUE TRUE TRUE TRUE
[46,] TRUE TRUE TRUE TRUE TRUE
I have a 2013 population density polygons shapefile that I downloaded from the Bolivian Government’s GIS repository. The denominator for the density calculation isn’t clear but I believe it’s in people / km^2.
I’m going to calculate based on that assumption and see if the total population I get makes sense given public numbers I can I find on La Paz population.
pop_dens_tab <- st_read('C:/Users/dvern/Desktop/Bolivia Shapefiles/Population/population/densidad_poblacion_projected.shp') %>% st_drop_geometry()
Reading layer `densidad_poblacion_projected' from data source `C:\Users\dvern\Desktop\Bolivia Shapefiles\Population\population\densidad_poblacion_projected.shp' using driver `ESRI Shapefile'
replacing null geometries with empty geometries
Simple feature collection with 611 features and 15 fields (with 2 geometries empty)
geometry type: POLYGON
dimension: XY
bbox: xmin: 588410 ymin: 8162580 xmax: 604597.3 ymax: 8183893
epsg (SRID): NA
proj4string: +proj=utm +zone=19 +south +ellps=GRS80 +units=m +no_defs
Upon examination, it looks like I performed the first steps of the calculation in this shapefile by making an area (km2) column and finding the population for each polygon by multiplying area by density.
sum_pop <- sum(pop_dens_tab$poblacion)
sum_pop
[1] 877501.8
This figure of about 877 K (2013) is a bit off from the 789 K reported by Wikipedia. However, the two numbers could be due to slightly different definitions of the city’s extent. I am confident that km^2 is the appropriate denominator.
The goal of this step was to get a more smoothed out and hopefully more realistic map of population density in La Paz (as of 2013). Then I extracted the newly calculated population density to my sample points. The reason for doing this step is that with the original polygon map of population, there can be sudden large changes in density at the borders of individual polygons. The real life variation in population density across the city is probably more smooth than this scenario.
The below screenshot illustrates what the new smoothed population raster looks like using a 750 m window in the KDE calculation.
pop_den_smth <- st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/shp/ARG_pop_dens.shp')
Reading layer `ARG_pop_dens' from data source `C:\Users\dvern\Desktop\Fresh Thesis Analysis\shp\ARG_pop_dens.shp' using driver `ESRI Shapefile'
Simple feature collection with 46 features and 12 fields
geometry type: POINT
dimension: XY
bbox: xmin: -68.11912 ymin: -16.54161 xmax: -68.06914 ymax: -16.47489
epsg (SRID): 4326
proj4string: +proj=longlat +datum=WGS84 +no_defs
pop_dens_smth_tab <- pop_den_smth %>% st_drop_geometry() %>% rename( pop_dens_km2 = RASTERVALU)
pop_den_chart <- ggplot(pop_dens_smth_tab, aes(pop_dens_km2, blTEM_c))
pop_den_chart + geom_smooth(method = lm) + geom_point() + ylab('concentration (gene copies per m^3 air)') + xlab('Persons per km^2') +ggtitle('blaTEM concentrations by population density at sample site')
A quick look at the above chart shows that there seems to be no relationship between sampled blaTEM concentrations and the pop density of the sample site.
This is good news because we might expect population density at the site to be a confounder. That is, given the hypothesis that rivers are the major source of ARG transport in La Paz, there is a concern about the influence of the environment immediately surrounding the bobcat sampler on our readings. For example, since human activity is expected to be a major source of ARGs into the environment, areas with higher densities of humans might have higher concentrations of aerosolized ARGs. There is the additional concern about animal contributions to this ARG load through their feces; this could also be covered under the pop. dens. proxy since areas with more people and thus more food could host more domestic and feral animals. Since we don’t see this relationship in the data, our initial hypothesis that rivers (sewage flows) in La Paz are an intermediary in ARG transport and aerosolization is more supported.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
labs(title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
"Intercept =",signif(fit$coef[[1]],5 ),
" Slope =",signif(fit$coef[[2]], 5),
" P =",signif(summary(fit)$coef[2,4], 5)))
}
#next define the model, concentration = intercept + B1(river_distance)
bla_pop<- lm(blTEM_c ~ pop_dens_km2, data= pop_dens_smth_tab)
#now try the function
ggplotRegression(bla_pop)
Above shows the stats for the simple linear model of blaTEM_conc ~ population density.
Now I need to join this pop. dens. data to my main dataset so a full model can be run.
#first clean up the main dataset some
arg.time.dis.adj.clean <- arg.time.dis.adj %>% select(Sample, Site.x, ARG, conc, Date, Time_Day, samp_elev, Distance_adj)
#next clean up pop dens table
pdens.clean <- pop_dens_smth_tab %>% select(Sample, Site, pop_dens_km2)
#join to make main dataset
arg.time.dis.pop <- left_join(arg.time.dis.adj.clean, pdens.clean, by = c('Sample' = 'Sample'))
#rejoin GPS coords to dataset
arg_gps <- arg2 %>% select(Sample, `GPS X`, `GPS Y`) %>% rename(X = `GPS X`, Y = `GPS Y` )
arg.time.dis.pop.coords <- left_join(arg.time.dis.pop, arg_gps, by = c('Sample' = 'Sample'))
In this section I will check for the correlation of the distance downstream of a sample site along the nearest river and aerosolized ARG concentration.
I will consider the start points of the various rivers as their furthest upstream point defined by the boundaries of the La Paz municipality shapefile. Furthermore, I will assign a distance to each sample site based on the total meters of urban river upstream of that site.
library(rgdal)
package 㤼㸱rgdal㤼㸲 was built under R version 3.5.3Loading required package: sp
package 㤼㸱sp㤼㸲 was built under R version 3.5.3rgdal: version: 1.4-8, (SVN revision 845)
Geospatial Data Abstraction Library extensions to R successfully loaded
Loaded GDAL runtime: GDAL 2.2.3, released 2017/11/20
Path to GDAL shared files: C:/Users/dvern/Documents/R/win-library/3.5/rgdal/gdal
GDAL binary built with GEOS: TRUE
Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
Path to PROJ.4 shared files: C:/Users/dvern/Documents/R/win-library/3.5/rgdal/proj
Linking to sp version: 1.3-2
library(spatial)
library(rgeos)
package 㤼㸱rgeos㤼㸲 was built under R version 3.5.3rgeos version: 0.5-2, (SVN revision 621)
GEOS runtime version: 3.6.1-CAPI-1.10.1
Linking to sp version: 1.3-2
Polygon checking: TRUE
#The following code reads in all river segments, gets rid of z-dimensions where neeeded, and converts to sp
achu <- st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/river_segs/achumani.shp') %>% as('Spatial')
Reading layer `achumani' from data source `C:\Users\dvern\Desktop\Fresh Thesis Analysis\river_segs\achumani.shp' using driver `ESRI Shapefile'
Simple feature collection with 1 feature and 7 fields
geometry type: LINESTRING
dimension: XY
bbox: xmin: 597095.6 ymin: 8171221 xmax: 602023.7 ymax: 8175847
epsg (SRID): NA
proj4string: +proj=utm +zone=19 +south +ellps=GRS80 +units=m +no_defs
irpavi_branch <- st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/river_segs/irpavi_branch.shp') %>% as('Spatial')
Reading layer `irpavi_branch' from data source `C:\Users\dvern\Desktop\Fresh Thesis Analysis\river_segs\irpavi_branch.shp' using driver `ESRI Shapefile'
Simple feature collection with 1 feature and 7 fields
geometry type: LINESTRING
dimension: XY
bbox: xmin: 597690 ymin: 8175087 xmax: 599365.7 ymax: 8175883
epsg (SRID): NA
proj4string: +proj=utm +zone=19 +south +ellps=GRS80 +units=m +no_defs
low_choque <- st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/river_segs/low_choque.shp')%>% st_zm() %>%as('Spatial')
Reading layer `low_choque' from data source `C:\Users\dvern\Desktop\Fresh Thesis Analysis\river_segs\low_choque.shp' using driver `ESRI Shapefile'
Simple feature collection with 1 feature and 16 fields
geometry type: LINESTRING
dimension: XYZ
bbox: xmin: 594406.1 ymin: 8170648 xmax: 596729.4 ymax: 8173027
epsg (SRID): NA
proj4string: +proj=utm +zone=19 +south +ellps=GRS80 +units=m +no_defs
main_irpavi <- st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/river_segs/main_irpavi.shp')%>% st_zm() %>% as('Spatial')
Reading layer `main_irpavi' from data source `C:\Users\dvern\Desktop\Fresh Thesis Analysis\river_segs\main_irpavi.shp' using driver `ESRI Shapefile'
Simple feature collection with 2 features and 7 fields
geometry type: LINESTRING
dimension: XY
bbox: xmin: 596716.9 ymin: 8170648 xmax: 597937.9 ymax: 8176883
epsg (SRID): NA
proj4string: +proj=utm +zone=19 +south +ellps=GRS80 +units=m +no_defs
whole_choq <- st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/river_segs/whole_choq_2.shp')%>% as('Spatial')
Reading layer `whole_choq_2' from data source `C:\Users\dvern\Desktop\Fresh Thesis Analysis\river_segs\whole_choq_2.shp' using driver `ESRI Shapefile'
Simple feature collection with 2 features and 7 fields
geometry type: LINESTRING
dimension: XY
bbox: xmin: 590404.8 ymin: 8170648 xmax: 596729.4 ymax: 8183562
epsg (SRID): NA
proj4string: +proj=utm +zone=19 +south +ellps=GRS80 +units=m +no_defs
whole_orko <- st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/river_segs/whole_orko.shp')%>% st_zm %>% as('Spatial')
Reading layer `whole_orko' from data source `C:\Users\dvern\Desktop\Fresh Thesis Analysis\river_segs\whole_orko.shp' using driver `ESRI Shapefile'
Simple feature collection with 2 features and 18 fields
geometry type: LINESTRING
dimension: XYZ
bbox: xmin: 593989.1 ymin: 8170648 xmax: 599268.1 ymax: 8183150
epsg (SRID): NA
proj4string: +proj=utm +zone=19 +south +ellps=GRS80 +units=m +no_defs
x <-st_crs(whole_choq)
#The following code joins river names to the main dataset
#I had to use this points file from the hospitals section since it's projected in meters and we want to measure in meters
#the hospital shapefile has to be cleaned up as before
hosp4 <-st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/shp/samp_to_hosp_dis.shp')
Reading layer `samp_to_hosp_dis' from data source `C:\Users\dvern\Desktop\Fresh Thesis Analysis\shp\samp_to_hosp_dis.shp' using driver `ESRI Shapefile'
Simple feature collection with 48 features and 58 fields
geometry type: POINT
dimension: XY
bbox: xmin: 594005.2 ymin: 8170936 xmax: 599332.7 ymax: 8178327
epsg (SRID): NA
proj4string: +proj=utm +zone=19 +south +ellps=GRS80 +units=m +no_defs
hosp.t4 <- hosp4 %>% arrange(Location_I) #arranges table in ascending order based on Location ID (which is actually sample ID)
hosp.t4$Sample <- 1:48 #create new correctly ordered Sample ID variable
hosp.sel <- hosp.t4 %>% select(Sample)
#river names joined in
main_riv_names <- left_join(hosp.sel, riv.name.tab, by=c('Sample' = 'Sample'))
#subset by each river, then convert to sp points file
choq_p <- main_riv_names %>% filter(Name == 'Choqueyapu') %>% as('Spatial')
orko_p <- main_riv_names%>% filter(Name == 'Orkojahuira') %>% as('Spatial')
irpav_p <- main_riv_names %>% filter(Name == 'Irpavi') %>% as('Spatial')
achu_p <- main_riv_names %>% filter(Name == 'Achumani') %>% as('Spatial')
## Find distances downriver for each set of points
## achumani is simple since no other sections affect its points
d <- gProject(achu, achu_p, normalized=FALSE)
achu_p$dist_along <- d
achu_sf <-st_as_sf(achu_p) #this seems to have been successful
## Choqueyapu Points
##For the Choqueyapu points,one is upstream of the choqueyapu-orkojahuira junction so we'll deal differently with it vs. the others
#First run distance operation on all Choque points with the Choque River File
d_choq <- gProject(whole_choq, choq_p, normalized = F)
choq_p$dist_along1 <- d_choq #we see that samples 1 and 2 are furthest upstream
choq_sf <-st_as_sf(choq_p)
#Next run distance operation on all Choque points with the Orko River File
d_choq_2 <- gProject(whole_orko, choq_p, normalized = F)
choq_sf$dist_along2 <- d_choq_2
#Then run distance operation on all Choque points with the lower Choque File
d_choq_3 <- gProject(low_choque, choq_p, normalized = F)
choq_sf$dist_along3 <- d_choq_3
#Now calculate final upstream distances and account for double counting of lower choque
choq_sf2 <- choq_sf %>% mutate(dist_along= dist_along1+dist_along2 - dist_along3) %>% select(dist_along, Sample, Name)
choq_sf2$dist_along[1:2] <-d_choq[1:2]
#Orkojahuira is another simple one
d_ork <- gProject(whole_orko, orko_p, normalized=FALSE)
orko_p$dist_along <- d_ork
orko_sf <-st_as_sf(orko_p)
#Irpavi is mixed, some points need to incorporate achu distance as well
d_irp <- gProject(main_irpavi, irpav_p, normalized=FALSE)
irpav_p$dist_along <- d_irp
irp_sf <-st_as_sf(irpav_p) #irpavi samples 7 and 8 need to include Achu distance
d_irp2 <- gProject(achu, irpav_p, normalized=FALSE)
irp_sf$dist_along[1:2] <- d_irp[1:2] + d_irp2[1:2]
#Now I need to join all then rivers back together
#This table can now be joined with the main dataset
comb_dist <- rbind(choq_sf2, achu_sf, irp_sf, orko_sf) %>% st_drop_geometry() %>% filter(Sample != 27 & Sample != 28)
We have direct monthly flow measurements for 4 sites along the rivers in La Paz from 2000 to 2017. I will bring in this data, clean it, and join it with the GPS data for each site then see if I can integrate it with my previous work.
Note as of 2/19 I think that if I am to use this data, I need to figure out how to extrapolate these few flow measurements along the lenght of the Choqueyapu and I’m not sure how to do this yet. My initial investigation leads me to believe this will be pretty complicated. Some articles mention the need for measurements of the rivers cross-section and information about the watersheds along its route. This might be beyond the scope of my thesis but I’ll investigate further after my other analysis steps are done.
flow_tab <-import('C:/Users/dvern/Desktop/Fresh Thesis Analysis/lap_river_flow.csv')
#code for converting to wide dataset
flow_tab_wide <- flow_tab %>% gather(key, value, -Date) %>% #creates an observation for each flow measurement
separate(key, c("Site", "Flow"), "\\_") %>%
spread(Flow, value)
I think it would be interestng to contrast how the different flow station measurements have varied over time.
flow_line <- ggplot(flow_tab_wide, aes(Date, flow, group = Site) )
flow_line + geom_line(aes(color = Site)) #+geom_point(aes(color= Site))
The line plot is pretty messy. But it does seem to indicate that when calculating a mean flow rate for each station, it might be better to only consider the last 8 years or so of data.
I think I should probably break down my date by month and year, not just combined date. For now, I’d like to see what the side-by-side box plots by Site look like.
ggplot(flow_tab_wide, aes(Site, flow)) + geom_boxplot() + ylab('Flow (m^3/s)')
I need to map the Site coords to be sure, but I believe Choqueyapu1 is the most downstream site.
Now I’ll make my dataset even wider by separating out year and month into their own columns.
flow_tab_wide2 <- flow_tab_wide %>% separate(Date, c('Month','Year'), sep = "-")
Next I’ll graph how flow varies by month, and then by year.
ggplot(flow_tab_wide2, aes(Month, flow)) + geom_boxplot()
Experiment with changing the month abrreviations into numbers to make them plot in the correct order.
flow_tab_wide2$month_numeric <- match(flow_tab_wide2$Month, month.abb)
flow_tab_wide2$month_numeric <- as.character(flow_tab_wide2$month_numeric) #have to convert to character for it to be read as a categorical
Now retry plot.
ggplot(flow_tab_wide2, aes(month_numeric, flow)) + geom_boxplot()
That didn’t work as expected. Here’s another solution using the base R function ‘Month’:
flow_date <- flow_tab_wide2 %>% unite(Date, Year, Month, sep ='-')
#install.packages('anytime') #I tried several ways of converting my Date variable to actual Date format before finding this package which works
library(anytime)
package 㤼㸱anytime㤼㸲 was built under R version 3.5.3
flow_date$Date <-anydate(flow_date$Date)
flow_date$Month <- months(flow_date$Date)
flow_date$Month <- factor(flow_date$Month, levels= month.name)
ggplot(flow_date, aes(Month, flow)) + geom_boxplot()
I finally got it to plot in order but I’m still not happy with the month names running into each other. Will fix later.
The error bars are quite wide since all the flow sensor sites are plotted together. Lets see what the Choqueyapu1 site looks like by itself.
flow_date_Choq <- flow_date %>% filter(Site == 'Choqueyapu1')
ggplot(flow_date_Choq, aes(Month, flow)) + geom_boxplot() + ylab('Flow (m3/s)') +ggtitle(label ='Rio Choqueyapu Monthly Flow', subtitle = 'Aranjuez Station, 2000-2017') +theme(
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)
)
Now we can see that there’s much less flow variability during the dry season as compared to the wet season, which makes sense. Based on this, I feel comfortable with using the full 17 years of flow data for an average June-July flow estimate.
stations <- import('C:/Users/dvern/Desktop/Fresh Thesis Analysis/lap_river_flow_coords.csv')
stations_sf <- st_as_sf(stations, coords= c('Longitud', 'Latitud'), crs =4326)
class(stations_sf)
[1] "sf" "data.frame"
#st_write(stations_sf, 'C:/Users/dvern/Desktop/Fresh Thesis Analysis/shp/flow_stations.shp', driver = 'ESRI Shapefile')
I would like to add in some weather related covariates to my model. I’ve started combing through the Bolivian weather service website for applicable data.
As a first step, I’ve located a table of weather station locations for La Paz. I’ll map these and see which stations are appropriate to use a references for my sample points.
#weather_tab <-read.csv('C:/Users/dvern/Desktop/Fresh Thesis Analysis/Tables/LAP_weather_station_locations.csv')
#weather_sf <-st_as_sf(weather_tab, coords = c('x','y'), crs= 3426)
#st_write(weather_sf, 'C:/Users/dvern/Desktop/Fresh Thesis Analysis/shp/weather_stations.shp', driver = 'ESRI Shapefile')
#river <- st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/shp/river_merge.shp')
#samp_points <- st_read('C:/Users/dvern/Desktop/Fresh Thesis Analysis/shp/ARG_pop_dens.shp')
#map1 <-tm_shape(river) + tm_lines()
#map2 <-tm_shape(weather_sf) + tm_dots(size = 3)
The following data is imported from the Servicio Nacional de Meteorologia y Hydrologia (SENAMHI) for Bolivia. http://senamhi.gob.bo/index.php/boletines
Daily min and max temperature data were available for the “La Paz - Centro” station.
temp <- read.csv('C:/Users/dvern/Desktop/Fresh Thesis Analysis/Tables/temp_table.csv')
temp$Date <- as.character(temp$Date)
date <- read.csv('C:/Users/dvern/Desktop/Fresh Thesis Analysis/Tables/date_table.csv')
date$Date <- as.character(date$Date)
temp_join <-left_join(date, temp, by = c('Date' = 'Date'))
In this section I will run linear regression models of blaTEM arg concentrations and analyze model residuals for spatial structure by testing for spatial autocorrelation.
I will run several models as I acquire more covariates.
First Step: Filter to blaTEM
blat_model1 <- arg.time.dis.pop.coords %>% filter(ARG == 'blaTEM')
mod_shp_1 <- st_as_sf(blat_model1, coords = c('X','Y'), crs = 4326)
#st_write(mod_shp_1,'C:/Users/dvern/Desktop/Fresh Thesis Analysis/shp/model1.shp', driver = 'ESRI Shapefile')
#empty model
m0 <- lm(conc ~ 1, data = blat_model1)
#conditional model
m1 <- lm(conc ~ Distance_adj + Time_Day , data = blat_model1)
summary(m0)
Call:
lm(formula = conc ~ 1, data = blat_model1)
Residuals:
Min 1Q Median 3Q Max
-532.4 -476.1 -361.5 -88.8 3685.9
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 532.4 140.7 3.785 0.000453 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 954 on 45 degrees of freedom
m1 <- lm(conc ~ Distance_adj + Time_Day + pop_dens_km2 , data = blat_model1)
summary(m1)
Call:
lm(formula = conc ~ Distance_adj + Time_Day + pop_dens_km2, data = blat_model1)
Residuals:
Min 1Q Median 3Q Max
-844.6 -483.3 -275.1 106.8 3418.8
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 711.597811 330.474061 2.153 0.0371 *
Distance_adj -7.101607 4.120487 -1.723 0.0922 .
Time_DayPM 183.097387 279.814963 0.654 0.5165
pop_dens_km2 0.001191 0.026344 0.045 0.9642
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 948.9 on 42 degrees of freedom
Multiple R-squared: 0.0767, Adjusted R-squared: 0.01075
F-statistic: 1.163 on 3 and 42 DF, p-value: 0.3352
This shows that population density has a pretty miniscule effect as a covariate and reduces our R2 by a factor of 3. We may want to drop it as more covariates become available.
Plot new model:
#ggplotRegression(m2)
Now I will test my model residuals for spatial autocorrelation using the global moran’s I test. This will allow me to see if there is a spatial structure to how well the model performs (or doesn’t) which can indicate if there are more useful covariates yet to be added.
First, I have to decide on a spatial weights matrix for running this test. Since I am working with points I have 2 choices: -Define neighbors as distance-based and set a distance band such as 500m,1km,1.5 km, etc. This could be informed by some research on how far aerosolized ARGs could travel for instance
-Set the minimum number of neighbors each point must have, sometimes this results in sites not mutually being considered neighbors.
I’ll start off with the minimum number of neighbors approach: k-nearest neighbors of 3.
library(spdep)
##Creating nb object
#since we're using proximity based nb object, we need a sp points file first
arg_sp <- as(mod_shp_1, 'Spatial')
arg_points <- coordinates(arg_sp)
arg_k3 <- knearneigh(arg_points, k =3) #creates set of 3 nearest neighbors
knearneigh: identical points found
arg_k3_nb <- knn2nb(arg_k3, row.names = arg_sp$Sample, sym = T) #creates a symmetrical nb object
arg_listw <- arg_k3_nb %>% nb2listw() #creates listw object for Moran's I test
##performing lm.morantest()
#empty model residuals
lm.morantest(m0, listw = arg_listw)
Global Moran I for regression residuals
data:
model: lm(formula = conc ~ 1, data = blat_model1)
weights: arg_listw
Moran I statistic standard deviate = -1.0211, p-value = 0.8464
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.12632425 -0.02222222 0.01039367
#conditional model residuals
#lm.morantest(m2, listw = arg_listw)
This reveals two weird things:
I’ll separate the sampes by time of day to see if this was messing up the moran’s I test.
blat_model1_PM <- blat_model1 %>% filter(Time_Day== 'PM')
PM_shp <- st_as_sf(blat_model1_PM, coords = c('X','Y'), crs = 4326)
PM_sp <- as(PM_shp, 'Spatial')
PM_points <- coordinates(PM_sp)
PM_k3 <- knearneigh(PM_points, k =3) #creates set of 3 nearest neighbors
PM_k3_nb <- knn2nb(PM_k3, row.names = PM_sp$Sample, sym = T) #creates a symmetrical nb object
PM_listw <- PM_k3_nb %>% nb2listw() #creates listw object for Moran's I test
#run new models
m0_PM <- lm(conc ~1, data = blat_model1_PM)
m1_PM <- lm(conc ~ Distance_adj + pop_dens_km2, data = blat_model1_PM)
summary(m1_PM)
Call:
lm(formula = conc ~ Distance_adj + pop_dens_km2, data = blat_model1_PM)
Residuals:
Min 1Q Median 3Q Max
-1214.8 -623.3 -75.2 117.0 3172.4
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1447.03110 479.46916 3.018 0.00679 **
Distance_adj -12.27461 6.60750 -1.858 0.07800 .
pop_dens_km2 -0.04808 0.04223 -1.139 0.26833
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1076 on 20 degrees of freedom
Multiple R-squared: 0.1743, Adjusted R-squared: 0.09171
F-statistic: 2.111 on 2 and 20 DF, p-value: 0.1473
##performing lm.morantest()
#empty model residuals
lm.morantest(m0_PM, listw = PM_listw)
Global Moran I for regression residuals
data:
model: lm(formula = conc ~ 1, data = blat_model1_PM)
weights: PM_listw
Moran I statistic standard deviate = -0.99093, p-value = 0.8391
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.19242524 -0.04545455 0.02199767
#conditional model residuals
lm.morantest(m1_PM, listw = PM_listw)
Global Moran I for regression residuals
data:
model: lm(formula = conc ~ Distance_adj + pop_dens_km2, data = blat_model1_PM)
weights: PM_listw
Moran I statistic standard deviate = -0.72635, p-value = 0.7662
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.17567343 -0.07247721 0.02018548
When the PM samples are separated, the slope for river distance is more impressive. Let’s see how the scatter plot looks now and with a log scale.
pm_plot_data <- blat_model1_PM %>% mutate(conc_1 = conc + 1) #necessary to remove zeros in order plot log scale
dis_plot_pm <- ggplot(pm_plot_data, aes(x = Distance_adj, y= conc_1))
dis_plot_pm + geom_point() + geom_smooth(method = 'lm') + scale_y_log10() + scale_x_log10() + ggtitle('Aerosolized blaTEM concentration by Distance from Nearest River:\n Afternoon Samples') + ylab('Gene copies per m^3 air') + xlab('River Distance (m)')
lm.morantest(m1_AM, listw = AM_listw)
Global Moran I for regression residuals
data:
model: lm(formula = conc ~ Distance_adj + pop_dens_km2, data = blat_model1_AM)
weights: AM_listw
Moran I statistic standard deviate = -0.3256, p-value = 0.6276
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.11867909 -0.07242208 0.02018334
lm.morantest(m1_AM, listw = AM_listw)
Global Moran I for regression residuals
data:
model: lm(formula = conc ~ Distance_adj + pop_dens_km2, data = blat_model1_AM)
weights: AM_listw
Moran I statistic standard deviate = -0.3256, p-value = 0.6276
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.11867909 -0.07242208 0.02018334
am_plot_data <- blat_model1_AM %>% mutate(conc_1 = conc + 1) #necessary to remove zeros in order plot log scale
pop_plot <- ggplot(am_plot_data, aes(x = pop_dens_km2, y= conc_1))
pop_plot + geom_point() + geom_smooth(method = 'lm') + scale_y_log10() + scale_x_log10() + ggtitle('Aerosolized blaTEM concentration by Population Density \nfor Morning Samples') + ylab('Gene copies per m^3 air') + xlab('Persons per km^2')
I want to check the distribution of my dependent variable to see if it’s reasonable to log transform it. If it is right skewed, then it could be a good idea to transform it to better meet parametric model assumptions.
hist(argjoin.wide.blat$conc)
So our dependent variable is very right skewed and I will log transform it.
argjoin.wide.blat$log_conc <- log10(argjoin.wide.blat$conc + 1)
hist(argjoin.wide.blat$log_conc)
#hist(blat_model5$Distance_adj)
#hist(blat_model5$pop_dens_km2)
That’s a much nicer distribution. I wonder how it will effect our stats.
blat_model2 <-arg.time.dis.pop.coords %>% filter(ARG == 'blaTEM') %>% mutate(log_conc = log10(conc + 1)) #added 1 to avoid log transforming 0s
m1 <- lm(log_conc ~ Distance_adj , data = blat_model2)
summary(m1)
Call:
lm(formula = log_conc ~ Distance_adj, data = blat_model2)
Residuals:
Min 1Q Median 3Q Max
-2.3478 -0.4480 0.0855 0.5840 1.3212
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.405128 0.192093 12.521 4.22e-16 ***
Distance_adj -0.007114 0.003680 -1.933 0.0597 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8568 on 44 degrees of freedom
Multiple R-squared: 0.07827, Adjusted R-squared: 0.05732
F-statistic: 3.736 on 1 and 44 DF, p-value: 0.05969
m2 <- lm(log_conc ~ Distance_adj + Time_Day + pop_dens_km2, data = blat_model2)
summary(m2)
Call:
lm(formula = log_conc ~ Distance_adj + Time_Day + pop_dens_km2,
data = blat_model2)
Residuals:
Min 1Q Median 3Q Max
-2.24368 -0.44437 0.05452 0.59067 1.45158
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.288e+00 3.014e-01 7.591 2.1e-09 ***
Distance_adj -6.595e-03 3.757e-03 -1.755 0.0865 .
Time_DayPM -1.272e-01 2.552e-01 -0.499 0.6207
pop_dens_km2 2.267e-05 2.402e-05 0.944 0.3507
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8653 on 42 degrees of freedom
Multiple R-squared: 0.1026, Adjusted R-squared: 0.03854
F-statistic: 1.601 on 3 and 42 DF, p-value: 0.2034
PM
blat_model2_PM <- blat_model2 %>% filter(Time_Day == 'PM')
m1 <- lm(log_conc ~ Distance_adj , data = blat_model2_PM)
summary(m1)
Call:
lm(formula = log_conc ~ Distance_adj, data = blat_model2_PM)
Residuals:
Min 1Q Median 3Q Max
-2.2124 -0.3975 0.2031 0.6293 1.4424
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.251341 0.326311 6.899 8.11e-07 ***
Distance_adj -0.004824 0.006252 -0.772 0.449
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.029 on 21 degrees of freedom
Multiple R-squared: 0.02757, Adjusted R-squared: -0.01873
F-statistic: 0.5954 on 1 and 21 DF, p-value: 0.4489
m2 <- lm(log_conc ~ Distance_adj + pop_dens_km2, data = blat_model2_PM)
summary(m2)
Call:
lm(formula = log_conc ~ Distance_adj + pop_dens_km2, data = blat_model2_PM)
Residuals:
Min 1Q Median 3Q Max
-2.2127 -0.3983 0.2042 0.6297 1.4420
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.252e+00 4.699e-01 4.794 0.000111 ***
Distance_adj -4.827e-03 6.475e-03 -0.746 0.464638
pop_dens_km2 -1.359e-07 4.139e-05 -0.003 0.997412
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.055 on 20 degrees of freedom
Multiple R-squared: 0.02757, Adjusted R-squared: -0.06967
F-statistic: 0.2835 on 2 and 20 DF, p-value: 0.7561
Well that’s weird.
AM
blat_model2_AM <- blat_model2 %>% filter(Time_Day == 'AM')
m1 <- lm(log_conc ~ Distance_adj , data = blat_model2_AM)
summary(m1)
Call:
lm(formula = log_conc ~ Distance_adj, data = blat_model2_AM)
Residuals:
Min 1Q Median 3Q Max
-1.23027 -0.48328 -0.06966 0.48004 1.17268
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.558967 0.214201 11.947 7.91e-11 ***
Distance_adj -0.009405 0.004104 -2.292 0.0324 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.6755 on 21 degrees of freedom
Multiple R-squared: 0.2001, Adjusted R-squared: 0.162
F-statistic: 5.252 on 1 and 21 DF, p-value: 0.03236
m2 <- lm(log_conc ~ Distance_adj + pop_dens_km2, data = blat_model2_AM)
summary(m2)
Call:
lm(formula = log_conc ~ Distance_adj + pop_dens_km2, data = blat_model2_AM)
Residuals:
Min 1Q Median 3Q Max
-1.18328 -0.34611 0.01701 0.40538 1.18999
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.195e+00 2.866e-01 7.658 2.27e-07 ***
Distance_adj -8.357e-03 3.943e-03 -2.120 0.0467 *
pop_dens_km2 4.550e-05 2.522e-05 1.804 0.0863 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.6419 on 20 degrees of freedom
Multiple R-squared: 0.312, Adjusted R-squared: 0.2432
F-statistic: 4.536 on 2 and 20 DF, p-value: 0.02375
Now I am very confused.
ggplot(blat_model2_AM, aes(x=Distance_adj, y=log_conc)) + geom_point() +geom_smooth(method=lm)
ggplot(blat_model2_AM, aes(x=Distance_adj, y=log_conc)) + geom_point() +geom_smooth(method=lm)
blat_model4 <- left_join(arg.time.dis.pop.coords, comb_dist, by=c('Sample'='Sample'))%>%
left_join(temp_join, by=c('Sample'='Sample.ID')) %>% filter(ARG == 'blaTEM') %>%
select(Sample, Site, ARG, conc, Time_Day, samp_elev, Distance_adj, pop_dens_km2, Date.x, dist_along, Name, temp_min, temp_max) %>% mutate(temp_avg = (temp_max + temp_min)/2)
#install.packages('sjPlot')
#install.packages('sjmisc')
#install.packages('sjlabelled')
library(sjPlot)
library(sjmisc)
library(sjlabelled)
full <- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min+Time_Day + pop_dens_km2 + samp_elev, data = blat_model4)
full_int <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min+Time_Day +pop_dens_km2 + samp_elev , data = blat_model4) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
b6 <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min+Time_Day +pop_dens_km2 , data = blat_model4)
b5 <- lm(conc ~ Distance_adj + dist_along +temp_max*temp_min +Time_Day , data = blat_model4)
b4<- lm(conc ~ Distance_adj + dist_along +temp_max*temp_min, data = blat_model4)
b4_no_int<- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min, data = blat_model4)
b3 <- lm(conc ~ Distance_adj + dist_along + temp_avg, data = blat_model4)
b2 <- lm(conc ~ Distance_adj + dist_along , data = blat_model4)
tab_model(full, full_int, b6, b5, b4, b4_no_int, b2, auto.label = T, show.ci = F)
blat_model4_PM <- blat_model4 %>% filter(Time_Day == 'PM')
full_p <- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min+ pop_dens_km2 + samp_elev, data = blat_model4_PM)
full_int_p <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min +pop_dens_km2 + samp_elev , data = blat_model4_PM) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
b6_p <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min +pop_dens_km2 , data = blat_model4_PM)
b4_p<- lm(conc ~ Distance_adj + dist_along +temp_max*temp_min, data = blat_model4_PM)
b4_no_int_p<- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min, data = blat_model4_PM)
b3_p <- lm(conc ~ Distance_adj + dist_along + temp_avg, data = blat_model4_PM)
b3_p_1 <- lm(conc ~ Distance_adj + dist_along +pop_dens_km2 , data = blat_model4_PM)
b2_p <- lm(conc ~ Distance_adj + dist_along, data = blat_model4_PM)
tab_model(full_p, full_int_p, b6_p, b4_p, b4_no_int_p, b3_p, b3_p_1, b2_p, auto.label = T, show.ci = F)
blat_model4_AM <- blat_model4 %>% filter(Time_Day == 'AM')
full_a <- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min+ pop_dens_km2 + samp_elev, data = blat_model4_AM)
full_int_a <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min +pop_dens_km2 + samp_elev , data = blat_model4_AM) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
b6_a <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min +pop_dens_km2 , data = blat_model4_AM)
b4_a <- lm(conc ~ Distance_adj + dist_along +temp_max*temp_min, data = blat_model4_AM)
b4_no_int_a<- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min, data = blat_model4_AM)
b3_a <- lm(conc ~ Distance_adj + dist_along + temp_avg, data = blat_model4_AM)
b3_a2 <- lm(conc ~ samp_elev + dist_along +pop_dens_km2 , data = blat_model4_AM)
b2_a <- lm(conc ~ dist_along + pop_dens_km2, data = blat_model4_AM)
tab_model(full_a, full_int_a, b6_a, b4_a, b4_no_int_a, b3_a, b3_a2, b2_a, auto.label = T, show.ci = F)
First join hospital distance data to the main dataset.
hosp.tab2 <- hosp.table.1 %>% select(Sample, name, Samp_to_Hosp, Hosp_to_river) %>% rename(nearest_hosp = name)
all_model5 <- left_join(arg.time.dis.pop.coords, comb_dist, by=c('Sample'='Sample'))%>%
left_join(temp_join, by=c('Sample'='Sample.ID')) %>%
select(Sample, Site, ARG, conc, Time_Day, samp_elev, Distance_adj, pop_dens_km2, Date.x, dist_along, Name, temp_min, temp_max) %>% mutate(temp_avg = (temp_max + temp_min)/2) %>%
left_join(hosp.tab2, by=c('Sample'='Sample')) %>% rename(nearest_river = Name)
blat_model5 <- all_model5 %>% filter(ARG == 'blaTEM')
blat_model5$Samp_to_Hosp <- as.numeric(blat_model5$Samp_to_Hosp)
Tables for whole dataset together:
full_h <- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min+Time_Day + pop_dens_km2 + samp_elev + Samp_to_Hosp,
data = blat_model5)
full_int_h <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min+Time_Day +pop_dens_km2 + samp_elev + Samp_to_Hosp, data = blat_model5) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
h6 <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min+Time_Day +pop_dens_km2 + Samp_to_Hosp, data = blat_model5)
h5 <- lm(conc ~ Distance_adj + dist_along +temp_max*temp_min +Time_Day + Samp_to_Hosp, data = blat_model5)
h4<- lm(conc ~ Distance_adj + dist_along +temp_max*temp_min + Samp_to_Hosp, data = blat_model5)
h4_no_int<- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min + Samp_to_Hosp, data = blat_model5)
h3 <- lm(conc ~ Distance_adj + dist_along + temp_avg + Samp_to_Hosp, data = blat_model5)
h2 <- lm(conc ~ Distance_adj + dist_along + Samp_to_Hosp, data = blat_model5)
h1 <- lm(conc ~ Distance_adj + dist_along, data = blat_model5)
tab_model(full_h, full_int_h, h6, h5, h4, h4_no_int, h3,h2, h1, auto.label = T, show.ci = F)
Autocorrelation
arg_sp <- as(mod_shp_1, 'Spatial')
arg_points <- coordinates(arg_sp)
arg_k3 <- knearneigh(arg_points, k =3) #creates set of 3 nearest neighbors
knearneigh: identical points found
arg_k3_nb <- knn2nb(arg_k3, row.names = arg_sp$Sample, sym = T) #creates a symmetrical nb object
arg_listw3 <- arg_k3_nb %>% nb2listw() #creates listw object for Moran's I test
##performing lm.morantest()
#empty model residuals
lm.morantest(h1, listw = arg_listw3)
Global Moran I for regression residuals
data:
model: lm(formula = conc ~ Distance_adj + dist_along, data = blat_model5)
weights: arg_listw3
Moran I statistic standard deviate = -1.3443, p-value = 0.9106
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.179780666 -0.046663151 0.009805254
PM
blat_model5_PM <- blat_model5 %>% filter(Time_Day == 'PM')
full_hp <- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min+ pop_dens_km2 + samp_elev + Samp_to_Hosp,
data = blat_model5_PM)
full_int_hp <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min +pop_dens_km2 + samp_elev + Samp_to_Hosp, data = blat_model5_PM) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
h6p <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min+pop_dens_km2 + Samp_to_Hosp, data = blat_model5_PM)
h5p <- lm(conc ~ Distance_adj + dist_along +temp_max*temp_min + Samp_to_Hosp, data = blat_model5_PM)
h4p<- lm(conc ~ Distance_adj + dist_along +temp_max*temp_min + Samp_to_Hosp, data = blat_model5_PM)
h4_no_intp<- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min + Samp_to_Hosp, data = blat_model5_PM)
h3p <- lm(conc ~ Distance_adj + dist_along + temp_avg + Samp_to_Hosp, data = blat_model5_PM)
h2p <- lm(conc ~ Distance_adj + dist_along + Samp_to_Hosp, data = blat_model5_PM)
tab_model(full_hp, full_int_hp, h6p, h5p, h4p, h4_no_intp, h3p,h2p, auto.label = T, show.ci = F)
Autocorrelation
lm.morantest(h2p, listw = PM_listw)
Global Moran I for regression residuals
data:
model: lm(formula = conc ~ Distance_adj + dist_along + Samp_to_Hosp, data = blat_model5_PM)
weights: PM_listw
Moran I statistic standard deviate = -0.75629, p-value = 0.7753
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.22407934 -0.12563622 0.01694326
AM
blat_model5_AM <- blat_model5 %>% filter(Time_Day == 'AM')
full_ha <- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min+ pop_dens_km2 + samp_elev + Samp_to_Hosp, data = blat_model5_AM)
full_int_ha <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min +pop_dens_km2 + samp_elev+ Samp_to_Hosp , data = blat_model5_AM) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
h6_a <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min +pop_dens_km2+ Samp_to_Hosp , data = blat_model5_AM)
h4_a <- lm(conc ~ Distance_adj + dist_along +temp_max*temp_min+ Samp_to_Hosp, data = blat_model5_AM)
h4_no_int_a<- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min+ Samp_to_Hosp, data = blat_model5_AM)
h3_a <- lm(conc ~ Distance_adj + dist_along + temp_avg+ Samp_to_Hosp, data = blat_model5_AM)
h3_a2 <- lm(conc ~ samp_elev + dist_along +pop_dens_km2+ Samp_to_Hosp , data = blat_model5_AM)
h2_a <- lm(conc ~ dist_along + pop_dens_km2+ Samp_to_Hosp, data = blat_model5_AM)
h2_b <- lm(conc ~ dist_along + pop_dens_km2, data = blat_model5_AM)
tab_model(full_ha, full_int_ha, h6_a, h4_a, h4_no_int_a, h3_a, h3_a2, h2_a,h2_b, auto.label = T, show.ci = F)
AM autocorrelation
lm.morantest(h2_b, listw = AM_listw)
Global Moran I for regression residuals
data:
model: lm(formula = conc ~ dist_along + pop_dens_km2, data = blat_model5_AM)
weights: AM_listw
Moran I statistic standard deviate = -1.0914, p-value = 0.8624
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.26662157 -0.12716058 0.01632902
hosp_up <- import('C:/Users/dvern/Desktop/Fresh Thesis Analysis/hosp_up.csv')
#below code fixes wrongly entered values and changes to factor
hosp_up$Site <- as.numeric(hosp_up$Site)
hosp_up$Site[5] <- 21
hosp_up$Site <- as.factor(hosp_up$Site)
hosp_up$hosp_up[5] <- 3
blat_model6 <- left_join(blat_model5, hosp_up, by = c('Site' = 'Site'))
Column `Site` joining factors with different levels, coercing to character vector
full_h <- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min+Time_Day + pop_dens_km2 + samp_elev + Samp_to_Hosp + hosp_up,
data = blat_model6)
full_int_h <- lm(conc ~ Distance_adj + dist_along+Time_Day +pop_dens_km2 + samp_elev + Samp_to_Hosp+hosp_up, data = blat_model6) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
h6 <- lm(conc ~ Distance_adj + dist_along+Time_Day +pop_dens_km2 + Samp_to_Hosp +hosp_up, data = blat_model6)
h5 <- lm(conc ~ Distance_adj + dist_along +Time_Day + Samp_to_Hosp+hosp_up, data = blat_model6)
h4_no_int<- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min + Samp_to_Hosp +hosp_up, data = blat_model6)
h3 <- lm(conc ~ Distance_adj + dist_along + temp_avg + Samp_to_Hosp+hosp_up, data = blat_model6)
h2 <- lm(conc ~ Distance_adj + dist_along, data = blat_model6)
h1 <- lm(conc ~ Distance_adj + dist_along +hosp_up +Samp_to_Hosp, data = blat_model6)
tab_model(full_h, full_int_h, h6, h5, h4_no_int, h3,h2, h1, auto.label = T, show.ci = F)
summary(h1)
Call:
lm(formula = conc ~ Distance_adj + dist_along + hosp_up + Samp_to_Hosp,
data = blat_model6)
Residuals:
Min 1Q Median 3Q Max
-1112.0 -589.9 -260.8 155.9 2836.5
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 339.90811 398.67206 0.853 0.3988
Distance_adj -7.68203 4.01335 -1.914 0.0626 .
dist_along -0.03594 0.05755 -0.625 0.5357
hosp_up 179.60999 151.33179 1.187 0.2421
Samp_to_Hosp 0.21004 0.15703 1.338 0.1884
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 903.8 on 41 degrees of freedom
Multiple R-squared: 0.1823, Adjusted R-squared: 0.1025
F-statistic: 2.285 on 4 and 41 DF, p-value: 0.07657
lm.morantest(h1, listw = arg_listw3)
Global Moran I for regression residuals
data:
model: lm(formula = conc ~ Distance_adj + dist_along + hosp_up + Samp_to_Hosp, data = blat_model6)
weights: arg_listw3
Moran I statistic standard deviate = -0.95358, p-value = 0.8299
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.171801056 -0.084261098 0.008427454
PM
blat_model6_PM <- blat_model6 %>% filter(Time_Day == 'PM')
full_hp <- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min+ pop_dens_km2 + samp_elev + Samp_to_Hosp + hosp_up,
data = blat_model6_PM)
full_int_hp <- lm(conc ~ Distance_adj + dist_along +pop_dens_km2 + samp_elev + Samp_to_Hosp + hosp_up, data = blat_model6_PM) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
h6p <- lm(conc ~ Distance_adj + dist_along+pop_dens_km2 + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
h5p <- lm(conc ~ Distance_adj + dist_along +temp_max*temp_min + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
h4p<- lm(conc ~ Distance_adj + dist_along + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
h4_no_intp<- lm(conc ~ Distance_adj + dist_along + Samp_to_Hosp + hosp_up +samp_elev + temp_max, data = blat_model6_PM)
h3p <- lm(conc ~ Distance_adj + dist_along + samp_elev + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
h2p <- lm(conc ~ Distance_adj + dist_along + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
tab_model(full_hp, full_int_hp, h6p, h5p, h4p, h4_no_intp, h3p,h2p, auto.label = T, show.ci = F)
summary(h4_no_intp)
Call:
lm(formula = conc ~ Distance_adj + dist_along + Samp_to_Hosp +
hosp_up + samp_elev + temp_max, data = blat_model6_PM)
Residuals:
Min 1Q Median 3Q Max
-1717.53 -598.80 48.74 484.78 1947.39
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.735e+04 9.603e+03 -1.807 0.0896 .
Distance_adj -1.418e+01 6.104e+00 -2.322 0.0337 *
dist_along -1.229e-01 9.596e-02 -1.281 0.2185
Samp_to_Hosp 8.827e-01 3.492e-01 2.528 0.0224 *
hosp_up 5.424e+02 2.639e+02 2.055 0.0566 .
samp_elev 3.641e+00 2.387e+00 1.525 0.1467
temp_max 2.170e+02 2.109e+02 1.029 0.3187
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 955 on 16 degrees of freedom
Multiple R-squared: 0.4797, Adjusted R-squared: 0.2846
F-statistic: 2.459 on 6 and 16 DF, p-value: 0.07043
lm.morantest(h4_no_intp, listw = PM_listw)
Global Moran I for regression residuals
data:
model: lm(formula = conc ~ Distance_adj + dist_along + Samp_to_Hosp + hosp_up + samp_elev + temp_max, data = blat_model6_PM)
weights: PM_listw
Moran I statistic standard deviate = -0.40137, p-value = 0.6559
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.23589195 -0.19372762 0.01103552
AM
blat_model6_AM <- blat_model6 %>% filter(Time_Day == 'AM')
full_ha <- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min+ pop_dens_km2 + samp_elev + Samp_to_Hosp+hosp_up, data = blat_model6_AM)
full_int_ha <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min +pop_dens_km2 + samp_elev+ Samp_to_Hosp + hosp_up , data = blat_model6_AM) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
h6_a <- lm(conc ~ Distance_adj + dist_along+temp_max*temp_min +pop_dens_km2+ Samp_to_Hosp +hosp_up , data = blat_model6_AM)
h4_a <- lm(conc ~ Distance_adj + dist_along +temp_max*temp_min+ Samp_to_Hosp +hosp_up, data = blat_model6_AM)
h4_no_int_a<- lm(conc ~ Distance_adj + dist_along +temp_max + temp_min+ Samp_to_Hosp+hosp_up, data = blat_model6_AM)
h3_a <- lm(conc ~ Distance_adj + dist_along + temp_avg+ Samp_to_Hosp+ hosp_up, data = blat_model6_AM)
h3_a2 <- lm(conc ~ samp_elev + dist_along +pop_dens_km2+ Samp_to_Hosp +hosp_up , data = blat_model6_AM)
h2_a <- lm(conc ~ dist_along + pop_dens_km2, data = blat_model6_AM)
h2_b <- lm(conc ~ dist_along + Distance_adj + pop_dens_km2 + hosp_up, data = blat_model6_AM)
tab_model(full_ha, full_int_ha, h6_a, h4_a, h4_no_int_a, h3_a, h3_a2, h2_a,h2_b, auto.label = T, show.ci = F)
summary(h2_a)
Call:
lm(formula = conc ~ dist_along + pop_dens_km2, data = blat_model6_AM)
Residuals:
Min 1Q Median 3Q Max
-763.5 -300.9 -111.0 119.2 2373.9
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -210.86040 296.10902 -0.712 0.4846
dist_along 0.02321 0.01459 1.591 0.1273
pop_dens_km2 0.05214 0.02678 1.947 0.0657 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 689.2 on 20 degrees of freedom
Multiple R-squared: 0.2413, Adjusted R-squared: 0.1655
F-statistic: 3.181 on 2 and 20 DF, p-value: 0.06318
lm.morantest(h2_a, listw = AM_listw)
Global Moran I for regression residuals
data:
model: lm(formula = conc ~ dist_along + pop_dens_km2, data = blat_model6_AM)
weights: AM_listw
Moran I statistic standard deviate = -1.0914, p-value = 0.8624
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.26662157 -0.12716058 0.01632902
run models with log_conc
Combined
install.packages('ggpubr')
Error in install.packages : Updating loaded packages
library(ggpubr)
blat_model6$log_conc <- log10(blat_model6$conc + 1)
ggqqplot(blat_model6$log_conc)
shapiro.test(blat_model6$log_conc)
Shapiro-Wilk normality test
data: blat_model6$log_conc
W = 0.91927, p-value = 0.003546
hist(blat_model6$log_conc)
full_h <- lm(log_conc ~ Distance_adj + dist_along +temp_max + temp_min+Time_Day + pop_dens_km2 + samp_elev + Samp_to_Hosp + hosp_up,
data = blat_model6)
full_int_h <- lm(log_conc ~ Distance_adj + dist_along+Time_Day +pop_dens_km2 + samp_elev + Samp_to_Hosp+hosp_up, data = blat_model6) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
h6 <- lm(log_conc ~ Distance_adj + dist_along+Time_Day +pop_dens_km2 + Samp_to_Hosp +hosp_up, data = blat_model6)
h5 <- lm(log_conc ~ Distance_adj + dist_along +Time_Day + Samp_to_Hosp+hosp_up, data = blat_model6)
h4_no_int<- lm(log_conc ~ Distance_adj + dist_along +temp_max + temp_min + Samp_to_Hosp +hosp_up, data = blat_model6)
h3 <- lm(log_conc ~ Distance_adj + dist_along + temp_avg + Samp_to_Hosp+hosp_up, data = blat_model6)
h2 <- lm(log_conc ~ Distance_adj + dist_along + hosp_up + pop_dens_km2, data = blat_model6)
h1 <- lm(log_conc ~ Distance_adj + dist_along +hosp_up, data = blat_model6)
tab_model(full_h, full_int_h, h6, h5, h4_no_int, h3,h2, h1, auto.label = T, show.ci = F)
summary(h1)
Call:
lm(formula = log_conc ~ Distance_adj + dist_along + hosp_up,
data = blat_model6)
Residuals:
Min 1Q Median 3Q Max
-2.0395 -0.3895 0.1757 0.4879 1.3898
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.679e+00 3.083e-01 8.690 6.23e-11 ***
Distance_adj -7.384e-03 3.489e-03 -2.117 0.0403 *
dist_along -9.985e-05 4.740e-05 -2.107 0.0412 *
hosp_up 2.987e-01 1.188e-01 2.514 0.0159 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8043 on 42 degrees of freedom
Multiple R-squared: 0.2246, Adjusted R-squared: 0.1692
F-statistic: 4.055 on 3 and 42 DF, p-value: 0.01281
lm.morantest(h1, listw = arg_listw3)
Global Moran I for regression residuals
data:
model: lm(formula = log_conc ~ Distance_adj + dist_along + hosp_up, data = blat_model6)
weights: arg_listw3
Moran I statistic standard deviate = -0.53004, p-value = 0.702
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.114360694 -0.063656731 0.009151042
PM
blat_model6_PM <- blat_model6 %>% filter(Time_Day == 'PM')
full_hp <- lm(log_conc ~ Distance_adj + dist_along +temp_max + temp_min+ pop_dens_km2 + samp_elev + Samp_to_Hosp + hosp_up,
data = blat_model6_PM)
full_int_hp <- lm(log_conc ~ Distance_adj + dist_along +pop_dens_km2 + samp_elev + Samp_to_Hosp + hosp_up, data = blat_model6_PM) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
h6p <- lm(log_conc ~ Distance_adj + dist_along+pop_dens_km2 + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
h5p <- lm(log_conc ~ Distance_adj + dist_along +temp_max*temp_min + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
h4p<- lm(log_conc ~ Distance_adj + dist_along + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
h4_no_intp<- lm(log_conc ~ Distance_adj + dist_along + Samp_to_Hosp + hosp_up +samp_elev + temp_max, data = blat_model6_PM)
h3p <- lm(log_conc ~ Distance_adj + dist_along + samp_elev + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
h2p <- lm(log_conc ~ Distance_adj + dist_along + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
tab_model(full_hp, full_int_hp, h6p, h5p, h4p, h4_no_intp, h3p,h2p, auto.label = T, show.ci = F)
summary(h3p)
Call:
lm(formula = log_conc ~ Distance_adj + dist_along + samp_elev +
Samp_to_Hosp + hosp_up, data = blat_model6_PM)
Residuals:
Min 1Q Median 3Q Max
-2.30094 -0.39164 0.04844 0.55472 1.75512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.087e+01 8.537e+00 -1.274 0.2199
Distance_adj -5.654e-03 5.865e-03 -0.964 0.3486
dist_along -1.806e-04 8.417e-05 -2.146 0.0466 *
samp_elev 3.699e-03 2.333e-03 1.586 0.1312
Samp_to_Hosp 4.995e-04 3.372e-04 1.481 0.1568
hosp_up 6.111e-01 2.327e-01 2.626 0.0177 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9333 on 17 degrees of freedom
Multiple R-squared: 0.3527, Adjusted R-squared: 0.1623
F-statistic: 1.853 on 5 and 17 DF, p-value: 0.1561
lm.morantest(h3p, listw = PM_listw)
Global Moran I for regression residuals
data:
model: lm(formula = log_conc ~ Distance_adj + dist_along + samp_elev + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
weights: PM_listw
Moran I statistic standard deviate = -0.90671, p-value = 0.8177
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.28780893 -0.19184774 0.01120089
AM
blat_model6_AM <- blat_model6 %>% filter(Time_Day == 'AM')
full_ha <- lm(log_conc ~ Distance_adj + dist_along +temp_max + temp_min+ pop_dens_km2 + samp_elev + Samp_to_Hosp+hosp_up, data = blat_model6_AM)
full_int_ha <- lm(log_conc ~ Distance_adj + dist_along+temp_max*temp_min +pop_dens_km2 + samp_elev+ Samp_to_Hosp + hosp_up , data = blat_model6_AM) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
h6_a <- lm(log_conc ~ Distance_adj + dist_along+temp_max*temp_min +pop_dens_km2+ Samp_to_Hosp +hosp_up , data = blat_model6_AM)
h4_a <- lm(log_conc ~ Distance_adj + dist_along +temp_max*temp_min+ Samp_to_Hosp +hosp_up, data = blat_model6_AM)
h4_no_int_a<- lm(log_conc ~ Distance_adj + dist_along +temp_max + temp_min+ Samp_to_Hosp+hosp_up, data = blat_model6_AM)
h3_a <- lm(log_conc ~ Distance_adj + dist_along + temp_avg+ Samp_to_Hosp+ hosp_up, data = blat_model6_AM)
h3_a2 <- lm(log_conc ~ samp_elev + dist_along +pop_dens_km2+ Samp_to_Hosp +hosp_up , data = blat_model6_AM)
h2_a <- lm(log_conc ~ dist_along + pop_dens_km2+ Samp_to_Hosp + hosp_up, data = blat_model6_AM)
h2_b <- lm(log_conc ~ pop_dens_km2 + hosp_up + Distance_adj + temp_min, data = blat_model6_AM)
tab_model(full_ha, full_int_ha, h6_a, h4_a, h4_no_int_a, h3_a, h3_a2, h2_a,h2_b, auto.label = T, show.ci = F)
summary(h2_b)
Call:
lm(formula = log_conc ~ pop_dens_km2 + hosp_up + Distance_adj +
temp_min, data = blat_model6_AM)
Residuals:
Min 1Q Median 3Q Max
-0.93931 -0.41030 0.03349 0.21515 1.33752
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.648e-01 1.120e+00 0.326 0.7485
pop_dens_km2 4.728e-05 2.376e-05 1.989 0.0621 .
hosp_up 3.543e-02 3.329e-02 1.064 0.3013
Distance_adj -4.332e-03 4.389e-03 -0.987 0.3367
temp_min 3.421e-01 2.229e-01 1.535 0.1422
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.6042 on 18 degrees of freedom
Multiple R-squared: 0.4514, Adjusted R-squared: 0.3295
F-statistic: 3.703 on 4 and 18 DF, p-value: 0.02277
lm.morantest(h2_b, listw = AM_listw)
Global Moran I for regression residuals
data:
model: lm(formula = log_conc ~ pop_dens_km2 + hosp_up + Distance_adj + temp_min, data = blat_model6_AM)
weights: AM_listw
Moran I statistic standard deviate = -0.31383, p-value = 0.6232
alternative hypothesis: greater
sample estimates:
Observed Moran I Expectation Variance
-0.15400710 -0.11222225 0.01772739
ggplot(blat_model6, aes(x=Time_Day, y=conc)) + geom_boxplot() + scale_y_log10()
Combined
blat_model6$log_conc <- log10(blat_model6$conc + 1)
blat_model6 <- blat_model6 %>% filter(log_conc != 0)
ggqqplot(blat_model6$log_conc)
shapiro.test(blat_model6$log_conc)
Shapiro-Wilk normality test
data: blat_model6$log_conc
W = 0.95945, p-value = 0.1411
hist(blat_model6$log_conc)
full_h <- lm(log_conc ~ Distance_adj + dist_along +temp_max + temp_min+Time_Day + pop_dens_km2 + samp_elev + Samp_to_Hosp + hosp_up,
data = blat_model6)
full_int_h <- lm(log_conc ~ Distance_adj + dist_along+Time_Day +pop_dens_km2 + samp_elev + Samp_to_Hosp+hosp_up, data = blat_model6) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
h6 <- lm(log_conc ~ Distance_adj + dist_along+Time_Day +pop_dens_km2 + Samp_to_Hosp +hosp_up, data = blat_model6)
h5 <- lm(log_conc ~ Distance_adj + dist_along +Time_Day + Samp_to_Hosp+hosp_up, data = blat_model6)
h4_no_int<- lm(log_conc ~ Distance_adj + dist_along +temp_max + temp_min + Samp_to_Hosp +hosp_up, data = blat_model6)
h3 <- lm(log_conc ~ Distance_adj + dist_along + temp_avg + Samp_to_Hosp+hosp_up, data = blat_model6)
h2 <- lm(log_conc ~ Distance_adj + dist_along + hosp_up + pop_dens_km2, data = blat_model6)
h1 <- lm(log_conc ~ Distance_adj + dist_along +hosp_up, data = blat_model6)
tab_model(full_h, full_int_h, h6, h5, h4_no_int, h3,h2, h1, auto.label = T, show.ci = F)
summary(h1)
Call:
lm(formula = log_conc ~ Distance_adj + dist_along + hosp_up,
data = blat_model6)
Residuals:
Min 1Q Median 3Q Max
-1.02228 -0.44774 -0.01741 0.32575 1.16587
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.455e+00 2.478e-01 9.908 4.41e-12 ***
Distance_adj -3.752e-03 3.140e-03 -1.195 0.239
dist_along -1.761e-05 4.021e-05 -0.438 0.664
hosp_up 6.718e-02 1.022e-01 0.658 0.515
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.6133 on 38 degrees of freedom
Multiple R-squared: 0.06285, Adjusted R-squared: -0.01114
F-statistic: 0.8495 on 3 and 38 DF, p-value: 0.4755
PM
blat_model6_PM <- blat_model6 %>% filter(Time_Day == 'PM')
full_hp <- lm(log_conc ~ Distance_adj + dist_along +temp_max + temp_min+ pop_dens_km2 + samp_elev + Samp_to_Hosp + hosp_up,
data = blat_model6_PM)
full_int_hp <- lm(log_conc ~ Distance_adj + dist_along +pop_dens_km2 + samp_elev + Samp_to_Hosp + hosp_up, data = blat_model6_PM) #when I put in both measurements of temperature with an interaction term, the p-values improve but not adj model r2
h6p <- lm(log_conc ~ Distance_adj + dist_along+pop_dens_km2 + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
h5p <- lm(log_conc ~ Distance_adj + dist_along +temp_max*temp_min + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
h4p<- lm(log_conc ~ Distance_adj + dist_along + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
h4_no_intp<- lm(log_conc ~ Distance_adj + dist_along + Samp_to_Hosp + hosp_up +samp_elev + temp_max, data = blat_model6_PM)
h3p <- lm(log_conc ~ Distance_adj + dist_along + samp_elev + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
h2p <- lm(log_conc ~ Distance_adj + dist_along + Samp_to_Hosp + hosp_up, data = blat_model6_PM)
tab_model(full_hp, full_int_hp, h6p, h5p, h4p, h4_no_intp, h3p,h2p, auto.label = T, show.ci = F)
AM
ggplot(blat_model6, aes(x=hosp_up, y=log_conc)) + geom_point() + geom_smooth(method = lm)
ggplot(blat_model6, aes(x=conc)) + geom_histogram(binwidth = 200, boundary = 0, color= 'black', fill= 'white') + xlab('Concentration') + ggtitle('Histogram of blaTEM concentration') +theme(plot.title = element_text(hjust = 0.5))
ggplot(blat_model6, aes(x=log_conc)) + geom_histogram(binwidth= 0.25,boundary = 0, color= 'black', fill= 'white') + xlab('Log-Concentration') + ggtitle('Histogram of log-transformed blaTEM concentration') +theme(plot.title = element_text(hjust = 0.5))
#install.packages('pastecs')
library(pastecs)
res <- stat.desc(blat_model6[,-5])
res1 <-round(res1, 1)
res1 <- res %>% select(conc, samp_elev, Distance_adj, pop_dens_km2, dist_along, temp_min, temp_max, Samp_to_Hosp, hosp_up, log_conc)