Purpose The purpose of this project is to examine the spatial struction of DEM data using semivariograms. The semivariograms were created in Python using simulation (see writeup for more detail). The semivariograms will be of a filtered DEM will be compared to a coarser resolution DEM. This script's purpose is to effectively compare the variograms.
Step 0: Read the txt files as tables
# Read in text files as tables Brady South, TX
# bm<-read.table('E:/Quant/FinalProject/Results/Variograms/BradySouth_TX_10mBM.txt',
# sep=',', header=T)
# filter<-read.table('E:/Quant/FinalProject/Results/Variograms/BradySouth_TX_5.txt',
# sep=',', header=T)
# Strawberry Lake, CO
bm <- read.table("E:/Quant/FinalProject/Results/Variograms/StrawberryLake_CO_10mBM.txt",
sep = ",", header = T)
filter <- read.table("E:/Quant/FinalProject/Results/Variograms/StrawberryLake_CO_0.txt",
sep = ",", header = T)
Filtering Iterations related to a cascading Gaussian Filter (data processing notes):
0 = 0px
1 = 3x3px
4 = 6x6px
9 = 9x9px
16 = 12x12px
25 = 15x15px
Step 1: Interpolate Semivariance Values. The variograms were calculated using different lags due to the differences in spatial resolutions. This step interpolates semivariance for every lagged meter.
# Interpolate BM Semivariance
bmList = list()
for (j in 1:169) {
start = bm$lag_dist[j]
nxt = bm$lag_dist[j + 1]
s1 = bm$Var_ave[j]
s2 = bm$Var_ave[j + 1]
step = 0
for (i in start:nxt) {
semivar = s1 + ((s2 - s1) * step/50) #original lag distance of 50m
bmList[i] = semivar
step = step + 1
}
}
bmList = bmList[c(50:8100)] #remove excess null values
# Interpolate Filter Semivariance
fList = list()
for (j in 1:169) {
start = filter$lag_dist[j]
nxt = filter$lag_dist[j + 1]
s1 = filter$Var_ave[j]
s2 = filter$Var_ave[j + 1]
step = 0
for (i in start:nxt) {
semivar = s1 + ((s2 - s1) * step/48) #original lag distance of 48m
fList[i] = semivar
step = step + 1
}
}
fList = fList[c(50:8100)] #remove excess null values
# Convert the data to a numeric list:
bmList = unlist(bmList) #Use unlist to make the data more manageable
fList = unlist(fList)
# Plot the semivariograms (Note: These are the average semivariance values
# of the simulated emperical semivariograms)
plot(bmList, col = 3, main = "Emperical Semivariogram", xlab = "lag in meters",
ylab = "semivariance")
points(fList, col = 4)
## NOTE: Green line is the benchmark variogram; blue line is the filtered
## variogram
## Because the filtered variogram has a larger semivariance than the
## benchmark, it is undergeneralized
Objective 2: Subtract semivariance values to get the residual deviance of the two lists. If the residuals are equal to 0, we will assume that the variograms are identical, and thus the resolution has changed.
diffList = fList - bmList
# Plot the Residuals: #Filter Window 3px
plot(diffList, main = "Residual Semivariance \n un-generalized", ylab = "semivariance",
xlab = "lag in meters", xaxt = "n", yaxt = "n", ylim = c(-8000, 8000))
axis(1, c(0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000))
axis(2, c(-5000, -2500, 0, 2500, 5000))
# Negative residuals indicate over generalization Positive residuals
# indicate under generalization
Objective 3: Statistically analyze if the slope of the residual line is 0.
# Because all iterations have a large jump in semivariance at about 7000m,
# this part of the data will be truncated in order to examine if the
# remaining part of the line's slope is equal to 0.
diffList = diffList[c(1:6500)]
# Convert the data into a dataframe so we can do a simple regression...
diffList_DF <- as.data.frame(diffList)
diffList_DF$semivariance <- diffList_DF$diffList
# Need to calculate the distance...
for (l in 1:length(diffList_DF$diffList)) {
diffList_DF$diffList[l] = l
}
diffList_DF$dist <- diffList_DF$diffList
# Create some regression models to explain the semivariance: Create a
# polynomial because this is what the data looks like:
quadModel <- lm(semivariance ~ dist + I(dist^2), data = diffList_DF)
summary(quadModel)
##
## Call:
## lm(formula = semivariance ~ dist + I(dist^2), data = diffList_DF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -150.73 -64.77 -0.87 61.26 148.03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.07e+01 2.87e+00 -14.2 <2e-16 ***
## dist 6.34e-01 2.04e-03 310.6 <2e-16 ***
## I(dist^2) -5.85e-05 3.04e-07 -192.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 77.2 on 6497 degrees of freedom
## Multiple R-squared: 0.978, Adjusted R-squared: 0.978
## F-statistic: 1.42e+05 on 2 and 6497 DF, p-value: <2e-16
# Plot the quadratic model over the points
plot(diffList, main = "Residual Semivariance \n un-generalized", ylab = "semivariance",
xlab = "lag in meters", xaxt = "n", yaxt = "n", ylim = c(-2000, 2000))
axis(1, c(0, 1000, 2000, 3000, 4000, 5000, 6000))
axis(2, c(-2000, -1000, 0, 1000, 2000))
lines(predict(quadModel))
# Thoud the data looks like a polynomial, we don't want it to be! Run a
# linear model...
linearModel <- lm(semivariance ~ dist, data = diffList_DF)
# check the T-test of the model parameters. The null hyposthesis is that
# the coefficient (slope) is equal to 0. This null hypothesis has to be
# accepted.
summary(linearModel)
##
## Call:
## lm(formula = semivariance ~ dist, data = diffList_DF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -359 -150 -66 195 314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.71e+02 4.96e+00 75 <2e-16 ***
## dist 2.54e-01 1.32e-03 192 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 200 on 6498 degrees of freedom
## Multiple R-squared: 0.85, Adjusted R-squared: 0.85
## F-statistic: 3.69e+04 on 1 and 6498 DF, p-value: <2e-16
plot(diffList, main = "Residual Semivariance \n un-generalized", ylab = "semivariance",
xlab = "lag in meters", xaxt = "n", yaxt = "n", ylim = c(-2000, 2000))
axis(1, c(0, 1000, 2000, 3000, 4000, 5000, 6000))
axis(2, c(-2000, -1000, 0, 1000, 2000))
lines(predict(linearModel))
Objective 4: Are the residual points normally distributed around 0? Even if the slope of the line formed by the points is 0, the residuals must still be distributed around 0. If they are higher, the DEM will be systematically undergeneralized. If they are lower than 0, the DEM will be systematically over generalized.
This isn't really required or meaningful. However, I found it interesting how you can parse through a dataset and select every nth observation.
simp_diffList = diffList[seq(1, length(diffList), 25)] #Reduce the data, select every 25th value. The shapiro.wilks test can handle a max of 5K observations.
plot(simp_diffList)
shapiro.test(simp_diffList) #null hypothesis = the data is normally distributed
##
## Shapiro-Wilk normality test
##
## data: simp_diffList
## W = 0.8451, p-value = 1.973e-15
# If the data is significantly normally distributed, print the value it is
# distributed around:
normTest = shapiro.test(simp_diffList)
if (normTest$p.value >= 0.05) {
mean(simp_diffList)
}
That was fun.