library(cartography)

## Warning: package 'cartography' was built under R version 4.3.3

## This project is in maintenance mode. 
## Core functionalities of `cartography` can be found in `mapsf`.
## https://riatelab.github.io/mapsf/

library(maps)

## Warning: package 'maps' was built under R version 4.3.2

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(geoR)

## Warning: package 'geoR' was built under R version 4.3.3

## --------------------------------------------------------------
##  Analysis of Geostatistical Data
##  For an Introduction to geoR go to http://www.leg.ufpr.br/geoR
##  geoR version 1.9-4 (built on 2024-02-14) is now loaded
## --------------------------------------------------------------

library(readr)
library(ggplot2)
library(geoR)

library(leaflet)

## Warning: package 'leaflet' was built under R version 4.3.3

library(gstat)

## Warning: package 'gstat' was built under R version 4.3.3

library(mapview)

## Warning: package 'mapview' was built under R version 4.3.3

library(sf)

## Warning: package 'sf' was built under R version 4.3.2

## Linking to GEOS 3.11.2, GDAL 3.7.2, PROJ 9.3.0; sf_use_s2() is TRUE

library(sp)

## Warning: package 'sp' was built under R version 4.3.2

library(stars)

## Warning: package 'stars' was built under R version 4.3.2

## Loading required package: abind

library(terra)

## Warning: package 'terra' was built under R version 4.3.3

## terra 1.7.71

## 
## Attaching package: 'terra'

## The following object is masked from 'package:cartography':
## 
##     north

Budapest, Hungary Spatial Analysis

Reading in the data

budapest_weekdays <- read_csv("/Users/jprol/OneDrive/Desktop/Spatial Statistics/Project/Europe Airbnb prices/budapest_weekdays.csv")

## New names:
## Rows: 2074 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): room_type dbl (16): ...1, realSum, person_capacity, multi, biz,
## cleanliness_rating, gu... lgl (3): room_shared, room_private, host_is_superhost
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

buda_wk<- as.data.frame(budapest_weekdays[,-1])

Exploratory Analysis

rows_to_keep <- buda_wk$realSum <= 900
buda <- buda_wk[rows_to_keep, ]
buda_wk <- buda_wk[rows_to_keep, ]


hist(as.numeric(buda_wk[,1]), main = "Rental price histogram", xlab = "Price")

From the above plot, we can see that the distibution of the prices are extremely right-skewed. This is expected with most economic data, salary data is much the same way. This is mainly due to fact that extremely large values are observed as part of the distribution.

Histograms for categorical predictors

# This code chunk plots the histograms of the rental price for each level of room types

ggplot(buda_wk, aes(x=room_type,y=realSum,colour=room_type)) + 
   geom_boxplot(varwidth=TRUE,show.legend=FALSE) +
  scale_colour_manual(values=c("firebrick3","navy","gold"),name="Room Type")+
    labs(title="Price of Room by Room type") +
    xlab("Type") +
    ylab("Price")

From the above histograms, we can see that the average price of each room type is aligned however we can expect to see extravagant homes/apt on the market.

Mapping

coordinates(buda) <- c("lng", "lat")
mapview(buda['realSum'])

map <- leaflet() %>%
  addTiles() %>%
  addCircleMarkers(data = buda, 
                   radius = 4,
                   color = ~colorFactor("Clarity", buda$realSum),
                   popup = ~as.character(realSum))

# Display the map
map

Clustering data

clust <- buda_wk[,-c(2:4,6:8)]

# needed later
buda_sf <- st_as_sf(buda)
# get coordinates
coords <- st_coordinates(buda_sf)

# k means clustering
kn.bnb<- kmeans(coords, 110, nstart = 50)

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

clust$cluster<- kn.bnb$cluster

# average the columns
result <- aggregate(cbind(realSum, person_capacity, cleanliness_rating,guest_satisfaction_overall, bedrooms, dist, metro_dist, attr_index, attr_index_norm, rest_index, rest_index_norm, lng,lat) ~ clust$cluster, data = clust, FUN = mean)

Empirical Semivariogram

# coords<- buda_wk[, 18:19]
coords<- result[, 13:14]

semivar.mom <- variog(coords = coords, data=log(result[,2]), estimator.type="classical", trend = "2nd") # Method of Moments

## variog: computing omnidirectional variogram

semivar.rob <- variog(coords = coords, data=result[,2], estimator.type="modulus", trend = "1st") # Method of Moments

## variog: computing omnidirectional variogram

plot(semivar.mom,main="MOM Empirical Variogram")
lines(semivar.mom$u,semivar.mom$v)

plot(semivar.rob,main="CH Empirical Variogram")
lines(semivar.rob$u,semivar.rob$v)

var(semivar.mom$v)

## [1] 0.0003285392

var(semivar.rob$v)

## [1] 1825363

Fitting Empirical Semivariograms

e.fit1<- variofit(vario = semivar.rob , cov.model = "exponential", weights = "cressie")

## variofit: covariance model used is exponential 
## variofit: weights used: cressie 
## variofit: minimisation function used: optim

## Warning in variofit(vario = semivar.rob, cov.model = "exponential", weights =
## "cressie"): initial values not provided - running the default search

## variofit: searching for best initial value ... selected values:
##               sigmasq   phi    tausq    kappa
## initial.value "2417.22" "0.12" "483.44" "0.5"
## status        "est"     "est"  "est"    "fix"
## loss value: 135.124147898333

m.fit1<- variofit(vario = semivar.rob , cov.model = "matern", weights = "cressie" , kappa = 1.5)

## variofit: covariance model used is matern 
## variofit: weights used: cressie 
## variofit: minimisation function used: optim

## Warning in variofit(vario = semivar.rob, cov.model = "matern", weights =
## "cressie", : initial values not provided - running the default search

## variofit: searching for best initial value ... selected values:
##               sigmasq   phi   tausq     kappa
## initial.value "3625.83" "0.2" "1208.61" "1.5"
## status        "est"     "est" "est"     "fix"
## loss value: 414.181320346213

Plotting Fitted Semivariograms

# Plot the empirical semivariogram
plot(semivar.rob, main = "Fitted Semivariograms", 
     xlab = "Distance", ylab = "Semivariance", type = "b")
lines(e.fit1, col = "purple", lwd = 2, lty = "dashed")
lines(m.fit1, col = "yellow", lwd = 2, lty = "dashed")

Checking cross-validation

m.fit1.cv <- xvalid(coords=coords, data=result[,2], model=m.fit1)

## xvalid: number of data locations       = 110
## xvalid: number of validation locations = 110
## xvalid: performing cross-validation at location ... 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 
## xvalid: end of cross-validation

e.fit1.cv <- xvalid(coords=coords, data=result[,2], model=e.fit1)

## xvalid: number of data locations       = 110
## xvalid: number of validation locations = 110
## xvalid: performing cross-validation at location ... 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 
## xvalid: end of cross-validation

mean(m.fit1.cv$std.error) ; sqrt(mean(m.fit1.cv$std.error^2)) ; sqrt(mean(m.fit1.cv$error^2))

## [1] -0.000708778

## [1] 1.007691

## [1] 29.95154

mean(e.fit1.cv$std.error) ; sqrt(mean(e.fit1.cv$std.error^2)) ; sqrt(mean(e.fit1.cv$error^2))

## [1] -0.003451509

## [1] 0.9113396

## [1] 26.81183

Ordinary Kriging

dat <- data.frame(x = coords[,1], y = coords[,2], realSum = result$realSum)
buda.grid<- expand.grid(seq(18.9, 19.5, by=0.01), seq(47.4, 47.6, by=0.01))


buda.krig<- krige.conv(as.geodata(dat), locations = buda.grid,
             krige=krige.control(type.krige ="ok",
             cov.model="exponential", cov.pars = c(2200, 0.12)),
             output = output.control(signal=T))

## krige.conv: model with constant mean
## krige.conv: Kriging performed using global neighbourhood

#"2200.09" "0.12"

image(buda.krig, xlab="x-coordinate (m)", ylab="y-coordinate (m)",
values=buda.krig$predict, xlim=c(18.6, 19.6), ylim=c(47.3, 47.8), col=heat.colors(15),
x.leg = c(18.58, 18.8), y.leg = c(47.6, 47.7), main="(a)")

image(buda.krig, xlab="x-coordinate (km)", ylab="y-coordinate (km)",
values=sqrt(buda.krig$krige.var), xlim=c(18.5, 19.6), ylim=c(47.2, 47.9), col=heat.colors(15),
x.leg = c(18.5, 18.8), y.leg = c(47.6, 47.7), main="(b)")

From the Prediction heat map we can see a circular area where rental prices are particulary high and a larger circular area of presumably a cheaper neighborhood due to the red shading indicating the low prices. It seems the Airbnb prices do follow a nonlinear trend, given that the prices follow a circular pattern depending on the area that they’re in. We also know the predictions are good given the prediction error being as low as it is. It seems there’s more data in the expensive areas, indicating other entrepeneurs know this is the expensive area and are trying to break in to this market.

Vienna, Austria Spatial Analysis

Reading in the data

vienna_weekdays <- read_csv("/Users/jprol/OneDrive/Desktop/Spatial Statistics/Project/Europe Airbnb prices/vienna_weekdays.csv")

## New names:
## Rows: 1738 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): room_type dbl (16): ...1, realSum, person_capacity, multi, biz,
## cleanliness_rating, gu... lgl (3): room_shared, room_private, host_is_superhost
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

vienna_wk<- as.data.frame(vienna_weekdays[,-1])

The city of Vienna has 1738 observations with 19 predictor variables. The predictors are a combination of numeric, character, and boolean variables that describe characteristics about particular locations of airbnbs. Our response variable is labeled ‘realSum’ which represents the price of a night at the time of sampling. The two last columns of our dataset contain the geolocation coordiantes of the observations.

Exploratory Analysis

# inconsistent histogram of the response
hist(vienna_wk$realSum)

rows_to_keep <- vienna_wk$realSum <= 900

# two sets are created for later analysis
v_wk <- vienna_wk[rows_to_keep, ]
vienna_wk <- vienna_wk[rows_to_keep, ] #To be turned into SPD

# Re-check the histogram
hist(vienna_wk$realSum, main = "Rental price histogram", xlab = "Price")

Because there seems to be several outline points that don’t allow us to view the distribution of our response variable, we will be deleting those observations. Those points are airbnbs with prices that are significantly different than the other airbnb’s.

Similar to Budapest, we observe that the prices are heavily right-skewed.

Histograms for categorical predictors

# This code chunk plots the histograms of the rental price for each level of room types
ggplot(vienna_wk, aes(x=room_type,y=realSum,colour=room_type)) + 
   geom_boxplot(varwidth=TRUE,show.legend=FALSE) +
  scale_colour_manual(values=c("firebrick3","pink","lightblue"),name="Room Type")+
    labs(title="Price of Room by Room type") +
    xlab("Type") +
    ylab("Price")

mean_values <- aggregate(realSum ~ room_type, data = buda_wk, FUN = mean)
avg_values <- aggregate(realSum ~ room_type, data = vienna_wk, FUN = mean)

print(mean_values)

##         room_type  realSum
## 1 Entire home/apt 171.5704
## 2    Private room 102.3721
## 3     Shared room 124.5141

print(avg_values)

##         room_type  realSum
## 1 Entire home/apt 245.1376
## 2    Private room 156.2920
## 3     Shared room 169.8271

Similar to Budapest, Vienna also is more expensive when you book an entire home as supposed to booking a private room or a shared room. In addition, on average it is more expensive per night to book an entire home in Vienna than in Budapest. As the mean is $245/night for Vienna and approximately $170/night for Budapest, a significant difference.

Mapping

# create spatialpointsdataframe to plot
coordinates(vienna_wk) <- c("lng", "lat")

# Mapped points
mapview(vienna_wk['realSum'])

# City Map
mapa <- leaflet() %>%
  addTiles() %>%
  addCircleMarkers(data = vienna_wk, 
                   radius = 4,
                   color = ~colorFactor("Clarity", vienna_wk$realSum),
                   popup = ~realSum)

# Display the map
mapa

As seen in the map, our observations demonstrated coordinate points either extremely close to each other or overlapping one another. We knew that when computing the distance matrix and ultimately attempting to do kriging predictions this would pose an issue. Specifically, it would mean having a near singular distance matrix leading to numerical instability and inaccurate results. To mitigate this issue, we set a cutoff distance value within the pairwise distance matrix we computed. Then, we identified the points that did not meet the threshold of the cutoff value. After countless iterations of attempting to remove the points that had a minimum distance, we were not successful.

We proceeded to sample our data and extract 100 different observations to see if this would alleviate the issue. It helped however, we still had points overlapping. Since this new data set did not have a significant amount of points, we were able to visually identify the pairs that were really close or that overlapped. Within this new strategy, we achieved the removable of the close observations and ended up with a cleaned map. Upon attempting to do cross-validation, we still ended up with errors. At this point, our assumption was that the data was just actually just close to each other.

To overcome this challenge, we implemented the k-means clustering algorithm. This would ensure numerical stability in the cross-validation stage as well as when conducting kriging predictions. In addition, performance increased and interpretability of our model was now attainable.

Clustering data

# Remove categorical variable for analysis preparation
new_data <- v_wk[,-c(2:4,6:8)]
vienna_sf <- st_as_sf(vienna_wk)

# get coordinates
vcoords <- st_coordinates(vienna_sf)

# CODE after here needs to updated, maybe
# k - means clustering
kn.bnb <- kmeans(vcoords, 110, nstart = 50)
new_data$cluster<- kn.bnb$cluster

# average the columns
result <- aggregate(cbind(realSum, person_capacity, cleanliness_rating,guest_satisfaction_overall, bedrooms, dist, metro_dist, attr_index, attr_index_norm, rest_index, rest_index_norm, lng,lat) ~ new_data$cluster, data = new_data, FUN = mean)

head(result)

##   new_data$cluster  realSum person_capacity cleanliness_rating
## 1                1 190.5118        3.400000           9.466667
## 2                2 222.4014        3.375000           9.458333
## 3                3 207.0165        2.928571           9.571429
## 4                4 230.4420        3.950000           9.400000
## 5                5 228.3755        3.352941           8.764706
## 6                6 193.6387        3.222222           9.666667
##   guest_satisfaction_overall bedrooms     dist metro_dist attr_index
## 1                   93.06667 1.000000 4.195142  0.3841191  103.30166
## 2                   93.37500 1.166667 2.510854  0.4311151  115.64913
## 3                   92.78571 1.071429 3.809899  0.4873938  104.36159
## 4                   94.80000 1.250000 3.097549  0.2576374  107.46473
## 5                   90.47059 1.117647 3.248358  0.6384084   92.19357
## 6                   92.55556 1.111111 6.066658  0.4833603   67.50407
##   attr_index_norm rest_index rest_index_norm      lng      lat
## 1        7.398023  105.98238        2.533585 16.32082 48.19532
## 2        8.282296  158.72710        3.794486 16.36871 48.18622
## 3        7.473931  121.03319        2.893386 16.33051 48.19012
## 4        7.696164  157.36629        3.761955 16.34140 48.19097
## 5        6.602509  129.59863        3.098149 16.33000 48.20946
## 6        4.834353   66.83315        1.597695 16.32192 48.16642

Empirical Semivariogram

coords <- result[, 13:14]

semivar.mom <- variog(coords=coords, data=log(result[,2]), estimator.type="classical", trend = "2nd",
                       max.dist = 0.225)

## variog: computing omnidirectional variogram

                      # Method of Moments
semivar.rob <- variog(coords=coords, data=result[,2], estimator.type="modulus", trend = "1st",
                       max.dist = 0.225)

## variog: computing omnidirectional variogram

                       # Cressie Hawkins

plot(semivar.mom,main="MOM Empirical Variogram")

plot(semivar.rob,main="CH Empirical Variogram")

var(semivar.mom$v)

## [1] 0.00620582

var(semivar.rob$v)

## [1] 40088332

Fitting Empirical Semivariograms

e.fit1<- variofit(vario = semivar.rob , cov.model = "exponential", weights = "cressie")

## variofit: covariance model used is exponential 
## variofit: weights used: cressie 
## variofit: minimisation function used: optim

## Warning in variofit(vario = semivar.rob, cov.model = "exponential", weights =
## "cressie"): initial values not provided - running the default search

## variofit: searching for best initial value ... selected values:
##               sigmasq   phi    tausq kappa
## initial.value "9714.57" "0.14" "0"   "0.5"
## status        "est"     "est"  "est" "fix"
## loss value: 1611.59443447267

m.fit1<- variofit(vario = semivar.rob , cov.model = "matern", weights = "cressie" ,
                  kappa = 1.5)

## variofit: covariance model used is matern 
## variofit: weights used: cressie 
## variofit: minimisation function used: optim

## Warning in variofit(vario = semivar.rob, cov.model = "matern", weights =
## "cressie", : initial values not provided - running the default search

## variofit: searching for best initial value ... selected values:
##               sigmasq    phi    tausq     kappa
## initial.value "19429.14" "0.17" "1942.91" "1.5"
## status        "est"      "est"  "est"     "fix"
## loss value: 1187.07136806912

Plotting Fitted Semivariograms

# Plot the empirical semivariogram
plot(semivar.rob, main = "Fitted Semivariograms", 
     xlab = "Distance", ylab = "Semivariance")

lines(e.fit1, col = "red", lwd = 2, lty = "dashed")
lines(m.fit1, col = "lightblue", lwd = 2, lty = "dashed")

Checking cross-validation

m.fit1.cv <- xvalid(coords=coords, data=result[,2], model=m.fit1)

## xvalid: number of data locations       = 110
## xvalid: number of validation locations = 110
## xvalid: performing cross-validation at location ... 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 
## xvalid: end of cross-validation

e.fit1.cv <- xvalid(coords=coords, data=result[,2], model=e.fit1)

## xvalid: number of data locations       = 110
## xvalid: number of validation locations = 110
## xvalid: performing cross-validation at location ... 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 
## xvalid: end of cross-validation

mean(m.fit1.cv$std.error) ; sqrt(mean(m.fit1.cv$std.error^2)) ; sqrt(mean(m.fit1.cv$error^2))

## [1] 0.002534319

## [1] 1.430168

## [1] 53.44307

mean(e.fit1.cv$std.error) ; sqrt(mean(e.fit1.cv$std.error^2)) ; sqrt(mean(e.fit1.cv$error^2))

## [1] -0.002996479

## [1] 1.599902

## [1] 53.58407

We conducted cross-validation on both fitted semivariograms, specifically the Matern and Exponential models. Upon thorough evaluation, we found that the Matern class function demonstrated superior performance. The optimal cross-validation error metrics were as follows: cv1 = -0.002627383, cv2 = 0.9166592, and cv3 = 34.62465. Consequently, we proceeded with kriging predictions based on these findings, yielding the following results.

Ordinary Kriging

df <- data.frame(x = coords[,1], y = coords[,2], realSum = result$realSum)
vienna.grid <- expand.grid(seq(16.2, 16.8, by=0.01), seq(48, 48.45, by=0.01))


vienna.krig<- krige.conv(as.geodata(df), locations = vienna.grid,
            krige=krige.control(type.krige ="ok",
            cov.model= "matern", cov.pars = c(3149.57, 0.14)),
            output = output.control(signal=T))

## krige.conv: model with constant mean
## krige.conv: Kriging performed using global neighbourhood

image(vienna.krig, xlab="x-coordinate (m)", ylab="y-coordinate (m)",
values= vienna.krig$predict, xlim=c(16.2, 16.8), ylim=c(48, 48.5), col=heat.colors(15),
x.leg = c(16.3, 16.8), y.leg = c(48.4, 48.45), main="(a)")

image(vienna.krig, xlab="x-coordinate (km)", ylab="y-coordinate (km)",
values = sqrt(vienna.krig$krige.var), xlim=c(16.2, 16.8), ylim=c(48, 48.5), col=heat.colors(15),
x.leg = c(16.3, 16.8), y.leg = c(48.4, 48.45), main="(b)")

plot(coords)

Based on our kriging prediction maps, we have identified a trend indicating higher rental prices for Airbnb accommodations on the southern side of Vienna. With a few specific clusters of lower-priced Airbnb listings predictions within higher priced range areas. Additionally, there is a discernible contrast between lower and upper price regions, with few exceptions. Our hypothesis is that the higher priced Airbnbs are penetrating through the lower priced regions. You can clearly see how they tend to split the lower priced regions.

Upon reviewing our mean square prediction error heat map, we observed reduced error levels within a particular circular region. This observation is interesting as it suggests a higher density of data in regions with higher Airbnb prices. To validate this hypothesis, we plotted our coordinates on a regular grid and compared them with the mean square prediction error (MSPE) heat map. We discovered that a substantial portion of our sampling locations coincided with the area exhibiting the highest prediction accuracy.

In summary, we generated predictive heat maps for two European cities using ordinary kriging. By evaluating cross-validation values and fitting a suitable semivariogram, we identified areas of opportunity. This information equips interested investors with the necessary tools to make informed decisions on property purchases, considering predicted prices and associated errors. To address challenges posed by a near singular matrix, we employed the k-means algorithm, which facilitated data aggregation, identification of similarities, and derivation of the theoretical semivariogram.

Limitations

Our spatial analysis could be improved if we were to more accurately estimate the mean function of the data. From the circular heat areas depicted in our kriging heat maps, we can see evidence of extreme nonlinear relationships in the coordinates. Fitting the semivariograms with the residuals of a well specified mean function would likely greatly improve our accuracy and show more patterns in heat maps

European Airbnb Spatial Analysis

Juan Pablo Olazaba & Ryan Rosario Merced

2024-05-03

Budapest, Hungary Spatial Analysis

Reading in the data

Exploratory Analysis

Histograms for categorical predictors

Mapping

Clustering data

Empirical Semivariogram

Fitting Empirical Semivariograms

Plotting Fitted Semivariograms

Checking cross-validation

Ordinary Kriging

Vienna, Austria Spatial Analysis

Reading in the data

Exploratory Analysis

Histograms for categorical predictors

Mapping

Clustering data

Empirical Semivariogram

Fitting Empirical Semivariograms

Plotting Fitted Semivariograms

Checking cross-validation

Ordinary Kriging

Limitations