Hypothesis

H1: There are more warehouses/pop in large and medium cities than in smaller cities

Packages and libraries

library(pacman)
p_load(tidyverse, metaverse, eviatlas, citationchaser, PRISMA2020, topictagger, litsearchr, robvis, metaDigitise, synthesisr, bibliometrix, litsearchr, gmodels, ggpubr, kableExtra)

Data loading

data <- read_delim("data.csv", delim = ";", escape_double = FALSE, locale = locale(decimal_mark = ",", grouping_mark = "."), trim_ws = TRUE)

Data organization

#str(data)

h1 <- data %>% 
   select(metro, size, number_ware_t0, population_t0, number_ware_t1, population_t1) %>% 
   mutate(size = as.factor(case_when(size == "Small" ~ "Small",
                           size == "Medium" ~ "Medium/Large",
                           size == "Large" ~ "Medium/Large"))) %>% 
   mutate(ware_pop_t0 = ceiling(number_ware_t0/(population_t0/1000000)), ware_pop_t1 = ceiling(number_ware_t1/(population_t1/1000000))) 

data_h1 <- h1 %>% 
   filter(!is.na(ware_pop_t0)) %>% 
   group_by(size) %>% 
   summarise(t0 = mean(ware_pop_t0), t1 = mean(ware_pop_t1)) 

Data Dictionary

Variable Name Description
metro The name of the metropolitan area.
size The size of the metropolitan area (small, medium, or large).
population_t0 The population of the metropolitan area at the start of the period covered by the dataset.
number_ware_t0 The number of warehouses in the metropolitan area at the start of the period covered by the dataset.
population_t1 The population of the metropolitan area at the end of the period covered by the dataset.
number_ware_t1 The number of warehouses in the metropolitan area at the end of the period covered by the dataset.
ware_pop_t0 Number of warehouses per million inhabitants for t0.
ware_pop_t1 Number of warehouses per million inhabitants for t1.

Complete data

h1 %>%
  kbl() %>%
  kable_paper() %>% 
  scroll_box(width = "900px", height = "600px")
metro size number_ware_t0 population_t0 number_ware_t1 population_t1 ware_pop_t0 ware_pop_t1
atlanta Medium/Large 132.0 2621089.0 401.00 3603409 51 112
belo horizonte Medium/Large 43.7 781333.3 156.39 5000000 56 32
berlin Medium/Large 18.0 3413084.6 22.00 4341000 6 6
bogota Medium/Large 347.0 8106481.0 475.00 8779734 43 55
bordeaux Small 11.0 583760.0 22.00 721744 19 31
brussels Medium/Large NA 1662000.0 10553.00 2500000 NA 4222
calgary Medium/Large 21.0 1021060.0 59.00 1310000 21 46
cali Medium/Large NA 2083171.0 27.00 2120000 NA 13
chicago Medium/Large 217.0 2676215.0 415.00 3202509 82 130
chongqing Medium/Large 401.0 17801658.0 3490.00 30000000 23 117
flevoland Small 60.0 371572.0 59.00 396879 162 149
gothenburg mea Small 132.0 810000.0 207.00 973000 163 213
gothenburg vgc Medium/Large 261.0 1495000.0 390.00 1615000 175 242
halifax Small 6.0 359183.0 9.00 390328 17 24
los angeles Medium/Large 220.0 12365597.0 515.00 13234696 18 39
montreal Medium/Large 79.0 2605738.0 70.00 2849318 31 25
noord holland Medium/Large 318.0 2614302.0 278.00 2700000 122 103
paris all Medium/Large 713.0 11356295.0 955.00 11900000 63 81
paris parcels Medium/Large 93.0 9485564.0 93.00 11771621 10 8
phoenix Medium/Large 41.0 3251884.0 183.00 4578519 13 40
sao paulo Medium/Large 228.0 15082000.0 2066.00 21600000 16 96
seattle Medium/Large 85.0 622023.0 212.00 789074 137 269
shenzhen Medium/Large 1430.0 9582772.0 1660.00 12000000 150 139
randstad Medium/Large 589.0 7629594.0 583.00 7100000 78 83
tokio Medium/Large 420.0 27106000.0 209.00 36000000 16 6
toronto ggh Medium/Large 217.0 7566300.0 350.00 8463688 29 42
toronto gta Medium/Large 165.0 5081826.0 228.00 6054191 33 38
utrecht Medium/Large 43.0 1222262.0 61.00 1200000 36 51
vancouver Medium/Large 135.0 2224515.0 134.00 2590921 61 52
winnipeg Small 26.0 621457.0 41.00 666832 42 62
zuid holland Medium/Large 168.0 3421459.0 185.00 3600000 50 52
new york Medium/Large 938.0 14983782.0 914.00 16118232 63 57
washington dc Medium/Large 285.0 4709434.0 318.00 5720217 61 56
san francisco Medium/Large 305.0 4123734.0 349.00 4647924 74 76
boston Medium/Large 290.0 2634378.0 294.00 2872310 111 103
philadelphia Medium/Large 288.0 4264068.0 340.00 4564258 68 75
dallas Medium/Large 338.0 2846428.0 402.00 4321973 119 94
miami Medium/Large 193.0 5007956.0 235.00 5969135 39 40
detroit Medium/Large 196.0 2201458.0 210.00 2350511 90 90
houston Medium/Large 221.0 1193312.0 298.00 1762483 186 170
cleveland Medium/Large 148.0 1978890.0 150.00 1898436 75 80
san diego Medium/Large 84.0 2813839.0 86.00 3280850 30 27
st louis Medium/Large 148.0 2148575.0 144.00 2311690 69 63
pittsburgh Medium/Large 92.0 1787955.0 98.00 1711755 52 58
denver Medium/Large 118.0 2316068.0 147.00 2908463 51 51
portland Medium/Large 160.0 1161090.0 163.00 1365871 138 120
tampa Medium/Large 63.0 2396038.0 79.00 2983928 27 27
orlando Medium/Large 75.0 3442581.0 91.00 3985594 22 23
kansas city Medium/Large 159.0 729993.0 153.00 796646 218 193
columbus Medium/Large 208.0 1625491.0 195.00 1785971 128 110
cincinnati Medium/Large 112.0 642221.0 122.00 683709 175 179
indianapolis Medium/Large 121.0 824209.0 171.00 1033843 147 166
milwaukee Medium/Large 101.0 1401336.0 98.00 1459422 73 68
charlotte Medium/Large 124.0 2091897.0 145.00 2712974 60 54
salt lake city Medium/Large 88.0 939169.0 117.00 1164912 94 101
san antonio Medium/Large 47.0 1747863.0 67.00 2405335 27 28
virginia beach Medium/Large 90.0 1931738.0 98.00 2106945 47 47
las vegas Medium/Large 51.0 23541.0 80.00 22513 2167 3554
new orleans Medium/Large 77.0 1337740.0 83.00 1260281 58 66
nashville Medium/Large 116.0 512532.0 121.00 559425 227 217
raleigh Medium/Large 76.0 781161.0 77.00 1238938 98 63
greensboro Medium/Large 88.0 720580.0 88.00 842338 123 105
louisville Medium/Large 81.0 1135588.0 89.00 1244745 72 72
grand rapids Medium/Large 62.0 250981.0 72.00 301251 248 240
buffalo Medium/Large 57.0 1170022.0 57.00 1133002 49 51
austin Medium/Large 38.0 1041753.0 50.00 1534758 37 33
birmingham Medium/Large 47.0 1051340.0 51.00 1144683 45 45
greenville Medium/Large 101.0 453531.0 97.00 566978 223 172
rochester Medium/Large 45.0 259409.0 48.00 263632 174 183
albany Small 54.0 825920.0 48.00 879085 66 55
dayton Small 54.0 269526.0 49.00 271791 201 181
richmond Medium/Large 58.0 990282.0 87.00 1186339 59 74
tulsa Medium/Large 39.0 822560.0 37.00 939783 48 40
tucson Medium/Large 33.0 843702.0 55.00 1009103 40 55
cape town Medium/Large 3899.0 4260700.0 4349.00 4592195 916 948
eThekwini Medium/Large 2673.0 3477000.0 2733.00 3702000 769 739
gauteng Medium/Large 8401.0 11190000.0 8766.00 12910000 751 680
seoul Medium/Large 984.0 10553000.0 3340.00 9963000 94 336

Statistics analysis

We have performed a chi-square test to analyse the first hypothesis. For that, we first need to categorize the cities into two groups based on their size: small or medium/large. Then we have calculated the number of warehouses per million inhabitants in each of these groups.

Next, we can created a contingency table that shows the average of the number of warehouses per million inhabitants in each size group (Table 1).

data_h1 <- data_h1 %>% 
   mutate(t0 = ceiling(t0), t1 = ceiling(t1))
data_h1 %>%
  kbl() %>%
  kable_paper() %>% 
  scroll_box(width = "300px", height = "100px")
size t0 t1
Medium/Large 142 170
Small 96 103

After that, We have performed the chi-square test to determine if there is a significant difference between warehouse per million inhabitants in each size group. The null hypothesis for this test is that there is no significant difference between the distribution of number of warehouses/population in small and medium/large cities. If the p-value of the chi-square test is less than the significance level (usually 0.05), we can reject the null hypothesis and conclude that there is a significant difference between the distribution of number of warehouses/population in different city sizes.

Chi-square

chisq.test(h1$ware_pop_t0, h1$size)
## 
##  Pearson's Chi-squared test
## 
## data:  h1$ware_pop_t0 and h1$size
## X-squared = 76, df = 68, p-value = 0.2365
chisq.test(h1$ware_pop_t1, h1$size)
## 
##  Pearson's Chi-squared test
## 
## data:  h1$ware_pop_t1 and h1$size
## X-squared = 69.839, df = 66, p-value = 0.3499

So, we can state that there are more warehouse/pop in large and medium cities than in small ones.

T-test

To test whether ware_pop_t0 is significantly different from ware_pop_t1, you can use a paired t-test. This is because the two samples (ware_pop_t0 and ware_pop_t1) are related, as they come from the same cities at different times.

# Calculate the difference between ware_pop_t1 and ware_pop_t0
diff <- h1$ware_pop_t0 - h1$ware_pop_t1

# Perform a paired t-test
t.test(diff)
## 
##  One Sample t-test
## 
## data:  diff
## t = -1.3891, df = 75, p-value = 0.1689
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -63.28553  11.28553
## sample estimates:
## mean of x 
##       -26

The output of the t-test will give you the t-statistic and p-value. Since the p-value is not less than the significance level (e.g. 0.05), we can assume that there is not enough evidence to conclude that time had a significant effect on warehouse per million inhabitants.

Remove outliers and organize the data for plot

# Outliers remove

Q1_t0 <- quantile(h1$ware_pop_t0, 0.25, na.rm = TRUE)
Q3_t0 <- quantile(h1$ware_pop_t0, 0.75, na.rm = TRUE)
IQR_t0 <- Q3_t0 - Q1_t0
lower_t0 <- Q1_t0 - 1.5*IQR_t0
upper_t0 <- Q3_t0 + 1.5*IQR_t0
h1$ware_pop_t0_is_outlier <- h1$ware_pop_t0 < lower_t0 | h1$ware_pop_t0 > upper_t0

Q1_t1 <- quantile(h1$ware_pop_t1, 0.25, na.rm = TRUE)
Q3_t1 <- quantile(h1$ware_pop_t1, 0.75, na.rm = TRUE)
IQR_t1 <- Q3_t1 - Q1_t1
lower_t1 <- Q1_t1 - 1.5*IQR_t1
upper_t1 <- Q3_t1 + 1.5*IQR_t1
h1$ware_pop_t1_is_outlier <- h1$ware_pop_t1 < lower_t1 | h1$ware_pop_t1 > upper_t1

h1_plot <- h1 %>% 
   filter(ware_pop_t0_is_outlier == FALSE & ware_pop_t1_is_outlier == FALSE) %>% 
   select(size, ware_pop_t0, ware_pop_t1) %>% 
   gather(ware_pop_t0, ware_pop_t1, -size) 

h1_plot <- h1_plot %>% 
   mutate(ware_pop_t0 = case_when(ware_pop_t0 == "ware_pop_t0" ~ "t0",
                                  ware_pop_t0 == "ware_pop_t1" ~ "t1"))

Boxplot

ggplot(h1_plot) +
   geom_boxplot(aes(y = ware_pop_t1, fill = ware_pop_t0)) +
   facet_wrap(~size) +
   labs(fill = "Timeframe", y = "Warehouse/pop")