library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
tomtommetro <- read.csv("tomtommetro.csv")
str(tomtommetro)
## 'data.frame': 492 obs. of 13 variables:
## $ city_key : chr "arequipa" "bogota" "mumbai" "medellin" ...
## $ city_name : chr "Arequipa" "Bogota" "Mumbai" "Medellin" ...
## $ country_code : chr "PE" "CO" "IN" "CO" ...
## $ country_name : chr "Peru" "Colombia" "India" "Colombia" ...
## $ continent : chr "south_america" "south_america" "asia" "south_america" ...
## $ rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ congestion_level_pct : num 68.8 67.4 61.5 60.7 60 60 59.7 59.5 58.8 58.4 ...
## $ congestion_level_delta_pct_pts: num 2.1 5 2.3 3 1.2 -0.2 -0.8 -0.4 0.7 -1.4 ...
## $ travel_time_per_10km_min : num 18 20.7 20.8 22.3 18.9 21.5 21.9 21.6 20.2 23.7 ...
## $ delay_time_per_10km_min : num 4.5 5.17 5.2 5.58 4.72 ...
## $ time_lost_per_year_rush_hours : int 122 111 105 101 106 88 112 92 117 89 ...
## $ highway_km_ratio : num 0.0173 0 0.2124 0 0.0184 ...
## $ population : int 1008290 10000000 18203000 2630000 12465000 2528163 1877155 2392877 1295925 22749315 ...
gallup <- read.csv("Gallup430Data.csv")
head(gallup)
## WP5 FIELD_DATE COUNTRYNEW INDEX_LE WP10268 WP16
## 1 United States 7/1/2006 United States Struggling
## 2 United States 7/1/2006 United States
## 3 United States 7/1/2006 United States Thriving
## 4 United States 7/1/2006 United States Thriving
## 5 United States 7/1/2006 United States Thriving Best possible
## 6 United States 7/1/2006 United States Struggling
## WP18
## 1
## 2 (DK)
## 3
## 4
## 5 Best possible
## 6 Best possible
wp16and18 <- read.csv("WP16and18.csv")
head(wp16and18)
## WP5 WP16 WP18
## 1 1 6 8
## 2 1 8 98
## 3 1 8 8
## 4 1 8 8
## 5 1 10 10
## 6 1 5 10
# removing all null values/other artifacts present in the Gallup data (98 represents null value)
gallup$WP16_num <- wp16and18$WP16
gallup$WP18_num <- wp16and18$WP18
head(gallup)
## WP5 FIELD_DATE COUNTRYNEW INDEX_LE WP10268 WP16
## 1 United States 7/1/2006 United States Struggling
## 2 United States 7/1/2006 United States
## 3 United States 7/1/2006 United States Thriving
## 4 United States 7/1/2006 United States Thriving
## 5 United States 7/1/2006 United States Thriving Best possible
## 6 United States 7/1/2006 United States Struggling
## WP18 WP16_num WP18_num
## 1 6 8
## 2 (DK) 8 98
## 3 8 8
## 4 8 8
## 5 Best possible 10 10
## 6 Best possible 5 10
gallup <- gallup %>%
mutate(
WP16_num = ifelse(WP16_num >= 98, NA, WP16_num),
WP18_num = ifelse(WP18_num >= 98, NA, WP18_num)
)
gallupavg <- gallup %>%
# Group data by country
group_by(COUNTRYNEW) %>%
# Calculatingn the mean life satisfaction per country + telling R to ignore NA
summarize(
avg_WP16 = mean(WP16_num, na.rm = TRUE),
avg_WP18 = mean(WP18_num, na.rm = TRUE)
)
head(gallupavg)
## # A tibble: 6 × 3
## COUNTRYNEW avg_WP16 avg_WP18
## <chr> <dbl> <dbl>
## 1 Afghanistan 3.36 4.10
## 2 Albania 5.14 6.32
## 3 Algeria 5.52 6.92
## 4 Angola 4.45 6.89
## 5 Argentina 6.33 7.45
## 6 Armenia 4.64 5.70
# grouping tomtom metro data by country
tomtomavg <- tomtommetro %>%
group_by(country_name) %>%
summarize(across(where(is.numeric), function(x) mean(x, na.rm = TRUE)))
head(tomtomavg)
## # A tibble: 6 × 9
## country_name rank congestion_level_pct congestion_level_delta_pct_pts
## <chr> <dbl> <dbl> <dbl>
## 1 Argentina 189. 35.6 2.33
## 2 Australia 230 33.3 1.13
## 3 Austria 324. 28.0 2.42
## 4 Bahrain 253 31.2 -0.2
## 5 Belgium 286. 30.1 0.52
## 6 Botswana 60 44.7 -1.5
## # ℹ 5 more variables: travel_time_per_10km_min <dbl>,
## # delay_time_per_10km_min <dbl>, time_lost_per_year_rush_hours <dbl>,
## # highway_km_ratio <dbl>, population <dbl>
# Merging the two datasets
galluptomtom <- gallupavg %>%
inner_join(tomtomavg, by = c("COUNTRYNEW" = "country_name"))
head(galluptomtom)
## # A tibble: 6 × 11
## COUNTRYNEW avg_WP16 avg_WP18 rank congestion_level_pct congestion_level_del…¹
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Argentina 6.33 7.45 189. 35.6 2.33
## 2 Australia 7.32 7.60 230 33.3 1.13
## 3 Austria 7.22 7.51 324. 28.0 2.42
## 4 Bahrain 5.97 7.19 253 31.2 -0.2
## 5 Belgium 7.03 7.31 286. 30.1 0.52
## 6 Botswana 4.02 6.54 60 44.7 -1.5
## # ℹ abbreviated name: ¹​congestion_level_delta_pct_pts
## # ℹ 5 more variables: travel_time_per_10km_min <dbl>,
## # delay_time_per_10km_min <dbl>, time_lost_per_year_rush_hours <dbl>,
## # highway_km_ratio <dbl>, population <dbl>
gdpdata <- read.csv("countrygdp2025.csv")
head(gdpdata)
## X. Country GDP GDP..Full.Value. GDP.Growth
## 1 1 United States $30.62 trillion $30,615,743,000,000 2%
## 2 2 China $19.4 trillion $19,398,577,000,000 4.8%
## 3 3 Germany $5.01 trillion $5,013,574,000,000 0.2%
## 4 4 Japan $4.28 trillion $4,279,828,000,000 1.1%
## 5 5 India $4.13 trillion $4,125,213,000,000 6.6%
## 6 6 United Kingdom $3.96 trillion $3,958,780,000,000 1.3%
## GDP.per.Capita
## 1 $89,599
## 2 $13,806
## 3 $59,925
## 4 $34,713
## 5 $2,818
## 6 $56,661
gdpdata_clean <- gdpdata %>%
mutate(
# removing '$' and ',', then convert to numeric
GDPperCapita_num = as.numeric(gsub("[$,]", "", GDP.per.Capita))
)
head(gdpdata_clean)
## X. Country GDP GDP..Full.Value. GDP.Growth
## 1 1 United States $30.62 trillion $30,615,743,000,000 2%
## 2 2 China $19.4 trillion $19,398,577,000,000 4.8%
## 3 3 Germany $5.01 trillion $5,013,574,000,000 0.2%
## 4 4 Japan $4.28 trillion $4,279,828,000,000 1.1%
## 5 5 India $4.13 trillion $4,125,213,000,000 6.6%
## 6 6 United Kingdom $3.96 trillion $3,958,780,000,000 1.3%
## GDP.per.Capita GDPperCapita_num
## 1 $89,599 89599
## 2 $13,806 13806
## 3 $59,925 59925
## 4 $34,713 34713
## 5 $2,818 2818
## 6 $56,661 56661
final_data <- galluptomtom %>%
inner_join(gdpdata_clean, by = c("COUNTRYNEW" = "Country"))
head(final_data)
## # A tibble: 6 × 17
## COUNTRYNEW avg_WP16 avg_WP18 rank congestion_level_pct congestion_level_del…¹
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Argentina 6.33 7.45 189. 35.6 2.33
## 2 Australia 7.32 7.60 230 33.3 1.13
## 3 Austria 7.22 7.51 324. 28.0 2.42
## 4 Bahrain 5.97 7.19 253 31.2 -0.2
## 5 Belgium 7.03 7.31 286. 30.1 0.52
## 6 Botswana 4.02 6.54 60 44.7 -1.5
## # ℹ abbreviated name: ¹​congestion_level_delta_pct_pts
## # ℹ 11 more variables: travel_time_per_10km_min <dbl>,
## # delay_time_per_10km_min <dbl>, time_lost_per_year_rush_hours <dbl>,
## # highway_km_ratio <dbl>, population <dbl>, X. <int>, GDP <chr>,
## # GDP..Full.Value. <chr>, GDP.Growth <chr>, GDP.per.Capita <chr>,
## # GDPperCapita_num <dbl>
# testing hypothesis 1: correlation between traffic and subjective wellbeing
h1_correlation <- cor.test(final_data$time_lost_per_year_rush_hours,
final_data$avg_WP16,
method = "pearson")
print(h1_correlation)
##
## Pearson's product-moment correlation
##
## data: final_data$time_lost_per_year_rush_hours and final_data$avg_WP16
## t = -4.3264, df = 56, p-value = 6.306e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.6719408 -0.2781817
## sample estimates:
## cor
## -0.500514
# testing hypothesis 2: hierarchichal regression
# BLOCK 1: baseline model, GDP only
model_1 <- lm(avg_WP16 ~ GDPperCapita_num, data = final_data)
summary(model_1)
##
## Call:
## lm(formula = avg_WP16 ~ GDPperCapita_num, data = final_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.61068 -0.26535 0.05777 0.40461 1.13977
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.497e+00 1.297e-01 42.400 < 2e-16 ***
## GDPperCapita_num 1.917e-05 2.529e-06 7.579 3.82e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6146 on 56 degrees of freedom
## Multiple R-squared: 0.5063, Adjusted R-squared: 0.4975
## F-statistic: 57.44 on 1 and 56 DF, p-value: 3.825e-10
# BLOCK 2: base model + traffic data. only time lost was used because of multicollinearity with other tomtom metrics
model_2 <- lm(avg_WP16 ~ GDPperCapita_num + time_lost_per_year_rush_hours, data = final_data)
summary(model_2)
##
## Call:
## lm(formula = avg_WP16 ~ GDPperCapita_num + time_lost_per_year_rush_hours,
## data = final_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.55405 -0.25373 0.05291 0.32927 1.01207
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.998e+00 2.902e-01 20.664 < 2e-16 ***
## GDPperCapita_num 1.652e-05 2.830e-06 5.837 2.94e-07 ***
## time_lost_per_year_rush_hours -7.919e-03 4.134e-03 -1.916 0.0606 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6005 on 55 degrees of freedom
## Multiple R-squared: 0.5372, Adjusted R-squared: 0.5204
## F-statistic: 31.92 on 2 and 55 DF, p-value: 6.279e-10
anova(model_1, model_2)
## Analysis of Variance Table
##
## Model 1: avg_WP16 ~ GDPperCapita_num
## Model 2: avg_WP16 ~ GDPperCapita_num + time_lost_per_year_rush_hours
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 56 21.154
## 2 55 19.831 1 1.323 3.6693 0.06063 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(ggplot2)
# Traffic-Happiness-Wealth Visualization
combined_plot <- ggplot(final_data, aes(x = time_lost_per_year_rush_hours,
y = avg_WP16,
color = GDPperCapita_num)) +
# dots with color mapping
geom_point(size = 4, alpha = 0.8) +
# trendline and confidence interval
geom_smooth(method = "lm", color = "darkred", fill = "lightcoral", alpha = 0.2) +
# labels
geom_text(aes(label = COUNTRYNEW), vjust = -1.2, size = 3, color = "black", check_overlap = TRUE) +
# color scale from GDP
scale_color_gradient(low = "blue", high = "gold",
labels = scales::dollar,
name = "GDP per Capita") +
# titles and captions
labs(
title = "The Impact of Urban Traffic on National Happiness",
subtitle = "Relationship controlled by national wealth (GDP per Capita)",
x = "Time Lost to Rush Hour Traffic (Hours/Year)",
y = "Average Life Evaluation (0-10)",
caption = "Data Sources: Gallup World Poll & TomTom Traffic Index"
) +
# themes
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.title = element_text(face = "bold"),
panel.grid.minor = element_blank(),
legend.title = element_text(face = "bold")
)
print(combined_plot)
## `geom_smooth()` using formula = 'y ~ x'

nrow(gallupavg)
## [1] 168