library(readxl)   
library(dplyr)   
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
tomtommetro <- read.csv("tomtommetro.csv")
str(tomtommetro)
## 'data.frame':    492 obs. of  13 variables:
##  $ city_key                      : chr  "arequipa" "bogota" "mumbai" "medellin" ...
##  $ city_name                     : chr  "Arequipa" "Bogota" "Mumbai" "Medellin" ...
##  $ country_code                  : chr  "PE" "CO" "IN" "CO" ...
##  $ country_name                  : chr  "Peru" "Colombia" "India" "Colombia" ...
##  $ continent                     : chr  "south_america" "south_america" "asia" "south_america" ...
##  $ rank                          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ congestion_level_pct          : num  68.8 67.4 61.5 60.7 60 60 59.7 59.5 58.8 58.4 ...
##  $ congestion_level_delta_pct_pts: num  2.1 5 2.3 3 1.2 -0.2 -0.8 -0.4 0.7 -1.4 ...
##  $ travel_time_per_10km_min      : num  18 20.7 20.8 22.3 18.9 21.5 21.9 21.6 20.2 23.7 ...
##  $ delay_time_per_10km_min       : num  4.5 5.17 5.2 5.58 4.72 ...
##  $ time_lost_per_year_rush_hours : int  122 111 105 101 106 88 112 92 117 89 ...
##  $ highway_km_ratio              : num  0.0173 0 0.2124 0 0.0184 ...
##  $ population                    : int  1008290 10000000 18203000 2630000 12465000 2528163 1877155 2392877 1295925 22749315 ...
gallup <- read.csv("Gallup430Data.csv")
head(gallup)
##             WP5 FIELD_DATE    COUNTRYNEW   INDEX_LE WP10268          WP16
## 1 United States   7/1/2006 United States Struggling                      
## 2 United States   7/1/2006 United States                                 
## 3 United States   7/1/2006 United States   Thriving                      
## 4 United States   7/1/2006 United States   Thriving                      
## 5 United States   7/1/2006 United States   Thriving         Best possible
## 6 United States   7/1/2006 United States Struggling                      
##            WP18
## 1              
## 2          (DK)
## 3              
## 4              
## 5 Best possible
## 6 Best possible
wp16and18 <- read.csv("WP16and18.csv")
head(wp16and18)
##   WP5 WP16 WP18
## 1   1    6    8
## 2   1    8   98
## 3   1    8    8
## 4   1    8    8
## 5   1   10   10
## 6   1    5   10
# removing all null values/other artifacts present in the Gallup data (98 represents null value)
gallup$WP16_num <- wp16and18$WP16
gallup$WP18_num <- wp16and18$WP18
head(gallup)
##             WP5 FIELD_DATE    COUNTRYNEW   INDEX_LE WP10268          WP16
## 1 United States   7/1/2006 United States Struggling                      
## 2 United States   7/1/2006 United States                                 
## 3 United States   7/1/2006 United States   Thriving                      
## 4 United States   7/1/2006 United States   Thriving                      
## 5 United States   7/1/2006 United States   Thriving         Best possible
## 6 United States   7/1/2006 United States Struggling                      
##            WP18 WP16_num WP18_num
## 1                      6        8
## 2          (DK)        8       98
## 3                      8        8
## 4                      8        8
## 5 Best possible       10       10
## 6 Best possible        5       10
gallup <- gallup %>%
  mutate(
    WP16_num = ifelse(WP16_num >= 98, NA, WP16_num),
    WP18_num = ifelse(WP18_num >= 98, NA, WP18_num)
  )
  
gallupavg <- gallup %>%
  

  # Group data by country
  group_by(COUNTRYNEW) %>%
  
  # Calculatingn the mean life satisfaction per country + telling R to ignore NA 
  summarize(
    avg_WP16 = mean(WP16_num, na.rm = TRUE),
    avg_WP18 = mean(WP18_num, na.rm = TRUE)
  )

head(gallupavg)
## # A tibble: 6 × 3
##   COUNTRYNEW  avg_WP16 avg_WP18
##   <chr>          <dbl>    <dbl>
## 1 Afghanistan     3.36     4.10
## 2 Albania         5.14     6.32
## 3 Algeria         5.52     6.92
## 4 Angola          4.45     6.89
## 5 Argentina       6.33     7.45
## 6 Armenia         4.64     5.70
# grouping tomtom metro data by country
tomtomavg <- tomtommetro %>%
  group_by(country_name) %>%
  summarize(across(where(is.numeric), function(x) mean(x, na.rm = TRUE)))

head(tomtomavg)
## # A tibble: 6 × 9
##   country_name  rank congestion_level_pct congestion_level_delta_pct_pts
##   <chr>        <dbl>                <dbl>                          <dbl>
## 1 Argentina     189.                 35.6                           2.33
## 2 Australia     230                  33.3                           1.13
## 3 Austria       324.                 28.0                           2.42
## 4 Bahrain       253                  31.2                          -0.2 
## 5 Belgium       286.                 30.1                           0.52
## 6 Botswana       60                  44.7                          -1.5 
## # ℹ 5 more variables: travel_time_per_10km_min <dbl>,
## #   delay_time_per_10km_min <dbl>, time_lost_per_year_rush_hours <dbl>,
## #   highway_km_ratio <dbl>, population <dbl>
# Merging the two datasets
galluptomtom <- gallupavg %>%
  inner_join(tomtomavg, by = c("COUNTRYNEW" = "country_name"))
head(galluptomtom)
## # A tibble: 6 × 11
##   COUNTRYNEW avg_WP16 avg_WP18  rank congestion_level_pct congestion_level_del…¹
##   <chr>         <dbl>    <dbl> <dbl>                <dbl>                  <dbl>
## 1 Argentina      6.33     7.45  189.                 35.6                   2.33
## 2 Australia      7.32     7.60  230                  33.3                   1.13
## 3 Austria        7.22     7.51  324.                 28.0                   2.42
## 4 Bahrain        5.97     7.19  253                  31.2                  -0.2 
## 5 Belgium        7.03     7.31  286.                 30.1                   0.52
## 6 Botswana       4.02     6.54   60                  44.7                  -1.5 
## # ℹ abbreviated name: ¹​congestion_level_delta_pct_pts
## # ℹ 5 more variables: travel_time_per_10km_min <dbl>,
## #   delay_time_per_10km_min <dbl>, time_lost_per_year_rush_hours <dbl>,
## #   highway_km_ratio <dbl>, population <dbl>
gdpdata <- read.csv("countrygdp2025.csv")
head(gdpdata)
##   X.        Country             GDP    GDP..Full.Value. GDP.Growth
## 1  1  United States $30.62 trillion $30,615,743,000,000         2%
## 2  2          China  $19.4 trillion $19,398,577,000,000       4.8%
## 3  3        Germany  $5.01 trillion  $5,013,574,000,000       0.2%
## 4  4          Japan  $4.28 trillion  $4,279,828,000,000       1.1%
## 5  5          India  $4.13 trillion  $4,125,213,000,000       6.6%
## 6  6 United Kingdom  $3.96 trillion  $3,958,780,000,000       1.3%
##   GDP.per.Capita
## 1        $89,599
## 2        $13,806
## 3        $59,925
## 4        $34,713
## 5         $2,818
## 6        $56,661
gdpdata_clean <- gdpdata %>%
  mutate(
    # removing '$' and ',', then convert to numeric
    GDPperCapita_num = as.numeric(gsub("[$,]", "", GDP.per.Capita))
  )

head(gdpdata_clean)
##   X.        Country             GDP    GDP..Full.Value. GDP.Growth
## 1  1  United States $30.62 trillion $30,615,743,000,000         2%
## 2  2          China  $19.4 trillion $19,398,577,000,000       4.8%
## 3  3        Germany  $5.01 trillion  $5,013,574,000,000       0.2%
## 4  4          Japan  $4.28 trillion  $4,279,828,000,000       1.1%
## 5  5          India  $4.13 trillion  $4,125,213,000,000       6.6%
## 6  6 United Kingdom  $3.96 trillion  $3,958,780,000,000       1.3%
##   GDP.per.Capita GDPperCapita_num
## 1        $89,599            89599
## 2        $13,806            13806
## 3        $59,925            59925
## 4        $34,713            34713
## 5         $2,818             2818
## 6        $56,661            56661
final_data <- galluptomtom %>%
  inner_join(gdpdata_clean, by = c("COUNTRYNEW" = "Country"))

head(final_data)
## # A tibble: 6 × 17
##   COUNTRYNEW avg_WP16 avg_WP18  rank congestion_level_pct congestion_level_del…¹
##   <chr>         <dbl>    <dbl> <dbl>                <dbl>                  <dbl>
## 1 Argentina      6.33     7.45  189.                 35.6                   2.33
## 2 Australia      7.32     7.60  230                  33.3                   1.13
## 3 Austria        7.22     7.51  324.                 28.0                   2.42
## 4 Bahrain        5.97     7.19  253                  31.2                  -0.2 
## 5 Belgium        7.03     7.31  286.                 30.1                   0.52
## 6 Botswana       4.02     6.54   60                  44.7                  -1.5 
## # ℹ abbreviated name: ¹​congestion_level_delta_pct_pts
## # ℹ 11 more variables: travel_time_per_10km_min <dbl>,
## #   delay_time_per_10km_min <dbl>, time_lost_per_year_rush_hours <dbl>,
## #   highway_km_ratio <dbl>, population <dbl>, X. <int>, GDP <chr>,
## #   GDP..Full.Value. <chr>, GDP.Growth <chr>, GDP.per.Capita <chr>,
## #   GDPperCapita_num <dbl>
# testing hypothesis 1: correlation between traffic and subjective wellbeing
h1_correlation <- cor.test(final_data$time_lost_per_year_rush_hours, 
                           final_data$avg_WP16, 
                           method = "pearson")

print(h1_correlation)
## 
##  Pearson's product-moment correlation
## 
## data:  final_data$time_lost_per_year_rush_hours and final_data$avg_WP16
## t = -4.3264, df = 56, p-value = 6.306e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6719408 -0.2781817
## sample estimates:
##       cor 
## -0.500514
# testing hypothesis 2: hierarchichal regression
# BLOCK 1: baseline model, GDP only 
model_1 <- lm(avg_WP16 ~ GDPperCapita_num, data = final_data)
summary(model_1)
## 
## Call:
## lm(formula = avg_WP16 ~ GDPperCapita_num, data = final_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.61068 -0.26535  0.05777  0.40461  1.13977 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.497e+00  1.297e-01  42.400  < 2e-16 ***
## GDPperCapita_num 1.917e-05  2.529e-06   7.579 3.82e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6146 on 56 degrees of freedom
## Multiple R-squared:  0.5063, Adjusted R-squared:  0.4975 
## F-statistic: 57.44 on 1 and 56 DF,  p-value: 3.825e-10
# BLOCK 2: base model + traffic data. only time lost was used because of multicollinearity with other tomtom metrics
model_2 <- lm(avg_WP16 ~ GDPperCapita_num + time_lost_per_year_rush_hours, data = final_data)
summary(model_2)
## 
## Call:
## lm(formula = avg_WP16 ~ GDPperCapita_num + time_lost_per_year_rush_hours, 
##     data = final_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.55405 -0.25373  0.05291  0.32927  1.01207 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    5.998e+00  2.902e-01  20.664  < 2e-16 ***
## GDPperCapita_num               1.652e-05  2.830e-06   5.837 2.94e-07 ***
## time_lost_per_year_rush_hours -7.919e-03  4.134e-03  -1.916   0.0606 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6005 on 55 degrees of freedom
## Multiple R-squared:  0.5372, Adjusted R-squared:  0.5204 
## F-statistic: 31.92 on 2 and 55 DF,  p-value: 6.279e-10
anova(model_1, model_2)
## Analysis of Variance Table
## 
## Model 1: avg_WP16 ~ GDPperCapita_num
## Model 2: avg_WP16 ~ GDPperCapita_num + time_lost_per_year_rush_hours
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     56 21.154                              
## 2     55 19.831  1     1.323 3.6693 0.06063 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(ggplot2)

# Traffic-Happiness-Wealth Visualization
combined_plot <- ggplot(final_data, aes(x = time_lost_per_year_rush_hours, 
                                        y = avg_WP16, 
                                        color = GDPperCapita_num)) +
  
  # dots with color mapping 
  geom_point(size = 4, alpha = 0.8) +
  
  # trendline and confidence interval
  geom_smooth(method = "lm", color = "darkred", fill = "lightcoral", alpha = 0.2) +
  
  # labels
  geom_text(aes(label = COUNTRYNEW), vjust = -1.2, size = 3, color = "black", check_overlap = TRUE) +
  
  # color scale from GDP
  scale_color_gradient(low = "blue", high = "gold", 
                       labels = scales::dollar, 
                       name = "GDP per Capita") +
  
  # titles and captions
  labs(
    title = "The Impact of Urban Traffic on National Happiness",
    subtitle = "Relationship controlled by national wealth (GDP per Capita)",
    x = "Time Lost to Rush Hour Traffic (Hours/Year)",
    y = "Average Life Evaluation (0-10)",
    caption = "Data Sources: Gallup World Poll & TomTom Traffic Index"
  ) +
  
  # themes
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.title = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    legend.title = element_text(face = "bold")
  )

print(combined_plot)
## `geom_smooth()` using formula = 'y ~ x'

nrow(gallupavg)
## [1] 168