Urban AADT

setwd("C:/Users/mvx13/OneDrive - Texas State University/Hackathon_Jinli/02_Projects/7187/0_AADT datacollect/Texas/1_DataProcess/0_UrbanModel")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
library(SmartEDA)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
dat= read.csv("AADT_SLD_Urban_Mimi_v2_filtered.csv")
names(dat)
##   [1] "OID_"       "FID_TMB_RP" "DIST_NM"    "CNTY_NM"    "TRFC_STATN"
##   [6] "LATEST_AAD" "AADT"       "filter"     "X2023"      "X2022"     
##  [11] "X2021"      "X2020"      "X2019"      "FID_EPA_SL" "GEOID10"   
##  [16] "GEOID20"    "STATEFP"    "COUNTYFP"   "TRACTCE"    "BLKGRPCE"  
##  [21] "CSA"        "CSA_Name"   "CBSA"       "CBSA_Name"  "CBSA_POP"  
##  [26] "CBSA_EMP"   "CBSA_WRK"   "Ac_Total"   "Ac_Water"   "Ac_Land"   
##  [31] "Ac_Unpr"    "TotPop"     "CountHU"    "HH"         "P_WrkAge"  
##  [36] "AutoOwn0"   "Pct_AO0"    "AutoOwn1"   "Pct_AO1"    "AutoOwn2p" 
##  [41] "Pct_AO2p"   "Workers"    "R_LowWageW" "R_MedWageW" "R_HiWageWk"
##  [46] "R_PCTLOWWA" "TotEmp"     "E5_Ret"     "E5_Off"     "E5_Ind"    
##  [51] "E5_Svc"     "E5_Ent"     "E8_Ret"     "E8_off"     "E8_Ind"    
##  [56] "E8_Svc"     "E8_Ent"     "E8_Ed"      "E8_Hlth"    "E8_Pub"    
##  [61] "E_LowWageW" "E_MedWageW" "E_HiWageWk" "E_PctLowWa" "D1A"       
##  [66] "D1B"        "D1C"        "D1C5_RET"   "D1C5_OFF"   "D1C5_IND"  
##  [71] "D1C5_SVC"   "D1C5_ENT"   "D1C8_RET"   "D1C8_OFF"   "D1C8_IND"  
##  [76] "D1C8_SVC"   "D1C8_ENT"   "D1C8_ED"    "D1C8_HLTH"  "D1C8_PUB"  
##  [81] "D1D"        "D1_FLAG"    "D2A_JPHH"   "D2B_E5MIX"  "D2B_E5MIXA"
##  [86] "D2B_E8MIX"  "D2B_E8MIXA" "D2A_EPHHM"  "D2C_TRPMX1" "D2C_TRPMX2"
##  [91] "D2C_TRIPEQ" "D2R_JOBPOP" "D2R_WRKEMP" "D2A_WRKEMP" "D2C_WREMLX"
##  [96] "D3A"        "D3AAO"      "D3AMM"      "D3APO"      "D3B"       
## [101] "D3BAO"      "D3BMM3"     "D3BMM4"     "D3BPO3"     "D3BPO4"    
## [106] "D4A"        "D4B025"     "D4B050"     "D4C"        "D4D"       
## [111] "D4E"        "D5AR"       "D5AE"       "D5BR"       "D5BE"      
## [116] "D5CR"       "D5CRI"      "D5CE"       "D5CEI"      "D5DR"      
## [121] "D5DRI"      "D5DE"       "D5DEI"      "D2A_Ranked" "D2B_Ranked"
## [126] "D3B_Ranked" "D4A_Ranked" "NatWalkInd" "Region"     "Households"
## [131] "Workers_1"  "Residents"  "Drivers"    "Vehicles"   "White"     
## [136] "Male"       "Lowwage"    "Medwage"    "Highwage"   "W_P_Lowwag"
## [141] "W_P_Medwag" "W_P_Highwa" "GasPrice"   "logd1a"     "logd1c"    
## [146] "logd3aao"   "logd3apo"   "d4bo25"     "d5dei_1"    "logd4d"    
## [151] "UPTpercap"  "B_C_consta" "B_C_male"   "B_C_ld1c"   "B_C_drvmve"
## [156] "B_C_ld1a"   "B_C_ld3apo" "B_C_inc1"   "B_C_gasp"   "B_N_consta"
## [161] "B_N_inc2"   "B_N_inc3"   "B_N_white"  "B_N_male"   "B_N_drvmve"
## [166] "B_N_gasp"   "B_N_ld1a"   "B_N_ld1c"   "B_N_ld3aao" "B_N_ld3apo"
## [171] "B_N_d4bo25" "B_N_d5dei"  "B_N_UPTpc"  "C_R_Househ" "C_R_Pop"   
## [176] "C_R_Worker" "C_R_Driver" "C_R_Vehicl" "C_R_White"  "C_R_Male"  
## [181] "C_R_Lowwag" "C_R_Medwag" "C_R_Highwa" "C_R_DrmV"   "NonCom_VMT"
## [186] "Com_VMT_Pe" "VMT_per_wo" "VMT_tot_mi" "VMT_tot_ma" "VMT_tot_av"
## [191] "GHG_per_wo" "Annual_GHG" "SLC_score"  "Y"          "X"         
## [196] "RU"
dat1= dat[, c("AADT", "TotPop", "HH", "AutoOwn1", "AutoOwn2p", "Workers", 
              "D1A", "D1B", "D1C5_RET", "D1C5_OFF", "D1C5_IND", "D1C5_SVC", 
              "D1C8_RET", "D1C8_OFF", "D1C8_IND", "D1C8_SVC", "D1D", 
              "D2A_JPHH", "D2B_E5MIX", "D2B_E8MIX", "D3AAO", "D3AMM", 
              "D3APO", "D3BAO", "D3BMM3", "D3BMM4", "D3BPO3", "D3BPO4", 
              "D5CR", "D5CE", "Drivers")]

dat2= subset(dat1, AADT > 0 & AADT < 4001)
dim(dat2)
## [1] 27988    31
ExpData(data=dat2,type=2, fun = c("mean", "median", "var"))
##    Index Variable_Name Variable_Type Sample_n Missing_Count Per_of_Missing
## 1      1          AADT       integer    27988             0              0
## 2      2        TotPop       integer    27988             0              0
## 3      3            HH       integer    27988             0              0
## 4      4      AutoOwn1       integer    27988             0              0
## 5      5     AutoOwn2p       integer    27988             0              0
## 6      6       Workers       integer    27988             0              0
## 7      7           D1A       numeric    27988             0              0
## 8      8           D1B       numeric    27988             0              0
## 9      9      D1C5_RET       numeric    27988             0              0
## 10    10      D1C5_OFF       numeric    27988             0              0
## 11    11      D1C5_IND       numeric    27988             0              0
## 12    12      D1C5_SVC       numeric    27988             0              0
## 13    13      D1C8_RET       numeric    27988             0              0
## 14    14      D1C8_OFF       numeric    27988             0              0
## 15    15      D1C8_IND       numeric    27988             0              0
## 16    16      D1C8_SVC       numeric    27988             0              0
## 17    17           D1D       numeric    27988             0              0
## 18    18      D2A_JPHH       numeric    27988             0              0
## 19    19     D2B_E5MIX       numeric    27988             0              0
## 20    20     D2B_E8MIX       numeric    27988             0              0
## 21    21         D3AAO       numeric    27988             0              0
## 22    22         D3AMM       numeric    27988             0              0
## 23    23         D3APO       numeric    27988             0              0
## 24    24         D3BAO       numeric    27988             0              0
## 25    25        D3BMM3       numeric    27988             0              0
## 26    26        D3BMM4       numeric    27988             0              0
## 27    27        D3BPO3       numeric    27988             0              0
## 28    28        D3BPO4       numeric    27988             0              0
## 29    29          D5CR       numeric    27988             0              0
## 30    30          D5CE       numeric    27988             0              0
## 31    31       Drivers       numeric    27988             0              0
##    No_of_distinct_values    mean  median        var
## 1                   3900 1256.42  940.00 1154329.86
## 2                   3172 1971.74 1534.00 4409372.71
## 3                   1543  665.20  524.00  417485.75
## 4                    785  211.07  164.00   35179.94
## 5                   1226  418.02  305.00  262112.10
## 6                   1804  835.49  653.00  581459.67
## 7                   8150    1.74    1.29       3.34
## 8                   8157    4.48    3.25      20.62
## 9                   6558    0.23    0.04       0.38
## 10                  6443    0.46    0.03      12.11
## 11                  7304    0.57    0.09       5.88
## 12                  7727    1.27    0.15      34.72
## 13                  6558    0.23    0.04       0.38
## 14                  6360    0.27    0.02       3.33
## 15                  7304    0.57    0.09       5.88
## 16                  7296    0.47    0.06       8.10
## 17                  8173    4.57    2.63     126.74
## 18                  7825    3.29    0.71     311.37
## 19                  7432    0.67    0.73       0.05
## 20                  7537    0.67    0.73       0.05
## 21                  4731    1.47    0.51       5.87
## 22                  7720    2.45    1.65       6.68
## 23                  8173   12.39   12.08      56.74
## 24                  4133    3.74    1.20      49.83
## 25                  6821    9.76    5.90     134.37
## 26                  6267    7.25    2.61     138.81
## 27                  8109   54.52   43.63    2383.28
## 28                  7839   26.96   12.31    1347.68
## 29                  7940    0.01    0.00       0.00
## 30                  7928    0.01    0.00       0.00
## 31                  2711 1353.67 1045.44 2148727.93
set.seed(123)

# Randomly select 5,000 row indices out of however many rows df has
sample_indices <- sample(seq_len(nrow(dat2)), size = 5000)

# Subset df using those indices
df_sample <- dat2[sample_indices, ]


df_long <- df_sample %>%
  select(all_of(names(df_sample))) %>%
  pivot_longer(cols = everything(),
               names_to = "variable",
               values_to = "value")


ggplot(df_long, aes(x = value)) +
  geom_density(fill = "blue", alpha = 0.3) +
  facet_wrap(~ variable, scales = "free") +
  labs(x = "Value",
    y = "Density"
  ) +
  theme_minimal()

df_subset <- subset(dat, AADT > 0 & AADT < 4001)
df_numeric_only <- df_subset[, sapply(df_subset, is.numeric)]

### for all variables
df_long1 <- df_numeric_only %>%
  select(all_of(names(df_numeric_only))) %>%
  pivot_longer(cols = everything(),
               names_to = "variable",
               values_to = "value")

ggplot(df_long1, aes(x = value)) +
  geom_density(fill = "blue", alpha = 0.3) +
  facet_wrap(~ variable, scales = "free") +
  labs(x = "Value",
    y = "Density"
  ) +
  theme_minimal()
## Warning: Removed 8428 rows containing non-finite outside the scale range
## (`stat_density()`).

# Mark the original dataset
df_original <- dat2 %>%
  mutate(source = "Original")

# Mark the sampled dataset
df_smpl <- df_sample %>%
  mutate(source = "Sample")

# Combine the two
df_combined <- bind_rows(df_original, df_smpl)

ggplot(df_combined, aes(x = AADT, fill = source)) +
  geom_density(alpha = 0.4) +
  labs(x = "AADT", 
       y = "Density",
       fill = "Dataset") + 
  theme_minimal() +
  theme(
    # Position legend inside plot.
    # Coordinates are in [0,1] from left to right (x) and bottom to top (y).
    legend.position = c(0.8, 0.7),
    
    # Optionally tweak legend appearance
    legend.background = element_rect(fill = "white", colour = "black"),
    legend.title = element_text(face = "bold"),
    legend.key = element_rect(fill = "white")
  )

library(corrplot)
cor1= cor(df_sample)
corrplot(cor1, method="circle")

corrplot(cor1, method="number")

library(lares)
corr_cross(df_sample)