setwd("C:/Users/mvx13/OneDrive - Texas State University/Hackathon_Jinli/02_Projects/7187/0_AADT datacollect/Texas/1_DataProcess/0_UrbanModel")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
library(SmartEDA)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
dat= read.csv("AADT_SLD_Urban_Mimi_v2_filtered.csv")
names(dat)
## [1] "OID_" "FID_TMB_RP" "DIST_NM" "CNTY_NM" "TRFC_STATN"
## [6] "LATEST_AAD" "AADT" "filter" "X2023" "X2022"
## [11] "X2021" "X2020" "X2019" "FID_EPA_SL" "GEOID10"
## [16] "GEOID20" "STATEFP" "COUNTYFP" "TRACTCE" "BLKGRPCE"
## [21] "CSA" "CSA_Name" "CBSA" "CBSA_Name" "CBSA_POP"
## [26] "CBSA_EMP" "CBSA_WRK" "Ac_Total" "Ac_Water" "Ac_Land"
## [31] "Ac_Unpr" "TotPop" "CountHU" "HH" "P_WrkAge"
## [36] "AutoOwn0" "Pct_AO0" "AutoOwn1" "Pct_AO1" "AutoOwn2p"
## [41] "Pct_AO2p" "Workers" "R_LowWageW" "R_MedWageW" "R_HiWageWk"
## [46] "R_PCTLOWWA" "TotEmp" "E5_Ret" "E5_Off" "E5_Ind"
## [51] "E5_Svc" "E5_Ent" "E8_Ret" "E8_off" "E8_Ind"
## [56] "E8_Svc" "E8_Ent" "E8_Ed" "E8_Hlth" "E8_Pub"
## [61] "E_LowWageW" "E_MedWageW" "E_HiWageWk" "E_PctLowWa" "D1A"
## [66] "D1B" "D1C" "D1C5_RET" "D1C5_OFF" "D1C5_IND"
## [71] "D1C5_SVC" "D1C5_ENT" "D1C8_RET" "D1C8_OFF" "D1C8_IND"
## [76] "D1C8_SVC" "D1C8_ENT" "D1C8_ED" "D1C8_HLTH" "D1C8_PUB"
## [81] "D1D" "D1_FLAG" "D2A_JPHH" "D2B_E5MIX" "D2B_E5MIXA"
## [86] "D2B_E8MIX" "D2B_E8MIXA" "D2A_EPHHM" "D2C_TRPMX1" "D2C_TRPMX2"
## [91] "D2C_TRIPEQ" "D2R_JOBPOP" "D2R_WRKEMP" "D2A_WRKEMP" "D2C_WREMLX"
## [96] "D3A" "D3AAO" "D3AMM" "D3APO" "D3B"
## [101] "D3BAO" "D3BMM3" "D3BMM4" "D3BPO3" "D3BPO4"
## [106] "D4A" "D4B025" "D4B050" "D4C" "D4D"
## [111] "D4E" "D5AR" "D5AE" "D5BR" "D5BE"
## [116] "D5CR" "D5CRI" "D5CE" "D5CEI" "D5DR"
## [121] "D5DRI" "D5DE" "D5DEI" "D2A_Ranked" "D2B_Ranked"
## [126] "D3B_Ranked" "D4A_Ranked" "NatWalkInd" "Region" "Households"
## [131] "Workers_1" "Residents" "Drivers" "Vehicles" "White"
## [136] "Male" "Lowwage" "Medwage" "Highwage" "W_P_Lowwag"
## [141] "W_P_Medwag" "W_P_Highwa" "GasPrice" "logd1a" "logd1c"
## [146] "logd3aao" "logd3apo" "d4bo25" "d5dei_1" "logd4d"
## [151] "UPTpercap" "B_C_consta" "B_C_male" "B_C_ld1c" "B_C_drvmve"
## [156] "B_C_ld1a" "B_C_ld3apo" "B_C_inc1" "B_C_gasp" "B_N_consta"
## [161] "B_N_inc2" "B_N_inc3" "B_N_white" "B_N_male" "B_N_drvmve"
## [166] "B_N_gasp" "B_N_ld1a" "B_N_ld1c" "B_N_ld3aao" "B_N_ld3apo"
## [171] "B_N_d4bo25" "B_N_d5dei" "B_N_UPTpc" "C_R_Househ" "C_R_Pop"
## [176] "C_R_Worker" "C_R_Driver" "C_R_Vehicl" "C_R_White" "C_R_Male"
## [181] "C_R_Lowwag" "C_R_Medwag" "C_R_Highwa" "C_R_DrmV" "NonCom_VMT"
## [186] "Com_VMT_Pe" "VMT_per_wo" "VMT_tot_mi" "VMT_tot_ma" "VMT_tot_av"
## [191] "GHG_per_wo" "Annual_GHG" "SLC_score" "Y" "X"
## [196] "RU"
dat1= dat[, c("AADT", "TotPop", "HH", "AutoOwn1", "AutoOwn2p", "Workers",
"D1A", "D1B", "D1C5_RET", "D1C5_OFF", "D1C5_IND", "D1C5_SVC",
"D1C8_RET", "D1C8_OFF", "D1C8_IND", "D1C8_SVC", "D1D",
"D2A_JPHH", "D2B_E5MIX", "D2B_E8MIX", "D3AAO", "D3AMM",
"D3APO", "D3BAO", "D3BMM3", "D3BMM4", "D3BPO3", "D3BPO4",
"D5CR", "D5CE", "Drivers")]
dat2= subset(dat1, AADT > 0 & AADT < 4001)
dim(dat2)
## [1] 27988 31
ExpData(data=dat2,type=2, fun = c("mean", "median", "var"))
## Index Variable_Name Variable_Type Sample_n Missing_Count Per_of_Missing
## 1 1 AADT integer 27988 0 0
## 2 2 TotPop integer 27988 0 0
## 3 3 HH integer 27988 0 0
## 4 4 AutoOwn1 integer 27988 0 0
## 5 5 AutoOwn2p integer 27988 0 0
## 6 6 Workers integer 27988 0 0
## 7 7 D1A numeric 27988 0 0
## 8 8 D1B numeric 27988 0 0
## 9 9 D1C5_RET numeric 27988 0 0
## 10 10 D1C5_OFF numeric 27988 0 0
## 11 11 D1C5_IND numeric 27988 0 0
## 12 12 D1C5_SVC numeric 27988 0 0
## 13 13 D1C8_RET numeric 27988 0 0
## 14 14 D1C8_OFF numeric 27988 0 0
## 15 15 D1C8_IND numeric 27988 0 0
## 16 16 D1C8_SVC numeric 27988 0 0
## 17 17 D1D numeric 27988 0 0
## 18 18 D2A_JPHH numeric 27988 0 0
## 19 19 D2B_E5MIX numeric 27988 0 0
## 20 20 D2B_E8MIX numeric 27988 0 0
## 21 21 D3AAO numeric 27988 0 0
## 22 22 D3AMM numeric 27988 0 0
## 23 23 D3APO numeric 27988 0 0
## 24 24 D3BAO numeric 27988 0 0
## 25 25 D3BMM3 numeric 27988 0 0
## 26 26 D3BMM4 numeric 27988 0 0
## 27 27 D3BPO3 numeric 27988 0 0
## 28 28 D3BPO4 numeric 27988 0 0
## 29 29 D5CR numeric 27988 0 0
## 30 30 D5CE numeric 27988 0 0
## 31 31 Drivers numeric 27988 0 0
## No_of_distinct_values mean median var
## 1 3900 1256.42 940.00 1154329.86
## 2 3172 1971.74 1534.00 4409372.71
## 3 1543 665.20 524.00 417485.75
## 4 785 211.07 164.00 35179.94
## 5 1226 418.02 305.00 262112.10
## 6 1804 835.49 653.00 581459.67
## 7 8150 1.74 1.29 3.34
## 8 8157 4.48 3.25 20.62
## 9 6558 0.23 0.04 0.38
## 10 6443 0.46 0.03 12.11
## 11 7304 0.57 0.09 5.88
## 12 7727 1.27 0.15 34.72
## 13 6558 0.23 0.04 0.38
## 14 6360 0.27 0.02 3.33
## 15 7304 0.57 0.09 5.88
## 16 7296 0.47 0.06 8.10
## 17 8173 4.57 2.63 126.74
## 18 7825 3.29 0.71 311.37
## 19 7432 0.67 0.73 0.05
## 20 7537 0.67 0.73 0.05
## 21 4731 1.47 0.51 5.87
## 22 7720 2.45 1.65 6.68
## 23 8173 12.39 12.08 56.74
## 24 4133 3.74 1.20 49.83
## 25 6821 9.76 5.90 134.37
## 26 6267 7.25 2.61 138.81
## 27 8109 54.52 43.63 2383.28
## 28 7839 26.96 12.31 1347.68
## 29 7940 0.01 0.00 0.00
## 30 7928 0.01 0.00 0.00
## 31 2711 1353.67 1045.44 2148727.93
set.seed(123)
# Randomly select 5,000 row indices out of however many rows df has
sample_indices <- sample(seq_len(nrow(dat2)), size = 5000)
# Subset df using those indices
df_sample <- dat2[sample_indices, ]
df_long <- df_sample %>%
select(all_of(names(df_sample))) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "value")
ggplot(df_long, aes(x = value)) +
geom_density(fill = "blue", alpha = 0.3) +
facet_wrap(~ variable, scales = "free") +
labs(x = "Value",
y = "Density"
) +
theme_minimal()

df_subset <- subset(dat, AADT > 0 & AADT < 4001)
df_numeric_only <- df_subset[, sapply(df_subset, is.numeric)]
### for all variables
df_long1 <- df_numeric_only %>%
select(all_of(names(df_numeric_only))) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "value")
ggplot(df_long1, aes(x = value)) +
geom_density(fill = "blue", alpha = 0.3) +
facet_wrap(~ variable, scales = "free") +
labs(x = "Value",
y = "Density"
) +
theme_minimal()
## Warning: Removed 8428 rows containing non-finite outside the scale range
## (`stat_density()`).

# Mark the original dataset
df_original <- dat2 %>%
mutate(source = "Original")
# Mark the sampled dataset
df_smpl <- df_sample %>%
mutate(source = "Sample")
# Combine the two
df_combined <- bind_rows(df_original, df_smpl)
ggplot(df_combined, aes(x = AADT, fill = source)) +
geom_density(alpha = 0.4) +
labs(x = "AADT",
y = "Density",
fill = "Dataset") +
theme_minimal() +
theme(
# Position legend inside plot.
# Coordinates are in [0,1] from left to right (x) and bottom to top (y).
legend.position = c(0.8, 0.7),
# Optionally tweak legend appearance
legend.background = element_rect(fill = "white", colour = "black"),
legend.title = element_text(face = "bold"),
legend.key = element_rect(fill = "white")
)

library(corrplot)
cor1= cor(df_sample)
corrplot(cor1, method="circle")

corrplot(cor1, method="number")

library(lares)
corr_cross(df_sample)
