Lumping and One Hot Encoder

setwd("C:/Users/mvx13/OneDrive - Texas State University/Hackathon_Rohit/Papers/Ped_byVehType")
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.2
library(data.table)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(vip)
## Warning: package 'vip' was built under R version 4.2.3
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
library(xgboost) 
## Warning: package 'xgboost' was built under R version 4.2.3
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(readxl)


dat1= read_excel("LA_17_21_PedwithOtherVarFin.xlsx", sheet="Main")
names(dat1)
##  [1] "CRASH_NUM1"     "PED_NUM"        "CRASH_PED_NUM1" "Ped_Impair"    
##  [5] "Ped_Act"        "Ped_Act1"       "Ped_Age"        "Ped_BAC"       
##  [9] "Ped_Cl_Low"     "Ped_Cl_Up"      "Ped_Cond"       "Ped_Cond1"     
## [13] "Ped_Inj"        "Ped_State"      "Ped_Gen"        "Ped_Rac"       
## [17] "Cited"          "Year"           "Seve"           "DrInj"         
## [21] "DOW"            "Season"         "Access"         "Align"         
## [25] "PriContrib"     "HAR"            "Lighting"       "Loc"           
## [29] "CrashType"      "NumVeh"         "RoadCond"       "RoadRel"       
## [33] "RoadType"       "TrkBus"         "Weather"        "HwyType"       
## [37] "NumOcc"         "MostHarm"       "DrAlcDrug"      "DrAge"         
## [41] "DrCond"         "DrDistract"     "DrRace"         "DrGen"         
## [45] "MovReason"      "NumOccV"        "PSL"            "PriorMov"      
## [49] "TCD"            "TCDCond"        "VehCond"        "VehLight"      
## [53] "VehType"        "Violation"      "VisObs"         "City"          
## [57] "Parish"         "Int"            "ESTAlc"         "Ped"           
## [61] "RwD"            "LaneDepart"     "Latitude"       "Longitude"
dat2= dat1[, c(1, 2, 3, 18, 20, 19, 6, 7, 12, 13, 15, 16, 21:55)]
names(dat2)
##  [1] "CRASH_NUM1"     "PED_NUM"        "CRASH_PED_NUM1" "Year"          
##  [5] "DrInj"          "Seve"           "Ped_Act1"       "Ped_Age"       
##  [9] "Ped_Cond1"      "Ped_Inj"        "Ped_Gen"        "Ped_Rac"       
## [13] "DOW"            "Season"         "Access"         "Align"         
## [17] "PriContrib"     "HAR"            "Lighting"       "Loc"           
## [21] "CrashType"      "NumVeh"         "RoadCond"       "RoadRel"       
## [25] "RoadType"       "TrkBus"         "Weather"        "HwyType"       
## [29] "NumOcc"         "MostHarm"       "DrAlcDrug"      "DrAge"         
## [33] "DrCond"         "DrDistract"     "DrRace"         "DrGen"         
## [37] "MovReason"      "NumOccV"        "PSL"            "PriorMov"      
## [41] "TCD"            "TCDCond"        "VehCond"        "VehLight"      
## [45] "VehType"        "Violation"      "VisObs"
dat2= dat2 %>% mutate_if(is.character, as.factor)
summary(dat2)
##                 CRASH_NUM1      PED_NUM                     CRASH_PED_NUM1
##  LA17_170309132154541:  23   Min.   : 1.000   LA17_1405131936324181:   1  
##  LA18_20180039626    :   9   1st Qu.: 1.000   LA17_17-0020881      :   1  
##  LA17_170712141541690:   8   Median : 1.000   LA17_17000092101     :   1  
##  LA1920190012302     :   6   Mean   : 1.115   LA17_17000102501     :   1  
##  LA17_20170012722    :   5   3rd Qu.: 1.000   LA17_1701021529437921:   1  
##  LA17_6246316        :   5   Max.   :23.000   LA17_1701021937334951:   1  
##  (Other)             :8157                    (Other)              :8207  
##       Year      DrInj    Seve    
##  Min.   :2017   A:  17   A: 951  
##  1st Qu.:2018   B: 102   B:3043  
##  Median :2019   C: 456   C:2719  
##  Mean   :2019   K:   5   K: 780  
##  3rd Qu.:2020   O:7633   O: 720  
##  Max.   :2021                    
##                                  
##                                         Ped_Act1                   Ped_Age    
##  Crossing, Entering Road At Intersection    :1882   25-45 years        :2848  
##  Crossing, Entering Road Not At Intersection:1778   46-65 years        :2071  
##  Other                                      :1159   Children           : 686  
##  Walking In Road - With Traffic             : 932   Infant             :  39  
##  Unknown                                    : 775   Older than 65 years: 559  
##  Not In Roadway                             : 653   Unknown            : 396  
##  (Other)                                    :1034   Young              :1614  
##         Ped_Cond1    Ped_Inj     Ped_Gen        Ped_Rac          DOW      
##  Normal      :3324   A: 874   F      :2816   B      :4365   Weekday:6022  
##  Unknown     :2316   B:2970   M      :5160   I      :   4   Weekend:2191  
##  Inattentive :1612   C:2780   Unknown: 237   O      : 260                 
##  Alc Impaired: 381   K: 734                  Unknown: 272                 
##  Other       : 246   O: 855                  W      :3312                 
##  Distracted  : 181                                                        
##  (Other)     : 153                                                        
##     Season                 Access           Align     
##  Autumn:2280   Full Control   : 344   Curve    : 243  
##  Spring:2034   No Control     :7052   Dip      :  15  
##  Summer:1700   Other          : 105   Hillcrest:  46  
##  Winter:2199   Partial Control: 712   On Grade :  97  
##                                       Other    :  56  
##                                       Straight :7756  
##                                                       
##                PriContrib        HAR                         Lighting   
##  Violations         :4223   Min.   :0.0000   Dark - Cont St Lts  :2191  
##  Pedestrian         :2095   1st Qu.:0.0000   Dark - No St Lts    :1222  
##  Move Prior to Crash:1223   Median :0.0000   Dark - St Lts at Int: 539  
##  Dr Condition       : 264   Mean   :0.2809   Daylight            :3940  
##  Lighting           : 105   3rd Qu.:1.0000   Dusk/Dawn           : 239  
##  Vision Obscure     : 100   Max.   :1.0000   Other               :  82  
##  (Other)            : 203                                               
##                  Loc             CrashType         NumVeh    
##  Business Cont     :2087   NC WMV     :5597   Multiple: 266  
##  Business Residen  :2813   Other      : 999   Single  :7312  
##  Manufac/Industrial: 148   Right Angle: 528   Two     : 635  
##  Open Country      : 411   Rear End   : 343                  
##  Other             : 131   Sideswipe  : 307                  
##  Residential       :2515   Left Turn  : 228                  
##  School            : 108   (Other)    : 211                  
##               RoadCond              RoadRel    
##  No Abnormalities :7740   Beyond ROW    :  24  
##  Other            : 201   Median        :  18  
##  Construction     :  56   On Roadway    :7339  
##  Water On Roadway :  49   Other         : 189  
##  Previous Crash   :  48   Shoulder      : 391  
##  Object In Roadway:  38   Shoulder Left :  90  
##  (Other)          :  81   Shoulder Right: 162  
##                       RoadType        TrkBus                  Weather    
##  One-Way Road             :1025   Min.   :0.0000   Clear          :6467  
##  Other                    : 155   1st Qu.:0.0000   Cloudy         :1064  
##  Two-Way Road Div.        :1834   Median :0.0000   Fog/Smoke      :  66  
##  Two-Way Road Undiv.      :4944   Mean   :0.0347   Other          :  84  
##  Two-Way Road With Barrier: 255   3rd Qu.:0.0000   Rain           : 524  
##                                   Max.   :1.0000   Sleet/Hail/Snow:   8  
##                                                                          
##         HwyType          NumOcc                     MostHarm   
##  City Street:4102   Multiple: 860   Other               :5940  
##  Interstate : 331   NA      :   1   MV in Transport     :1872  
##  Other      :  53   Single  :6267   Unknown             : 203  
##  Parish Road:1140   Two     :1085   Parked Motor Vehicle: 144  
##  State Hwy  :1682                   Animal              :  20  
##  Toll Road  :   3                   Ditch               :  14  
##  U.S. Hwy   : 902                   (Other)             :  20  
##                      DrAlcDrug                    DrAge     
##  Alcohol And Drugs Present:  84   25-45 years        :2614  
##  Alcohol Present          : 281   46-65 years        :1776  
##  Drugs Present            :  35   Older than 65 years: 778  
##  Neither Alc/Drugs        :5439   Unknown            :1807  
##  Unknown                  :2374   Young              :1238  
##                                                             
##                                                             
##             DrCond                       DrDistract         DrRace    
##  Normal        :4084   Cell Phone             :  50   Afri-Amcn:3152  
##  Unknown       :2156   Not Distracted         :4492   Caucasian:3119  
##  Inattentive   :1237   Other Electronic Device:  11   Other    :1942  
##  Distracted    : 224   Other Inside Vehicle   : 120                   
##  Other         : 199   Other Outside Vehicle  : 258                   
##  Alc (Impaired): 198   Unknown                :3282                   
##  (Other)       : 115                                                  
##      DrGen                    MovReason        NumOccV    
##  F      :2632   Normal Movement    :3771   Multiple: 413  
##  M      :3891   Unknown            :1753   Single  :7008  
##  Unknown:1690   Driver Violation   :1358   Two     : 792  
##                 To Avoid Pedestrian: 411                  
##                 Driver Condition   : 316                  
##                 Other              : 308                  
##                 (Other)            : 296                  
##                PSL                                    PriorMov   
##  25 MPH or Lower :3129   Changing Lanes On Multi-Lane Road:  74  
##  30-45 MPH       :3938   Making Left Turn                 : 725  
##  50-60 MPH       : 949   Other                            :1534  
##  65-70 MPH       : 188   Other Or Unknown                 : 736  
##  75 MPH and Above:   9   Proceeding Straight Ahead        :4864  
##                          Slowing To Stop                  :  94  
##                          Stopped                          : 186  
##                      TCD                     TCDCond    
##  No Control            :1950   Defective Markings:  20  
##  White Dashed Line     :1797   Functioning       :5686  
##  Yellow No Passing Line: 790   No Controls       :1763  
##  Yellow Dashed Line    : 745   Not Functioning   :  17  
##  Stop Sign             : 737   Obscured          :   6  
##  Green Signal On       : 634   Unknown           : 721  
##  (Other)               :1560                            
##                 VehCond                       VehLight   
##  Defective          :  76   Daytime Running Lights: 508  
##  No Defects Observed:5881   Headlights Off        :1364  
##  Other              : 136   Headlights On         :3398  
##  Unknown            :2120   Unknown               :2943  
##                                                          
##                                                          
##                                                          
##                VehType                           Violation   
##  Car               :3516   No Violations              :3572  
##  Light Truck/Van   :2023   Other                      :2465  
##  SUV               :1829   Unknown                    :1280  
##  Other             : 516   Failure to Yield           : 579  
##  Medium/Large Truck: 201   Disregarded Traffic Control: 128  
##  Motorcycle        :  56   Improper Backing           :  96  
##  (Other)           :  72   (Other)                    :  93  
##                  VisObs    
##  Moving Vehicles    :  94  
##  No Obscurements    :5337  
##  Other              : 482  
##  Rain/Snow Winshield: 155  
##  Unknown            :2145  
##                            
## 
table(dat2$VehType)
## 
##            Bicycle                Bus                Car  Emergency Vehicle 
##                 11                 20               3516                 16 
##     Farm Equipment    Light Truck/Van Medium/Large Truck         Motor Home 
##                  1               2023                201                  4 
##         Motorcycle              Other         School Bus                SUV 
##                 56                516                 20               1829
suv1= subset(dat2, VehType== "Light Truck/Van"| VehType== "SUV")
dim(suv1)
## [1] 3852   47
suv2= suv1[, c("Ped_Inj", "Ped_Gen", "CrashType", "Ped_Age", "Ped_Act1", "Ped_Cond1", "HwyType", "Violation", "TCD", 
               "DrAge", "Loc", "PriorMov", "PSL", "Lighting", "Season", "DrCond", "VisObs", "RoadType", 
               "DrRace", "VehLight", "Ped_Rac", "MostHarm", "Weather", "NumOcc", "RoadRel", "VehType", 
               "RoadCond", "DrGen", "Access", "TCDCond", "VehCond", "NumVeh", "DOW", "Align", 
               "HAR", "TrkBus")]

oldnames <- c("Ped_Inj", "Ped_Gen", "CrashType", "Ped_Age", "Ped_Act1", "Ped_Cond1", "HwyType", "Violation", "TCD", 
              "DrAge", "Loc", "PriorMov", "PSL", "Lighting", "Season", "DrCond", "VisObs", "RoadType", 
              "DrRace", "VehLight", "Ped_Rac", "MostHarm", "Weather", "NumOcc", "RoadRel", "VehType", 
              "RoadCond", "DrGen", "Access", "TCDCond", "VehCond", "NumVeh", "DOW", "Align",  
              "HAR", "TrkBus")


newnames <- c("PIn", "PGn", "CTy", "PAg", "PAc", "PCn", "HTy", "Vln", "TCD", 
              "DAg", "Loc", "PMv", "PSL", "Lgh", "Ssn", "DCn", "VsO", "RTy", 
              "DRc", "VLg", "PRc", "MHr", "Wth", "NOc", "RRl", "VTy", 
              "RCn", "DGn", "Acs", "TCn", "VCn", "NVh", "DOW", "Aln",
              "HAR", "TBs")

on= as.data.frame(oldnames)
on$varid= 1:nrow(on)
nn= as.data.frame(newnames)
nn$varid= 1:nrow(nn)
onn= left_join(on, nn, by="varid")
head(onn)
##    oldnames varid newnames
## 1   Ped_Inj     1      PIn
## 2   Ped_Gen     2      PGn
## 3 CrashType     3      CTy
## 4   Ped_Age     4      PAg
## 5  Ped_Act1     5      PAc
## 6 Ped_Cond1     6      PCn
suv2 = suv2 %>% rename_with(oldnames, .fn = ~ newnames)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(oldnames)
## 
##   # Now:
##   data %>% select(all_of(oldnames))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(xgboost) 
# Fit a GBM
set.seed(102)  # for reproducibility
bst <- xgboost(
  data = data.matrix(subset(suv2, select = -PIn)),
  label = suv2$PIn,
  objective = "reg:squarederror",
  nrounds = 100,
  max_depth = 5,
  eta = 0.3,
  verbose = 0  # suppress printing
)

library(vip)
vi_bst <- xgb.importance(model = bst)
vi(bst) 
## # A tibble: 35 × 2
##    Variable Importance
##    <chr>         <dbl>
##  1 PGn          0.118 
##  2 CTy          0.0739
##  3 PAg          0.0668
##  4 TCD          0.0582
##  5 PCn          0.0524
##  6 PAc          0.0509
##  7 HTy          0.0426
##  8 Lgh          0.0382
##  9 Ssn          0.0373
## 10 Vln          0.0359
## # ℹ 25 more rows
library(ggplot2)
vip(bst, num_features=42) +theme_bw(base_size=14)

suv3= suv2[, c("PIn","PGn", "CTy", "PAg", "PAc", "PCn", "HTy", 
               "Vln", "TCD", "DAg", "Loc", "PMv", "PSL", "Lgh", 
               "Ssn", "DCn", "VsO", "RTy", "DRc", "VLg", 
               "PRc", "MHr")]
dim(suv3)
## [1] 3852   22
suv4= suv3 %>% mutate_if(is.character, as.factor)

library(forcats)
## Warning: package 'forcats' was built under R version 4.2.3
library(dplyr)
library(forcats)

factor_columns= c("PGn","CTy", "PAg", "PAc", "PCn", "HTy", 
                  "Vln", "TCD", "DAg", "Loc", "PMv", "PSL", "Lgh", 
                  "Ssn", "DCn", "VsO", "RTy", "DRc", "VLg", 
                  "PRc", "MHr")

suv4a = suv4 %>%
  mutate(across(factor_columns, fct_lump_n, n = 2,other_level = 'other'))
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(factor_columns, fct_lump_n, n = 2, other_level =
##   "other")`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(factor_columns)
## 
##   # Now:
##   data %>% select(all_of(factor_columns))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
suv4a$id= 1:nrow(suv4a)
dim(suv4a)
## [1] 3852   23
library(data.table)
library(mltools)
## Warning: package 'mltools' was built under R version 4.2.3
suv4b <- one_hot(as.data.table(suv4a[, -c(1)]))
suv4b$id= 1:nrow(suv4b)
dim(suv4b)
## [1] 3852   64
suv4c= left_join(suv4a[, c(1, 23)], suv4b, by="id")
head(suv4c)
## # A tibble: 6 × 65
##   PIn      id PGn_F PGn_M PGn_other `CTy_NC WMV` CTy_Other CTy_other
##   <fct> <int> <int> <int>     <int>        <int>     <int>     <int>
## 1 C         1     0     1         0            0         1         0
## 2 C         2     0     1         0            1         0         0
## 3 O         3     0     1         0            0         0         1
## 4 B         4     0     1         0            0         0         1
## 5 C         5     0     1         0            0         0         1
## 6 B         6     1     0         0            0         0         1
## # ℹ 57 more variables: `PAg_25-45 years` <int>, `PAg_46-65 years` <int>,
## #   PAg_other <int>, `PAc_Crossing, Entering Road At Intersection` <int>,
## #   `PAc_Crossing, Entering Road Not At Intersection` <int>, PAc_other <int>,
## #   PCn_Normal <int>, PCn_Unknown <int>, PCn_other <int>,
## #   `HTy_City Street` <int>, `HTy_State Hwy` <int>, HTy_other <int>,
## #   `Vln_No Violations` <int>, Vln_Other <int>, Vln_other <int>,
## #   `TCD_No Control` <int>, `TCD_White Dashed Line` <int>, TCD_other <int>, …
## names(suv4c) = gsub(pattern = "`*`", replacement = "", x = names(suv4c))
## head(suv4c)

names(suv4c)
##  [1] "PIn"                                            
##  [2] "id"                                             
##  [3] "PGn_F"                                          
##  [4] "PGn_M"                                          
##  [5] "PGn_other"                                      
##  [6] "CTy_NC WMV"                                     
##  [7] "CTy_Other"                                      
##  [8] "CTy_other"                                      
##  [9] "PAg_25-45 years"                                
## [10] "PAg_46-65 years"                                
## [11] "PAg_other"                                      
## [12] "PAc_Crossing, Entering Road At Intersection"    
## [13] "PAc_Crossing, Entering Road Not At Intersection"
## [14] "PAc_other"                                      
## [15] "PCn_Normal"                                     
## [16] "PCn_Unknown"                                    
## [17] "PCn_other"                                      
## [18] "HTy_City Street"                                
## [19] "HTy_State Hwy"                                  
## [20] "HTy_other"                                      
## [21] "Vln_No Violations"                              
## [22] "Vln_Other"                                      
## [23] "Vln_other"                                      
## [24] "TCD_No Control"                                 
## [25] "TCD_White Dashed Line"                          
## [26] "TCD_other"                                      
## [27] "DAg_25-45 years"                                
## [28] "DAg_46-65 years"                                
## [29] "DAg_other"                                      
## [30] "Loc_Business Residen"                           
## [31] "Loc_Residential"                                
## [32] "Loc_other"                                      
## [33] "PMv_Other"                                      
## [34] "PMv_Proceeding Straight Ahead"                  
## [35] "PMv_other"                                      
## [36] "PSL_25 MPH or Lower"                            
## [37] "PSL_30-45 MPH"                                  
## [38] "PSL_other"                                      
## [39] "Lgh_Dark - Cont St Lts"                         
## [40] "Lgh_Daylight"                                   
## [41] "Lgh_other"                                      
## [42] "Ssn_Autumn"                                     
## [43] "Ssn_Winter"                                     
## [44] "Ssn_other"                                      
## [45] "DCn_Normal"                                     
## [46] "DCn_Unknown"                                    
## [47] "DCn_other"                                      
## [48] "VsO_No Obscurements"                            
## [49] "VsO_Unknown"                                    
## [50] "VsO_other"                                      
## [51] "RTy_Two-Way Road Div."                          
## [52] "RTy_Two-Way Road Undiv."                        
## [53] "RTy_other"                                      
## [54] "DRc_Afri-Amcn"                                  
## [55] "DRc_Caucasian"                                  
## [56] "DRc_other"                                      
## [57] "VLg_Headlights On"                              
## [58] "VLg_Unknown"                                    
## [59] "VLg_other"                                      
## [60] "PRc_B"                                          
## [61] "PRc_W"                                          
## [62] "PRc_other"                                      
## [63] "MHr_MV in Transport"                            
## [64] "MHr_Other"                                      
## [65] "MHr_other"
suv4d <- setNames(names(suv4c),
                       abbreviate(names(suv4c)))
names(suv4d)
##  [1] "PIn"    "id"     "PG_F"   "PG_M"   "PGn_"   "CT_W"   "CT_O"   "CTy_"  
##  [9] "PA_2y"  "PA_4y"  "PAg_"   "PERAI"  "PERNAI" "PAc_"   "PC_N"   "PC_U"  
## [17] "PCn_"   "HT_S"   "HT_H"   "HTy_"   "V_NV"   "Vl_O"   "Vln_"   "TCDC"  
## [25] "TCDL"   "TCD_"   "DA_2y"  "DA_4y"  "DAg_"   "L_BR"   "Lc_R"   "Lc_t"  
## [33] "PM_O"   "PMSA"   "PMv_"   "PMoL"   "PSLM"   "PSL_"   "L-CSL"  "Lg_D"  
## [41] "Lgh_"   "Ss_A"   "Ss_W"   "Ssn_"   "DC_N"   "DC_U"   "DCn_"   "VO_O"  
## [49] "VO_U"   "VsO_"   "RTRD"   "RTRU"   "RTy_"   "DR_A"   "DR_C"   "DRc_"  
## [57] "VL_O"   "VL_U"   "VLg_"   "PR_B"   "PR_W"   "PRc_"   "MHiT"   "MH_O"  
## [65] "MHr_"
suv4e = suv4c %>% rename_with(names(suv4c), .fn = ~ names(suv4d))
head(suv4e)
## # A tibble: 6 × 65
##   PIn      id  PG_F  PG_M  PGn_  CT_W  CT_O  CTy_ PA_2y PA_4y  PAg_ PERAI PERNAI
##   <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>  <int>
## 1 C         1     0     1     0     0     1     0     0     0     1     0      1
## 2 C         2     0     1     0     1     0     0     0     0     1     0      1
## 3 O         3     0     1     0     0     0     1     0     0     1     0      0
## 4 B         4     0     1     0     0     0     1     0     1     0     0      0
## 5 C         5     0     1     0     0     0     1     1     0     0     0      0
## 6 B         6     1     0     0     0     0     1     1     0     0     0      0
## # ℹ 52 more variables: PAc_ <int>, PC_N <int>, PC_U <int>, PCn_ <int>,
## #   HT_S <int>, HT_H <int>, HTy_ <int>, V_NV <int>, Vl_O <int>, Vln_ <int>,
## #   TCDC <int>, TCDL <int>, TCD_ <int>, DA_2y <int>, DA_4y <int>, DAg_ <int>,
## #   L_BR <int>, Lc_R <int>, Lc_t <int>, PM_O <int>, PMSA <int>, PMv_ <int>,
## #   PMoL <int>, PSLM <int>, PSL_ <int>, `L-CSL` <int>, Lg_D <int>, Lgh_ <int>,
## #   Ss_A <int>, Ss_W <int>, Ssn_ <int>, DC_N <int>, DC_U <int>, DCn_ <int>,
## #   VO_O <int>, VO_U <int>, VsO_ <int>, RTRD <int>, RTRU <int>, RTy_ <int>, …
summary(suv4e)
##  PIn            id              PG_F             PG_M             PGn_      
##  A: 391   Min.   :   1.0   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  B:1331   1st Qu.: 963.8   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
##  C:1348   Median :1926.5   Median :0.0000   Median :1.0000   Median :0.000  
##  K: 380   Mean   :1926.5   Mean   :0.3494   Mean   :0.6236   Mean   :0.027  
##  O: 402   3rd Qu.:2889.2   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.000  
##           Max.   :3852.0   Max.   :1.0000   Max.   :1.0000   Max.   :1.000  
##       CT_W             CT_O             CTy_            PA_2y       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.6882   Mean   :0.1197   Mean   :0.1921   Mean   :0.3528  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      PA_4y             PAg_           PERAI            PERNAI      
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.2552   Mean   :0.392   Mean   :0.2323   Mean   :0.2126  
##  3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##       PAc_            PC_N             PC_U             PCn_       
##  Min.   :0.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.555   Mean   :0.4123   Mean   :0.2721   Mean   :0.3157  
##  3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       HT_S             HT_H             HTy_             V_NV       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.4592   Mean   :0.2214   Mean   :0.3193   Mean   :0.4792  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       Vl_O             Vln_             TCDC             TCDL       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.2892   Mean   :0.2316   Mean   :0.2251   Mean   :0.2251  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       TCD_            DA_2y            DA_4y             DAg_       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.5498   Mean   :0.3313   Mean   :0.2747   Mean   :0.3941  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       L_BR             Lc_R             Lc_t             PM_O      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.000  
##  Mean   :0.3424   Mean   :0.2934   Mean   :0.3642   Mean   :0.197  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.000  
##       PMSA             PMv_           PMoL             PSLM       
##  Min.   :0.0000   Min.   :0.00   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.00   Median :0.0000   Median :0.0000  
##  Mean   :0.5929   Mean   :0.21   Mean   :0.3614   Mean   :0.4844  
##  3rd Qu.:1.0000   3rd Qu.:0.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.00   Max.   :1.0000   Max.   :1.0000  
##       PSL_            L-CSL             Lg_D             Lgh_       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.1542   Mean   :0.2378   Mean   :0.4995   Mean   :0.2627  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       Ss_A             Ss_W             Ssn_             DC_N       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :1.0000  
##  Mean   :0.2809   Mean   :0.2684   Mean   :0.4507   Mean   :0.5449  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       DC_U             DCn_             VO_O             VO_U       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.2022   Mean   :0.2529   Mean   :0.7147   Mean   :0.1999  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       VsO_              RTRD             RTRU             RTy_       
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.08541   Mean   :0.2253   Mean   :0.6077   Mean   :0.1669  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       DR_A             DR_C             DRc_             VL_O       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.3432   Mean   :0.4821   Mean   :0.1747   Mean   :0.4463  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       VL_U             VLg_             PR_B            PR_W       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :1.000   Median :0.0000  
##  Mean   :0.2985   Mean   :0.2552   Mean   :0.507   Mean   :0.4356  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.000   Max.   :1.0000  
##       PRc_              MHiT             MH_O             MHr_        
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.0000   Median :1.0000   Median :0.00000  
##  Mean   :0.05737   Mean   :0.2227   Mean   :0.7357   Mean   :0.04154  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000
on= as.data.frame(names(suv4c))
on$varid= 1:nrow(on)
nn= as.data.frame(names(suv4d))
nn$varid= 1:nrow(nn)
onn= left_join(on, nn, by="varid")
head(onn)
##   names(suv4c) varid names(suv4d)
## 1          PIn     1          PIn
## 2           id     2           id
## 3        PGn_F     3         PG_F
## 4        PGn_M     4         PG_M
## 5    PGn_other     5         PGn_
## 6   CTy_NC WMV     6         CT_W