Visualization

data_path <- "/Users/xiangjiang/Google Drive/Data sharing/data/"
training_file <- paste0(data_path, "training.csv")
testing_file <- paste0(data_path, "test.csv")

# Load training and test data
dtrain <- read.csv(training_file)
dtest <- read.csv(testing_file)

# Summary for training and test data
summary(dtrain)
##     EventId        DER_mass_MMC    DER_mass_transverse_met_lep
##  Min.   :100000   Min.   :-999.0   Min.   :  0.0              
##  1st Qu.:162500   1st Qu.:  78.1   1st Qu.: 19.2              
##  Median :225000   Median : 105.0   Median : 46.5              
##  Mean   :225000   Mean   : -49.0   Mean   : 49.2              
##  3rd Qu.:287499   3rd Qu.: 130.6   3rd Qu.: 73.6              
##  Max.   :349999   Max.   :1192.0   Max.   :690.1              
##   DER_mass_vis       DER_pt_h      DER_deltaeta_jet_jet DER_mass_jet_jet
##  Min.   :   6.3   Min.   :   0.0   Min.   :-999.0       Min.   :-999    
##  1st Qu.:  59.4   1st Qu.:  14.1   1st Qu.:-999.0       1st Qu.:-999    
##  Median :  73.8   Median :  38.5   Median :-999.0       Median :-999    
##  Mean   :  81.2   Mean   :  57.9   Mean   :-708.4       Mean   :-601    
##  3rd Qu.:  92.3   3rd Qu.:  79.2   3rd Qu.:   0.5       3rd Qu.:  83    
##  Max.   :1349.4   Max.   :2835.0   Max.   :   8.5       Max.   :4975    
##  DER_prodeta_jet_jet DER_deltar_tau_lep   DER_pt_tot       DER_sum_pt    
##  Min.   :-999.0      Min.   :0.208      Min.   :   0.0   Min.   :  46.1  
##  1st Qu.:-999.0      1st Qu.:1.810      1st Qu.:   2.8   1st Qu.:  77.5  
##  Median :-999.0      Median :2.491      Median :  12.3   Median : 120.7  
##  Mean   :-709.4      Mean   :2.373      Mean   :  18.9   Mean   : 158.4  
##  3rd Qu.:  -4.6      3rd Qu.:2.961      3rd Qu.:  27.6   3rd Qu.: 200.5  
##  Max.   :  16.7      Max.   :5.684      Max.   :2835.0   Max.   :1852.5  
##  DER_pt_ratio_lep_tau DER_met_phi_centrality DER_lep_eta_centrality
##  Min.   : 0.047       Min.   :-1.414         Min.   :-999          
##  1st Qu.: 0.883       1st Qu.:-1.371         1st Qu.:-999          
##  Median : 1.280       Median :-0.356         Median :-999          
##  Mean   : 1.438       Mean   :-0.128         Mean   :-709          
##  3rd Qu.: 1.777       3rd Qu.: 1.225         3rd Qu.:   0          
##  Max.   :19.773       Max.   : 1.414         Max.   :   1          
##    PRI_tau_pt     PRI_tau_eta      PRI_tau_phi        PRI_lep_pt   
##  Min.   : 20.0   Min.   :-2.499   Min.   :-3.1420   Min.   : 26.0  
##  1st Qu.: 24.6   1st Qu.:-0.925   1st Qu.:-1.5750   1st Qu.: 32.4  
##  Median : 31.8   Median :-0.023   Median :-0.0330   Median : 40.5  
##  Mean   : 38.7   Mean   :-0.011   Mean   :-0.0082   Mean   : 46.7  
##  3rd Qu.: 45.0   3rd Qu.: 0.898   3rd Qu.: 1.5650   3rd Qu.: 53.4  
##  Max.   :764.4   Max.   : 2.497   Max.   : 3.1420   Max.   :560.3  
##   PRI_lep_eta       PRI_lep_phi         PRI_met        PRI_met_phi     
##  Min.   :-2.5050   Min.   :-3.1420   Min.   :   0.1   Min.   :-3.1420  
##  1st Qu.:-1.0140   1st Qu.:-1.5220   1st Qu.:  21.4   1st Qu.:-1.5750  
##  Median :-0.0450   Median : 0.0860   Median :  34.8   Median :-0.0240  
##  Mean   :-0.0195   Mean   : 0.0435   Mean   :  41.7   Mean   :-0.0101  
##  3rd Qu.: 0.9590   3rd Qu.: 1.6180   3rd Qu.:  51.9   3rd Qu.: 1.5610  
##  Max.   : 2.5030   Max.   : 3.1420   Max.   :2842.6   Max.   : 3.1420  
##  PRI_met_sumet     PRI_jet_num    PRI_jet_leading_pt PRI_jet_leading_eta
##  Min.   :  13.7   Min.   :0.000   Min.   :-999.0     Min.   :-999.0     
##  1st Qu.: 123.0   1st Qu.:0.000   1st Qu.:-999.0     1st Qu.:-999.0     
##  Median : 179.7   Median :1.000   Median :  39.0     Median :  -1.9     
##  Mean   : 209.8   Mean   :0.979   Mean   :-348.3     Mean   :-399.3     
##  3rd Qu.: 263.4   3rd Qu.:2.000   3rd Qu.:  75.3     3rd Qu.:   0.4     
##  Max.   :2004.0   Max.   :3.000   Max.   :1120.6     Max.   :   4.5     
##  PRI_jet_leading_phi PRI_jet_subleading_pt PRI_jet_subleading_eta
##  Min.   :-999.0      Min.   :-999.0        Min.   :-999.0        
##  1st Qu.:-999.0      1st Qu.:-999.0        1st Qu.:-999.0        
##  Median :  -2.1      Median :-999.0        Median :-999.0        
##  Mean   :-399.3      Mean   :-692.4        Mean   :-709.1        
##  3rd Qu.:   0.5      3rd Qu.:  33.7        3rd Qu.:  -2.5        
##  Max.   :   3.1      Max.   : 721.5        Max.   :   4.5        
##  PRI_jet_subleading_phi PRI_jet_all_pt       Weight      Label     
##  Min.   :-999.0         Min.   :   0.0   Min.   :0.002   b:164333  
##  1st Qu.:-999.0         1st Qu.:   0.0   1st Qu.:0.019   s: 85667  
##  Median :-999.0         Median :  40.5   Median :1.156             
##  Mean   :-709.1         Mean   :  73.1   Mean   :1.647             
##  3rd Qu.:  -2.3         3rd Qu.: 109.9   3rd Qu.:2.404             
##  Max.   :   3.1         Max.   :1633.4   Max.   :7.823
summary(dtest)
##     EventId        DER_mass_MMC    DER_mass_transverse_met_lep
##  Min.   :350000   Min.   :-999.0   Min.   :  0.0              
##  1st Qu.:487500   1st Qu.:  78.2   1st Qu.: 19.3              
##  Median :625000   Median : 105.1   Median : 46.5              
##  Mean   :625000   Mean   : -49.0   Mean   : 49.3              
##  3rd Qu.:762499   3rd Qu.: 130.8   3rd Qu.: 73.6              
##  Max.   :899999   Max.   :1949.3   Max.   :968.7              
##   DER_mass_vis       DER_pt_h      DER_deltaeta_jet_jet DER_mass_jet_jet
##  Min.   :   6.8   Min.   :   0.0   Min.   :-999.0       Min.   :-999    
##  1st Qu.:  59.4   1st Qu.:  14.2   1st Qu.:-999.0       1st Qu.:-999    
##  Median :  73.7   Median :  38.5   Median :-999.0       Median :-999    
##  Mean   :  81.1   Mean   :  57.8   Mean   :-707.4       Mean   :-600    
##  3rd Qu.:  92.2   3rd Qu.:  79.2   3rd Qu.:   0.5       3rd Qu.:  84    
##  Max.   :1265.0   Max.   :1337.2   Max.   :   8.7       Max.   :4795    
##  DER_prodeta_jet_jet DER_deltar_tau_lep   DER_pt_tot      DER_sum_pt    
##  Min.   :-999.0      Min.   :0.237      Min.   :  0.0   Min.   :  46.1  
##  1st Qu.:-999.0      1st Qu.:1.815      1st Qu.:  2.8   1st Qu.:  77.5  
##  Median :-999.0      Median :2.492      Median : 12.4   Median : 120.7  
##  Mean   :-708.4      Mean   :2.374      Mean   : 19.0   Mean   : 158.7  
##  3rd Qu.:  -4.5      3rd Qu.:2.962      3rd Qu.: 27.6   3rd Qu.: 201.0  
##  Max.   :  17.3      Max.   :5.751      Max.   :759.4   Max.   :2079.2  
##  DER_pt_ratio_lep_tau DER_met_phi_centrality DER_lep_eta_centrality
##  Min.   : 0.06        Min.   :-1.414         Min.   :-999          
##  1st Qu.: 0.89        1st Qu.:-1.371         1st Qu.:-999          
##  Median : 1.28        Median :-0.356         Median :-999          
##  Mean   : 1.44        Mean   :-0.127         Mean   :-708          
##  3rd Qu.: 1.78        3rd Qu.: 1.230         3rd Qu.:   0          
##  Max.   :32.23        Max.   : 1.414         Max.   :   1          
##    PRI_tau_pt     PRI_tau_eta       PRI_tau_phi        PRI_lep_pt   
##  Min.   : 20.0   Min.   :-2.4990   Min.   :-3.1420   Min.   : 26.0  
##  1st Qu.: 24.6   1st Qu.:-0.9260   1st Qu.:-1.5880   1st Qu.: 32.4  
##  Median : 31.8   Median :-0.0210   Median :-0.0420   Median : 40.6  
##  Mean   : 38.7   Mean   :-0.0119   Mean   :-0.0158   Mean   : 46.7  
##  3rd Qu.: 44.9   3rd Qu.: 0.8990   3rd Qu.: 1.5570   3rd Qu.: 53.4  
##  Max.   :627.0   Max.   : 2.5000   Max.   : 3.1420   Max.   :701.3  
##   PRI_lep_eta       PRI_lep_phi         PRI_met        PRI_met_phi     
##  Min.   :-2.5080   Min.   :-3.1420   Min.   :   0.1   Min.   :-3.1420  
##  1st Qu.:-1.0110   1st Qu.:-1.5080   1st Qu.:  21.4   1st Qu.:-1.5740  
##  Median :-0.0380   Median : 0.0970   Median :  34.8   Median :-0.0170  
##  Mean   :-0.0188   Mean   : 0.0518   Mean   :  41.6   Mean   :-0.0082  
##  3rd Qu.: 0.9560   3rd Qu.: 1.6170   3rd Qu.:  51.9   3rd Qu.: 1.5580  
##  Max.   : 2.5060   Max.   : 3.1420   Max.   :1254.4   Max.   : 3.1420  
##  PRI_met_sumet     PRI_jet_num   PRI_jet_leading_pt PRI_jet_leading_eta
##  Min.   :  13.8   Min.   :0.00   Min.   :-999.0     Min.   :-999.0     
##  1st Qu.: 123.0   1st Qu.:0.00   1st Qu.:-999.0     1st Qu.:-999.0     
##  Median : 179.9   Median :1.00   Median :  39.0     Median :  -1.9     
##  Mean   : 209.9   Mean   :0.98   Mean   :-348.9     Mean   :-399.9     
##  3rd Qu.: 263.9   3rd Qu.:2.00   3rd Qu.:  75.5     3rd Qu.:   0.4     
##  Max.   :2190.3   Max.   :3.00   Max.   :1163.4     Max.   :   4.5     
##  PRI_jet_leading_phi PRI_jet_subleading_pt PRI_jet_subleading_eta
##  Min.   :-999.0      Min.   :-999.0        Min.   :-999.0        
##  1st Qu.:-999.0      1st Qu.:-999.0        1st Qu.:-999.0        
##  Median :  -2.1      Median :-999.0        Median :-999.0        
##  Mean   :-399.9      Mean   :-691.3        Mean   :-708.2        
##  3rd Qu.:   0.5      3rd Qu.:  33.8        3rd Qu.:  -2.4        
##  Max.   :   3.1      Max.   : 817.8        Max.   :   4.5        
##  PRI_jet_subleading_phi PRI_jet_all_pt  
##  Min.   :-999.0         Min.   :   0.0  
##  1st Qu.:-999.0         1st Qu.:   0.0  
##  Median :-999.0         Median :  40.5  
##  Mean   :-708.2         Mean   :  73.2  
##  3rd Qu.:  -2.3         3rd Qu.: 110.5  
##  Max.   :   3.1         Max.   :1860.2
# Visualize correlation of the variables
pairs(dtrain[1:10000,2:5], col=dtrain$Label[1:10000])

plot of chunk unnamed-chunk-1

Preprocessing

# Normalize training data
dtrain$Label <- factor(dtrain$Label)
id.train <- dtrain[1]
label.train <- as.numeric(dtrain$Label) - 1
scaled.train <- scale(dtrain[2:32])

# Save normalized training data
write.csv(id.train, paste0(data_path, "Norm_id_train.csv"), row.names = FALSE)
write.csv(label.train, paste0(data_path, "Norm_label_train.csv"), row.names = FALSE)
write.csv(scaled.train, paste0(data_path, "Norm_scaled_train.csv"), row.names = FALSE)

# Normalize test data
id.test <- dtest[1]
scaled.test <- scale(dtest[2:31])

# Save normalized training data
write.csv(id.test, paste0(data_path, "Norm_id_test.csv"), row.names = FALSE)
write.csv(scaled.test, paste0(data_path, "Norm_scaled_test.csv"), row.names = FALSE)