Visualization
data_path <- "/Users/xiangjiang/Google Drive/Data sharing/data/"
training_file <- paste0(data_path, "training.csv")
testing_file <- paste0(data_path, "test.csv")
# Load training and test data
dtrain <- read.csv(training_file)
dtest <- read.csv(testing_file)
# Summary for training and test data
summary(dtrain)
## EventId DER_mass_MMC DER_mass_transverse_met_lep
## Min. :100000 Min. :-999.0 Min. : 0.0
## 1st Qu.:162500 1st Qu.: 78.1 1st Qu.: 19.2
## Median :225000 Median : 105.0 Median : 46.5
## Mean :225000 Mean : -49.0 Mean : 49.2
## 3rd Qu.:287499 3rd Qu.: 130.6 3rd Qu.: 73.6
## Max. :349999 Max. :1192.0 Max. :690.1
## DER_mass_vis DER_pt_h DER_deltaeta_jet_jet DER_mass_jet_jet
## Min. : 6.3 Min. : 0.0 Min. :-999.0 Min. :-999
## 1st Qu.: 59.4 1st Qu.: 14.1 1st Qu.:-999.0 1st Qu.:-999
## Median : 73.8 Median : 38.5 Median :-999.0 Median :-999
## Mean : 81.2 Mean : 57.9 Mean :-708.4 Mean :-601
## 3rd Qu.: 92.3 3rd Qu.: 79.2 3rd Qu.: 0.5 3rd Qu.: 83
## Max. :1349.4 Max. :2835.0 Max. : 8.5 Max. :4975
## DER_prodeta_jet_jet DER_deltar_tau_lep DER_pt_tot DER_sum_pt
## Min. :-999.0 Min. :0.208 Min. : 0.0 Min. : 46.1
## 1st Qu.:-999.0 1st Qu.:1.810 1st Qu.: 2.8 1st Qu.: 77.5
## Median :-999.0 Median :2.491 Median : 12.3 Median : 120.7
## Mean :-709.4 Mean :2.373 Mean : 18.9 Mean : 158.4
## 3rd Qu.: -4.6 3rd Qu.:2.961 3rd Qu.: 27.6 3rd Qu.: 200.5
## Max. : 16.7 Max. :5.684 Max. :2835.0 Max. :1852.5
## DER_pt_ratio_lep_tau DER_met_phi_centrality DER_lep_eta_centrality
## Min. : 0.047 Min. :-1.414 Min. :-999
## 1st Qu.: 0.883 1st Qu.:-1.371 1st Qu.:-999
## Median : 1.280 Median :-0.356 Median :-999
## Mean : 1.438 Mean :-0.128 Mean :-709
## 3rd Qu.: 1.777 3rd Qu.: 1.225 3rd Qu.: 0
## Max. :19.773 Max. : 1.414 Max. : 1
## PRI_tau_pt PRI_tau_eta PRI_tau_phi PRI_lep_pt
## Min. : 20.0 Min. :-2.499 Min. :-3.1420 Min. : 26.0
## 1st Qu.: 24.6 1st Qu.:-0.925 1st Qu.:-1.5750 1st Qu.: 32.4
## Median : 31.8 Median :-0.023 Median :-0.0330 Median : 40.5
## Mean : 38.7 Mean :-0.011 Mean :-0.0082 Mean : 46.7
## 3rd Qu.: 45.0 3rd Qu.: 0.898 3rd Qu.: 1.5650 3rd Qu.: 53.4
## Max. :764.4 Max. : 2.497 Max. : 3.1420 Max. :560.3
## PRI_lep_eta PRI_lep_phi PRI_met PRI_met_phi
## Min. :-2.5050 Min. :-3.1420 Min. : 0.1 Min. :-3.1420
## 1st Qu.:-1.0140 1st Qu.:-1.5220 1st Qu.: 21.4 1st Qu.:-1.5750
## Median :-0.0450 Median : 0.0860 Median : 34.8 Median :-0.0240
## Mean :-0.0195 Mean : 0.0435 Mean : 41.7 Mean :-0.0101
## 3rd Qu.: 0.9590 3rd Qu.: 1.6180 3rd Qu.: 51.9 3rd Qu.: 1.5610
## Max. : 2.5030 Max. : 3.1420 Max. :2842.6 Max. : 3.1420
## PRI_met_sumet PRI_jet_num PRI_jet_leading_pt PRI_jet_leading_eta
## Min. : 13.7 Min. :0.000 Min. :-999.0 Min. :-999.0
## 1st Qu.: 123.0 1st Qu.:0.000 1st Qu.:-999.0 1st Qu.:-999.0
## Median : 179.7 Median :1.000 Median : 39.0 Median : -1.9
## Mean : 209.8 Mean :0.979 Mean :-348.3 Mean :-399.3
## 3rd Qu.: 263.4 3rd Qu.:2.000 3rd Qu.: 75.3 3rd Qu.: 0.4
## Max. :2004.0 Max. :3.000 Max. :1120.6 Max. : 4.5
## PRI_jet_leading_phi PRI_jet_subleading_pt PRI_jet_subleading_eta
## Min. :-999.0 Min. :-999.0 Min. :-999.0
## 1st Qu.:-999.0 1st Qu.:-999.0 1st Qu.:-999.0
## Median : -2.1 Median :-999.0 Median :-999.0
## Mean :-399.3 Mean :-692.4 Mean :-709.1
## 3rd Qu.: 0.5 3rd Qu.: 33.7 3rd Qu.: -2.5
## Max. : 3.1 Max. : 721.5 Max. : 4.5
## PRI_jet_subleading_phi PRI_jet_all_pt Weight Label
## Min. :-999.0 Min. : 0.0 Min. :0.002 b:164333
## 1st Qu.:-999.0 1st Qu.: 0.0 1st Qu.:0.019 s: 85667
## Median :-999.0 Median : 40.5 Median :1.156
## Mean :-709.1 Mean : 73.1 Mean :1.647
## 3rd Qu.: -2.3 3rd Qu.: 109.9 3rd Qu.:2.404
## Max. : 3.1 Max. :1633.4 Max. :7.823
summary(dtest)
## EventId DER_mass_MMC DER_mass_transverse_met_lep
## Min. :350000 Min. :-999.0 Min. : 0.0
## 1st Qu.:487500 1st Qu.: 78.2 1st Qu.: 19.3
## Median :625000 Median : 105.1 Median : 46.5
## Mean :625000 Mean : -49.0 Mean : 49.3
## 3rd Qu.:762499 3rd Qu.: 130.8 3rd Qu.: 73.6
## Max. :899999 Max. :1949.3 Max. :968.7
## DER_mass_vis DER_pt_h DER_deltaeta_jet_jet DER_mass_jet_jet
## Min. : 6.8 Min. : 0.0 Min. :-999.0 Min. :-999
## 1st Qu.: 59.4 1st Qu.: 14.2 1st Qu.:-999.0 1st Qu.:-999
## Median : 73.7 Median : 38.5 Median :-999.0 Median :-999
## Mean : 81.1 Mean : 57.8 Mean :-707.4 Mean :-600
## 3rd Qu.: 92.2 3rd Qu.: 79.2 3rd Qu.: 0.5 3rd Qu.: 84
## Max. :1265.0 Max. :1337.2 Max. : 8.7 Max. :4795
## DER_prodeta_jet_jet DER_deltar_tau_lep DER_pt_tot DER_sum_pt
## Min. :-999.0 Min. :0.237 Min. : 0.0 Min. : 46.1
## 1st Qu.:-999.0 1st Qu.:1.815 1st Qu.: 2.8 1st Qu.: 77.5
## Median :-999.0 Median :2.492 Median : 12.4 Median : 120.7
## Mean :-708.4 Mean :2.374 Mean : 19.0 Mean : 158.7
## 3rd Qu.: -4.5 3rd Qu.:2.962 3rd Qu.: 27.6 3rd Qu.: 201.0
## Max. : 17.3 Max. :5.751 Max. :759.4 Max. :2079.2
## DER_pt_ratio_lep_tau DER_met_phi_centrality DER_lep_eta_centrality
## Min. : 0.06 Min. :-1.414 Min. :-999
## 1st Qu.: 0.89 1st Qu.:-1.371 1st Qu.:-999
## Median : 1.28 Median :-0.356 Median :-999
## Mean : 1.44 Mean :-0.127 Mean :-708
## 3rd Qu.: 1.78 3rd Qu.: 1.230 3rd Qu.: 0
## Max. :32.23 Max. : 1.414 Max. : 1
## PRI_tau_pt PRI_tau_eta PRI_tau_phi PRI_lep_pt
## Min. : 20.0 Min. :-2.4990 Min. :-3.1420 Min. : 26.0
## 1st Qu.: 24.6 1st Qu.:-0.9260 1st Qu.:-1.5880 1st Qu.: 32.4
## Median : 31.8 Median :-0.0210 Median :-0.0420 Median : 40.6
## Mean : 38.7 Mean :-0.0119 Mean :-0.0158 Mean : 46.7
## 3rd Qu.: 44.9 3rd Qu.: 0.8990 3rd Qu.: 1.5570 3rd Qu.: 53.4
## Max. :627.0 Max. : 2.5000 Max. : 3.1420 Max. :701.3
## PRI_lep_eta PRI_lep_phi PRI_met PRI_met_phi
## Min. :-2.5080 Min. :-3.1420 Min. : 0.1 Min. :-3.1420
## 1st Qu.:-1.0110 1st Qu.:-1.5080 1st Qu.: 21.4 1st Qu.:-1.5740
## Median :-0.0380 Median : 0.0970 Median : 34.8 Median :-0.0170
## Mean :-0.0188 Mean : 0.0518 Mean : 41.6 Mean :-0.0082
## 3rd Qu.: 0.9560 3rd Qu.: 1.6170 3rd Qu.: 51.9 3rd Qu.: 1.5580
## Max. : 2.5060 Max. : 3.1420 Max. :1254.4 Max. : 3.1420
## PRI_met_sumet PRI_jet_num PRI_jet_leading_pt PRI_jet_leading_eta
## Min. : 13.8 Min. :0.00 Min. :-999.0 Min. :-999.0
## 1st Qu.: 123.0 1st Qu.:0.00 1st Qu.:-999.0 1st Qu.:-999.0
## Median : 179.9 Median :1.00 Median : 39.0 Median : -1.9
## Mean : 209.9 Mean :0.98 Mean :-348.9 Mean :-399.9
## 3rd Qu.: 263.9 3rd Qu.:2.00 3rd Qu.: 75.5 3rd Qu.: 0.4
## Max. :2190.3 Max. :3.00 Max. :1163.4 Max. : 4.5
## PRI_jet_leading_phi PRI_jet_subleading_pt PRI_jet_subleading_eta
## Min. :-999.0 Min. :-999.0 Min. :-999.0
## 1st Qu.:-999.0 1st Qu.:-999.0 1st Qu.:-999.0
## Median : -2.1 Median :-999.0 Median :-999.0
## Mean :-399.9 Mean :-691.3 Mean :-708.2
## 3rd Qu.: 0.5 3rd Qu.: 33.8 3rd Qu.: -2.4
## Max. : 3.1 Max. : 817.8 Max. : 4.5
## PRI_jet_subleading_phi PRI_jet_all_pt
## Min. :-999.0 Min. : 0.0
## 1st Qu.:-999.0 1st Qu.: 0.0
## Median :-999.0 Median : 40.5
## Mean :-708.2 Mean : 73.2
## 3rd Qu.: -2.3 3rd Qu.: 110.5
## Max. : 3.1 Max. :1860.2
# Visualize correlation of the variables
pairs(dtrain[1:10000,2:5], col=dtrain$Label[1:10000])
Preprocessing
# Normalize training data
dtrain$Label <- factor(dtrain$Label)
id.train <- dtrain[1]
label.train <- as.numeric(dtrain$Label) - 1
scaled.train <- scale(dtrain[2:32])
# Save normalized training data
write.csv(id.train, paste0(data_path, "Norm_id_train.csv"), row.names = FALSE)
write.csv(label.train, paste0(data_path, "Norm_label_train.csv"), row.names = FALSE)
write.csv(scaled.train, paste0(data_path, "Norm_scaled_train.csv"), row.names = FALSE)
# Normalize test data
id.test <- dtest[1]
scaled.test <- scale(dtest[2:31])
# Save normalized training data
write.csv(id.test, paste0(data_path, "Norm_id_test.csv"), row.names = FALSE)
write.csv(scaled.test, paste0(data_path, "Norm_scaled_test.csv"), row.names = FALSE)