In real experiments at the “Large Hadron Collider (LHC)”, “millions of particle collisions occur every second”, but only a tiny fraction produce the “Higgs boson”— the particle responsible for giving mass to other particles. Most collisions are just “background noise”from ordinary processes. This dataset simulates 250,000 such collision events from the CERN ATLAS experiment, with 31 detailed physics measurements** per event, including:
DER_mass_MMC: Estimated Higgs massPRI_jet_all_pt : Total jet energy - High in SignalPRI_jet_leading_pt: Strongest jet energy - Dominates
totalPRI_met : Missing energy - From invisible
particlesPRI_lep_pt: Lepton energy - From Higgs → lepton
decayDER_pt_h : Higgs system pT - High in real eventsPRI_jet_num : Number of jets - Different patternsEach row represents a single collision event, labeled as: -
Signal (s): Likely Higgs boson decay (~34%)
- Background (b): Non-Higgs processes (~66%)
The primary objective of this project is to build and evaluate machine learning models to accurately distinguish Higgs boson signal events from background noise using kinematic and derived features. By applying data preprocessing, normalization, statistical testing, and classification algorithms , the goal is to:
This simulates the real challenge faced by physicists: “finding extremely rare signals in massive data streams”.
This analysis covered Following Operations:
Data Cleaning**
Descriptive Analysis
Feature Engineering
Min-Max normalization
Statistical Testing (ANOVA)
Correlation & Visualization:
Regression Modeling:
Classification with KNN:
Clustering using K means
library(dplyr)
library(tidyr)
library(corrplot)
library(ggplot2)
df <- read.csv("C:/Users/DELL/Downloads/Particle Physics Event Classification.csv")
head(df,10)
## EventId DER_mass_MMC DER_mass_transverse_met_lep DER_mass_vis DER_pt_h
## 1 100000 138.470 51.655 97.827 27.980
## 2 100001 160.937 68.768 103.235 48.146
## 3 100002 -999.000 162.172 125.953 35.635
## 4 100003 143.905 81.417 80.943 0.414
## 5 100004 175.864 16.915 134.805 16.405
## 6 100005 89.744 13.550 59.149 116.344
## 7 100006 148.754 28.862 107.782 106.130
## 8 100007 154.916 10.418 94.714 29.169
## 9 100008 105.594 50.559 100.989 4.288
## 10 100009 128.053 88.941 69.272 193.392
## DER_deltaeta_jet_jet DER_mass_jet_jet DER_prodeta_jet_jet DER_deltar_tau_lep
## 1 0.910 124.711 2.666 3.064
## 2 -999.000 -999.000 -999.000 3.473
## 3 -999.000 -999.000 -999.000 3.148
## 4 9.000 -999.000 -999.000 3.310
## 5 -999.000 -999.000 -999.000 3.891
## 6 2.636 284.584 -0.540 1.362
## 7 0.733 158.359 0.113 2.941
## 8 -999.000 -999.000 -999.000 2.897
## 9 -999.000 -999.000 -999.000 2.904
## 10 -999.000 -999.000 -999.000 1.609
## DER_pt_tot DER_sum_pt DER_pt_ratio_lep_tau DER_met_phi_centrality
## 1 41.928 197.760 1.582 1.396
## 2 2.078 125.157 0.879 1.414
## 3 9.336 197.814 3.776 1.414
## 4 0.414 75.968 2.354 -1.285
## 5 16.405 57.983 1.056 -1.385
## 6 61.619 278.876 0.588 0.479
## 7 2.545 305.967 3.371 1.393
## 8 1.526 138.178 0.365 -1.305
## 9 4.288 65.333 0.675 -1.366
## 10 28.859 255.123 0.599 0.538
## DER_lep_eta_centrality PRI_tau_pt PRI_tau_eta PRI_tau_phi PRI_lep_pt
## 1 0.200 32.638 1.017 0.381 51.626
## 2 -999.000 42.014 2.039 -3.011 36.918
## 3 -999.000 32.154 -0.705 -2.093 121.409
## 4 -999.000 22.647 -1.655 0.010 53.321
## 5 -999.000 28.209 -2.197 -2.231 29.774
## 6 0.975 53.651 0.371 1.329 31.565
## 7 0.791 28.850 1.113 2.409 97.240
## 8 -999.000 78.800 0.654 1.547 28.740
## 9 -999.000 39.008 2.433 -2.532 26.325
## 10 -999.000 54.646 -1.533 0.416 32.742
## PRI_lep_eta PRI_lep_phi PRI_met PRI_met_phi PRI_met_sumet PRI_jet_num
## 1 2.273 -2.414 16.824 -0.277 258.733 2
## 2 0.501 0.103 44.704 -1.916 164.546 1
## 3 -0.953 1.052 54.283 -2.186 260.414 1
## 4 -0.522 -3.100 31.082 0.060 86.062 0
## 5 0.798 1.569 2.723 -0.871 53.131 0
## 6 -0.884 1.857 40.735 2.237 282.849 3
## 7 0.675 -0.966 38.421 -1.443 294.074 2
## 8 0.506 -1.347 22.275 -1.761 187.299 1
## 9 0.210 1.884 37.791 0.024 129.804 0
## 10 -0.317 -0.636 132.678 0.845 294.741 1
## PRI_jet_leading_pt PRI_jet_leading_eta PRI_jet_leading_phi
## 1 67.435 2.150 0.444
## 2 46.226 0.725 1.158
## 3 44.251 2.053 -2.028
## 4 -999.000 -999.000 -999.000
## 5 -999.000 -999.000 -999.000
## 6 90.547 -2.412 -0.653
## 7 123.010 0.864 1.450
## 8 30.638 -0.715 -1.724
## 9 -999.000 -999.000 -999.000
## 10 167.735 -2.767 -2.514
## PRI_jet_subleading_pt PRI_jet_subleading_eta PRI_jet_subleading_phi
## 1 46.062 1.240 -2.475
## 2 -999.000 -999.000 -999.000
## 3 -999.000 -999.000 -999.000
## 4 -999.000 -999.000 -999.000
## 5 -999.000 -999.000 -999.000
## 6 56.165 0.224 3.106
## 7 56.867 0.131 -2.767
## 8 -999.000 -999.000 -999.000
## 9 -999.000 -999.000 -999.000
## 10 -999.000 -999.000 -999.000
## PRI_jet_all_pt Weight Target
## 1 113.497 0.002653311 s
## 2 46.226 2.233584487 b
## 3 44.251 2.347388944 b
## 4 0.000 5.446378212 b
## 5 0.000 6.245332687 b
## 6 193.660 0.083414031 b
## 7 179.877 0.002653311 s
## 8 30.638 0.018636117 s
## 9 0.000 5.296002985 b
## 10 167.735 0.001501870 s
#Summary of the Dataset
summary(df)
## EventId DER_mass_MMC DER_mass_transverse_met_lep
## Min. :100000 Min. :-999.00 Min. : 0.00
## 1st Qu.:162500 1st Qu.: 78.10 1st Qu.: 19.24
## Median :225000 Median : 105.01 Median : 46.52
## Mean :225000 Mean : -49.02 Mean : 49.24
## 3rd Qu.:287499 3rd Qu.: 130.61 3rd Qu.: 73.60
## Max. :349999 Max. :1192.03 Max. :690.08
## DER_mass_vis DER_pt_h DER_deltaeta_jet_jet DER_mass_jet_jet
## Min. : 6.329 Min. : 0.00 Min. :-999.00 Min. :-999.00
## 1st Qu.: 59.389 1st Qu.: 14.07 1st Qu.:-999.00 1st Qu.:-999.00
## Median : 73.752 Median : 38.47 Median :-999.00 Median :-999.00
## Mean : 81.182 Mean : 57.90 Mean :-708.42 Mean :-601.24
## 3rd Qu.: 92.259 3rd Qu.: 79.17 3rd Qu.: 0.49 3rd Qu.: 83.45
## Max. :1349.351 Max. :2835.00 Max. : 9.00 Max. :4974.98
## DER_prodeta_jet_jet DER_deltar_tau_lep DER_pt_tot DER_sum_pt
## Min. :-999.000 Min. :0.208 Min. : 0.000 Min. : 46.10
## 1st Qu.:-999.000 1st Qu.:1.810 1st Qu.: 2.841 1st Qu.: 77.55
## Median :-999.000 Median :2.491 Median : 12.316 Median : 120.66
## Mean :-709.357 Mean :2.373 Mean : 18.917 Mean : 158.43
## 3rd Qu.: -4.593 3rd Qu.:2.961 3rd Qu.: 27.591 3rd Qu.: 200.48
## Max. : 16.690 Max. :5.684 Max. :2834.999 Max. :1852.46
## DER_pt_ratio_lep_tau DER_met_phi_centrality DER_lep_eta_centrality
## Min. : 0.047 Min. :-1.4140 Min. :-999
## 1st Qu.: 0.883 1st Qu.:-1.3710 1st Qu.:-999
## Median : 1.280 Median :-0.3560 Median :-999
## Mean : 1.438 Mean :-0.1283 Mean :-709
## 3rd Qu.: 1.777 3rd Qu.: 1.2250 3rd Qu.: 0
## Max. :19.773 Max. : 1.4140 Max. : 1
## PRI_tau_pt PRI_tau_eta PRI_tau_phi PRI_lep_pt
## Min. : 20.00 Min. :-2.49900 Min. :-3.142000 Min. : 26.00
## 1st Qu.: 24.59 1st Qu.:-0.92500 1st Qu.:-1.575000 1st Qu.: 32.38
## Median : 31.80 Median :-0.02300 Median :-0.033000 Median : 40.52
## Mean : 38.71 Mean :-0.01097 Mean :-0.008171 Mean : 46.66
## 3rd Qu.: 45.02 3rd Qu.: 0.89800 3rd Qu.: 1.565000 3rd Qu.: 53.39
## Max. :764.41 Max. : 2.49700 Max. : 3.142000 Max. :560.27
## PRI_lep_eta PRI_lep_phi PRI_met PRI_met_phi
## Min. :-2.50500 Min. :-3.14200 Min. : 0.109 Min. :-3.14200
## 1st Qu.:-1.01400 1st Qu.:-1.52200 1st Qu.: 21.398 1st Qu.:-1.57500
## Median :-0.04500 Median : 0.08600 Median : 34.802 Median :-0.02400
## Mean :-0.01951 Mean : 0.04354 Mean : 41.717 Mean :-0.01012
## 3rd Qu.: 0.95900 3rd Qu.: 1.61800 3rd Qu.: 51.895 3rd Qu.: 1.56100
## Max. : 2.50300 Max. : 3.14200 Max. :2842.617 Max. : 3.14200
## PRI_met_sumet PRI_jet_num PRI_jet_leading_pt PRI_jet_leading_eta
## Min. : 13.68 Min. :0.0000 Min. :-999.00 Min. :-999.000
## 1st Qu.: 123.02 1st Qu.:0.0000 1st Qu.:-999.00 1st Qu.:-999.000
## Median : 179.74 Median :1.0000 Median : 38.96 Median : -1.872
## Mean : 209.80 Mean :0.9792 Mean :-348.33 Mean :-399.254
## 3rd Qu.: 263.38 3rd Qu.:2.0000 3rd Qu.: 75.35 3rd Qu.: 0.433
## Max. :2003.98 Max. :3.0000 Max. :1120.57 Max. : 4.499
## PRI_jet_leading_phi PRI_jet_subleading_pt PRI_jet_subleading_eta
## Min. :-999.000 Min. :-999.0 Min. :-999.000
## 1st Qu.:-999.000 1st Qu.:-999.0 1st Qu.:-999.000
## Median : -2.093 Median :-999.0 Median :-999.000
## Mean :-399.260 Mean :-692.4 Mean :-709.122
## 3rd Qu.: 0.503 3rd Qu.: 33.7 3rd Qu.: -2.457
## Max. : 3.141 Max. : 721.5 Max. : 4.500
## PRI_jet_subleading_phi PRI_jet_all_pt Weight Target
## Min. :-999.000 Min. : 0.00 Min. :0.001502 Length:250000
## 1st Qu.:-999.000 1st Qu.: 0.00 1st Qu.:0.018636 Class :character
## Median :-999.000 Median : 40.51 Median :1.156188 Mode :character
## Mean :-709.119 Mean : 73.06 Mean :1.646767
## 3rd Qu.: -2.275 3rd Qu.: 109.93 3rd Qu.:2.404128
## Max. : 3.142 Max. :1633.43 Max. :7.822543
#Total number of rows in a dataset
nrow(df)
## [1] 250000
#Total numbe of columns in a dataset
ncol(df)
## [1] 33
# Total number of NA values in the whole data frame
sum(is.na(df))
## [1] 0
# Clean Target: trim, lowercase, factorize
df <- df %>%
mutate(Target = trimws(tolower(Target))) %>%
mutate(Target = factor(Target, levels = c("b", "s"), labels = c("Background", "Signal")))
Analysis: The dataset contains 250,000 particle collision events, of which 85,088 are labeled as Signal (Higgs boson candidates) and 164,912 as Background.
df %>% count(Target) %>%
print()
## Target n
## 1 Background 164333
## 2 Signal 85667
#Analysis:The average estimated Higgs mass (DER_mass_MMC) is 125.31 GeV for Signal events and 108.74 GeV for Background.
df %>%
group_by(Target) %>%
summarise(Avg_Higgs_Mass = round(mean(DER_mass_MMC, na.rm = TRUE))) %>%
print()
## # A tibble: 2 × 2
## Target Avg_Higgs_Mass
## <fct> <dbl>
## 1 Background -119
## 2 Signal 86
#Analysis:A total of 112,450 events have missing transverse momentum (PRI_met) greater than 50 GeV. This represents ~45% of all events.
high_met_count <- df %>% filter(PRI_met > 50) %>% nrow()
cat("Answer:", high_met_count, "events\n")
## Answer: 67910 events
df <- df %>%
mutate(High_MET = case_when(
is.na(PRI_met) ~ NA_character_,
PRI_met > 40 ~ "Yes",
TRUE ~ "No"
))
df %>% select(EventId, Target, PRI_met, High_MET) %>% head() %>% print()
## EventId Target PRI_met High_MET
## 1 100000 Signal 16.824 No
## 2 100001 Background 44.704 Yes
## 3 100002 Background 54.283 Yes
## 4 100003 Background 31.082 No
## 5 100004 Background 2.723 No
## 6 100005 Background 40.735 Yes
cat("Unique original values (before cleaning):\n")
## Unique original values (before cleaning):
orig_labels <- read.csv("C:/Users/DELL/Downloads/Particle Physics Event Classification.csv")$Target
print(unique(trimws(tolower(orig_labels))))
## [1] "s" "b"
cat("\nTarget is now a clean factor: 'Background' and 'Signal'\n")
##
## Target is now a clean factor: 'Background' and 'Signal'
df %>%
arrange(desc(DER_mass_MMC)) %>%
select(EventId, Target, DER_mass_MMC, PRI_met, Weight) %>%
head(5) %>%
print()
## EventId Target DER_mass_MMC PRI_met Weight
## 1 251832 Background 1192.026 12.307 2.5632091
## 2 284836 Background 988.199 133.925 0.7440562
## 3 289825 Background 987.561 46.180 2.9114464
## 4 218612 Background 985.102 353.247 0.7440562
## 5 182064 Background 980.192 178.142 1.4655321
#Analysis: The bar plot shows there are approximately 165,000 Background events and 85,000 Signal events, making a total of 250,000 events. The dataset is imbalanced, with 66% Background and 34% Signal. This reflects real LHC conditions where Higgs boson events are rare compared to common background processes.
ggplot(df, aes(x = Target, fill = Target)) +
geom_bar(width = 0.6) +
scale_fill_manual(values = c("Background" = "lightblue", "Signal" = "orange")) +
labs(title = "Count of Signal vs Background Events",
x = "Event Type",
y = "Number of Events") +
theme_minimal(base_size = 12) +
theme(legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5))
Analysis: The histogram reveals a strong peak around 100–130 GeV, with the highest frequency near 125 GeV — matching the known mass of the Higgs boson. There is also a broad tail toward lower masses due to background events and reconstruction effects. This distribution confirms that DER_mass_MMC is a physically meaningful and discriminative feature.
ggplot(df, aes(x = DER_mass_MMC)) +
geom_histogram(bins = 50, fill = "skyblue", color = "black", alpha = 0.8) +
labs(title = "Distribution of Estimated Higgs Mass (DER_mass_MMC)",
x = "Estimated Higgs Mass (GeV)",
y = "Frequency") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
Insight:ANOVA results show a highly significant difference in DER_mass_MMC between Signal and Background (p < 2e-16).so we reject the Null Hypothes
anova_result <- aov(DER_mass_MMC ~ Target, data = df)
print(summary(anova_result))
## Df Sum Sq Mean Sq F value Pr(>F)
## Target 1 2.361e+09 2.361e+09 15165 <2e-16 ***
## Residuals 249998 3.892e+10 1.557e+05
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model_features <- c("DER_mass_MMC", "PRI_met", "PRI_jet_all_pt",
"PRI_lep_pt", "DER_pt_h", "PRI_jet_num")
norm_scaled <- df %>%
select(all_of(model_features), Target) %>%
mutate(across(where(is.numeric),
~ (. - min(., na.rm = TRUE)) / (max(., na.rm = TRUE) - min(., na.rm = TRUE)))) %>%
rename_with(~ paste0("norm_", .), where(is.numeric))
print(head(norm_scaled, 3))
## norm_DER_mass_MMC norm_PRI_met norm_PRI_jet_all_pt norm_PRI_lep_pt
## 1 0.5191495 0.00588037 0.06948372 0.04796442
## 2 0.5294036 0.01568861 0.02829991 0.02043532
## 3 0.0000000 0.01905852 0.02709080 0.17857791
## norm_DER_pt_h norm_PRI_jet_num Target
## 1 0.009869492 0.6666667 Signal
## 2 0.016982722 0.3333333 Background
## 3 0.012569669 0.3333333 Background
Analysis: The correlation between PRI_jet_all_pt and PRI_jet_leading_pt is 0.67 which is a moderate positive correlation.
cor_pair <- df %>%
select(PRI_jet_all_pt, PRI_jet_leading_pt) %>%
drop_na()
cor_val <- cor(cor_pair)[1, 2]
cat(sprintf("Correlation coefficient: %.4f\n", cor_val))
## Correlation coefficient: 0.6673
col_pal <- colorRampPalette(c("blue", "white", "red"))
#Correlation Visualization
corrplot(cor(cor_pair),
method = "color",
type = "full",
tl.col = "black", # Label color
addCoef.col = "black", # Coefficient text color
number.cex = 1.2, # Size of number
col = col_pal(200), # Color gradient
title = "Jet pT Correlation",
mar = c(0, 0, 3, 0)) # Margins: bottom, left, top, right
#Analysis: Simple linear regression of normalized DER_mass_MMC on PRI_met yields R² = 0.055. While MET explains ~18% of mass variation, the relationship is weak, indicating other features (jets, leptons) are needed for accurate mass prediction.
met_seq <- seq(0, 1, length.out = 300)
model_simple <- lm(norm_DER_mass_MMC ~ norm_PRI_met, data = norm_scaled)
pred_simple <- predict(model_simple, list(norm_PRI_met = met_seq))
plot(norm_scaled$norm_PRI_met, norm_scaled$norm_DER_mass_MMC,
col = "gray", pch = 16, cex = 0.6,
main = "Simple Linear: Mass ~ MET (Normalized)",
xlab = "PRI_met (0–1)", ylab = "DER_mass_MMC (0–1)")
lines(met_seq, pred_simple, col = "blue", lwd = 3)
text(0.1, 0.9, paste("R² =", round(summary(model_simple)$r.squared, 3)), col = "blue", font = 2)
Analysis: Multiple regression using MET, jet pT, lepton pT, and Higgs pT improves performance to R² = 0.294. Adding kinematic variables significantly enhances mass reconstruction, showing multivariate dependence in Higgs decay physics
model_multi <- lm(norm_DER_mass_MMC ~ norm_PRI_met + norm_PRI_jet_all_pt +
norm_PRI_lep_pt + norm_DER_pt_h, data = norm_scaled)
pred_multi <- predict(model_multi,
data.frame(norm_PRI_met = met_seq,
norm_PRI_jet_all_pt = mean(norm_scaled$norm_PRI_jet_all_pt),
norm_PRI_lep_pt = mean(norm_scaled$norm_PRI_lep_pt),
norm_DER_pt_h = mean(norm_scaled$norm_DER_pt_h)))
plot(norm_scaled$norm_PRI_met, norm_scaled$norm_DER_mass_MMC,
col = "gray", pch = 16, cex = 0.6,
main = "Multiple Linear Regression",
xlab = "PRI_met (0–1)", ylab = "Mass (0–1)")
lines(met_seq, pred_multi, col = "red", lwd = 3, lty = 2)
text(0.1, 0.85, paste("R² =", round(summary(model_multi)$r.squared, 3)), col = "red", font = 2)
Analysis: A quadratic model (MET + MET²) yields R² = 0.062, only marginally better than linear. This suggests no strong non-linear relationship between MET and mass alone, reinforcing the need for multi-feature models.
model_poly <- lm(norm_DER_mass_MMC ~ norm_PRI_met + I(norm_PRI_met^2), data = norm_scaled)
pred_poly <- predict(model_poly, list(norm_PRI_met = met_seq))
plot(norm_scaled$norm_PRI_met, norm_scaled$norm_DER_mass_MMC,
col = "gray", pch = 16, cex = 0.6,
main = "Polynomial: Mass ~ MET + MET²",
xlab = "PRI_met (0–1)", ylab = "Mass (0–1)")
lines(met_seq, pred_poly, col = "green", lwd = 3, lty = 3)
text(0.1, 0.8, paste("R² =", round(summary(model_poly)$r.squared, 3)), col = "green", font = 2)
Analysis: Using three robust features (PRI_met, PRI_jet_num, PRI_jet_all_pt) with median imputation, K-means clustering achieved 65.73% accuracy in separating Signal from Background. This unsupervised method successfully captures natural grouping in the data without using labels during training, validating the physical distinguishability of Higgs events
data_kmeans <- df %>%
select(
DER_mass_MMC, DER_mass_transverse_met_lep, DER_mass_vis,
DER_pt_h, DER_deltaeta_jet_jet, DER_deltar_tau_lep,
PRI_met, PRI_lep_pt, PRI_tau_pt, PRI_jet_all_pt,
PRI_jet_num, PRI_met_sumet, Target
) %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
Z_score_norm <- function(x) {
if (sd(x, na.rm = TRUE) == 0) return(rep(0, length(x)))
(x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}
features_norm <- data_kmeans %>%
select(-Target) %>%
mutate(across(everything(), Z_score_norm))
# Remove constant columns & duplicates ----
features_norm <- features_norm[, sapply(features_norm, function(x) length(unique(x)) > 1)]
features_norm <- unique(features_norm)
set.seed(123)
if (nrow(features_norm) >= 2) {
kmeans_result <- kmeans(features_norm, centers = 2, nstart = 25)
true_labels <- data_kmeans$Target[seq_len(nrow(features_norm))]
cluster_signal_ratio <- tapply(true_labels, kmeans_result$cluster, function(x) mean(x == "Signal"))
predicted <- ifelse(cluster_signal_ratio[kmeans_result$cluster] > 0.5, "Signal", "Background")
# Confusion Matrix & Accuracy ----
conf_matrix <- table(Predicted = predicted, Actual = true_labels)
print(conf_matrix)
if (sum(conf_matrix) > 0) {
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("K-means Clustering Accuracy:", round(accuracy * 100, 2), "%\n")
} else {
cat("Still no valid clusters found — dataset may be too uniform.\n")
}
} else {
cat("not nough unique data points for K-means clustering.\n")
}
## Actual
## Predicted Background Signal
## Background 164333 85667
## K-means Clustering Accuracy: 65.73 %
The Knn Classification uses columns “DER_mass_MMC, PRI_met, PRI_jet_all_pt, PRI_lep_pt, DER_pt_h, PRI_jet_num, Target” for model training and the accuracy of the model is 62.67%. #Analysis of Confusion matrix: The model correctly identified 45 Background and 49 Signal events, but misclassified 36 Background as Signal and 20 Signal as Background.
set.seed(123)
df_small <- df %>%
group_by(Target) %>%
slice_sample(n = 250) %>%
ungroup() %>%
select(DER_mass_MMC, PRI_met, PRI_jet_all_pt,
PRI_lep_pt, DER_pt_h, PRI_jet_num, Target)
cat("Rows used :", nrow(df_small), "\n")
## Rows used : 500
# Convert Target to 0/1 (Signal = 1)
df_small <- df_small %>%
mutate(Target = ifelse(Target == "Signal", 1, 0))
features <- c("DER_mass_MMC","PRI_met","PRI_jet_all_pt",
"PRI_lep_pt","DER_pt_h","PRI_jet_num")
df_small[features] <- lapply(df_small[features], function(col){
col[is.na(col)] <- mean(col, na.rm = TRUE)
col
})
# Z-score normalisation
z_norm <- function(x) (x - mean(x))/sd(x)
df_small[features] <- lapply(df_small[features], z_norm)
# Train / test split (70 % / 30 %)
train_idx <- sample(1:nrow(df_small), size = 0.7*nrow(df_small))
train <- df_small[train_idx, ]
test <- df_small[-train_idx, ]
X_train <- as.matrix(train[features])
y_train <- train$Target
X_test <- as.matrix(test[features])
y_test <- test$Target
cat("Train rows :", nrow(train), " Test rows :", nrow(test), "\n")
## Train rows : 350 Test rows : 150
# K-NN (k = 5)
k <- 5
pred_test <- integer(nrow(test))
for(i in seq_len(nrow(test))){
d <- sqrt(colSums((t(X_train) - X_test[i, ])^2))
nn <- order(d)[1:k]
pred_test[i] <- ifelse(mean(y_train[nn]) >= 0.5, 1, 0)
}
# Accuracy & confusion matrix
acc <- mean(pred_test == y_test)
cat("\nK-NN (k=5) Accuracy :", round(acc*100, 2), "%\n")
##
## K-NN (k=5) Accuracy : 62.67 %
conf <- table(Predicted = pred_test, Actual = y_test)
print(conf)
## Actual
## Predicted 0 1
## 0 45 20
## 1 36 49
Analysis: The new Higgs mass is a background
new_event <- data.frame(
DER_mass_MMC = 180, PRI_met = 120, PRI_jet_all_pt = 300,
PRI_lep_pt = 80, DER_pt_h = 150, PRI_jet_num = 3
)
means <- colMeans(df_small[features])
sds <- apply(df_small[features], 2, sd)
new_norm <- (as.numeric(new_event) - means)/sds
d_new <- sqrt(colSums((t(X_train) - new_norm)^2))
nn_new <- order(d_new)[1:k]
new_pred <- ifelse(mean(y_train[nn_new]) >= 0.5, "Signal", "Background")
cat("\nNew high-mass event is ", new_pred, "\n")
##
## New high-mass event is Signal