## In diesem Abschnitt werden die Daten eingelesen, die grundlegende Struktur dargestellt und Daten-Leakage vermieden
data <- read.csv("~/Downloads/ObesityDataSet.csv")
dim(data)
## [1] 2111   17
cat("Der Datensatz hat", ncol(data), "Variablen und", nrow(data), "Zeilen\n\n")
## Der Datensatz hat 17 Variablen und 2111 Zeilen
str(data)
## 'data.frame':    2111 obs. of  17 variables:
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Gender                        : chr  "Female" "Female" "Male" "Male" ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ CALC                          : chr  "no" "Sometimes" "Frequently" "Frequently" ...
##  $ FAVC                          : chr  "no" "no" "no" "no" ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ SCC                           : chr  "no" "yes" "no" "no" ...
##  $ SMOKE                         : chr  "no" "yes" "no" "no" ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ family_history_with_overweight: chr  "yes" "yes" "yes" "no" ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CAEC                          : chr  "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
##  $ MTRANS                        : chr  "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
##  $ NObeyesdad                    : chr  "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
summary(data)
##       Age           Gender              Height          Weight      
##  Min.   :14.00   Length:2111        Min.   :1.450   Min.   : 39.00  
##  1st Qu.:19.95   Class :character   1st Qu.:1.630   1st Qu.: 65.47  
##  Median :22.78   Mode  :character   Median :1.700   Median : 83.00  
##  Mean   :24.31                      Mean   :1.702   Mean   : 86.59  
##  3rd Qu.:26.00                      3rd Qu.:1.768   3rd Qu.:107.43  
##  Max.   :61.00                      Max.   :1.980   Max.   :173.00  
##      CALC               FAVC                FCVC            NCP       
##  Length:2111        Length:2111        Min.   :1.000   Min.   :1.000  
##  Class :character   Class :character   1st Qu.:2.000   1st Qu.:2.659  
##  Mode  :character   Mode  :character   Median :2.386   Median :3.000  
##                                        Mean   :2.419   Mean   :2.686  
##                                        3rd Qu.:3.000   3rd Qu.:3.000  
##                                        Max.   :3.000   Max.   :4.000  
##      SCC               SMOKE                CH2O      
##  Length:2111        Length:2111        Min.   :1.000  
##  Class :character   Class :character   1st Qu.:1.585  
##  Mode  :character   Mode  :character   Median :2.000  
##                                        Mean   :2.008  
##                                        3rd Qu.:2.477  
##                                        Max.   :3.000  
##  family_history_with_overweight      FAF              TUE        
##  Length:2111                    Min.   :0.0000   Min.   :0.0000  
##  Class :character               1st Qu.:0.1245   1st Qu.:0.0000  
##  Mode  :character               Median :1.0000   Median :0.6253  
##                                 Mean   :1.0103   Mean   :0.6579  
##                                 3rd Qu.:1.6667   3rd Qu.:1.0000  
##                                 Max.   :3.0000   Max.   :2.0000  
##      CAEC              MTRANS           NObeyesdad       
##  Length:2111        Length:2111        Length:2111       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
## 
#Daten-Leakage vermeiden - Größe und Gewicht aus dem Datensatz entfernen 
data <- data[, !(names(data) %in% c("Height", "Weight"))]
names(data)
##  [1] "Age"                            "Gender"                        
##  [3] "CALC"                           "FAVC"                          
##  [5] "FCVC"                           "NCP"                           
##  [7] "SCC"                            "SMOKE"                         
##  [9] "CH2O"                           "family_history_with_overweight"
## [11] "FAF"                            "TUE"                           
## [13] "CAEC"                           "MTRANS"                        
## [15] "NObeyesdad"
## Dieser Abschnitt bietet eine Übersichtstabelle über die kategorischen und numerischen Variablen

# Kategoriale Variablen identifizieren (Textvariablen)
categorical_columns <- sapply(data, is.character)

# Numerische Variablen identifizieren (Zahlen)
numeric_columns <- sapply(data, is.numeric)

# Übersicht der kategorialen Variablen und deren Antwortmöglichkeiten
categorical_summary <- data.frame(
  Variable = names(data)[categorical_columns],
  Antwortmöglichkeiten = sapply(
    data[, categorical_columns],
    function(col) paste(unique(col), collapse = ", ")
  ),
  Antworthäufigkeit = sapply(
    data[, categorical_columns],
    function(col) {
      # Erstellt eine Tabelle und formatiert sie als Text
      tab <- table(col)
      paste(names(tab), ": ", tab, sep = "", collapse = " | ")
    }
  )
)

print(categorical_summary)
##                                                      Variable
## Gender                                                 Gender
## CALC                                                     CALC
## FAVC                                                     FAVC
## SCC                                                       SCC
## SMOKE                                                   SMOKE
## family_history_with_overweight family_history_with_overweight
## CAEC                                                     CAEC
## MTRANS                                                 MTRANS
## NObeyesdad                                         NObeyesdad
##                                                                                                                                          Antwortmöglichkeiten
## Gender                                                                                                                                           Female, Male
## CALC                                                                                                                        no, Sometimes, Frequently, Always
## FAVC                                                                                                                                                  no, yes
## SCC                                                                                                                                                   no, yes
## SMOKE                                                                                                                                                 no, yes
## family_history_with_overweight                                                                                                                        yes, no
## CAEC                                                                                                                        Sometimes, Frequently, Always, no
## MTRANS                                                                                            Public_Transportation, Walking, Automobile, Motorbike, Bike
## NObeyesdad                     Normal_Weight, Overweight_Level_I, Overweight_Level_II, Obesity_Type_I, Insufficient_Weight, Obesity_Type_II, Obesity_Type_III
##                                                                                                                                                                                      Antworthäufigkeit
## Gender                                                                                                                                                                       Female: 1043 | Male: 1068
## CALC                                                                                                                                            Always: 1 | Frequently: 70 | no: 639 | Sometimes: 1401
## FAVC                                                                                                                                                                               no: 245 | yes: 1866
## SCC                                                                                                                                                                                 no: 2015 | yes: 96
## SMOKE                                                                                                                                                                               no: 2067 | yes: 44
## family_history_with_overweight                                                                                                                                                     no: 385 | yes: 1726
## CAEC                                                                                                                                           Always: 53 | Frequently: 242 | no: 51 | Sometimes: 1765
## MTRANS                                                                                                           Automobile: 457 | Bike: 7 | Motorbike: 11 | Public_Transportation: 1580 | Walking: 56
## NObeyesdad                     Insufficient_Weight: 272 | Normal_Weight: 287 | Obesity_Type_I: 351 | Obesity_Type_II: 297 | Obesity_Type_III: 324 | Overweight_Level_I: 290 | Overweight_Level_II: 290
# Übersicht der numerischen Variablen sowie Min-/Max-Werte und Kommentarte zu den numerischen Variablen
numeric_summary <- data.frame(
  Variable = names(data)[numeric_columns],
  Min = sapply(data[, numeric_columns], min),
  Max = sapply(data[, numeric_columns], max),
  Kommentar = ""
)
# Kommentar = Inhaltliche Bedeutung der numerischen Variablen
numeric_summary$Kommentar[numeric_summary$Variable == "Age"] <- "Alter in Jahren"
numeric_summary$Kommentar[numeric_summary$Variable == "FCVC"] <- "Häufigkeit von Gemüsekonsum"
numeric_summary$Kommentar[numeric_summary$Variable == "NCP"] <- "Anzahl Hauptmahlzeiten pro Tag"
numeric_summary$Kommentar[numeric_summary$Variable == "CH2O"] <- "Wasseraufnahme"
numeric_summary$Kommentar[numeric_summary$Variable == "FAF"] <- "Körperliche Aktivität"
numeric_summary$Kommentar[numeric_summary$Variable == "TUE"] <- "Technologienutzung"

print(numeric_summary)
##      Variable Min Max                      Kommentar
## Age       Age  14  61                Alter in Jahren
## FCVC     FCVC   1   3    Häufigkeit von Gemüsekonsum
## NCP       NCP   1   4 Anzahl Hauptmahlzeiten pro Tag
## CH2O     CH2O   1   3                 Wasseraufnahme
## FAF       FAF   0   3          Körperliche Aktivität
## TUE       TUE   0   2             Technologienutzung
## Grafische Darstellung der Zielvariablen und Zusammenhänge 

# Zuerst werden die Klassen der Zielvariable logisch angeordnet (Untergewicht -> Normalgewicht -> Übergewicht -> Adipositas)

data$Gewichtsklasse <- factor(
  data$NObeyesdad,
  levels = c(
    "Insufficient_Weight",
    "Normal_Weight",
    "Overweight_Level_I",
    "Overweight_Level_II",
    "Obesity_Type_I",
    "Obesity_Type_II",
    "Obesity_Type_III"
  ),
  labels = c(
    "Untergewicht",
    "Normalgewicht",
    "Übergewicht I",
    "Übergewicht II",
    "Adipositas I",
    "Adipositas II",
    "Adipositas III"
  ),
  ordered = TRUE
)

# Verteilung der Zielvariable als Balkendiagramm dargestellt

table(data$Gewichtsklasse)
## 
##   Untergewicht  Normalgewicht  Übergewicht I Übergewicht II   Adipositas I 
##            272            287            290            290            351 
##  Adipositas II Adipositas III 
##            297            324
barplot(
  table(data$Gewichtsklasse),
  main = "Verteilung der Gewichtsklassen (NObeyesdad)",
  xlab = "Gewichtsklasse",
  ylab = "Anzahl Personen",
  las = 2,
  col = "lightblue",
  cex.names = 0.5
)

# Darstellung numerischer Variablen als Histogramme

par(mfrow = c(2, 3))  # 2 Zeilen, 3 Spalten

hist(data$Age,
     main = "Alter",
     xlab = "Jahre",
     ylab = "Anzahl Personen",
     col = "lightgray")

hist(data$FCVC,
     main = "Gemüsekonsum (FCVC)",
     xlab = "Ausprägung",
    ylab = "Anzahl Personen",
     col = "lightgreen")

hist(data$NCP,
     main = "Hauptmahlzeiten (NCP)",
     xlab = "Ausprägung",
    ylab = "Anzahl Personen",
     col = "salmon")

hist(data$FAF,
     main = "Körperliche Aktivität (FAF)",
     xlab = "Ausprägung",
      ylab = "Anzahl Personen",
     col = "orange")

hist(data$TUE,
     main = "Technologienutzung (TUE)",
     xlab = "Ausprägung",
      ylab = "Anzahl Personen",
     col = "purple")

hist(data$CH2O,
     main = "Wasseraufnahme (CH2O)",
     xlab = "Ausprägung",
      ylab = "Anzahl Personen",
     col = "lightyellow")

par(mfrow = c(1, 1))

# Initiale Darstellung der Zusammenhänge zwischen Lebensstilvariablen und Zielvariable Gewichtsklasse

par(mfrow = c(1, 2))

# Familiäre Vorbelastung vs. Adipositas
barplot(
  table(data$family_history_with_overweight, data$Gewichtsklasse),
  beside = TRUE,
  col = c("blue", "red"),
  main = "Familiäre Vorbelastung\nund Gewichtsklassen",
  xlab = "Gewichtsklasse",
  ylab = "Anzahl Personen",
  legend.text = c("Nein", "Ja"),
  args.legend = list(x = "topright", inset = 0.02),
  las = 2,
  cex.names = 0.5
)
# Hochkalorischer Konsum (FAVC) vs. Adipositas
barplot(
  table(data$FAVC, data$Gewichtsklasse),
  beside = TRUE,
  col = c("lightgreen", "orange"),
  main = "Hochkalorische Ernährung (FAVC)\nund Gewichtsklassen",
  xlab = "Gewichtsklasse",
  ylab = "Anzahl Personen",
  legend.text = c("Nein", "Ja"),
  args.legend = list(x = "topright", inset = 0.02),
  las = 2,
  cex.names = 0.5
)

par(mfrow = c(1, 1))

##Logistisches Regressionsmodell

Zunächst erfolgt die Erstellung eines logistischen Regressionsmodells, um eine Ursachenanalyse durchzuführen, welche Variablen grundsätzlich zu Übergewicht führen. Dazu werden zunächst die 7 Gewichtsklassen in eine binäre Zielvariable überführt sowie die kategorischen Variablen in Faktoren umgewandelt. Die logistische Regression umfasst die Erstellung eines initialen Modells, das alle potenziell relevanten Variablen beinhaltet und ein anschließendes, optimiertes Modell, bei dem nur die, als signifikant identifizierten Variablen berücksichtig werden. Die Modelle werden durch eine Confusion Matrix, Accuracy, Precision, Recall und F1-Score evaluiert. Eine k-fold Cross Validation wird durchgeführt, um die generelle Güte bzw. Generalisierbarkeit des jeweiligen Modells zu prüfen.

library(boot)

## In diesem Abschnitt werden die Zielvariablen in eine binäre Zielvariable überführt und die kategorischen Datentypen umgewandelt

#Binäre Zielvariable erzeugen (0 = nicht übergewichtig (Untergewicht, Normalgewicht); 1 = übergewichtig (Übergewicht + Adipositas))
data$Is_Overweight <- ifelse(
  data$Gewichtsklasse %in% c(
    "Übergewicht I",
    "Übergewicht II",
    "Adipositas I",
    "Adipositas II",
    "Adipositas III"
  ),
  1, 0
)

# Umwandlung in kategoriale Variablen 
data$Is_Overweight <- factor(
  data$Is_Overweight,
  levels = c(0, 1),
  labels = c("Nein", "Ja")
)

# Kontrolle der Klassenverteilung
table(data$Is_Overweight)
## 
## Nein   Ja 
##  559 1552
prop.table(table(data$Is_Overweight)) * 100
## 
##     Nein       Ja 
## 26.48034 73.51966
# Einschub Datenkorrektur, da CALC == Always nur einmal auftauch und bei der CV in die Testdaten gelangt und daher zu Error führt. Always wird mit Frequently zusammengeführt
data$CALC <- as.character(data$CALC)
data$CALC[data$CALC == "Always"] <- "Frequently"
data$CALC <- as.factor(data$CALC)

# Kategoriale Variablen als Faktoren setzen
factor_cols <- c(
  "Gender",
  "family_history_with_overweight",
  "FAVC",
  "CAEC",
  "SMOKE",
  "SCC",
  "CALC",
  "MTRANS"
)
data[factor_cols] <- lapply(data[factor_cols], as.factor)

str(data)
## 'data.frame':    2111 obs. of  17 variables:
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Gender                        : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
##  $ CALC                          : Factor w/ 3 levels "Frequently","no",..: 2 3 1 1 3 3 3 3 1 2 ...
##  $ FAVC                          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ SCC                           : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ SMOKE                         : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CAEC                          : Factor w/ 4 levels "Always","Frequently",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ MTRANS                        : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
##  $ NObeyesdad                    : chr  "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
##  $ Gewichtsklasse                : Ord.factor w/ 7 levels "Untergewicht"<..: 2 2 2 3 4 2 2 2 2 2 ...
##  $ Is_Overweight                 : Factor w/ 2 levels "Nein","Ja": 1 1 1 2 2 1 1 1 1 1 ...
# Datensatz aufteilen
set.seed(1)
trainIndex <- sample(1:nrow(data), 0.8 * nrow(data))
train_data <- data[trainIndex, ]
test_data  <- data[-trainIndex, ]

## 1. Logistische Regression mit allen Variablen 
log_model <- glm(
  Is_Overweight ~ Age + Gender + family_history_with_overweight +
    FAVC + FCVC + NCP + CAEC + SMOKE + CH2O + FAF + TUE + CALC + MTRANS,
  data = train_data,
  family = binomial
)
summary(log_model)
## 
## Call:
## glm(formula = Is_Overweight ~ Age + Gender + family_history_with_overweight + 
##     FAVC + FCVC + NCP + CAEC + SMOKE + CH2O + FAF + TUE + CALC + 
##     MTRANS, family = binomial, data = train_data)
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -7.57719    1.01804  -7.443 9.85e-14 ***
## Age                                0.21617    0.02107  10.260  < 2e-16 ***
## GenderMale                         0.30913    0.18020   1.716 0.086251 .  
## family_history_with_overweightyes  2.40519    0.19975  12.041  < 2e-16 ***
## FAVCyes                            0.67015    0.23392   2.865 0.004172 ** 
## FCVC                               0.07218    0.15817   0.456 0.648140    
## NCP                               -0.37700    0.10801  -3.490 0.000482 ***
## CAECFrequently                    -0.91748    0.49702  -1.846 0.064899 .  
## CAECno                             2.63999    0.61905   4.265 2.00e-05 ***
## CAECSometimes                      2.48091    0.44376   5.591 2.26e-08 ***
## SMOKEyes                          -0.67556    0.63138  -1.070 0.284634    
## CH2O                               0.27874    0.14652   1.902 0.057108 .  
## FAF                               -0.39174    0.10072  -3.889 0.000101 ***
## TUE                               -0.21582    0.13505  -1.598 0.110025    
## CALCno                            -1.23669    0.49511  -2.498 0.012497 *  
## CALCSometimes                     -0.82540    0.48937  -1.687 0.091668 .  
## MTRANSBike                        -0.70101    1.22465  -0.572 0.567037    
## MTRANSMotorbike                    1.42292    1.07205   1.327 0.184412    
## MTRANSPublic_Transportation        1.28517    0.26316   4.884 1.04e-06 ***
## MTRANSWalking                      0.16969    0.49549   0.342 0.732004    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1969.6  on 1687  degrees of freedom
## Residual deviance: 1020.1  on 1668  degrees of freedom
## AIC: 1060.1
## 
## Number of Fisher Scoring iterations: 6
# Vorhersage des Modells 
prob <- predict(log_model, newdata = test_data, type = "response")
pred <- ifelse(prob >= 0.5, "Ja", "Nein")

# Konfusionsmatrix und Genauigkeit des Modells 
cm <- table(Predicted = pred, Actual = test_data$Is_Overweight)

cat("Confusion Matrix:\n")
## Confusion Matrix:
print(cm)
##          Actual
## Predicted Nein  Ja
##      Ja     37 303
##      Nein   66  17
cat("\n")
TP <- cm["Ja", "Ja"]
FP <- cm["Ja", "Nein"]
FN <- cm["Nein", "Ja"]
TN <- cm["Nein", "Nein"]

accuracy = (TP + TN) / sum(cm)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)

cat("Modellgüte:\n")
## Modellgüte:
cat("Accuracy  :", round(accuracy, 3), "\n")
## Accuracy  : 0.872
cat("Precision :", round(precision, 3), "\n")
## Precision : 0.891
cat("Recall    :", round(recall, 3), "\n")
## Recall    : 0.947
cat("F1-Score  :", round(f1_score, 3), "\n")
## F1-Score  : 0.918
# Kreuzvalidierung zur Vorhersagegüte

set.seed(1) 
cv_results_reduced = cv.glm(train_data, log_model, K = 10) 
cv_error_reduced = cv_results_reduced$delta[1]

cat("Durchschnittliche Fehlerrate:", round(cv_error_reduced, 3), "\n")
## Durchschnittliche Fehlerrate: 0.089
## 2.Logistische Regression mit signifikanten Variablen 
log_model_sig <- glm(
  Is_Overweight ~ Age + family_history_with_overweight + 
    FAVC + NCP + CAEC + CH2O + FAF + CALC + MTRANS,
  data = train_data,
  family = binomial
)
summary(log_model_sig)
## 
## Call:
## glm(formula = Is_Overweight ~ Age + family_history_with_overweight + 
##     FAVC + NCP + CAEC + CH2O + FAF + CALC + MTRANS, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -7.65173    0.94391  -8.106 5.21e-16 ***
## Age                                0.21974    0.02082  10.554  < 2e-16 ***
## family_history_with_overweightyes  2.40562    0.19919  12.077  < 2e-16 ***
## FAVCyes                            0.68993    0.23415   2.946 0.003214 ** 
## NCP                               -0.35668    0.10755  -3.316 0.000912 ***
## CAECFrequently                    -0.93797    0.49497  -1.895 0.058093 .  
## CAECno                             2.84668    0.61669   4.616 3.91e-06 ***
## CAECSometimes                      2.49411    0.44271   5.634 1.76e-08 ***
## CH2O                               0.32044    0.14330   2.236 0.025337 *  
## FAF                               -0.38542    0.09914  -3.888 0.000101 ***
## CALCno                            -1.15811    0.48724  -2.377 0.017460 *  
## CALCSometimes                     -0.74126    0.48287  -1.535 0.124759    
## MTRANSBike                        -0.72172    1.25075  -0.577 0.563920    
## MTRANSMotorbike                    1.56284    1.06390   1.469 0.141842    
## MTRANSPublic_Transportation        1.15540    0.25450   4.540 5.63e-06 ***
## MTRANSWalking                      0.06606    0.48390   0.137 0.891416    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1969.6  on 1687  degrees of freedom
## Residual deviance: 1026.9  on 1672  degrees of freedom
## AIC: 1058.9
## 
## Number of Fisher Scoring iterations: 6
# Vorhersage des sign.Modells 
prob <- predict(log_model_sig, newdata = test_data, type = "response")
pred <- ifelse(prob >= 0.5, "Ja", "Nein")

# Konfusionsmatrix und Genauigkeit des sign. Modells 
cm <- table(Predicted = pred, Actual = test_data$Is_Overweight)

cat("Confusion Matrix:\n")
## Confusion Matrix:
print(cm)
##          Actual
## Predicted Nein  Ja
##      Ja     36 302
##      Nein   67  18
cat("\n")
TP <- cm["Ja", "Ja"]
FP <- cm["Ja", "Nein"]
FN <- cm["Nein", "Ja"]
TN <- cm["Nein", "Nein"]

accuracy = (TP + TN) / sum(cm)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)

cat("Modellgüte:\n")
## Modellgüte:
cat("Accuracy  :", round(accuracy, 3), "\n")
## Accuracy  : 0.872
cat("Precision :", round(precision, 3), "\n")
## Precision : 0.893
cat("Recall    :", round(recall, 3), "\n")
## Recall    : 0.944
cat("F1-Score  :", round(f1_score, 3), "\n")
## F1-Score  : 0.918
# Kreuzvalidierung zur Vorhersagegüte

set.seed(1) 
cv_results_reduced = cv.glm(train_data, log_model_sig, K = 10) 
cv_error_reduced = cv_results_reduced$delta[1]

cat("Durchschnittliche Fehlerrate:", round(cv_error_reduced, 3), "\n")
## Durchschnittliche Fehlerrate: 0.09

Decision Tree

Die Erstellung eines Decision Trees …

## In diesem Bereich werden Libraries installiert, die Zielvariable als Faktor gesetzt und der Datensatz aufgeteilt

#Installieren benötigter Libraries 
library(tree)
library(caret)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
#Zielvariable als Faktor setzen
data$NObeyesdad <- as.factor(data$NObeyesdad)

# Datensatz aufteilen in Trainings- und Testdaten
set.seed(1)
trainIndex = sample(1:nrow(data), 0.8 * nrow(data))
train_data = data[trainIndex, ]
test_data = data[-trainIndex, ]

cat("Daten erfolgreich geteilt:\n")
## Daten erfolgreich geteilt:
cat("Trainingsdaten:", nrow(train_data), "Zeilen\n")
## Trainingsdaten: 1688 Zeilen
cat("Testdaten:     ", nrow(test_data), "Zeilen\n\n")
## Testdaten:      423 Zeilen
## 1.Initialer unbeschnittener Decision Tree

# Erstellung des Modells
tree_model <- tree(
  NObeyesdad ~ . -Is_Overweight -Gewichtsklasse, 
  data = train_data
)
summary(tree_model)
## 
## Classification tree:
## tree(formula = NObeyesdad ~ . - Is_Overweight - Gewichtsklasse, 
##     data = train_data)
## Variables actually used in tree construction:
## [1] "FCVC"                           "Age"                           
## [3] "family_history_with_overweight" "NCP"                           
## [5] "FAVC"                           "CALC"                          
## [7] "Gender"                         "CAEC"                          
## [9] "MTRANS"                        
## Number of terminal nodes:  19 
## Residual mean deviance:  2.012 = 3357 / 1669 
## Misclassification error rate: 0.4082 = 689 / 1688
# Visualisierung des Decision Trees
plot(tree_model)
text(tree_model, pretty = 0, cex = 0.7)
title("Unbeschnittener Entscheidungsbaum")

# Bewertung des Decision Trees durch Konfusionsmatrix
tree_pred = predict(tree_model, newdata = test_data, type = "class")
cm_dec_tree = confusionMatrix(tree_pred, test_data$NObeyesdad)
print(cm_dec_tree)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Obesity_Type_I
##   Insufficient_Weight                  22             1              0
##   Normal_Weight                        21            24              5
##   Obesity_Type_I                        9            11             40
##   Obesity_Type_II                       0             0              6
##   Obesity_Type_III                      0             1              1
##   Overweight_Level_I                    1             0              1
##   Overweight_Level_II                   0            13             18
##                      Reference
## Prediction            Obesity_Type_II Obesity_Type_III Overweight_Level_I
##   Insufficient_Weight               0                0                  3
##   Normal_Weight                     7                1                 11
##   Obesity_Type_I                    4                0                 27
##   Obesity_Type_II                  44                0                  5
##   Obesity_Type_III                  0               82                  1
##   Overweight_Level_I                1                0                  6
##   Overweight_Level_II               1                0                  5
##                      Reference
## Prediction            Overweight_Level_II
##   Insufficient_Weight                   1
##   Normal_Weight                         2
##   Obesity_Type_I                       11
##   Obesity_Type_II                      10
##   Obesity_Type_III                      0
##   Overweight_Level_I                    0
##   Overweight_Level_II                  27
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5792          
##                  95% CI : (0.5306, 0.6267)
##     No Information Rate : 0.1962          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5052          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                             0.41509              0.48000
## Specificity                             0.98649              0.87399
## Pos Pred Value                          0.81481              0.33803
## Neg Pred Value                          0.92172              0.92614
## Prevalence                              0.12530              0.11820
## Detection Rate                          0.05201              0.05674
## Detection Prevalence                    0.06383              0.16785
## Balanced Accuracy                       0.70079              0.67700
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                        0.56338                 0.7719
## Specificity                        0.82386                 0.9426
## Pos Pred Value                     0.39216                 0.6769
## Neg Pred Value                     0.90343                 0.9637
## Prevalence                         0.16785                 0.1348
## Detection Rate                     0.09456                 0.1040
## Detection Prevalence               0.24113                 0.1537
## Balanced Accuracy                  0.69362                 0.8573
##                      Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity                           0.9880                   0.10345
## Specificity                           0.9912                   0.99178
## Pos Pred Value                        0.9647                   0.66667
## Neg Pred Value                        0.9970                   0.87440
## Prevalence                            0.1962                   0.13712
## Detection Rate                        0.1939                   0.01418
## Detection Prevalence                  0.2009                   0.02128
## Balanced Accuracy                     0.9896                   0.54761
##                      Class: Overweight_Level_II
## Sensitivity                             0.52941
## Specificity                             0.90054
## Pos Pred Value                          0.42187
## Neg Pred Value                          0.93315
## Prevalence                              0.12057
## Detection Rate                          0.06383
## Detection Prevalence                    0.15130
## Balanced Accuracy                       0.71497
# Ausgabe der Gesamtgenauigkeit des Modells 
overall_acc <- as.numeric(cm_dec_tree$overall["Accuracy"])
cat("Overall Accuracy (Testdaten): ",
    round(overall_acc * 100, 3), " %\n", sep = "")
## Overall Accuracy (Testdaten): 57.92 %
#Ausgabe der Genauigkeitswerte je Klasse
by_class <- cm_dec_tree$byClass
class_metrics <- by_class[, c("Pos Pred Value", "Sensitivity", "Balanced Accuracy")]
class_metrics <- round(class_metrics, 3)
print(class_metrics)
##                            Pos Pred Value Sensitivity Balanced Accuracy
## Class: Insufficient_Weight          0.815       0.415             0.701
## Class: Normal_Weight                0.338       0.480             0.677
## Class: Obesity_Type_I               0.392       0.563             0.694
## Class: Obesity_Type_II              0.677       0.772             0.857
## Class: Obesity_Type_III             0.965       0.988             0.990
## Class: Overweight_Level_I           0.667       0.103             0.548
## Class: Overweight_Level_II          0.422       0.529             0.715
# F1-Scores berechnen und neue Tabelle ausgeben
precision <- class_metrics[, "Pos Pred Value"]
recall <- class_metrics[, "Sensitivity"]
f1_scores_dec_tree <- 2 * (precision * recall) / (precision + recall)
class_metrics <- cbind(class_metrics, "F1-Score" = f1_scores_dec_tree)
class_metrics <- round(class_metrics, 3)
print(class_metrics)
##                            Pos Pred Value Sensitivity Balanced Accuracy
## Class: Insufficient_Weight          0.815       0.415             0.701
## Class: Normal_Weight                0.338       0.480             0.677
## Class: Obesity_Type_I               0.392       0.563             0.694
## Class: Obesity_Type_II              0.677       0.772             0.857
## Class: Obesity_Type_III             0.965       0.988             0.990
## Class: Overweight_Level_I           0.667       0.103             0.548
## Class: Overweight_Level_II          0.422       0.529             0.715
##                            F1-Score
## Class: Insufficient_Weight    0.550
## Class: Normal_Weight          0.397
## Class: Obesity_Type_I         0.462
## Class: Obesity_Type_II        0.721
## Class: Obesity_Type_III       0.976
## Class: Overweight_Level_I     0.178
## Class: Overweight_Level_II    0.469
#F1-Score berechnen und ausgeben
mean_f1_dec_tree <- mean(f1_scores_dec_tree)
cat("\nDurchschnittlicher F1-Score_dec_tree:", round(mean_f1_dec_tree, 3), "\n")
## 
## Durchschnittlicher F1-Score_dec_tree: 0.536
#Vergleich Training und Test - Overfitting
tree_pred_train <- predict(tree_model, newdata = train_data, type = "class")
cm_train_tree <- confusionMatrix(tree_pred_train, train_data$NObeyesdad)
train_accuracy <- as.numeric(cm_train_tree$overall["Accuracy"])
cat("Trainingsgenauigkeit (unbeschnittener Baum): ",
    round(train_accuracy * 100, 3), " %\n", sep = "")
## Trainingsgenauigkeit (unbeschnittener Baum): 59.182 %
cat("Testgenauigkeit (unbeschnittener Baum): ",
    round(overall_acc * 100, 3), " %\n", sep = "")
## Testgenauigkeit (unbeschnittener Baum): 57.92 %
## 2.Beschnittener Decision Tree 

# Cross Validation zur Optimierung der Baumgröße
cv_result <- cv.tree(tree_model, FUN = prune.misclass)
print(cv_result)
## $size
##  [1] 19 18 17 16 14 13 12 10  9  7  6  5  4  3  2  1
## 
## $dev
##  [1]  713  714  714  730  730  765  819  843  880  886  957 1010 1021 1128 1181
## [16] 1408
## 
## $k
##  [1]  -Inf   4.0   6.0   9.0   9.5  16.0  19.0  21.0  25.0  26.0  39.0  47.0
## [13]  48.0  80.0  92.0 221.0
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
# Plot: Baumgröße vs. CV-Fehler
plot(cv_result$size, cv_result$dev, type = "b",
     xlab = "Baumgröße (Anzahl Endknoten)",
     ylab = "CV-Fehlklassifikationen",
     main = "Cross-Validation zur Optimierung der Baumgröße")
grid()

# Optimale Baumgröße (Minimum der CV-Fehler)
optimal_size <- cv_result$size[which.min(cv_result$dev)]
cat("Optimale Baumgröße nach Cross-Validation: ", optimal_size, "\n", sep = "")
## Optimale Baumgröße nach Cross-Validation: 19
# Baum beschneiden
pruned_tree <- prune.misclass(tree_model, best = optimal_size)
plot(pruned_tree)
text(pruned_tree, pretty = 0, cex = 0.7)
title(paste("Beschnittener Entscheidungsbaum (size =", optimal_size, ")"))

# Bewertung des beschnittenen Decision Trees durch Konfusionsmatrix
pruned_pred_test <- predict(pruned_tree, newdata = test_data, type = "class")
cm_pruned <- confusionMatrix(pruned_pred_test, test_data$NObeyesdad)
print(cm_pruned)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Obesity_Type_I
##   Insufficient_Weight                  22             1              0
##   Normal_Weight                        21            24              5
##   Obesity_Type_I                        9            11             40
##   Obesity_Type_II                       0             0              6
##   Obesity_Type_III                      0             1              1
##   Overweight_Level_I                    1             0              1
##   Overweight_Level_II                   0            13             18
##                      Reference
## Prediction            Obesity_Type_II Obesity_Type_III Overweight_Level_I
##   Insufficient_Weight               0                0                  3
##   Normal_Weight                     7                1                 11
##   Obesity_Type_I                    4                0                 27
##   Obesity_Type_II                  44                0                  5
##   Obesity_Type_III                  0               82                  1
##   Overweight_Level_I                1                0                  6
##   Overweight_Level_II               1                0                  5
##                      Reference
## Prediction            Overweight_Level_II
##   Insufficient_Weight                   1
##   Normal_Weight                         2
##   Obesity_Type_I                       11
##   Obesity_Type_II                      10
##   Obesity_Type_III                      0
##   Overweight_Level_I                    0
##   Overweight_Level_II                  27
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5792          
##                  95% CI : (0.5306, 0.6267)
##     No Information Rate : 0.1962          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5052          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                             0.41509              0.48000
## Specificity                             0.98649              0.87399
## Pos Pred Value                          0.81481              0.33803
## Neg Pred Value                          0.92172              0.92614
## Prevalence                              0.12530              0.11820
## Detection Rate                          0.05201              0.05674
## Detection Prevalence                    0.06383              0.16785
## Balanced Accuracy                       0.70079              0.67700
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                        0.56338                 0.7719
## Specificity                        0.82386                 0.9426
## Pos Pred Value                     0.39216                 0.6769
## Neg Pred Value                     0.90343                 0.9637
## Prevalence                         0.16785                 0.1348
## Detection Rate                     0.09456                 0.1040
## Detection Prevalence               0.24113                 0.1537
## Balanced Accuracy                  0.69362                 0.8573
##                      Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity                           0.9880                   0.10345
## Specificity                           0.9912                   0.99178
## Pos Pred Value                        0.9647                   0.66667
## Neg Pred Value                        0.9970                   0.87440
## Prevalence                            0.1962                   0.13712
## Detection Rate                        0.1939                   0.01418
## Detection Prevalence                  0.2009                   0.02128
## Balanced Accuracy                     0.9896                   0.54761
##                      Class: Overweight_Level_II
## Sensitivity                             0.52941
## Specificity                             0.90054
## Pos Pred Value                          0.42187
## Neg Pred Value                          0.93315
## Prevalence                              0.12057
## Detection Rate                          0.06383
## Detection Prevalence                    0.15130
## Balanced Accuracy                       0.71497
# Ausgabe der Gesamtgenauigkeit des beschnittenen Modells 
overall_acc <- as.numeric(cm_pruned$overall["Accuracy"])
cat("Overall Accuracy (Testdaten): ",
    round(overall_acc * 100, 3), " %\n", sep = "")
## Overall Accuracy (Testdaten): 57.92 %
#Ausgabe der Genauigkeitswerte je Klasse
by_class <- cm_pruned$byClass
class_metrics <- by_class[, c("Pos Pred Value", "Sensitivity", "Balanced Accuracy")]
class_metrics <- round(class_metrics, 3)
print(class_metrics)
##                            Pos Pred Value Sensitivity Balanced Accuracy
## Class: Insufficient_Weight          0.815       0.415             0.701
## Class: Normal_Weight                0.338       0.480             0.677
## Class: Obesity_Type_I               0.392       0.563             0.694
## Class: Obesity_Type_II              0.677       0.772             0.857
## Class: Obesity_Type_III             0.965       0.988             0.990
## Class: Overweight_Level_I           0.667       0.103             0.548
## Class: Overweight_Level_II          0.422       0.529             0.715
# F1-Scores berechnen und neue Tabelle ausgeben
precision <- class_metrics[, "Pos Pred Value"]
recall <- class_metrics[, "Sensitivity"]
f1_scores_pruned <- 2 * (precision * recall) / (precision + recall)
class_metrics <- cbind(class_metrics, "F1-Score" = f1_scores_pruned)
class_metrics <- round(class_metrics, 3)
print(class_metrics)
##                            Pos Pred Value Sensitivity Balanced Accuracy
## Class: Insufficient_Weight          0.815       0.415             0.701
## Class: Normal_Weight                0.338       0.480             0.677
## Class: Obesity_Type_I               0.392       0.563             0.694
## Class: Obesity_Type_II              0.677       0.772             0.857
## Class: Obesity_Type_III             0.965       0.988             0.990
## Class: Overweight_Level_I           0.667       0.103             0.548
## Class: Overweight_Level_II          0.422       0.529             0.715
##                            F1-Score
## Class: Insufficient_Weight    0.550
## Class: Normal_Weight          0.397
## Class: Obesity_Type_I         0.462
## Class: Obesity_Type_II        0.721
## Class: Obesity_Type_III       0.976
## Class: Overweight_Level_I     0.178
## Class: Overweight_Level_II    0.469
#F1-Score berechnen und ausgeben
mean_f1_pruned <- mean(f1_scores_pruned)
cat("\nDurchschnittlicher F1-Score_pruned_tree:", round(mean_f1_pruned, 3), "\n")
## 
## Durchschnittlicher F1-Score_pruned_tree: 0.536

Random Forest

In diesem Abschnitt wird …

## In diesem Bereich werden die benötigten Libraries installiert und die Zielvariablen als Faktor gesetzt 
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
train_data$NObeyesdad <- as.factor(train_data$NObeyesdad)
test_data$NObeyesdad  <- as.factor(test_data$NObeyesdad)


## 1.In diesem Bereich wird ein initiales Random Forest Modell erstellt und evaluiert

# Erstellung des Random Forest Modells
set.seed(1)

rf_base <- randomForest(
  NObeyesdad ~ . -Is_Overweight -Gewichtsklasse,
  data = train_data,
  ntree = 500,
  importance = TRUE
)

print(rf_base)
## 
## Call:
##  randomForest(formula = NObeyesdad ~ . - Is_Overweight - Gewichtsklasse,      data = train_data, ntree = 500, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 13.98%
## Confusion matrix:
##                     Insufficient_Weight Normal_Weight Obesity_Type_I
## Insufficient_Weight                 195            21              1
## Normal_Weight                         9           191             10
## Obesity_Type_I                        0            15            241
## Obesity_Type_II                       0             3              2
## Obesity_Type_III                      0             0              0
## Overweight_Level_I                    4            28             15
## Overweight_Level_II                   1            24             19
##                     Obesity_Type_II Obesity_Type_III Overweight_Level_I
## Insufficient_Weight               0                0                  2
## Normal_Weight                     4                0                 12
## Obesity_Type_I                    8                2                  6
## Obesity_Type_II                 230                0                  1
## Obesity_Type_III                  0              240                  0
## Overweight_Level_I                3                1                178
## Overweight_Level_II              14                0                  4
##                     Overweight_Level_II class.error
## Insufficient_Weight                   0 0.109589041
## Normal_Weight                        11 0.194092827
## Obesity_Type_I                        8 0.139285714
## Obesity_Type_II                       4 0.041666667
## Obesity_Type_III                      1 0.004149378
## Overweight_Level_I                    3 0.232758621
## Overweight_Level_II                 177 0.259414226
# Confusion Matrix und Bewertung der Genauigkeit - Testdaten
rf_pred_test_base <- predict(rf_base, newdata = test_data, type = "class")
cm_rf_base <- confusionMatrix(rf_pred_test_base, test_data$NObeyesdad)
print(cm_rf_base)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Obesity_Type_I
##   Insufficient_Weight                  49             1              0
##   Normal_Weight                         4            42              3
##   Obesity_Type_I                        0             2             64
##   Obesity_Type_II                       0             1              3
##   Obesity_Type_III                      0             0              0
##   Overweight_Level_I                    0             4              1
##   Overweight_Level_II                   0             0              0
##                      Reference
## Prediction            Obesity_Type_II Obesity_Type_III Overweight_Level_I
##   Insufficient_Weight               0                0                  1
##   Normal_Weight                     2                1                  7
##   Obesity_Type_I                    0                0                  3
##   Obesity_Type_II                  51                0                  2
##   Obesity_Type_III                  1               82                  0
##   Overweight_Level_I                0                0                 42
##   Overweight_Level_II               3                0                  3
##                      Reference
## Prediction            Overweight_Level_II
##   Insufficient_Weight                   0
##   Normal_Weight                         3
##   Obesity_Type_I                        5
##   Obesity_Type_II                       3
##   Obesity_Type_III                      0
##   Overweight_Level_I                    0
##   Overweight_Level_II                  40
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8747          
##                  95% CI : (0.8393, 0.9047)
##     No Information Rate : 0.1962          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.853           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.9245              0.84000
## Specificity                              0.9946              0.94638
## Pos Pred Value                           0.9608              0.67742
## Neg Pred Value                           0.9892              0.97784
## Prevalence                               0.1253              0.11820
## Detection Rate                           0.1158              0.09929
## Detection Prevalence                     0.1206              0.14657
## Balanced Accuracy                        0.9596              0.89319
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.9014                 0.8947
## Specificity                         0.9716                 0.9754
## Pos Pred Value                      0.8649                 0.8500
## Neg Pred Value                      0.9799                 0.9835
## Prevalence                          0.1678                 0.1348
## Detection Rate                      0.1513                 0.1206
## Detection Prevalence                0.1749                 0.1418
## Balanced Accuracy                   0.9365                 0.9351
##                      Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity                           0.9880                   0.72414
## Specificity                           0.9971                   0.98630
## Pos Pred Value                        0.9880                   0.89362
## Neg Pred Value                        0.9971                   0.95745
## Prevalence                            0.1962                   0.13712
## Detection Rate                        0.1939                   0.09929
## Detection Prevalence                  0.1962                   0.11111
## Balanced Accuracy                     0.9925                   0.85522
##                      Class: Overweight_Level_II
## Sensitivity                             0.78431
## Specificity                             0.98387
## Pos Pred Value                          0.86957
## Neg Pred Value                          0.97082
## Prevalence                              0.12057
## Detection Rate                          0.09456
## Detection Prevalence                    0.10875
## Balanced Accuracy                       0.88409
overall_acc_rf_base <- as.numeric(cm_rf_base$overall["Accuracy"])
cat("Overall Accuracy (RF Baseline, Test): ",
    round(overall_acc_rf_base * 100, 2), " %\n", sep = "")
## Overall Accuracy (RF Baseline, Test): 87.47 %
rf_class_metrics_base <- round(
  cm_rf_base$byClass[, c("Sensitivity", "Specificity", "Balanced Accuracy")],
  3
)
print(rf_class_metrics_base)
##                            Sensitivity Specificity Balanced Accuracy
## Class: Insufficient_Weight       0.925       0.995             0.960
## Class: Normal_Weight             0.840       0.946             0.893
## Class: Obesity_Type_I            0.901       0.972             0.936
## Class: Obesity_Type_II           0.895       0.975             0.935
## Class: Obesity_Type_III          0.988       0.997             0.993
## Class: Overweight_Level_I        0.724       0.986             0.855
## Class: Overweight_Level_II       0.784       0.984             0.884
## 2.In diesem Bereich wird das optimierte Random Forest Modell mit Hyperparametertuning erstellt  

set.seed(1)

train_data$NObeyesdad <- as.factor(train_data$NObeyesdad)
test_data$NObeyesdad  <- as.factor(test_data$NObeyesdad)

# Cross-Validation Vorbeireitung
train_control <- trainControl(
  method = "cv",
  number = 5
)

# Anzahl der Prädiktoren p bestimmen - keine Zielvariablen
p <- ncol(train_data[, !(names(train_data) %in% 
                           c("NObeyesdad", "Is_Overweight", "Gewichtsklasse"))])

default_mtry <- floor(sqrt(p))
cat("p =", p, "| Default mtry =", default_mtry, "\n")
## p = 14 | Default mtry = 3
# Hyperparametertuning: mtry (5-fold CV)
tune_grid <- expand.grid(
  mtry = seq(default_mtry - 1, default_mtry + 5, by = 1)
)

rf_cv <- train(
  NObeyesdad ~ . -Is_Overweight -Gewichtsklasse,
  data = train_data,
  method = "rf",
  trControl = train_control,
  tuneGrid = tune_grid,
  metric = "Accuracy",
  ntree = 500
)

print(rf_cv)
## Random Forest 
## 
## 1688 samples
##   16 predictor
##    7 classes: 'Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I', 'Overweight_Level_II' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1352, 1349, 1350, 1351, 1350 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.7517567  0.7098946
##   3     0.8204927  0.7903290
##   4     0.8365007  0.8090327
##   5     0.8471675  0.8215118
##   6     0.8448165  0.8187502
##   7     0.8477593  0.8222016
##   8     0.8442160  0.8180558
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.
best_mtry <- rf_cv$bestTune$mtry
mtry_final <- 5  # gewählt aufgrund Minimum/Plateau in CV-Fehler-Plot
cat("Optimales mtry:", mtry_final, "\n")
## Optimales mtry: 5
# Grafische Darstellung CV-Fehler vs. mtry
cv_results <- rf_cv$results
cv_results$CV_Error <- 1 - cv_results$Accuracy

plot(
  cv_results$mtry,
  cv_results$CV_Error,
  type = "b",
  pch = 19,
  col = "blue",
  xlab = "Anzahl zufällig ausgewählter Prädiktoren (mtry)",
  ylab = "Cross-Validation-Fehler",
  main = "Abhängigkeit des CV-Fehlers von mtry"
)

# Analyse der Baumanzahl (ntree) bei fixiertem mtry
ntree_values <- c(100, 300, 500, 800)
test_accuracy <- numeric(length(ntree_values))

set.seed(1)

for (i in seq_along(ntree_values)) {
  rf_tmp <- randomForest(
    NObeyesdad ~ . -Is_Overweight -Gewichtsklasse,
    data = train_data,
    mtry = mtry_final,
    ntree = ntree_values[i]
  )
  
  preds <- predict(rf_tmp, newdata = test_data)
  test_accuracy[i] <- mean(preds == test_data$NObeyesdad)
}

ntree_results <- data.frame(
  ntree = ntree_values,
  Test_Accuracy = test_accuracy
)

ntree_results$Test_Error <- 1 - ntree_results$Test_Accuracy
print(ntree_results)
##   ntree Test_Accuracy Test_Error
## 1   100     0.8723404  0.1276596
## 2   300     0.8794326  0.1205674
## 3   500     0.8841608  0.1158392
## 4   800     0.8841608  0.1158392
# Grafische Darstellung Test-Fehler vs. Baumanzahl
plot(
  ntree_results$ntree,
  ntree_results$Test_Error,
  type = "b",
  pch = 19,
  col = "darkgreen",
  xlab = "Anzahl der Bäume (ntree)",
  ylab = "Test-Fehler",
  main = "Abhängigkeit des Testfehlers von der Baumanzahl"
)
grid()

# Analyse: Fehlerverlauf über ntree für verschiedene mtry

mtry_values <- c(3, 5, 6, 7, 9)
ntree_max <- 500

set.seed(1)

rf_models <- lapply(mtry_values, function(m) {
  randomForest(
    NObeyesdad ~ . -Is_Overweight -Gewichtsklasse,
    data = train_data,
    mtry = m,
    ntree = ntree_max
  )
})

names(rf_models) <- paste0("m=", mtry_values)

err_curves <- lapply(rf_models, function(rf) {
  rf$err.rate[, "OOB"]
})

plot(
  1:ntree_max,
  err_curves[[1]],
  type = "l",
  col = "red",
  ylim = range(unlist(err_curves)),
  xlab = "Baumanzahl (ntrees)",
  ylab = "Fehlklassifizierung",
  main = "Random Forest: Fehlklassifizierung vs. Baumanzahl für unterschiedliche mtry"
)

cols <- c("red", "orange", "green", "blue", "cyan")

for (i in 2:length(err_curves)) {
  lines(1:ntree_max, err_curves[[i]], col = cols[i])
}

legend(
  "topright",
  legend = names(err_curves),
  col = cols,
  lty = 1,
  bty = "n"
)

# Finales Random Forest Modell 
set.seed(1)

rf_final <- randomForest(
  NObeyesdad ~ . -Is_Overweight -Gewichtsklasse,
  data = train_data,
  mtry = mtry_final,
  ntree = 500,
  importance = TRUE
)

# Confusion Matrix und Genauigkeit für Finales Random Forest Modell
rf_pred_test <- predict(rf_final, newdata = test_data)
cm_rf_final  <- confusionMatrix(rf_pred_test, test_data$NObeyesdad)

cat("\nRandom Forest nach Hyperparameter-Tuning:\n")
## 
## Random Forest nach Hyperparameter-Tuning:
cat("\nConfusion Matrix:\n")
## 
## Confusion Matrix:
print(cm_rf_final)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Obesity_Type_I
##   Insufficient_Weight                  49             0              0
##   Normal_Weight                         4            40              2
##   Obesity_Type_I                        0             4             67
##   Obesity_Type_II                       0             1              1
##   Obesity_Type_III                      0             0              0
##   Overweight_Level_I                    0             3              1
##   Overweight_Level_II                   0             2              0
##                      Reference
## Prediction            Obesity_Type_II Obesity_Type_III Overweight_Level_I
##   Insufficient_Weight               0                0                  1
##   Normal_Weight                     2                1                  7
##   Obesity_Type_I                    0                0                  2
##   Obesity_Type_II                  52                0                  2
##   Obesity_Type_III                  1               82                  0
##   Overweight_Level_I                0                0                 44
##   Overweight_Level_II               2                0                  2
##                      Reference
## Prediction            Overweight_Level_II
##   Insufficient_Weight                   0
##   Normal_Weight                         3
##   Obesity_Type_I                        6
##   Obesity_Type_II                       3
##   Obesity_Type_III                      0
##   Overweight_Level_I                    0
##   Overweight_Level_II                  39
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8818         
##                  95% CI : (0.8471, 0.911)
##     No Information Rate : 0.1962         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.8612         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.9245              0.80000
## Specificity                              0.9973              0.94906
## Pos Pred Value                           0.9800              0.67797
## Neg Pred Value                           0.9893              0.97253
## Prevalence                               0.1253              0.11820
## Detection Rate                           0.1158              0.09456
## Detection Prevalence                     0.1182              0.13948
## Balanced Accuracy                        0.9609              0.87453
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.9437                 0.9123
## Specificity                         0.9659                 0.9809
## Pos Pred Value                      0.8481                 0.8814
## Neg Pred Value                      0.9884                 0.9863
## Prevalence                          0.1678                 0.1348
## Detection Rate                      0.1584                 0.1229
## Detection Prevalence                0.1868                 0.1395
## Balanced Accuracy                   0.9548                 0.9466
##                      Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity                           0.9880                    0.7586
## Specificity                           0.9971                    0.9890
## Pos Pred Value                        0.9880                    0.9167
## Neg Pred Value                        0.9971                    0.9627
## Prevalence                            0.1962                    0.1371
## Detection Rate                        0.1939                    0.1040
## Detection Prevalence                  0.1962                    0.1135
## Balanced Accuracy                     0.9925                    0.8738
##                      Class: Overweight_Level_II
## Sensitivity                              0.7647
## Specificity                              0.9839
## Pos Pred Value                           0.8667
## Neg Pred Value                           0.9683
## Prevalence                               0.1206
## Detection Rate                           0.0922
## Detection Prevalence                     0.1064
## Balanced Accuracy                        0.8743
cat("\nGesamtmetriken:\n")
## 
## Gesamtmetriken:
cat("Accuracy: ", round(cm_rf_final$overall["Accuracy"] * 100, 2), " %\n", sep = "")
## Accuracy: 88.18 %
## Vergleich des initialen Random Forest mit dem hyperparametergetunten Random Forest 

# F1, Precision, Recall je Klasse für initialen RF
byc <- cm_rf_base$byClass
if (is.matrix(byc)) {
  metrics <- byc[, c("Precision", "Recall", "F1")]
  metrics <- round(metrics * 100, 2)
  print(metrics)
  
  f1_score_rf_base <- mean(byc[, "F1"], na.rm = TRUE)
  cat("F1-Wert: ", round(f1_score_rf_base * 100, 2), " %\n", sep = "")
}
##                            Precision Recall    F1
## Class: Insufficient_Weight     96.08  92.45 94.23
## Class: Normal_Weight           67.74  84.00 75.00
## Class: Obesity_Type_I          86.49  90.14 88.28
## Class: Obesity_Type_II         85.00  89.47 87.18
## Class: Obesity_Type_III        98.80  98.80 98.80
## Class: Overweight_Level_I      89.36  72.41 80.00
## Class: Overweight_Level_II     86.96  78.43 82.47
## F1-Wert: 86.57 %
# F1, Precision, Recall je Klasse für optimierten RF
byc <- cm_rf_final$byClass
if (is.matrix(byc)) {
  metrics <- byc[, c("Precision", "Recall", "F1")]
  metrics <- round(metrics * 100, 2)
  print(metrics)

  f1_score_rf_final <- mean(byc[, "F1"], na.rm = TRUE)
  cat("F1-Wert: ", round(f1_score_rf_final * 100, 2), " %\n", sep = "")
}
##                            Precision Recall    F1
## Class: Insufficient_Weight     98.00  92.45 95.15
## Class: Normal_Weight           67.80  80.00 73.39
## Class: Obesity_Type_I          84.81  94.37 89.33
## Class: Obesity_Type_II         88.14  91.23 89.66
## Class: Obesity_Type_III        98.80  98.80 98.80
## Class: Overweight_Level_I      91.67  75.86 83.02
## Class: Overweight_Level_II     86.67  76.47 81.25
## F1-Wert: 87.23 %
## Wichtigkeit der einzelnen Variablen
# 1. Grafik: Genauigkeit (Mean Decrease Accuracy)
varImpPlot(rf_final,
           type = 1,
           main = "Wichtigkeit der Variablen (Accuracy)",
           col = "blue",
           pch = 19)

# 2. Grafik: Gini-Index (Mean Decrease Gini)
varImpPlot(rf_final,
           type = 2,
           main = "Wichtigkeit der Variablen (Gini)",
           col = "red",
           pch = 19)