## In diesem Abschnitt werden die Daten eingelesen, die grundlegende Struktur dargestellt und Daten-Leakage vermieden
data <- read.csv("~/Downloads/ObesityDataSet.csv")
dim(data)
## [1] 2111 17
cat("Der Datensatz hat", ncol(data), "Variablen und", nrow(data), "Zeilen\n\n")
## Der Datensatz hat 17 Variablen und 2111 Zeilen
str(data)
## 'data.frame': 2111 obs. of 17 variables:
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ CALC : chr "no" "Sometimes" "Frequently" "Frequently" ...
## $ FAVC : chr "no" "no" "no" "no" ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ SCC : chr "no" "yes" "no" "no" ...
## $ SMOKE : chr "no" "yes" "no" "no" ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ family_history_with_overweight: chr "yes" "yes" "yes" "no" ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CAEC : chr "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
## $ MTRANS : chr "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
## $ NObeyesdad : chr "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
summary(data)
## Age Gender Height Weight
## Min. :14.00 Length:2111 Min. :1.450 Min. : 39.00
## 1st Qu.:19.95 Class :character 1st Qu.:1.630 1st Qu.: 65.47
## Median :22.78 Mode :character Median :1.700 Median : 83.00
## Mean :24.31 Mean :1.702 Mean : 86.59
## 3rd Qu.:26.00 3rd Qu.:1.768 3rd Qu.:107.43
## Max. :61.00 Max. :1.980 Max. :173.00
## CALC FAVC FCVC NCP
## Length:2111 Length:2111 Min. :1.000 Min. :1.000
## Class :character Class :character 1st Qu.:2.000 1st Qu.:2.659
## Mode :character Mode :character Median :2.386 Median :3.000
## Mean :2.419 Mean :2.686
## 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :3.000 Max. :4.000
## SCC SMOKE CH2O
## Length:2111 Length:2111 Min. :1.000
## Class :character Class :character 1st Qu.:1.585
## Mode :character Mode :character Median :2.000
## Mean :2.008
## 3rd Qu.:2.477
## Max. :3.000
## family_history_with_overweight FAF TUE
## Length:2111 Min. :0.0000 Min. :0.0000
## Class :character 1st Qu.:0.1245 1st Qu.:0.0000
## Mode :character Median :1.0000 Median :0.6253
## Mean :1.0103 Mean :0.6579
## 3rd Qu.:1.6667 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.0000
## CAEC MTRANS NObeyesdad
## Length:2111 Length:2111 Length:2111
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
#Daten-Leakage vermeiden - Größe und Gewicht aus dem Datensatz entfernen
data <- data[, !(names(data) %in% c("Height", "Weight"))]
names(data)
## [1] "Age" "Gender"
## [3] "CALC" "FAVC"
## [5] "FCVC" "NCP"
## [7] "SCC" "SMOKE"
## [9] "CH2O" "family_history_with_overweight"
## [11] "FAF" "TUE"
## [13] "CAEC" "MTRANS"
## [15] "NObeyesdad"
## Dieser Abschnitt bietet eine Übersichtstabelle über die kategorischen und numerischen Variablen
# Kategoriale Variablen identifizieren (Textvariablen)
categorical_columns <- sapply(data, is.character)
# Numerische Variablen identifizieren (Zahlen)
numeric_columns <- sapply(data, is.numeric)
# Übersicht der kategorialen Variablen und deren Antwortmöglichkeiten
categorical_summary <- data.frame(
Variable = names(data)[categorical_columns],
Antwortmöglichkeiten = sapply(
data[, categorical_columns],
function(col) paste(unique(col), collapse = ", ")
),
Antworthäufigkeit = sapply(
data[, categorical_columns],
function(col) {
# Erstellt eine Tabelle und formatiert sie als Text
tab <- table(col)
paste(names(tab), ": ", tab, sep = "", collapse = " | ")
}
)
)
print(categorical_summary)
## Variable
## Gender Gender
## CALC CALC
## FAVC FAVC
## SCC SCC
## SMOKE SMOKE
## family_history_with_overweight family_history_with_overweight
## CAEC CAEC
## MTRANS MTRANS
## NObeyesdad NObeyesdad
## Antwortmöglichkeiten
## Gender Female, Male
## CALC no, Sometimes, Frequently, Always
## FAVC no, yes
## SCC no, yes
## SMOKE no, yes
## family_history_with_overweight yes, no
## CAEC Sometimes, Frequently, Always, no
## MTRANS Public_Transportation, Walking, Automobile, Motorbike, Bike
## NObeyesdad Normal_Weight, Overweight_Level_I, Overweight_Level_II, Obesity_Type_I, Insufficient_Weight, Obesity_Type_II, Obesity_Type_III
## Antworthäufigkeit
## Gender Female: 1043 | Male: 1068
## CALC Always: 1 | Frequently: 70 | no: 639 | Sometimes: 1401
## FAVC no: 245 | yes: 1866
## SCC no: 2015 | yes: 96
## SMOKE no: 2067 | yes: 44
## family_history_with_overweight no: 385 | yes: 1726
## CAEC Always: 53 | Frequently: 242 | no: 51 | Sometimes: 1765
## MTRANS Automobile: 457 | Bike: 7 | Motorbike: 11 | Public_Transportation: 1580 | Walking: 56
## NObeyesdad Insufficient_Weight: 272 | Normal_Weight: 287 | Obesity_Type_I: 351 | Obesity_Type_II: 297 | Obesity_Type_III: 324 | Overweight_Level_I: 290 | Overweight_Level_II: 290
# Übersicht der numerischen Variablen sowie Min-/Max-Werte und Kommentarte zu den numerischen Variablen
numeric_summary <- data.frame(
Variable = names(data)[numeric_columns],
Min = sapply(data[, numeric_columns], min),
Max = sapply(data[, numeric_columns], max),
Kommentar = ""
)
# Kommentar = Inhaltliche Bedeutung der numerischen Variablen
numeric_summary$Kommentar[numeric_summary$Variable == "Age"] <- "Alter in Jahren"
numeric_summary$Kommentar[numeric_summary$Variable == "FCVC"] <- "Häufigkeit von Gemüsekonsum"
numeric_summary$Kommentar[numeric_summary$Variable == "NCP"] <- "Anzahl Hauptmahlzeiten pro Tag"
numeric_summary$Kommentar[numeric_summary$Variable == "CH2O"] <- "Wasseraufnahme"
numeric_summary$Kommentar[numeric_summary$Variable == "FAF"] <- "Körperliche Aktivität"
numeric_summary$Kommentar[numeric_summary$Variable == "TUE"] <- "Technologienutzung"
print(numeric_summary)
## Variable Min Max Kommentar
## Age Age 14 61 Alter in Jahren
## FCVC FCVC 1 3 Häufigkeit von Gemüsekonsum
## NCP NCP 1 4 Anzahl Hauptmahlzeiten pro Tag
## CH2O CH2O 1 3 Wasseraufnahme
## FAF FAF 0 3 Körperliche Aktivität
## TUE TUE 0 2 Technologienutzung
## Grafische Darstellung der Zielvariablen und Zusammenhänge
# Zuerst werden die Klassen der Zielvariable logisch angeordnet (Untergewicht -> Normalgewicht -> Übergewicht -> Adipositas)
data$Gewichtsklasse <- factor(
data$NObeyesdad,
levels = c(
"Insufficient_Weight",
"Normal_Weight",
"Overweight_Level_I",
"Overweight_Level_II",
"Obesity_Type_I",
"Obesity_Type_II",
"Obesity_Type_III"
),
labels = c(
"Untergewicht",
"Normalgewicht",
"Übergewicht I",
"Übergewicht II",
"Adipositas I",
"Adipositas II",
"Adipositas III"
),
ordered = TRUE
)
# Verteilung der Zielvariable als Balkendiagramm dargestellt
table(data$Gewichtsklasse)
##
## Untergewicht Normalgewicht Übergewicht I Übergewicht II Adipositas I
## 272 287 290 290 351
## Adipositas II Adipositas III
## 297 324
barplot(
table(data$Gewichtsklasse),
main = "Verteilung der Gewichtsklassen (NObeyesdad)",
xlab = "Gewichtsklasse",
ylab = "Anzahl Personen",
las = 2,
col = "lightblue",
cex.names = 0.5
)
# Darstellung numerischer Variablen als Histogramme
par(mfrow = c(2, 3)) # 2 Zeilen, 3 Spalten
hist(data$Age,
main = "Alter",
xlab = "Jahre",
ylab = "Anzahl Personen",
col = "lightgray")
hist(data$FCVC,
main = "Gemüsekonsum (FCVC)",
xlab = "Ausprägung",
ylab = "Anzahl Personen",
col = "lightgreen")
hist(data$NCP,
main = "Hauptmahlzeiten (NCP)",
xlab = "Ausprägung",
ylab = "Anzahl Personen",
col = "salmon")
hist(data$FAF,
main = "Körperliche Aktivität (FAF)",
xlab = "Ausprägung",
ylab = "Anzahl Personen",
col = "orange")
hist(data$TUE,
main = "Technologienutzung (TUE)",
xlab = "Ausprägung",
ylab = "Anzahl Personen",
col = "purple")
hist(data$CH2O,
main = "Wasseraufnahme (CH2O)",
xlab = "Ausprägung",
ylab = "Anzahl Personen",
col = "lightyellow")
par(mfrow = c(1, 1))
# Initiale Darstellung der Zusammenhänge zwischen Lebensstilvariablen und Zielvariable Gewichtsklasse
par(mfrow = c(1, 2))
# Familiäre Vorbelastung vs. Adipositas
barplot(
table(data$family_history_with_overweight, data$Gewichtsklasse),
beside = TRUE,
col = c("blue", "red"),
main = "Familiäre Vorbelastung\nund Gewichtsklassen",
xlab = "Gewichtsklasse",
ylab = "Anzahl Personen",
legend.text = c("Nein", "Ja"),
args.legend = list(x = "topright", inset = 0.02),
las = 2,
cex.names = 0.5
)
# Hochkalorischer Konsum (FAVC) vs. Adipositas
barplot(
table(data$FAVC, data$Gewichtsklasse),
beside = TRUE,
col = c("lightgreen", "orange"),
main = "Hochkalorische Ernährung (FAVC)\nund Gewichtsklassen",
xlab = "Gewichtsklasse",
ylab = "Anzahl Personen",
legend.text = c("Nein", "Ja"),
args.legend = list(x = "topright", inset = 0.02),
las = 2,
cex.names = 0.5
)
par(mfrow = c(1, 1))
##Logistisches Regressionsmodell
Zunächst erfolgt die Erstellung eines logistischen Regressionsmodells, um eine Ursachenanalyse durchzuführen, welche Variablen grundsätzlich zu Übergewicht führen. Dazu werden zunächst die 7 Gewichtsklassen in eine binäre Zielvariable überführt sowie die kategorischen Variablen in Faktoren umgewandelt. Die logistische Regression umfasst die Erstellung eines initialen Modells, das alle potenziell relevanten Variablen beinhaltet und ein anschließendes, optimiertes Modell, bei dem nur die, als signifikant identifizierten Variablen berücksichtig werden. Die Modelle werden durch eine Confusion Matrix, Accuracy, Precision, Recall und F1-Score evaluiert. Eine k-fold Cross Validation wird durchgeführt, um die generelle Güte bzw. Generalisierbarkeit des jeweiligen Modells zu prüfen.
library(boot)
## In diesem Abschnitt werden die Zielvariablen in eine binäre Zielvariable überführt und die kategorischen Datentypen umgewandelt
#Binäre Zielvariable erzeugen (0 = nicht übergewichtig (Untergewicht, Normalgewicht); 1 = übergewichtig (Übergewicht + Adipositas))
data$Is_Overweight <- ifelse(
data$Gewichtsklasse %in% c(
"Übergewicht I",
"Übergewicht II",
"Adipositas I",
"Adipositas II",
"Adipositas III"
),
1, 0
)
# Umwandlung in kategoriale Variablen
data$Is_Overweight <- factor(
data$Is_Overweight,
levels = c(0, 1),
labels = c("Nein", "Ja")
)
# Kontrolle der Klassenverteilung
table(data$Is_Overweight)
##
## Nein Ja
## 559 1552
prop.table(table(data$Is_Overweight)) * 100
##
## Nein Ja
## 26.48034 73.51966
# Einschub Datenkorrektur, da CALC == Always nur einmal auftauch und bei der CV in die Testdaten gelangt und daher zu Error führt. Always wird mit Frequently zusammengeführt
data$CALC <- as.character(data$CALC)
data$CALC[data$CALC == "Always"] <- "Frequently"
data$CALC <- as.factor(data$CALC)
# Kategoriale Variablen als Faktoren setzen
factor_cols <- c(
"Gender",
"family_history_with_overweight",
"FAVC",
"CAEC",
"SMOKE",
"SCC",
"CALC",
"MTRANS"
)
data[factor_cols] <- lapply(data[factor_cols], as.factor)
str(data)
## 'data.frame': 2111 obs. of 17 variables:
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
## $ CALC : Factor w/ 3 levels "Frequently","no",..: 2 3 1 1 3 3 3 3 1 2 ...
## $ FAVC : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ SCC : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ SMOKE : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CAEC : Factor w/ 4 levels "Always","Frequently",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ MTRANS : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
## $ NObeyesdad : chr "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
## $ Gewichtsklasse : Ord.factor w/ 7 levels "Untergewicht"<..: 2 2 2 3 4 2 2 2 2 2 ...
## $ Is_Overweight : Factor w/ 2 levels "Nein","Ja": 1 1 1 2 2 1 1 1 1 1 ...
# Datensatz aufteilen
set.seed(1)
trainIndex <- sample(1:nrow(data), 0.8 * nrow(data))
train_data <- data[trainIndex, ]
test_data <- data[-trainIndex, ]
## 1. Logistische Regression mit allen Variablen
log_model <- glm(
Is_Overweight ~ Age + Gender + family_history_with_overweight +
FAVC + FCVC + NCP + CAEC + SMOKE + CH2O + FAF + TUE + CALC + MTRANS,
data = train_data,
family = binomial
)
summary(log_model)
##
## Call:
## glm(formula = Is_Overweight ~ Age + Gender + family_history_with_overweight +
## FAVC + FCVC + NCP + CAEC + SMOKE + CH2O + FAF + TUE + CALC +
## MTRANS, family = binomial, data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.57719 1.01804 -7.443 9.85e-14 ***
## Age 0.21617 0.02107 10.260 < 2e-16 ***
## GenderMale 0.30913 0.18020 1.716 0.086251 .
## family_history_with_overweightyes 2.40519 0.19975 12.041 < 2e-16 ***
## FAVCyes 0.67015 0.23392 2.865 0.004172 **
## FCVC 0.07218 0.15817 0.456 0.648140
## NCP -0.37700 0.10801 -3.490 0.000482 ***
## CAECFrequently -0.91748 0.49702 -1.846 0.064899 .
## CAECno 2.63999 0.61905 4.265 2.00e-05 ***
## CAECSometimes 2.48091 0.44376 5.591 2.26e-08 ***
## SMOKEyes -0.67556 0.63138 -1.070 0.284634
## CH2O 0.27874 0.14652 1.902 0.057108 .
## FAF -0.39174 0.10072 -3.889 0.000101 ***
## TUE -0.21582 0.13505 -1.598 0.110025
## CALCno -1.23669 0.49511 -2.498 0.012497 *
## CALCSometimes -0.82540 0.48937 -1.687 0.091668 .
## MTRANSBike -0.70101 1.22465 -0.572 0.567037
## MTRANSMotorbike 1.42292 1.07205 1.327 0.184412
## MTRANSPublic_Transportation 1.28517 0.26316 4.884 1.04e-06 ***
## MTRANSWalking 0.16969 0.49549 0.342 0.732004
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1969.6 on 1687 degrees of freedom
## Residual deviance: 1020.1 on 1668 degrees of freedom
## AIC: 1060.1
##
## Number of Fisher Scoring iterations: 6
# Vorhersage des Modells
prob <- predict(log_model, newdata = test_data, type = "response")
pred <- ifelse(prob >= 0.5, "Ja", "Nein")
# Konfusionsmatrix und Genauigkeit des Modells
cm <- table(Predicted = pred, Actual = test_data$Is_Overweight)
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(cm)
## Actual
## Predicted Nein Ja
## Ja 37 303
## Nein 66 17
cat("\n")
TP <- cm["Ja", "Ja"]
FP <- cm["Ja", "Nein"]
FN <- cm["Nein", "Ja"]
TN <- cm["Nein", "Nein"]
accuracy = (TP + TN) / sum(cm)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)
cat("Modellgüte:\n")
## Modellgüte:
cat("Accuracy :", round(accuracy, 3), "\n")
## Accuracy : 0.872
cat("Precision :", round(precision, 3), "\n")
## Precision : 0.891
cat("Recall :", round(recall, 3), "\n")
## Recall : 0.947
cat("F1-Score :", round(f1_score, 3), "\n")
## F1-Score : 0.918
# Kreuzvalidierung zur Vorhersagegüte
set.seed(1)
cv_results_reduced = cv.glm(train_data, log_model, K = 10)
cv_error_reduced = cv_results_reduced$delta[1]
cat("Durchschnittliche Fehlerrate:", round(cv_error_reduced, 3), "\n")
## Durchschnittliche Fehlerrate: 0.089
## 2.Logistische Regression mit signifikanten Variablen
log_model_sig <- glm(
Is_Overweight ~ Age + family_history_with_overweight +
FAVC + NCP + CAEC + CH2O + FAF + CALC + MTRANS,
data = train_data,
family = binomial
)
summary(log_model_sig)
##
## Call:
## glm(formula = Is_Overweight ~ Age + family_history_with_overweight +
## FAVC + NCP + CAEC + CH2O + FAF + CALC + MTRANS, family = binomial,
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.65173 0.94391 -8.106 5.21e-16 ***
## Age 0.21974 0.02082 10.554 < 2e-16 ***
## family_history_with_overweightyes 2.40562 0.19919 12.077 < 2e-16 ***
## FAVCyes 0.68993 0.23415 2.946 0.003214 **
## NCP -0.35668 0.10755 -3.316 0.000912 ***
## CAECFrequently -0.93797 0.49497 -1.895 0.058093 .
## CAECno 2.84668 0.61669 4.616 3.91e-06 ***
## CAECSometimes 2.49411 0.44271 5.634 1.76e-08 ***
## CH2O 0.32044 0.14330 2.236 0.025337 *
## FAF -0.38542 0.09914 -3.888 0.000101 ***
## CALCno -1.15811 0.48724 -2.377 0.017460 *
## CALCSometimes -0.74126 0.48287 -1.535 0.124759
## MTRANSBike -0.72172 1.25075 -0.577 0.563920
## MTRANSMotorbike 1.56284 1.06390 1.469 0.141842
## MTRANSPublic_Transportation 1.15540 0.25450 4.540 5.63e-06 ***
## MTRANSWalking 0.06606 0.48390 0.137 0.891416
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1969.6 on 1687 degrees of freedom
## Residual deviance: 1026.9 on 1672 degrees of freedom
## AIC: 1058.9
##
## Number of Fisher Scoring iterations: 6
# Vorhersage des sign.Modells
prob <- predict(log_model_sig, newdata = test_data, type = "response")
pred <- ifelse(prob >= 0.5, "Ja", "Nein")
# Konfusionsmatrix und Genauigkeit des sign. Modells
cm <- table(Predicted = pred, Actual = test_data$Is_Overweight)
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(cm)
## Actual
## Predicted Nein Ja
## Ja 36 302
## Nein 67 18
cat("\n")
TP <- cm["Ja", "Ja"]
FP <- cm["Ja", "Nein"]
FN <- cm["Nein", "Ja"]
TN <- cm["Nein", "Nein"]
accuracy = (TP + TN) / sum(cm)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)
cat("Modellgüte:\n")
## Modellgüte:
cat("Accuracy :", round(accuracy, 3), "\n")
## Accuracy : 0.872
cat("Precision :", round(precision, 3), "\n")
## Precision : 0.893
cat("Recall :", round(recall, 3), "\n")
## Recall : 0.944
cat("F1-Score :", round(f1_score, 3), "\n")
## F1-Score : 0.918
# Kreuzvalidierung zur Vorhersagegüte
set.seed(1)
cv_results_reduced = cv.glm(train_data, log_model_sig, K = 10)
cv_error_reduced = cv_results_reduced$delta[1]
cat("Durchschnittliche Fehlerrate:", round(cv_error_reduced, 3), "\n")
## Durchschnittliche Fehlerrate: 0.09
Die Erstellung eines Decision Trees …
## In diesem Bereich werden Libraries installiert, die Zielvariable als Faktor gesetzt und der Datensatz aufgeteilt
#Installieren benötigter Libraries
library(tree)
library(caret)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
#Zielvariable als Faktor setzen
data$NObeyesdad <- as.factor(data$NObeyesdad)
# Datensatz aufteilen in Trainings- und Testdaten
set.seed(1)
trainIndex = sample(1:nrow(data), 0.8 * nrow(data))
train_data = data[trainIndex, ]
test_data = data[-trainIndex, ]
cat("Daten erfolgreich geteilt:\n")
## Daten erfolgreich geteilt:
cat("Trainingsdaten:", nrow(train_data), "Zeilen\n")
## Trainingsdaten: 1688 Zeilen
cat("Testdaten: ", nrow(test_data), "Zeilen\n\n")
## Testdaten: 423 Zeilen
## 1.Initialer unbeschnittener Decision Tree
# Erstellung des Modells
tree_model <- tree(
NObeyesdad ~ . -Is_Overweight -Gewichtsklasse,
data = train_data
)
summary(tree_model)
##
## Classification tree:
## tree(formula = NObeyesdad ~ . - Is_Overweight - Gewichtsklasse,
## data = train_data)
## Variables actually used in tree construction:
## [1] "FCVC" "Age"
## [3] "family_history_with_overweight" "NCP"
## [5] "FAVC" "CALC"
## [7] "Gender" "CAEC"
## [9] "MTRANS"
## Number of terminal nodes: 19
## Residual mean deviance: 2.012 = 3357 / 1669
## Misclassification error rate: 0.4082 = 689 / 1688
# Visualisierung des Decision Trees
plot(tree_model)
text(tree_model, pretty = 0, cex = 0.7)
title("Unbeschnittener Entscheidungsbaum")
# Bewertung des Decision Trees durch Konfusionsmatrix
tree_pred = predict(tree_model, newdata = test_data, type = "class")
cm_dec_tree = confusionMatrix(tree_pred, test_data$NObeyesdad)
print(cm_dec_tree)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Obesity_Type_I
## Insufficient_Weight 22 1 0
## Normal_Weight 21 24 5
## Obesity_Type_I 9 11 40
## Obesity_Type_II 0 0 6
## Obesity_Type_III 0 1 1
## Overweight_Level_I 1 0 1
## Overweight_Level_II 0 13 18
## Reference
## Prediction Obesity_Type_II Obesity_Type_III Overweight_Level_I
## Insufficient_Weight 0 0 3
## Normal_Weight 7 1 11
## Obesity_Type_I 4 0 27
## Obesity_Type_II 44 0 5
## Obesity_Type_III 0 82 1
## Overweight_Level_I 1 0 6
## Overweight_Level_II 1 0 5
## Reference
## Prediction Overweight_Level_II
## Insufficient_Weight 1
## Normal_Weight 2
## Obesity_Type_I 11
## Obesity_Type_II 10
## Obesity_Type_III 0
## Overweight_Level_I 0
## Overweight_Level_II 27
##
## Overall Statistics
##
## Accuracy : 0.5792
## 95% CI : (0.5306, 0.6267)
## No Information Rate : 0.1962
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5052
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.41509 0.48000
## Specificity 0.98649 0.87399
## Pos Pred Value 0.81481 0.33803
## Neg Pred Value 0.92172 0.92614
## Prevalence 0.12530 0.11820
## Detection Rate 0.05201 0.05674
## Detection Prevalence 0.06383 0.16785
## Balanced Accuracy 0.70079 0.67700
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.56338 0.7719
## Specificity 0.82386 0.9426
## Pos Pred Value 0.39216 0.6769
## Neg Pred Value 0.90343 0.9637
## Prevalence 0.16785 0.1348
## Detection Rate 0.09456 0.1040
## Detection Prevalence 0.24113 0.1537
## Balanced Accuracy 0.69362 0.8573
## Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity 0.9880 0.10345
## Specificity 0.9912 0.99178
## Pos Pred Value 0.9647 0.66667
## Neg Pred Value 0.9970 0.87440
## Prevalence 0.1962 0.13712
## Detection Rate 0.1939 0.01418
## Detection Prevalence 0.2009 0.02128
## Balanced Accuracy 0.9896 0.54761
## Class: Overweight_Level_II
## Sensitivity 0.52941
## Specificity 0.90054
## Pos Pred Value 0.42187
## Neg Pred Value 0.93315
## Prevalence 0.12057
## Detection Rate 0.06383
## Detection Prevalence 0.15130
## Balanced Accuracy 0.71497
# Ausgabe der Gesamtgenauigkeit des Modells
overall_acc <- as.numeric(cm_dec_tree$overall["Accuracy"])
cat("Overall Accuracy (Testdaten): ",
round(overall_acc * 100, 3), " %\n", sep = "")
## Overall Accuracy (Testdaten): 57.92 %
#Ausgabe der Genauigkeitswerte je Klasse
by_class <- cm_dec_tree$byClass
class_metrics <- by_class[, c("Pos Pred Value", "Sensitivity", "Balanced Accuracy")]
class_metrics <- round(class_metrics, 3)
print(class_metrics)
## Pos Pred Value Sensitivity Balanced Accuracy
## Class: Insufficient_Weight 0.815 0.415 0.701
## Class: Normal_Weight 0.338 0.480 0.677
## Class: Obesity_Type_I 0.392 0.563 0.694
## Class: Obesity_Type_II 0.677 0.772 0.857
## Class: Obesity_Type_III 0.965 0.988 0.990
## Class: Overweight_Level_I 0.667 0.103 0.548
## Class: Overweight_Level_II 0.422 0.529 0.715
# F1-Scores berechnen und neue Tabelle ausgeben
precision <- class_metrics[, "Pos Pred Value"]
recall <- class_metrics[, "Sensitivity"]
f1_scores_dec_tree <- 2 * (precision * recall) / (precision + recall)
class_metrics <- cbind(class_metrics, "F1-Score" = f1_scores_dec_tree)
class_metrics <- round(class_metrics, 3)
print(class_metrics)
## Pos Pred Value Sensitivity Balanced Accuracy
## Class: Insufficient_Weight 0.815 0.415 0.701
## Class: Normal_Weight 0.338 0.480 0.677
## Class: Obesity_Type_I 0.392 0.563 0.694
## Class: Obesity_Type_II 0.677 0.772 0.857
## Class: Obesity_Type_III 0.965 0.988 0.990
## Class: Overweight_Level_I 0.667 0.103 0.548
## Class: Overweight_Level_II 0.422 0.529 0.715
## F1-Score
## Class: Insufficient_Weight 0.550
## Class: Normal_Weight 0.397
## Class: Obesity_Type_I 0.462
## Class: Obesity_Type_II 0.721
## Class: Obesity_Type_III 0.976
## Class: Overweight_Level_I 0.178
## Class: Overweight_Level_II 0.469
#F1-Score berechnen und ausgeben
mean_f1_dec_tree <- mean(f1_scores_dec_tree)
cat("\nDurchschnittlicher F1-Score_dec_tree:", round(mean_f1_dec_tree, 3), "\n")
##
## Durchschnittlicher F1-Score_dec_tree: 0.536
#Vergleich Training und Test - Overfitting
tree_pred_train <- predict(tree_model, newdata = train_data, type = "class")
cm_train_tree <- confusionMatrix(tree_pred_train, train_data$NObeyesdad)
train_accuracy <- as.numeric(cm_train_tree$overall["Accuracy"])
cat("Trainingsgenauigkeit (unbeschnittener Baum): ",
round(train_accuracy * 100, 3), " %\n", sep = "")
## Trainingsgenauigkeit (unbeschnittener Baum): 59.182 %
cat("Testgenauigkeit (unbeschnittener Baum): ",
round(overall_acc * 100, 3), " %\n", sep = "")
## Testgenauigkeit (unbeschnittener Baum): 57.92 %
## 2.Beschnittener Decision Tree
# Cross Validation zur Optimierung der Baumgröße
cv_result <- cv.tree(tree_model, FUN = prune.misclass)
print(cv_result)
## $size
## [1] 19 18 17 16 14 13 12 10 9 7 6 5 4 3 2 1
##
## $dev
## [1] 713 714 714 730 730 765 819 843 880 886 957 1010 1021 1128 1181
## [16] 1408
##
## $k
## [1] -Inf 4.0 6.0 9.0 9.5 16.0 19.0 21.0 25.0 26.0 39.0 47.0
## [13] 48.0 80.0 92.0 221.0
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
# Plot: Baumgröße vs. CV-Fehler
plot(cv_result$size, cv_result$dev, type = "b",
xlab = "Baumgröße (Anzahl Endknoten)",
ylab = "CV-Fehlklassifikationen",
main = "Cross-Validation zur Optimierung der Baumgröße")
grid()
# Optimale Baumgröße (Minimum der CV-Fehler)
optimal_size <- cv_result$size[which.min(cv_result$dev)]
cat("Optimale Baumgröße nach Cross-Validation: ", optimal_size, "\n", sep = "")
## Optimale Baumgröße nach Cross-Validation: 19
# Baum beschneiden
pruned_tree <- prune.misclass(tree_model, best = optimal_size)
plot(pruned_tree)
text(pruned_tree, pretty = 0, cex = 0.7)
title(paste("Beschnittener Entscheidungsbaum (size =", optimal_size, ")"))
# Bewertung des beschnittenen Decision Trees durch Konfusionsmatrix
pruned_pred_test <- predict(pruned_tree, newdata = test_data, type = "class")
cm_pruned <- confusionMatrix(pruned_pred_test, test_data$NObeyesdad)
print(cm_pruned)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Obesity_Type_I
## Insufficient_Weight 22 1 0
## Normal_Weight 21 24 5
## Obesity_Type_I 9 11 40
## Obesity_Type_II 0 0 6
## Obesity_Type_III 0 1 1
## Overweight_Level_I 1 0 1
## Overweight_Level_II 0 13 18
## Reference
## Prediction Obesity_Type_II Obesity_Type_III Overweight_Level_I
## Insufficient_Weight 0 0 3
## Normal_Weight 7 1 11
## Obesity_Type_I 4 0 27
## Obesity_Type_II 44 0 5
## Obesity_Type_III 0 82 1
## Overweight_Level_I 1 0 6
## Overweight_Level_II 1 0 5
## Reference
## Prediction Overweight_Level_II
## Insufficient_Weight 1
## Normal_Weight 2
## Obesity_Type_I 11
## Obesity_Type_II 10
## Obesity_Type_III 0
## Overweight_Level_I 0
## Overweight_Level_II 27
##
## Overall Statistics
##
## Accuracy : 0.5792
## 95% CI : (0.5306, 0.6267)
## No Information Rate : 0.1962
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5052
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.41509 0.48000
## Specificity 0.98649 0.87399
## Pos Pred Value 0.81481 0.33803
## Neg Pred Value 0.92172 0.92614
## Prevalence 0.12530 0.11820
## Detection Rate 0.05201 0.05674
## Detection Prevalence 0.06383 0.16785
## Balanced Accuracy 0.70079 0.67700
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.56338 0.7719
## Specificity 0.82386 0.9426
## Pos Pred Value 0.39216 0.6769
## Neg Pred Value 0.90343 0.9637
## Prevalence 0.16785 0.1348
## Detection Rate 0.09456 0.1040
## Detection Prevalence 0.24113 0.1537
## Balanced Accuracy 0.69362 0.8573
## Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity 0.9880 0.10345
## Specificity 0.9912 0.99178
## Pos Pred Value 0.9647 0.66667
## Neg Pred Value 0.9970 0.87440
## Prevalence 0.1962 0.13712
## Detection Rate 0.1939 0.01418
## Detection Prevalence 0.2009 0.02128
## Balanced Accuracy 0.9896 0.54761
## Class: Overweight_Level_II
## Sensitivity 0.52941
## Specificity 0.90054
## Pos Pred Value 0.42187
## Neg Pred Value 0.93315
## Prevalence 0.12057
## Detection Rate 0.06383
## Detection Prevalence 0.15130
## Balanced Accuracy 0.71497
# Ausgabe der Gesamtgenauigkeit des beschnittenen Modells
overall_acc <- as.numeric(cm_pruned$overall["Accuracy"])
cat("Overall Accuracy (Testdaten): ",
round(overall_acc * 100, 3), " %\n", sep = "")
## Overall Accuracy (Testdaten): 57.92 %
#Ausgabe der Genauigkeitswerte je Klasse
by_class <- cm_pruned$byClass
class_metrics <- by_class[, c("Pos Pred Value", "Sensitivity", "Balanced Accuracy")]
class_metrics <- round(class_metrics, 3)
print(class_metrics)
## Pos Pred Value Sensitivity Balanced Accuracy
## Class: Insufficient_Weight 0.815 0.415 0.701
## Class: Normal_Weight 0.338 0.480 0.677
## Class: Obesity_Type_I 0.392 0.563 0.694
## Class: Obesity_Type_II 0.677 0.772 0.857
## Class: Obesity_Type_III 0.965 0.988 0.990
## Class: Overweight_Level_I 0.667 0.103 0.548
## Class: Overweight_Level_II 0.422 0.529 0.715
# F1-Scores berechnen und neue Tabelle ausgeben
precision <- class_metrics[, "Pos Pred Value"]
recall <- class_metrics[, "Sensitivity"]
f1_scores_pruned <- 2 * (precision * recall) / (precision + recall)
class_metrics <- cbind(class_metrics, "F1-Score" = f1_scores_pruned)
class_metrics <- round(class_metrics, 3)
print(class_metrics)
## Pos Pred Value Sensitivity Balanced Accuracy
## Class: Insufficient_Weight 0.815 0.415 0.701
## Class: Normal_Weight 0.338 0.480 0.677
## Class: Obesity_Type_I 0.392 0.563 0.694
## Class: Obesity_Type_II 0.677 0.772 0.857
## Class: Obesity_Type_III 0.965 0.988 0.990
## Class: Overweight_Level_I 0.667 0.103 0.548
## Class: Overweight_Level_II 0.422 0.529 0.715
## F1-Score
## Class: Insufficient_Weight 0.550
## Class: Normal_Weight 0.397
## Class: Obesity_Type_I 0.462
## Class: Obesity_Type_II 0.721
## Class: Obesity_Type_III 0.976
## Class: Overweight_Level_I 0.178
## Class: Overweight_Level_II 0.469
#F1-Score berechnen und ausgeben
mean_f1_pruned <- mean(f1_scores_pruned)
cat("\nDurchschnittlicher F1-Score_pruned_tree:", round(mean_f1_pruned, 3), "\n")
##
## Durchschnittlicher F1-Score_pruned_tree: 0.536
In diesem Abschnitt wird …
## In diesem Bereich werden die benötigten Libraries installiert und die Zielvariablen als Faktor gesetzt
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
train_data$NObeyesdad <- as.factor(train_data$NObeyesdad)
test_data$NObeyesdad <- as.factor(test_data$NObeyesdad)
## 1.In diesem Bereich wird ein initiales Random Forest Modell erstellt und evaluiert
# Erstellung des Random Forest Modells
set.seed(1)
rf_base <- randomForest(
NObeyesdad ~ . -Is_Overweight -Gewichtsklasse,
data = train_data,
ntree = 500,
importance = TRUE
)
print(rf_base)
##
## Call:
## randomForest(formula = NObeyesdad ~ . - Is_Overweight - Gewichtsklasse, data = train_data, ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 13.98%
## Confusion matrix:
## Insufficient_Weight Normal_Weight Obesity_Type_I
## Insufficient_Weight 195 21 1
## Normal_Weight 9 191 10
## Obesity_Type_I 0 15 241
## Obesity_Type_II 0 3 2
## Obesity_Type_III 0 0 0
## Overweight_Level_I 4 28 15
## Overweight_Level_II 1 24 19
## Obesity_Type_II Obesity_Type_III Overweight_Level_I
## Insufficient_Weight 0 0 2
## Normal_Weight 4 0 12
## Obesity_Type_I 8 2 6
## Obesity_Type_II 230 0 1
## Obesity_Type_III 0 240 0
## Overweight_Level_I 3 1 178
## Overweight_Level_II 14 0 4
## Overweight_Level_II class.error
## Insufficient_Weight 0 0.109589041
## Normal_Weight 11 0.194092827
## Obesity_Type_I 8 0.139285714
## Obesity_Type_II 4 0.041666667
## Obesity_Type_III 1 0.004149378
## Overweight_Level_I 3 0.232758621
## Overweight_Level_II 177 0.259414226
# Confusion Matrix und Bewertung der Genauigkeit - Testdaten
rf_pred_test_base <- predict(rf_base, newdata = test_data, type = "class")
cm_rf_base <- confusionMatrix(rf_pred_test_base, test_data$NObeyesdad)
print(cm_rf_base)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Obesity_Type_I
## Insufficient_Weight 49 1 0
## Normal_Weight 4 42 3
## Obesity_Type_I 0 2 64
## Obesity_Type_II 0 1 3
## Obesity_Type_III 0 0 0
## Overweight_Level_I 0 4 1
## Overweight_Level_II 0 0 0
## Reference
## Prediction Obesity_Type_II Obesity_Type_III Overweight_Level_I
## Insufficient_Weight 0 0 1
## Normal_Weight 2 1 7
## Obesity_Type_I 0 0 3
## Obesity_Type_II 51 0 2
## Obesity_Type_III 1 82 0
## Overweight_Level_I 0 0 42
## Overweight_Level_II 3 0 3
## Reference
## Prediction Overweight_Level_II
## Insufficient_Weight 0
## Normal_Weight 3
## Obesity_Type_I 5
## Obesity_Type_II 3
## Obesity_Type_III 0
## Overweight_Level_I 0
## Overweight_Level_II 40
##
## Overall Statistics
##
## Accuracy : 0.8747
## 95% CI : (0.8393, 0.9047)
## No Information Rate : 0.1962
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.853
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.9245 0.84000
## Specificity 0.9946 0.94638
## Pos Pred Value 0.9608 0.67742
## Neg Pred Value 0.9892 0.97784
## Prevalence 0.1253 0.11820
## Detection Rate 0.1158 0.09929
## Detection Prevalence 0.1206 0.14657
## Balanced Accuracy 0.9596 0.89319
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.9014 0.8947
## Specificity 0.9716 0.9754
## Pos Pred Value 0.8649 0.8500
## Neg Pred Value 0.9799 0.9835
## Prevalence 0.1678 0.1348
## Detection Rate 0.1513 0.1206
## Detection Prevalence 0.1749 0.1418
## Balanced Accuracy 0.9365 0.9351
## Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity 0.9880 0.72414
## Specificity 0.9971 0.98630
## Pos Pred Value 0.9880 0.89362
## Neg Pred Value 0.9971 0.95745
## Prevalence 0.1962 0.13712
## Detection Rate 0.1939 0.09929
## Detection Prevalence 0.1962 0.11111
## Balanced Accuracy 0.9925 0.85522
## Class: Overweight_Level_II
## Sensitivity 0.78431
## Specificity 0.98387
## Pos Pred Value 0.86957
## Neg Pred Value 0.97082
## Prevalence 0.12057
## Detection Rate 0.09456
## Detection Prevalence 0.10875
## Balanced Accuracy 0.88409
overall_acc_rf_base <- as.numeric(cm_rf_base$overall["Accuracy"])
cat("Overall Accuracy (RF Baseline, Test): ",
round(overall_acc_rf_base * 100, 2), " %\n", sep = "")
## Overall Accuracy (RF Baseline, Test): 87.47 %
rf_class_metrics_base <- round(
cm_rf_base$byClass[, c("Sensitivity", "Specificity", "Balanced Accuracy")],
3
)
print(rf_class_metrics_base)
## Sensitivity Specificity Balanced Accuracy
## Class: Insufficient_Weight 0.925 0.995 0.960
## Class: Normal_Weight 0.840 0.946 0.893
## Class: Obesity_Type_I 0.901 0.972 0.936
## Class: Obesity_Type_II 0.895 0.975 0.935
## Class: Obesity_Type_III 0.988 0.997 0.993
## Class: Overweight_Level_I 0.724 0.986 0.855
## Class: Overweight_Level_II 0.784 0.984 0.884
## 2.In diesem Bereich wird das optimierte Random Forest Modell mit Hyperparametertuning erstellt
set.seed(1)
train_data$NObeyesdad <- as.factor(train_data$NObeyesdad)
test_data$NObeyesdad <- as.factor(test_data$NObeyesdad)
# Cross-Validation Vorbeireitung
train_control <- trainControl(
method = "cv",
number = 5
)
# Anzahl der Prädiktoren p bestimmen - keine Zielvariablen
p <- ncol(train_data[, !(names(train_data) %in%
c("NObeyesdad", "Is_Overweight", "Gewichtsklasse"))])
default_mtry <- floor(sqrt(p))
cat("p =", p, "| Default mtry =", default_mtry, "\n")
## p = 14 | Default mtry = 3
# Hyperparametertuning: mtry (5-fold CV)
tune_grid <- expand.grid(
mtry = seq(default_mtry - 1, default_mtry + 5, by = 1)
)
rf_cv <- train(
NObeyesdad ~ . -Is_Overweight -Gewichtsklasse,
data = train_data,
method = "rf",
trControl = train_control,
tuneGrid = tune_grid,
metric = "Accuracy",
ntree = 500
)
print(rf_cv)
## Random Forest
##
## 1688 samples
## 16 predictor
## 7 classes: 'Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I', 'Overweight_Level_II'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1352, 1349, 1350, 1351, 1350
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7517567 0.7098946
## 3 0.8204927 0.7903290
## 4 0.8365007 0.8090327
## 5 0.8471675 0.8215118
## 6 0.8448165 0.8187502
## 7 0.8477593 0.8222016
## 8 0.8442160 0.8180558
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.
best_mtry <- rf_cv$bestTune$mtry
mtry_final <- 5 # gewählt aufgrund Minimum/Plateau in CV-Fehler-Plot
cat("Optimales mtry:", mtry_final, "\n")
## Optimales mtry: 5
# Grafische Darstellung CV-Fehler vs. mtry
cv_results <- rf_cv$results
cv_results$CV_Error <- 1 - cv_results$Accuracy
plot(
cv_results$mtry,
cv_results$CV_Error,
type = "b",
pch = 19,
col = "blue",
xlab = "Anzahl zufällig ausgewählter Prädiktoren (mtry)",
ylab = "Cross-Validation-Fehler",
main = "Abhängigkeit des CV-Fehlers von mtry"
)
# Analyse der Baumanzahl (ntree) bei fixiertem mtry
ntree_values <- c(100, 300, 500, 800)
test_accuracy <- numeric(length(ntree_values))
set.seed(1)
for (i in seq_along(ntree_values)) {
rf_tmp <- randomForest(
NObeyesdad ~ . -Is_Overweight -Gewichtsklasse,
data = train_data,
mtry = mtry_final,
ntree = ntree_values[i]
)
preds <- predict(rf_tmp, newdata = test_data)
test_accuracy[i] <- mean(preds == test_data$NObeyesdad)
}
ntree_results <- data.frame(
ntree = ntree_values,
Test_Accuracy = test_accuracy
)
ntree_results$Test_Error <- 1 - ntree_results$Test_Accuracy
print(ntree_results)
## ntree Test_Accuracy Test_Error
## 1 100 0.8723404 0.1276596
## 2 300 0.8794326 0.1205674
## 3 500 0.8841608 0.1158392
## 4 800 0.8841608 0.1158392
# Grafische Darstellung Test-Fehler vs. Baumanzahl
plot(
ntree_results$ntree,
ntree_results$Test_Error,
type = "b",
pch = 19,
col = "darkgreen",
xlab = "Anzahl der Bäume (ntree)",
ylab = "Test-Fehler",
main = "Abhängigkeit des Testfehlers von der Baumanzahl"
)
grid()
# Analyse: Fehlerverlauf über ntree für verschiedene mtry
mtry_values <- c(3, 5, 6, 7, 9)
ntree_max <- 500
set.seed(1)
rf_models <- lapply(mtry_values, function(m) {
randomForest(
NObeyesdad ~ . -Is_Overweight -Gewichtsklasse,
data = train_data,
mtry = m,
ntree = ntree_max
)
})
names(rf_models) <- paste0("m=", mtry_values)
err_curves <- lapply(rf_models, function(rf) {
rf$err.rate[, "OOB"]
})
plot(
1:ntree_max,
err_curves[[1]],
type = "l",
col = "red",
ylim = range(unlist(err_curves)),
xlab = "Baumanzahl (ntrees)",
ylab = "Fehlklassifizierung",
main = "Random Forest: Fehlklassifizierung vs. Baumanzahl für unterschiedliche mtry"
)
cols <- c("red", "orange", "green", "blue", "cyan")
for (i in 2:length(err_curves)) {
lines(1:ntree_max, err_curves[[i]], col = cols[i])
}
legend(
"topright",
legend = names(err_curves),
col = cols,
lty = 1,
bty = "n"
)
# Finales Random Forest Modell
set.seed(1)
rf_final <- randomForest(
NObeyesdad ~ . -Is_Overweight -Gewichtsklasse,
data = train_data,
mtry = mtry_final,
ntree = 500,
importance = TRUE
)
# Confusion Matrix und Genauigkeit für Finales Random Forest Modell
rf_pred_test <- predict(rf_final, newdata = test_data)
cm_rf_final <- confusionMatrix(rf_pred_test, test_data$NObeyesdad)
cat("\nRandom Forest nach Hyperparameter-Tuning:\n")
##
## Random Forest nach Hyperparameter-Tuning:
cat("\nConfusion Matrix:\n")
##
## Confusion Matrix:
print(cm_rf_final)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Obesity_Type_I
## Insufficient_Weight 49 0 0
## Normal_Weight 4 40 2
## Obesity_Type_I 0 4 67
## Obesity_Type_II 0 1 1
## Obesity_Type_III 0 0 0
## Overweight_Level_I 0 3 1
## Overweight_Level_II 0 2 0
## Reference
## Prediction Obesity_Type_II Obesity_Type_III Overweight_Level_I
## Insufficient_Weight 0 0 1
## Normal_Weight 2 1 7
## Obesity_Type_I 0 0 2
## Obesity_Type_II 52 0 2
## Obesity_Type_III 1 82 0
## Overweight_Level_I 0 0 44
## Overweight_Level_II 2 0 2
## Reference
## Prediction Overweight_Level_II
## Insufficient_Weight 0
## Normal_Weight 3
## Obesity_Type_I 6
## Obesity_Type_II 3
## Obesity_Type_III 0
## Overweight_Level_I 0
## Overweight_Level_II 39
##
## Overall Statistics
##
## Accuracy : 0.8818
## 95% CI : (0.8471, 0.911)
## No Information Rate : 0.1962
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8612
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.9245 0.80000
## Specificity 0.9973 0.94906
## Pos Pred Value 0.9800 0.67797
## Neg Pred Value 0.9893 0.97253
## Prevalence 0.1253 0.11820
## Detection Rate 0.1158 0.09456
## Detection Prevalence 0.1182 0.13948
## Balanced Accuracy 0.9609 0.87453
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.9437 0.9123
## Specificity 0.9659 0.9809
## Pos Pred Value 0.8481 0.8814
## Neg Pred Value 0.9884 0.9863
## Prevalence 0.1678 0.1348
## Detection Rate 0.1584 0.1229
## Detection Prevalence 0.1868 0.1395
## Balanced Accuracy 0.9548 0.9466
## Class: Obesity_Type_III Class: Overweight_Level_I
## Sensitivity 0.9880 0.7586
## Specificity 0.9971 0.9890
## Pos Pred Value 0.9880 0.9167
## Neg Pred Value 0.9971 0.9627
## Prevalence 0.1962 0.1371
## Detection Rate 0.1939 0.1040
## Detection Prevalence 0.1962 0.1135
## Balanced Accuracy 0.9925 0.8738
## Class: Overweight_Level_II
## Sensitivity 0.7647
## Specificity 0.9839
## Pos Pred Value 0.8667
## Neg Pred Value 0.9683
## Prevalence 0.1206
## Detection Rate 0.0922
## Detection Prevalence 0.1064
## Balanced Accuracy 0.8743
cat("\nGesamtmetriken:\n")
##
## Gesamtmetriken:
cat("Accuracy: ", round(cm_rf_final$overall["Accuracy"] * 100, 2), " %\n", sep = "")
## Accuracy: 88.18 %
## Vergleich des initialen Random Forest mit dem hyperparametergetunten Random Forest
# F1, Precision, Recall je Klasse für initialen RF
byc <- cm_rf_base$byClass
if (is.matrix(byc)) {
metrics <- byc[, c("Precision", "Recall", "F1")]
metrics <- round(metrics * 100, 2)
print(metrics)
f1_score_rf_base <- mean(byc[, "F1"], na.rm = TRUE)
cat("F1-Wert: ", round(f1_score_rf_base * 100, 2), " %\n", sep = "")
}
## Precision Recall F1
## Class: Insufficient_Weight 96.08 92.45 94.23
## Class: Normal_Weight 67.74 84.00 75.00
## Class: Obesity_Type_I 86.49 90.14 88.28
## Class: Obesity_Type_II 85.00 89.47 87.18
## Class: Obesity_Type_III 98.80 98.80 98.80
## Class: Overweight_Level_I 89.36 72.41 80.00
## Class: Overweight_Level_II 86.96 78.43 82.47
## F1-Wert: 86.57 %
# F1, Precision, Recall je Klasse für optimierten RF
byc <- cm_rf_final$byClass
if (is.matrix(byc)) {
metrics <- byc[, c("Precision", "Recall", "F1")]
metrics <- round(metrics * 100, 2)
print(metrics)
f1_score_rf_final <- mean(byc[, "F1"], na.rm = TRUE)
cat("F1-Wert: ", round(f1_score_rf_final * 100, 2), " %\n", sep = "")
}
## Precision Recall F1
## Class: Insufficient_Weight 98.00 92.45 95.15
## Class: Normal_Weight 67.80 80.00 73.39
## Class: Obesity_Type_I 84.81 94.37 89.33
## Class: Obesity_Type_II 88.14 91.23 89.66
## Class: Obesity_Type_III 98.80 98.80 98.80
## Class: Overweight_Level_I 91.67 75.86 83.02
## Class: Overweight_Level_II 86.67 76.47 81.25
## F1-Wert: 87.23 %
## Wichtigkeit der einzelnen Variablen
# 1. Grafik: Genauigkeit (Mean Decrease Accuracy)
varImpPlot(rf_final,
type = 1,
main = "Wichtigkeit der Variablen (Accuracy)",
col = "blue",
pch = 19)
# 2. Grafik: Gini-Index (Mean Decrease Gini)
varImpPlot(rf_final,
type = 2,
main = "Wichtigkeit der Variablen (Gini)",
col = "red",
pch = 19)