R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(caTools) library(ROCR)

=========================

📊 ΦΟΡΤΩΣΗ DATASET

=========================

data <- read.csv(“airline_ticket_prices_dataset.csv”)

=========================

🔧 DATA PREPARATION

=========================

Μετατροπή categorical σε factors

data\(Airline <- as.factor(data\)Airline) data\(Origin <- as.factor(data\)Origin) data\(Destination <- as.factor(data\)Destination) data\(Class <- as.factor(data\)Class)

Δημιουργία binary μεταβλητής (target)

median_price <- median(data$Price_USD)

data\(HighPrice <- ifelse(data\)Price_USD > median_price, 1, 0)

=========================

🔀 TRAIN / TEST SPLIT

=========================

set.seed(923) # βάλε τα δικά σου τελευταία 2 ψηφία

split <- sample.split(data$HighPrice, SplitRatio = 0.65)

train <- subset(data, split == TRUE) test <- subset(data, split == FALSE)

cat(“Train size:”, nrow(train), “”) cat(“Test size:”, nrow(test), “”)

=========================

📈 LOGISTIC MODEL

=========================

model <- glm(HighPrice ~ Airline + Origin + Destination + Distance_km + Class + Days_Before_Departure, data = train, family = binomial)

summary(model)

=========================

🔮 PREDICTIONS

=========================

predictTest <- predict(model, newdata = test, type = “response”)

head(predictTest)

=========================

🔢 CONFUSION MATRIX

=========================

predictClass <- ifelse(predictTest > 0.5, 1, 0)

cm <- table(Predicted = predictClass, Actual = test$HighPrice) print(cm)

=========================

📏 METRICS

=========================

accuracy <- (cm[1,1] + cm[2,2]) / sum(cm)

sensitivity <- cm[2,2] / (cm[2,2] + cm[1,2])

specificity <- cm[1,1] / (cm[1,1] + cm[2,1])

cat(“Accuracy:”, accuracy, “”) cat(“Sensitivity:”, sensitivity, “”) cat(“Specificity:”, specificity, “”)

=========================

📉 BASELINE MODEL

=========================

baseline <- max(prop.table(table(test$HighPrice)))

cat(“Baseline Accuracy:”, baseline, “”)

=========================

📊 ROC & AUC

=========================

ROCRpred <- prediction(predictTest, test$HighPrice)

ROCRperf <- performance(ROCRpred, “tpr”, “fpr”)

plot(ROCRperf, colorize = TRUE, main=“ROC Curve”)

auc <- performance(ROCRpred, “auc”) cat(“AUC:”, [[1]], “”)

=========================

🧹 NA.OMIT + ΝΕΑ SETS

=========================

data2 <- na.omit(data)

split2 <- sample.split(data2$HighPrice, SplitRatio = 0.65)

train2 <- subset(data2, split2 == TRUE) test2 <- subset(data2, split2 == FALSE)

cat(“Train2 size:”, nrow(train2), “”) cat(“Test2 size:”, nrow(test2), “”)