R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.


title: “Clustering” output: html_document ———————

data <- read.csv("airline_ticket_prices_dataset.csv")

head(data)
##   Ticket_ID          Airline   Origin Destination Distance_km    Class
## 1         1           Saudia   Mumbai       Paris        2643  Economy
## 2         2           Etihad Istanbul      Jeddah        8266 Business
## 3         3  British Airways    Dubai      Riyadh        2698    First
## 4         4         Emirates   London    Istanbul        7956    First
## 5         5    Qatar Airways     Doha       Dubai        1308 Business
## 6         6 Turkish Airlines   Mumbai    Istanbul        9382    First
##   Days_Before_Departure Price_USD
## 1                    28    555.30
## 2                    98   2070.27
## 3                    21   1883.11
## 4                    24   5349.65
## 5                    60    516.19
## 6                    19   6495.84
str(data)
## 'data.frame':    250 obs. of  8 variables:
##  $ Ticket_ID            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Airline              : chr  "Saudia" "Etihad" "British Airways" "Emirates" ...
##  $ Origin               : chr  "Mumbai" "Istanbul" "Dubai" "London" ...
##  $ Destination          : chr  "Paris" "Jeddah" "Riyadh" "Istanbul" ...
##  $ Distance_km          : int  2643 8266 2698 7956 1308 9382 3506 6122 7150 3760 ...
##  $ Class                : chr  "Economy" "Business" "First" "First" ...
##  $ Days_Before_Departure: int  28 98 21 24 60 19 85 88 24 36 ...
##  $ Price_USD            : num  555 2070 1883 5350 516 ...
numeric_data <- data[sapply(data, is.numeric)]

summary(numeric_data)
##    Ticket_ID       Distance_km    Days_Before_Departure   Price_USD     
##  Min.   :  1.00   Min.   :  336   Min.   :  1.00        Min.   : 106.1  
##  1st Qu.: 63.25   1st Qu.: 2992   1st Qu.: 31.25        1st Qu.: 818.7  
##  Median :125.50   Median : 6132   Median : 60.50        Median :1531.2  
##  Mean   :125.50   Mean   : 6000   Mean   : 59.02        Mean   :2006.1  
##  3rd Qu.:187.75   3rd Qu.: 8833   3rd Qu.: 86.75        3rd Qu.:2590.4  
##  Max.   :250.00   Max.   :11903   Max.   :118.00        Max.   :8852.7
scaled_data <- scale(numeric_data)

dist_matrix <- dist(scaled_data)

hc <- hclust(dist_matrix, method = "ward.D2")

plot(hc)
rect.hclust(hc, k=3, border="red")

clusters_hc <- cutree(hc, k=3)
table(clusters_hc)
## clusters_hc
##   1   2   3 
## 144  86  20
set.seed(123)

kmeans_result <- kmeans(scaled_data, centers=3, nstart=25)

kmeans_result$centers
##     Ticket_ID Distance_km Days_Before_Departure  Price_USD
## 1 -0.09047971   0.8054445           -0.91535950  1.0927416
## 2 -0.21060074  -1.0106156           -0.08469529 -0.6935627
## 3  0.31608632   0.4837855            0.85959419 -0.1179749
table(kmeans_result$cluster)
## 
##  1  2  3 
## 70 96 84
wss <- numeric(10)

for (i in 1:10) {
  km <- kmeans(scaled_data, centers=i, nstart=10)
  wss[i] <- km$tot.withinss
}

plot(1:10, wss, type="b")

library(ggplot2)

data$cluster <- as.factor(kmeans_result$cluster)

ggplot(data, aes(x = numeric_data[,1], y = numeric_data[,2], color = cluster)) +
  geom_point(size=3)