This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
title: “Clustering” output: html_document ———————
data <- read.csv("airline_ticket_prices_dataset.csv")
head(data)
## Ticket_ID Airline Origin Destination Distance_km Class
## 1 1 Saudia Mumbai Paris 2643 Economy
## 2 2 Etihad Istanbul Jeddah 8266 Business
## 3 3 British Airways Dubai Riyadh 2698 First
## 4 4 Emirates London Istanbul 7956 First
## 5 5 Qatar Airways Doha Dubai 1308 Business
## 6 6 Turkish Airlines Mumbai Istanbul 9382 First
## Days_Before_Departure Price_USD
## 1 28 555.30
## 2 98 2070.27
## 3 21 1883.11
## 4 24 5349.65
## 5 60 516.19
## 6 19 6495.84
str(data)
## 'data.frame': 250 obs. of 8 variables:
## $ Ticket_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Airline : chr "Saudia" "Etihad" "British Airways" "Emirates" ...
## $ Origin : chr "Mumbai" "Istanbul" "Dubai" "London" ...
## $ Destination : chr "Paris" "Jeddah" "Riyadh" "Istanbul" ...
## $ Distance_km : int 2643 8266 2698 7956 1308 9382 3506 6122 7150 3760 ...
## $ Class : chr "Economy" "Business" "First" "First" ...
## $ Days_Before_Departure: int 28 98 21 24 60 19 85 88 24 36 ...
## $ Price_USD : num 555 2070 1883 5350 516 ...
numeric_data <- data[sapply(data, is.numeric)]
summary(numeric_data)
## Ticket_ID Distance_km Days_Before_Departure Price_USD
## Min. : 1.00 Min. : 336 Min. : 1.00 Min. : 106.1
## 1st Qu.: 63.25 1st Qu.: 2992 1st Qu.: 31.25 1st Qu.: 818.7
## Median :125.50 Median : 6132 Median : 60.50 Median :1531.2
## Mean :125.50 Mean : 6000 Mean : 59.02 Mean :2006.1
## 3rd Qu.:187.75 3rd Qu.: 8833 3rd Qu.: 86.75 3rd Qu.:2590.4
## Max. :250.00 Max. :11903 Max. :118.00 Max. :8852.7
scaled_data <- scale(numeric_data)
dist_matrix <- dist(scaled_data)
hc <- hclust(dist_matrix, method = "ward.D2")
plot(hc)
rect.hclust(hc, k=3, border="red")
clusters_hc <- cutree(hc, k=3)
table(clusters_hc)
## clusters_hc
## 1 2 3
## 144 86 20
set.seed(123)
kmeans_result <- kmeans(scaled_data, centers=3, nstart=25)
kmeans_result$centers
## Ticket_ID Distance_km Days_Before_Departure Price_USD
## 1 -0.09047971 0.8054445 -0.91535950 1.0927416
## 2 -0.21060074 -1.0106156 -0.08469529 -0.6935627
## 3 0.31608632 0.4837855 0.85959419 -0.1179749
table(kmeans_result$cluster)
##
## 1 2 3
## 70 96 84
wss <- numeric(10)
for (i in 1:10) {
km <- kmeans(scaled_data, centers=i, nstart=10)
wss[i] <- km$tot.withinss
}
plot(1:10, wss, type="b")
library(ggplot2)
data$cluster <- as.factor(kmeans_result$cluster)
ggplot(data, aes(x = numeric_data[,1], y = numeric_data[,2], color = cluster)) +
geom_point(size=3)