setwd("C:\\Users\\Yang\\Desktop\\Business Data mining\\dataset 1\\Airfares")
Airfares_data <- read.csv ("Airfares.csv")
str(Airfares_data)
## 'data.frame': 638 obs. of 19 variables:
## $ S_CODE : Factor w/ 8 levels "*","DCA","EWR",..: 1 1 1 8 7 1 1 1 1 1 ...
## $ S_CITY : Factor w/ 51 levels "Albuquerque NM",..: 14 3 7 9 9 11 14 18 23 25 ...
## $ E_CODE : Factor w/ 8 levels "*","DCA","EWR",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ E_CITY : Factor w/ 68 levels "Amarillo TX",..: 1 2 2 2 2 2 2 2 2 2 ...
## $ COUPON : num 1 1.06 1.06 1.06 1.06 1.01 1.28 1.15 1.33 1.6 ...
## $ NEW : int 3 3 3 3 3 3 3 3 3 2 ...
## $ VACATION: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 2 1 1 ...
## $ SW : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 1 2 2 2 ...
## $ HI : num 5292 5419 9185 2657 2657 ...
## $ S_INCOME: Factor w/ 50 levels "$14,600","$18,933",..: 37 34 44 41 41 29 37 33 35 26 ...
## $ E_INCOME: Factor w/ 67 levels "$14,600","$18,851",..: 7 57 57 57 57 57 57 57 57 57 ...
## $ S_POP : int 3036732 3532657 5787293 7830332 7830332 2230955 3036732 1440377 3770125 1694803 ...
## $ E_POP : int 205711 7145897 7145897 7145897 7145897 7145897 7145897 7145897 7145897 7145897 ...
## $ SLOT : Factor w/ 2 levels "Controlled","Free": 2 2 2 1 2 2 2 2 2 2 ...
## $ GATE : Factor w/ 2 levels "Constrained",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ DISTANCE: int 312 576 364 612 612 309 1220 921 1249 964 ...
## $ PAX : int 7864 8820 6452 25144 25144 13386 4625 5512 7811 4657 ...
## $ FARE : Factor w/ 451 levels "$100.36","$100.80",..: 371 170 219 427 427 345 241 48 168 41 ...
## $ X : logi NA NA NA NA NA NA ...
Conslusion: there are 19 variables and 638 observations in this dataframe.
head(sort(table(Airfares_data$E_CITY),decreasing=TRUE))
##
## New York/Newark NY Washington DC Phoenix AZ
## 75 54 25
## Baltimore/Wash Intl MD Orlando FL San Francisco CA
## 23 23 21
pie(head(sort(table(Airfares_data$E_CITY),decreasing=TRUE)))
Conclusion: the top 5 ending cities based on the flights number are following by New York(NY), Washingto(DC), Phoenix(AZ), Baltimore/Wash(MD), Orlando(FL), and San Francisco (CA).
City_pop <- aggregate (E_POP ~ E_CITY, mean, data = Airfares_data)
City_pop5 <- with(City_pop,head(City_pop[order(-E_POP),]))
City_pop5
## E_CITY E_POP
## 29 Los Angeles CA 9056076
## 38 New York/Newark NY 8621121
## 9 Chicago IL 7830332
## 2 Baltimore/Wash Intl MD 7145897
## 5 Boston MA 5787293
## 43 Philadelphia/Camden PA 4948339
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.2
City_pop5$E_CITY <- with(City_pop5, factor(E_CITY , levels = E_CITY [order(-E_POP)]))
qplot(E_CITY, E_POP, data = City_pop5, color = E_CITY, main="Top 5 Ending City's Population", xlab="Ending city", ylab="Populatoin of ending city",label = E_CITY) + theme(axis.text.x = element_text(angle=90, vjust=0.5))
Conclusion: The ending city which has the highest populaton is Las Angelas (CA), following by New York(NY), Chicago(IL), Baltimore/Wash(MD), and Philadelphia/Camden(PA).