this:
## Read the Original data from GitHub link
urlfile <- 'https://raw.githubusercontent.com/baruab/Week2_HW/main/USAirlines.csv'
datain <- read.csv(urlfile)
airline_data <- data.frame(datain)
#View(airline_data)
# Summarize the data
print(paste("Summarize Airline data"))
## [1] "Summarize Airline data"
summary(airline_data)
## X firm year output
## Min. : 1.00 Min. :1.0 Min. :1970 Min. :0.03768
## 1st Qu.:23.25 1st Qu.:2.0 1st Qu.:1973 1st Qu.:0.14213
## Median :45.50 Median :3.5 Median :1977 Median :0.30503
## Mean :45.50 Mean :3.5 Mean :1977 Mean :0.54499
## 3rd Qu.:67.75 3rd Qu.:5.0 3rd Qu.:1981 3rd Qu.:0.94528
## Max. :90.00 Max. :6.0 Max. :1984 Max. :1.93646
## cost price load
## Min. : 68978 Min. : 103795 Min. :0.4321
## 1st Qu.: 292046 1st Qu.: 129848 1st Qu.:0.5288
## Median : 637001 Median : 357434 Median :0.5661
## Mean :1122524 Mean : 471683 Mean :0.5605
## 3rd Qu.:1345968 3rd Qu.: 849840 3rd Qu.:0.5947
## Max. :4748320 Max. :1015610 Max. :0.6763
# Mean of two attributes
print(paste("Mean Cost is",format(mean(airline_data$cost))))
## [1] "Mean Cost is 1122524"
print(paste("Mean Load is",format(mean(airline_data$load))))
## [1] "Mean Load is 0.5604602"
#Median of two attributes
print(paste("Median Cost is",format(median(airline_data$cost))))
## [1] "Median Cost is 637001"
print(paste("Median Load is",format(median(airline_data$load))))
## [1] "Median Load is 0.566085"
# Create Sub-set data
airline2_subdata <- subset(airline_data, firm ==2, select = c("year","cost", "load"))
colnames(airline2_subdata) <- c("Year", "Total Cost", "Capacity Utilization")
#View(airline2_subdata)
# Summarize the data for Airline Firm 2
cat("\n")
print(paste("Summarize Airline Subset data"))
## [1] "Summarize Airline Subset data"
summary(airline2_subdata)
## Year Total Cost Capacity Utilization
## Min. :1970 Min. : 569292 Min. :0.4734
## 1st Qu.:1974 1st Qu.:1101632 1st Qu.:0.5196
## Median :1977 Median :1709270 Median :0.5402
## Mean :1977 Mean :2127884 Mean :0.5471
## 3rd Qu.:1980 3rd Qu.:3347720 3rd Qu.:0.5694
## Max. :1984 Max. :4209390 Max. :0.6287
# Mean of Subset data with two attributes
print(paste("Subdata:: Mean Total Cost is",format(mean(airline2_subdata$`Total Cost`))))
## [1] "Subdata:: Mean Total Cost is 2127884"
print(paste("Subdata:: Mean Capacity Util(Load) is",format(mean(airline2_subdata$`Capacity Utilization`))))
## [1] "Subdata:: Mean Capacity Util(Load) is 0.5470946"
#Median of two attributes
print(paste("Subdata:: Median Total Cost is",format(median(airline2_subdata$`Total Cost`))))
## [1] "Subdata:: Median Total Cost is 1709270"
print(paste("Subdata:: Median Capacity Util(Load) is",format(median(airline2_subdata$`Capacity Utilization`))))
## [1] "Subdata:: Median Capacity Util(Load) is 0.540163"
cat("Comparison Analysis:\n The mean and median values in the two datasets are way different, due to the second dataset being subset of the first with less row of data")
## Comparison Analysis:
## The mean and median values in the two datasets are way different, due to the second dataset being subset of the first with less row of data
airline2_subdata$`Total Cost` <- ifelse(airline2_subdata$`Total Cost` < 2127884, "Less than mean","Higher than mean")
airline2_subdata
## Year Total Cost Capacity Utilization
## 16 1970 Less than mean 0.490851
## 17 1971 Less than mean 0.473449
## 18 1972 Less than mean 0.503013
## 19 1973 Less than mean 0.512501
## 20 1974 Less than mean 0.566782
## 21 1975 Less than mean 0.558133
## 22 1976 Less than mean 0.558799
## 23 1977 Less than mean 0.572070
## 24 1978 Less than mean 0.624763
## 25 1979 Higher than mean 0.628706
## 26 1980 Higher than mean 0.589150
## 27 1981 Higher than mean 0.532612
## 28 1982 Higher than mean 0.526652
## 29 1983 Higher than mean 0.540163
## 30 1984 Higher than mean 0.528775