Case 1

Read file & divide the groups (AmericanEagle, Hawaiian, United):

d0=read.csv("Baggage.csv",header = T)
attach(d0)
AmericanEagle=d0[Airline=="American Eagle",]
Hawaiian=d0[Airline=="Hawaiian",]
United=d0[Airline=="United",]

1.Explore baggage complaints over time: create 3 time series plots for the variable Baggage by Date for each of the airlines separately.

library(forecast)
AmericanEagle_Baggage_ts= ts(AmericanEagle$Baggage, frequency=12, start=c(2004,1), end=c(2010,12))
plot.ts(AmericanEagle_Baggage_ts,main="American Eagle",xlab="Date",ylab="Baggage Complaints",type="b")+grid()

Hawaiian_Baggage_ts= ts(Hawaiian$Baggage, frequency=12, start=c(2004,1), end=c(2010,12))
plot.ts(Hawaiian_Baggage_ts,main="Hawaiian",xlab="Date",ylab="Baggage Complaints",type="b")+grid()

United_Baggage_ts= ts(United$Baggage, frequency=12, start=c(2004,1), end=c(2010,12))
plot.ts(United_Baggage_ts,main="United",xlab="Date",ylab="Baggage Complaints",type="b")+grid()

2.Briefly describe what patterns, if any, you see in the plots in 1.

  1. American Eagle: The baggage complaints increased from 2004 to 2007, and after the beginning of 2007, the complaints started decreasing.
  2. Hawaiian: The baggage complaints slowly increased until 2008 and then sharply decreased. At the end of 2010, the complaints suddenly increased.
  3. United: During the period of 2004 to 2006, the trend of complaints are slightly increased. The end of 2006 and the beginning of 2007, there are the high complaints. After 2007, the complaints started decreasing.

3.To compare each month’s data in different years, superimpose each year’s Baggage by month on a single graph for each airline. Your answer consists of 3 time series plots.

#AmericanEagle
A2004=AmericanEagle[which(AmericanEagle$Year==2004),]
A2005=AmericanEagle[which(AmericanEagle$Year==2005),]
A2006=AmericanEagle[which(AmericanEagle$Year==2006),]
A2007=AmericanEagle[which(AmericanEagle$Year==2007),]
A2008=AmericanEagle[which(AmericanEagle$Year==2008),]
A2009=AmericanEagle[which(AmericanEagle$Year==2009),]
A2010=AmericanEagle[which(AmericanEagle$Year==2010),]
plot(A2004$Baggage~A2004$Month,type="b",xlab="Month",ylab="Baggage Complaints",
     main="American Eagle",ylim=c(5000,37000),xlim=c(0,14),color="black",pch=1)
lines(A2005$Baggage~A2005$Month,col="red",type="b",pch=9)
lines(A2006$Baggage~A2006$Month,col="orange",type="b",pch=10)
lines(A2007$Baggage~A2007$Month,col="yellow3",type="b",pch=15)
lines(A2008$Baggage~A2008$Month,col="green",type="b",,pch=16)
lines(A2009$Baggage~A2009$Month,col="blue",type="b",pch=17)
lines(A2010$Baggage~A2010$Month,col="palevioletred1",type="b",pch=18)
Name=c("2004","2005","2006","2007","2008","2009","2010")
legend("topright", Name, ncol=3,
       col=c("black","red","orange","yellow3","green","blue","palevioletred1"),
       bg = "gray90", pch = c(1,9,10,15,16,17,18),lty =1)

#Hawaiian
H2004=Hawaiian[which(Hawaiian$Year==2004),]
H2005=Hawaiian[which(Hawaiian$Year==2005),]
H2006=Hawaiian[which(Hawaiian$Year==2006),]
H2007=Hawaiian[which(Hawaiian$Year==2007),]
H2008=Hawaiian[which(Hawaiian$Year==2008),]
H2009=Hawaiian[which(Hawaiian$Year==2009),]
H2010=Hawaiian[which(Hawaiian$Year==2010),]
plot(H2004$Baggage~H2004$Month,type="b",xlab="Month",ylab="Baggage Complaints",
     main="Hawaiian",ylim=c(1000,3500),xlim=c(0,14),color="black",pch=1)
lines(H2005$Baggage~H2005$Month,col="red",type="b",pch=9)
lines(H2006$Baggage~H2006$Month,col="orange",type="b",pch=10)
lines(H2007$Baggage~H2007$Month,col="yellow3",type="b",pch=15)
lines(H2008$Baggage~H2008$Month,col="green",type="b",,pch=16)
lines(H2009$Baggage~H2009$Month,col="blue",type="b",pch=17)
lines(H2010$Baggage~H2010$Month,col="palevioletred1",type="b",pch=18)
legend("topright", Name, ncol=3,
       col=c("black","red","orange","yellow3","green","blue","palevioletred1"),
       bg = "gray90", pch = c(1,9,10,15,16,17,18),lty =1)

#United
U2004=United[which(United$Year==2004),]
U2005=United[which(United$Year==2005),]
U2006=United[which(United$Year==2006),]
U2007=United[which(United$Year==2007),]
U2008=United[which(United$Year==2008),]
U2009=United[which(United$Year==2009),]
U2010=United[which(United$Year==2010),]
plot(U2004$Baggage~U2004$Month,type="b",xlab="Month",ylab="Baggage Complaints",main="United",
     ylim=c(8000,52000),xlim=c(0,14),color="black",pch=1)
lines(U2005$Baggage~U2005$Month,col="red",type="b",pch=9)
lines(U2006$Baggage~U2006$Month,col="orange",type="b",pch=10)
lines(U2007$Baggage~U2007$Month,col="yellow3",type="b",pch=15)
lines(U2008$Baggage~U2008$Month,col="green",type="b",,pch=16)
lines(U2009$Baggage~U2009$Month,col="blue",type="b",pch=17)
lines(U2010$Baggage~U2010$Month,col="palevioletred1",type="b",pch=18)
legend("topright", Name, ncol=3,
       col=c("black","red","orange","yellow3","green","blue","palevioletred1"),
       bg = "gray90", pch = c(1,9,10,15,16,17,18),lty =1)

4.Briefly describe what patterns, if any, you see in the plots in 3.

In general, there are high baggage complaints in June to August and December in a year. For American Eagle, the number of the baggae complaints increased from 2004 to 2006 and then decreased until 2010. For Hawaiian, the number of the baggage complaints increased from 2004 and reached the highest complaint in 2008 and after it decreased. However, in 2010, there were abnormally high complaint in Oct, Noc, and Dec compared to the previous year. *For United, the baggage complaints increased from 2004 to 2006 and 2007 and after 2007 it started decreasing. After 2008, the baggage complaints imrpoved a lot until 2010.

5.To better compare the baggage complaints for three airlines, plot all three airline Baggage data by Date on one graph.

AmericanEagle_Baggage_ts= ts(AmericanEagle$Baggage, frequency=12, start=c(2004,1), end=c(2010,12))
plot.ts(AmericanEagle_Baggage_ts,col="red",pch=5,ylim=c(1000,45000),xlab="Date",ylab="Baggage Complaints",type="b")+grid()
Hawaiian_Baggage_ts= ts(Hawaiian$Baggage, frequency=12, start=c(2004,1), end=c(2010,12))
par(new=T)
plot.ts(Hawaiian_Baggage_ts,col="blue",pch=6,ylim=c(1000,45000),xlab="Date",ylab="Baggage Complaints",type="b")+grid()
United_Baggage_ts= ts(United$Baggage, frequency=12, start=c(2004,1), end=c(2010,12))
par(new=T)
plot.ts(United_Baggage_ts,col="green",pch=7,ylim=c(1000,45000),xlab="Date",ylab="Baggage Complaints",type="b")+grid()
legend("topright", c("United","American Eagle","Hawaiian"), ncol=1,
       col=c("green","red","blue"),
       bg = "gray90", pch = c(7,5,6),lty =3)

6.Based on the graph in question 5., do some airlines have better baggage handling practices?

It’s difficult to tell that which airline company has the better baggage handling practice since the numbers of the enplaned flights are hugely different. It’s obvious that if you have more people taking the flights, you will definitely get more baggage complaints. Therefore, the number of baggage complaints can’t mean to which company has the best practice. However, from the graph, after 2007, the complaints in United and American Eagle decreased, so it can show that these two company have some good practices to handle the complaints and make them decrease in the following years. For Hawaiian, it’s difficult to tell the trend of the baggage complaints since the y-scale is to broad for Hawaiian. putting the 3 companies’ baggage complaints into one graph is not the good way to discuss about whether the company improve the practice to handle the complaints.

7.Based on the graph in question 5., which airline has the best record? The worst?

Based on the number of the complaints the airline received, the United got the highest number of complaints; Hawaiian got the least number of the complaints. It makes sense that they got more complaints since there are more enplanned flights for United.

8.Based on the graph in question 5., are complaints getting better or worse over time?

From the graph, it seems that they are getting better for dealing with the complaints. It’s hard to tell whether Hawaiian is getting better or worse because of the broad y-scale that need to fit the other airplane companies.

9.Are the conclusions, you have drawn based on the graphs of the raw data you created, accurate? Are there any potential factors that may distort your conclusions and should be taken into consideration?

Not really. It’s not approriate to make these 3 company into one graph since the numbers of the complaints from differenct companies are hugely different. It may sacrifice the small number of the complaint in Hawaiian Airline making it difficult to tell the trend. It’s more fair to get the ratio of complaints to enplaned paddengers and get the numbers from these 3 companies to make the comparison.

10.Report the average of scheduled flights and the average of enplaned passengers by airline.

#scheduled flights:
round(colMeans(AmericanEagle[,6,drop=FALSE]),2)
## Scheduled 
##  41314.05
round(colMeans(Hawaiian[,6,drop=FALSE]),2)
## Scheduled 
##   4844.68
round(colMeans(United[,6,drop=FALSE]),2)
## Scheduled 
##   38225.3
#enplaned flights:
round(colMeans(AmericanEagle[,8,drop=FALSE]),2)
## Enplaned 
##  1396726
round(colMeans(Hawaiian[,8,drop=FALSE]),2)
## Enplaned 
## 594174.2
round(colMeans(United[,8,drop=FALSE]),2)
## Enplaned 
##  4620712
AmericanEagle Hawaiian United
41,314.05 4,844.68 38,225.30
1,396,726 594,174 4,620,712

11.What insights, ideas, and concerns does the data in the table in 10. provide you with?

It provides me with the average of scheduled flights and the average of enplaned passengers by different airlines. For United and American Eagle, United got the smaller number of scheduled flights than American Eagle, but United had more people actually taking the flights than American Eagle. It may because United have better baggage practice so people were willing to take United’s flights rather than American Eagle. For Hawaiian, the number of enplaned paseengers and scheduled flights is far less than the other two companies so it’s hard to tell the performance of the company. It’s better to get the percentage or ratio to make the comparison among these 3 companies.

12.Create a KPI that adjusts the total number of passenger complaints for size.

#Baggage % = Baggage / Enplaned ×100 %. Display average Baggage % for each airline.
paste(round((sum(AmericanEagle[,5])/sum(AmericanEagle[,8]))*100,2),"%")
## [1] "1.05 %"
paste(round((sum(Hawaiian[,5])/sum(Hawaiian[,8]))*100,2),"%")
## [1] "0.27 %"
paste(round((sum(United[,5])/sum(United[,8]))*100,2),"%")
## [1] "0.47 %"
AmericanEagle Hawaiian United
1.05% 0.27% 0.47%

13.Do the results in question 12 support your previous conclusions? Briefly explain.

No. From the table of enplaned passengers and scheduled flights, I expected United may have better performance than American Eagle since there were more people taking United’s airplanes than American Eagle. Actually, United got the highest percentage of baggage complaints from passengers. The second highest is American Eagle and Hawaiian seems to have the best baggage complaints practice to improve this problem.

14.Superimpose all three time series on one graph to display Baggage % by Date.

AmericanEagle$Baggage_percernt=round(((AmericanEagle[,5])/(AmericanEagle[,8]))*100,2)
AmericanEagle_BaggagePercent_ts= ts(AmericanEagle$Baggage_percernt, frequency=12, start=c(2004,1), end=c(2010,12))
plot.ts(AmericanEagle_BaggagePercent_ts,col="red",pch=5,ylim=c(0,3),xlab="Date",ylab="Baggage Complaints Percentage (%)",type="b")+grid()
par(new=T)
Hawaiian$Baggage_percernt=round(((Hawaiian[,5])/(Hawaiian[,8]))*100,2)
Hawaiian_BaggagePercent_ts= ts(Hawaiian$Baggage_percernt, frequency=12, start=c(2004,1), end=c(2010,12))
plot.ts(Hawaiian_BaggagePercent_ts,col="blue",pch=6,ylim=c(0,3),xlab="Date",ylab="Baggage Complaints Percentage (%)",type="b")+grid()
par(new=T)
United$Baggage_percernt=round(((United[,5])/(United[,8]))*100,2)
United_BaggagePercent_ts= ts(United$Baggage_percernt, frequency=12, start=c(2004,1), end=c(2010,12))
plot.ts(United_BaggagePercent_ts,col="green",pch=7,ylim=c(0,3),xlab="Date",ylab="Baggage Complaints Percentage (%)",type="b")+grid()
legend("topright", c("United","American Eagle","Hawaiian"), ncol=1,
       col=c("green","red","blue"),
       bg = "gray90", pch = c(7,5,6),lty =3)

15.In addition to the graph in question 14., would plotting each series on a separate graph be beneficial and why? Create a graph to support your answer.

I think making the baggage complaint percentages among the 3 companies in one graph is better since it’s easier to make the comparison that which company get the high percentage of the complaits from passengers and have the bad performance on dealing with complaints.

plot.ts(AmericanEagle_BaggagePercent_ts,col="red",pch=5,ylim=c(0,3),xlab="Date",ylab="Baggage Complaints Percentage (%)",type="b",main="American Eagle")+grid()

plot.ts(Hawaiian_BaggagePercent_ts,col="blue",pch=6,ylim=c(0,0.5),xlab="Date",ylab="Baggage Complaints Percentage (%)",type="b",main="Hawaiian")+grid()

plot.ts(United_BaggagePercent_ts,col="green",pch=7,ylim=c(0,1),xlab="Date",ylab="Baggage Complaints Percentage (%)",type="b",main="United")+grid()

16.Based on the analysis of KPI Baggage %, have any of your conclusions drawn in questions 6. - 8. changed? Briefly discuss.

Based on KPI baggage complaint percentage, it’s more clear and straightforward to compare baggage complaints among 3 companies since the scale of the companies is differenct, it’s hard to tell which company has the better solutions to deal with it merely based on the number of the baggage complaints. It may lead to a bias by only looking at the number of the complaints.

17.Superimpose time series plots of monthly averages of Baggage % by time for the three airlines.

new_data_AmericanEagle=data.frame(matrix(0, ncol = 2, nrow = 12))
colnames(new_data_AmericanEagle)=c("Month","MonthlyBaggage")
new_data_AmericanEagle$Month= ("Month"=1:nrow(new_data_AmericanEagle))

new_data_Hawaiian=data.frame(matrix(0, ncol = 2, nrow = 12))
colnames(new_data_Hawaiian)=c("Month","MonthlyBaggage")
new_data_Hawaiian$Month= ("Month"=1:nrow(new_data_Hawaiian))

new_data_United=data.frame(matrix(0, ncol = 2, nrow = 12))
colnames(new_data_United)=c("Month","MonthlyBaggage")
new_data_United$Month= ("Month"=1:nrow(new_data_United))

for (i in c(1:12)){
  new_data_AmericanEagle$MonthlyBaggage[i]=round(mean((AmericanEagle[AmericanEagle$Month==i,"Baggage"])/
                                         (AmericanEagle[AmericanEagle$Month==i,"Enplaned"])*100),2)
}
for (i in c(1:12)){
  new_data_Hawaiian$MonthlyBaggage[i]=round(mean((Hawaiian[Hawaiian$Month==i,"Baggage"])/
                                      (Hawaiian[Hawaiian$Month==i,"Enplaned"])*100),2)
}
for (i in c(1:12)){
  new_data_United$MonthlyBaggage[i]=round(mean((United[United$Month==i,"Baggage"])/
                                      (United[United$Month==i,"Enplaned"])*100),2)
}

plot(AmericanEagle$Baggage_percernt~AmericanEagle$Month,pch=20,col="red",ylim=c(0,2.5),ylab="Monthly Average of Baggage Complaints (%)",xlab="Month",main="American Eagle Monthly Average of Baggage Complaints (2004-2010)")
axis(1,at=c(1:12))
lines(new_data_AmericanEagle$MonthlyBaggage~new_data_AmericanEagle$Month,ylim=c(0,2.5),type="b",col="red",pch=5)

plot(Hawaiian$Baggage_percernt~Hawaiian$Month,pch=20,col="blue",ylim=c(0,0.5),ylab="Monthly Average of Baggage Complaints (%)",xlab="Month",main="Hawaiian Monthly Average of Baggage Complaints (2004-2010)")
axis(1,at=c(1:12))
lines(new_data_Hawaiian$MonthlyBaggage~new_data_Hawaiian$Month,type="b",col="blue",pch=6)

plot(United$Baggage_percernt~United$Month,pch=20,col="green",ylim=c(0,1),ylab="Monthly Average of Baggage Complaints (%)",xlab="Month",main="United Monthly Average of Baggage Complaints (2004-2010)")
axis(1,at=c(1:12))
lines(new_data_United$MonthlyBaggage~new_data_United$Month,type="b",col="green",pch=7)

plot(new_data_AmericanEagle$MonthlyBaggage~new_data_AmericanEagle$Month,ylim=c(0,2),ylab="Monthly Average of Baggage Complaints (%)",xlab="Month",main="Monthly Average of Baggage Complaints (2004-2010)",type="b",col="red",pch=5)
axis(1,at=c(1:12))
lines(new_data_Hawaiian$MonthlyBaggage~new_data_Hawaiian$Month,type="b",col="blue",pch=6)
lines(new_data_United$MonthlyBaggage~new_data_United$Month,type="b",col="green",pch=7)
legend("topright", c("United","American Eagle","Hawaiian"), ncol=1,
       col=c("green","red","blue"),
       bg = "gray90", pch = c(7,5,6),lty =3)

18.Discuss common patterns all three time series exhibit in question 17.

Generally speaking, these 3 companies have the higher complaint during June to August and December to January. United has the more obvious seasonal fluctuation compared to American eagle and Hawaiian.

19.1. Create a timeplot of Baggage %, add average line of Baggage % and a Trendline (regression line) of monthly average Baggage %’s for each airline. Hint: total of 3 charts; each chart 3 superimposed trajectories.

A=round(mean(new_data_AmericanEagle$MonthlyBaggage),2)
H=round(mean(new_data_Hawaiian$MonthlyBaggage),2)
U=round(mean(new_data_United$MonthlyBaggage),2)
new_data_AmericanEagle$Average=A
new_data_Hawaiian$Average=H
new_data_United$Average=U

M_A=lm(new_data_AmericanEagle$MonthlyBaggage~Month)
M_H=lm(new_data_Hawaiian$MonthlyBaggage~Month)
M_U=lm(new_data_United$MonthlyBaggage~Month)

plot(AmericanEagle$Baggage_percernt~AmericanEagle$Month,pch=20,col="red",ylim=c(0,2.5),ylab="Monthly Average of Baggage Complaints (%)",xlab="Month",main="American Eagle Monthly Average of Baggage Complaints (2004-2010)")
axis(1,at=c(1:12))
lines(new_data_AmericanEagle$MonthlyBaggage~new_data_AmericanEagle$Month,ylim=c(0,2.5),type="b",col="red",pch=5)
lines(new_data_AmericanEagle$Average,lty=2,col="red",lwd=2)
abline(M_A,lty=3,col="blue",lwd=2)
legend("topright", c("Average line","Regression line"), ncol=1,
       col=c("red","blue"),bg = "gray90", lty = c(2,3))

plot(Hawaiian$Baggage_percernt~Hawaiian$Month,pch=20,col="blue",ylim=c(0,0.5),ylab="Monthly Average of Baggage Complaints (%)",xlab="Month",main="Hawaiian Monthly Average of Baggage Complaints (2004-2010)")
axis(1,at=c(1:12))
lines(new_data_Hawaiian$MonthlyBaggage~new_data_Hawaiian$Month,type="b",col="blue",pch=6)
lines(new_data_Hawaiian$Average,lty=2,col="red",lwd=2)
abline(M_H,lty=3,col="blue",lwd=2)
legend("topright", c("Average line","Regression line"), ncol=1,
       col=c("red","blue"),bg = "gray90", lty = c(2,3))

plot(United$Baggage_percernt~United$Month,pch=20,col="green",ylim=c(0,1),ylab="Monthly Average of Baggage Complaints (%)",xlab="Month",main="United Monthly Average of Baggage Complaints (2004-2010)")
axis(1,at=c(1:12))
lines(new_data_United$MonthlyBaggage~new_data_United$Month,type="b",col="green",pch=7)
lines(new_data_United$Average,lty=2,col="red",lwd=2)
abline(M_U,lty=3,col="blue",lwd=2)
legend("topright", c("Average line","Regression line"), ncol=1,
       col=c("red","blue"),bg = "gray90", lty = c(2,3))

20.Prepare a brief (one paragraph) executive summary of your findings.

From the graphs of the monthly average baggage (%), it can tell that there are not big difference between average line and regression line. They have the similar seasonal trend. The baggage complaints rely on the seasonal factor with some level.

Case 2

Read file:

ceo=read.csv("CEOcompensation.csv",header = T)

1.What is the number of female CEOs? (Answer is an integer)

length(which(ceo$Gender=="F"))
## [1] 2

2.What is the age of a youngest CEO? (Answer is an integer)

ceo[which.min(ceo$Age),"Age"]
## [1] 45

3.What is the age of the oldest CEO? (Answer is an integer)

ceo[which.max(ceo$Age),"Age"]
## [1] 81

4.What is the average age of a CEO? (Use two decimal digit precision)

round(colMeans(ceo[,"Age",drop=F]),2)
##   Age 
## 58.38

5.What is the total CEO 2008 salary? (Use two decimal digit precision)

round(sum(ceo[,"X2008.Salary",drop=F]),2)
## [1] 201.8

6.How many CEOs have joined a company as a CEO? (Hint: CEOs can always be founders. Founders can’t always be CEOs)

length(which(ceo$Years.with.company==ceo$Years.as.company.CEO))
## [1] 40

7.What is the average amount of time a CEO worked for a company before becoming a CEO? (Use two decimal digit precision)

ceo$year_before_ceo=ceo[,"Years.with.company"]-ceo[,"Years.as.company.CEO"]
round(colMeans(ceo[,"year_before_ceo",drop=F]),2)
## year_before_ceo 
##           11.51

8.Which industry in the data set has largest number CEO’s?

which.max(table(ceo$Industry))
## Oil & Gas Operations 
##                   19

9.What is the average CEO 2008 Compensation? Note that 2008 compensation for a CEO consists of a total four components: Salary, Bonus, other (including vested restricted stock grants, LTIP (long-term incentive plan) payouts, and perks), and stock gains. (Use two decimal digit precision)

ceo$compensation=ceo[,"X2008.Salary"]+ceo[,"X2008.Bonus"]+ceo[,"X2008.Other"]+ceo[,"X2008.Stock.gains"]
round(mean(ceo$compensation),2)
## [1] 18.68

10.Which CEO did get paid the largest compensation amount in 2008?

ceo[which.max(ceo$compensation),"CEO"]
## [1] Lawrence J Ellison
## 179 Levels: A Greig Woodring Alan G Lafley ... William R Johnson

11.What is the corresponding amount? (Use two decimal digit precision)

ceo[which.max(ceo$compensation),"compensation"]
## [1] 556.98

12.Which industry does correspond to the second largest total CEO compensation in 2008?

sort(ceo$compensation, decreasing = TRUE)
##   [1] 556.98 222.64 154.58 116.93  90.47  87.48  68.62  61.31  51.28  49.26
##  [11]  47.55  44.49  42.27  39.27  39.22  36.71  36.01  35.13  34.59  33.05
##  [21]  30.33  30.04  29.75  28.98  27.14  26.94  26.76  26.24  25.51  25.33
##  [31]  24.29  24.24  23.22  23.11  22.95  20.96  20.73  19.95  19.75  19.31
##  [41]  19.16  18.90  18.68  18.59  18.52  18.12  18.00  17.83  17.55  16.01
##  [51]  15.99  15.89  15.69  14.04  13.94  13.74  13.67  13.20  13.17  12.98
##  [61]  12.60  12.55  12.52  12.05  11.77  11.55  11.48  11.43  11.31  11.30
##  [71]  11.09  11.07  11.05  10.50  10.47  10.44  10.30  10.26  10.25  10.02
##  [81]   9.66   9.52   9.29   9.29   9.24   9.09   8.99   8.72   8.61   8.23
##  [91]   8.18   7.85   7.76   7.74   7.65   7.54   7.22   7.02   7.02   6.77
## [101]   6.76   6.74   6.65   6.61   6.32   6.27   6.12   5.98   5.87   5.85
## [111]   5.83   5.78   5.73   5.69   5.68   5.57   5.30   5.24   5.11   5.08
## [121]   5.03   5.01   4.88   4.68   4.66   4.53   4.49   4.47   4.40   4.30
## [131]   4.17   4.16   4.12   4.11   4.11   4.01   3.90   3.90   3.82   3.70
## [141]   3.66   3.62   3.47   3.47   3.39   3.34   3.31   3.08   3.07   2.97
## [151]   2.86   2.83   2.78   2.64   2.60   2.39   2.31   2.30   2.16   2.16
## [161]   2.05   1.99   1.92   1.71   1.54   1.48   1.41   1.40   1.35   1.35
## [171]   1.33   1.28   1.18   1.07   1.03   0.97   0.77   0.10   0.00
ceo[ceo$compensation==222.64,"Industry"]
## [1] Oil & Gas Operations
## 26 Levels: Aerospace & Defense Banking ... Utilities

13.Consider the following age groups: [45 – 50), [50 – 55), [55 – 60), [60 – 70), and [70 or more). Analyze age groups by industry and determine which age group corresponds to largest CEO average salary in 2008? Hint: 1. left end point is included; 2. nested if helps assign age category

ceo[,"AgeCategory"]=NA
sum_1=0
sum_2=0
sum_3=0
sum_4=0
sum_5=0
count_1=0
count_2=0
count_3=0
count_4=0
count_5=0
for (i in 1:length(ceo$Nr)){
  if (ceo$Age[i]>=45&ceo$Age[i]<50){
    sum_1=sum_1+ceo$X2008.Salary[i]
    count_1=count_1+1
    average_1=sum_1/count_1
  }
  if (ceo$Age[i]>=50&ceo$Age[i]<55){
    sum_2=sum_2+ceo$X2008.Salary[i]
    count_2=count_2+1
    average_2=sum_2/count_2
  }
  if (ceo$Age[i]>=55&ceo$Age[i]<60){
    sum_3=sum_3+ceo$X2008.Salary[i]
    count_3=count_3+1
    average_3=sum_3/count_3
  }
  if (ceo$Age[i]>=60&ceo$Age[i]<70){
    sum_4=sum_4+ceo$X2008.Salary[i]
    count_4=count_4+1
    average_4=sum_4/count_4
  }
  if (ceo$Age[i]>=70){
    sum_5=sum_5+ceo$X2008.Salary[i]
    count_5=count_5+1
    average_5=sum_5/count_5
  }
}
round(average_1,2)
## [1] 0.97
round(average_2,2)
## [1] 1.11
round(average_3,2)
## [1] 1.07
round(average_4,2)
## [1] 1.13
round(average_5,2)
## [1] 1.79
45–50 50-55 55-60 60-70 70 or more
0.97 million 1.11 million 1.07 million 1.13 million 1.79 million

14.How many CEO’s have received 100% or larger compensation relative to their respective median compensation?

IndustryMedian=read.csv("IndustryMedians.csv",header = T)
for (i in 1:length(ceo$Nr)){
  for (j in 1:length(IndustryMedian$Industry)){
    if (ceo$Industry[i]==IndustryMedian$Industry[j]){
      ceo$differece[i]=((ceo$compensation[i]-IndustryMedian$Total.compensation[j])/IndustryMedian$Total.compensation[j])*100
    }
  }
}
length(which(ceo$differece>=100))
## [1] 63

15.Is the following formula always true? Total median compensation = Median Salary + Median Bonus + Median Other + Median Stock Gains

No. If there are the value mostly toward to highest value or lowest value, it may lead to get the different median from the way that you get each person’s compensation first and calculate the median of it.