R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Regression Analysis of Southwest Airlines Data to predict fares

The goal of the analysis is to predict the airfares for new routes

library(ggplot2) library(car) library(MASS) library(corrplot)

air_train<-read.csv(“E:\ISB\Term 2 - SA2\Assignment\airfares_train.csv”,header=TRUE)

air_test<-read.csv(“E:\ISB\Term 2 - SA2\Assignment\airfares_test.csv”,header=TRUE)

attach(air_train) colnames(air_train)

air_train

To check how many observations we have

nrow(air_train) version par(mfrow=c(1, 2)) boxplot(air_train\(COUPON, main="COUPON") #boxplot(air_train\)NEW, main=“NEW”) boxplot(air_train\(HI, main="HI") par(mfrow=c(3, 2)) boxplot(air_train\)S_INCOME, main=“S_INCOME”) boxplot(air_train\(E_INCOME, main="E_INCOME") boxplot(air_train\)S_POP, main=“S_POP”) boxplot(air_train\(E_POP, main="E_POP") boxplot(air_train\)DISTANCE, main=“DISTANCE”) par(mfrow=c(1, 2)) boxplot(air_train\(FARE, main="FARE") boxplot(air_train\)PAX, main=“PAX”)

dat <- data.frame(x = air_train\(FARE) ggplot(dat, aes(x=air_train\)FARE)) + geom_density(fill=“violet”)

Try logarithamic transformation of fare to see the normal distribution

dat <- data.frame(x = log(air_train\(FARE)) ggplot(dat, aes(x=log(air_train\)FARE))) + geom_density(fill=“violet”)

Impose Normal distribution on current one to see the over lap

norm<-rnorm(510, mean=mean(air_train\(FARE), sd=sd(air_train\)FARE)) dat <- data.frame(cond = factor(rep(c(“FARE”,“Normal”), each=510)), x = c(air_train$FARE,norm)) ggplot(dat, aes(x, fill=cond)) + geom_density(alpha=.3)

Impose Normal distribution on Logarithmic Graph

lnorm<-rnorm(510, mean=mean(log(air_train\(FARE)), sd=sd(log(air_train\)FARE))) dat <- data.frame(cond = factor(rep(c(“LFARE”,“Log Normal”), each=510)), x = c(log(air_train$FARE),lnorm)) ggplot(dat, aes(x, fill=cond)) + geom_density(alpha=.3)

scplotdata <- air_train[, sapply(air_train, is.numeric)]

pairs(scplotdata,col=“dodgerblue4”,pch=20)

Correlation Matrix

mcor<-round(cor(scplotdata),2) mcor library(corrplot) png(height=1200, width=1500, pointsize=20, file=“Correlation Matrix2.jpg”) corrplot(mcor)

updateR

model_1<-lm(air_train\(FARE ~ air_train\)COUPON+airlines_train\(NEW+air_train\)HI+ air_train\(S_INCOME+air_train\)E_INCOME+ air_train\(S_POP+air_train\)E_POP+air_train\(DISTANCE+ air_train\)PAX) summary(model_1)

qqPlot(model_1, main=“QQ Plot of residuals: Model_3”)

residual1 <- studres(model_1) hist(residual1, freq=FALSE, main=“Distribution of Studentized Residuals:Model_3”) xfit1<-seq(min(residual1),max(residual1),length=40) yfit1<-dnorm(xfit1) lines(xfit1, yfit1)

residualPlot(model_1, id.n=5)

residualPlots(model_1, id.n=5)

Transformations