linear regression

airfoil_self_noise <- read_csv(“C:/Users/leelavathi.a/Desktop/airfoil_self_noise.csv”) d <- sample ( x = nrow(airfoil_self_noise), size = nrow(airfoil_self_noise)*0.7) train <- airfoil_self_noise[d,] #1052 rows test <- airfoil_self_noise[-d,] #check missing values colSums(is.na(airfoil_self_noise))

Correlation

cor(train)

M<-cor(train)

head(round(M,2))

Correlogram : Visualizing the correlation matrix

a<-corrplot(M, method=“number”) corrplot(M, type=“full”, order=“hclust”, tl.col=“purple”, tl.srt=45,method=“number”) regmodel <- lm(Sound_pressure_level ~ ., data = airfoil_self_noise) summary(regmodel)

set graphic output

par(mfrow=c(2,2)) #create residual plots plot (regmodel) ########## Among all, Residual vs. Fitted value catches my attention. ####Not exactly though, but I see signs of heteroskedasticity in this data. ###### Remember funnel shape? You can see a similar pattern. ######To overcome this situation, we’ll build another model with log(y).

regmodel <- update(regmodel, log(Sound_pressure_level)~.) summary(regmodel) regpred <- predict(regmodel, test) regpred <- exp(regpred) library(Metrics) rmse(actual = test$Sound_pressure_level,predicted = regpred) #save the output of boxplot d <- boxplot(train$Displacement,varwidth = T,outline = T,border = T,plot = T) d$out #enlist outlier observations

LM

leela

June 5, 2017

cor(train)

M<-cor(train)

head(round(M,2))

set graphic output