Library
library("car")
library("tidyverse")
library("ggplot2")
library("corrplot")
Import data
housing_data<-read.csv("Housing_data.csv", header = TRUE, sep = ',')
data<-housing_data
Structure of the data
str(housing_data)
## 'data.frame': 506 obs. of 10 variables:
## $ PCCR : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ PRLZ : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ INDUS: num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ NOX : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ AVR : num 6.58 6.42 7.18 7 7.15 ...
## $ AGE : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ DIS : num 4.09 4.97 4.97 6.06 6.06 ...
## $ RAD : int 1 2 2 3 3 3 5 5 5 5 ...
## $ TAX : int 296 242 242 222 222 222 311 311 311 311 ...
## $ MEDV : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
Visualization of the data
plot(housing_data)

Missing value treatment of the data set
missing_value<-is.na(housing_data)
### There are no missing value in dataframe so the dataframe is clean
Check is there any catagorial variable in dataframe or not
sapply(housing_data,class)
## PCCR PRLZ INDUS NOX AVR AGE DIS
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## RAD TAX MEDV
## "integer" "integer" "numeric"
### There are no catagorial variables in the dataframe.
To check all of the variables are continous or not plot histogram using ggplot2
Variable PCCR
ggplot(data = housing_data) +
geom_histogram(mapping = aes(x = PCCR),binwidth = .5)

### PCCR variable is continous.
### We can apply this method to other variables to check wheather it is continuous or not.
Let’s check for outliers in variables:
outliers_PCCR = boxplot(data$PCCR, plot=FALSE)$out
o_PCCR=data$PCCR %in% outliers_PCCR
boxplot(o_PCCR)

### There are outliers in variable PCCR.
Find indices of outlier in PCCR
which(data$PCCR %in% boxplot(data$PCCR)$out)

## [1] 368 372 374 375 376 377 378 379 380 381 382 383 385 386 387 388 389
## [18] 393 395 399 400 401 402 403 404 405 406 407 408 410 411 412 413 414
## [35] 415 416 417 418 419 420 421 423 426 427 428 430 432 435 436 437 438
## [52] 439 440 441 442 444 445 446 448 449 455 469 470 478 479 480
### Similarly, we can check the outiers of other variables.
Visualization of correlation between variables as ‘ellipse’ and ‘number’
corrplot(cor(data[,c(1:10)]), "ellipse")

corrplot(cor(data[,c(1:10)]), "number")

Regression analysis
linearmodel<-lm(housing_data$MEDV~housing_data$PCCR+housing_data$PRLZ+housing_data$INDUS+housing_data$NOX+housing_data$AVR+housing_data$AGE+housing_data$DIS+housing_data$RAD+housing_data$TAX)
summary(linearmodel)
##
## Call:
## lm(formula = housing_data$MEDV ~ housing_data$PCCR + housing_data$PRLZ +
## housing_data$INDUS + housing_data$NOX + housing_data$AVR +
## housing_data$AGE + housing_data$DIS + housing_data$RAD +
## housing_data$TAX)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.343 -2.961 -0.721 1.991 37.928
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.141691 4.141297 0.517 0.605279
## housing_data$PCCR -0.190101 0.038266 -4.968 9.33e-07 ***
## housing_data$PRLZ 0.074194 0.015544 4.773 2.39e-06 ***
## housing_data$INDUS -0.079583 0.072204 -1.102 0.270911
## housing_data$NOX -11.565479 4.267974 -2.710 0.006965 **
## housing_data$AVR 6.824466 0.409219 16.677 < 2e-16 ***
## housing_data$AGE -0.053242 0.014742 -3.612 0.000335 ***
## housing_data$DIS -1.755763 0.236635 -7.420 5.13e-13 ***
## housing_data$RAD 0.185892 0.077187 2.408 0.016390 *
## housing_data$TAX -0.016690 0.004436 -3.762 0.000189 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.664 on 496 degrees of freedom
## Multiple R-squared: 0.6275, Adjusted R-squared: 0.6207
## F-statistic: 92.82 on 9 and 496 DF, p-value: < 2.2e-16
### From the summary of the model, it can be said that the most important variables are PCCR, PRLZ, AVR, AGE, DIS & TAX.
Find the coefficient of the linear model
coef(linearmodel)
## (Intercept) housing_data$PCCR housing_data$PRLZ
## 2.14169061 -0.19010058 0.07419353
## housing_data$INDUS housing_data$NOX housing_data$AVR
## -0.07958348 -11.56547928 6.82446595
## housing_data$AGE housing_data$DIS housing_data$RAD
## -0.05324220 -1.75576345 0.18589153
## housing_data$TAX
## -0.01669028
Calculation of Mean Square Error (MSE)
### Predictions using our linear regression model
predict_data<-predict(linearmodel,housing_data)
mymse<-function(yhat,y){
mse=sum(((yhat-y)^2))/length(y)
return(mse)
}
mymse(predict_data,housing_data$MEDV)
## [1] 31.44927
Simplify the model by removing the intercept (has highest p value) from the previous model
simplyfy_linearmodel<-lm(housing_data$MEDV~housing_data$PCCR+housing_data$PRLZ+ housing_data$NOX+housing_data$AVR+housing_data$AGE+housing_data$DIS+housing_data$RAD+housing_data$TAX+0)
Summary of the simplified model
summary(simplyfy_linearmodel)
##
## Call:
## lm(formula = housing_data$MEDV ~ housing_data$PCCR + housing_data$PRLZ +
## housing_data$NOX + housing_data$AVR + housing_data$AGE +
## housing_data$DIS + housing_data$RAD + housing_data$TAX +
## 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.926 -2.914 -0.689 2.019 38.258
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## housing_data$PCCR -0.187190 0.038045 -4.920 1.18e-06 ***
## housing_data$PRLZ 0.075002 0.014878 5.041 6.48e-07 ***
## housing_data$NOX -11.688749 3.404328 -3.433 0.000646 ***
## housing_data$AVR 7.050511 0.256677 27.468 < 2e-16 ***
## housing_data$AGE -0.053114 0.014603 -3.637 0.000304 ***
## housing_data$DIS -1.636549 0.180841 -9.050 < 2e-16 ***
## housing_data$RAD 0.197106 0.071401 2.761 0.005983 **
## housing_data$TAX -0.018385 0.003808 -4.828 1.84e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.661 on 498 degrees of freedom
## Multiple R-squared: 0.9467, Adjusted R-squared: 0.9459
## F-statistic: 1106 on 8 and 498 DF, p-value: < 2.2e-16
Plot original vs fitted model
ggplot(housing_data,aes(x=1:length(housing_data$MEDV),y=housing_data$MEDV))+
geom_line(col='blue')+
geom_point(col='blue')+
geom_line(aes(x=1:length(MEDV),y=fitted.values(simplyfy_linearmodel)),col='red')+
geom_point(aes(x=1:length(MEDV),y=fitted.values(simplyfy_linearmodel)),col='red')+
xlab("Explanatory varriables")+
ylab("Prices of the house")+
ggtitle('Actual ("blue") vs Fitted ("red") Values')

Residual Analysis
plot(residuals(simplyfy_linearmodel),xlab="Exploratory Variables",ylab="Residuals",main="Residual Analysis",col="blue",pch=16,ylim=c(-8,8))
abline(a=0,b=0,col="black",lty=2)
abline(a=-3*sd(residuals(simplyfy_linearmodel)),b=0,col="red",lty=2)

Normality test of the simplified model
res <- scale(simplyfy_linearmodel$residuals)
hist(res)

qqPlot(simplyfy_linearmodel)

## [1] 369 373
Density plot of simplified model
plot(density(residuals(simplyfy_linearmodel)),col='blue',main='Distribution of Residuals')

Summary of the result
### From the exploratory data analysis, we found that there are no categorial variable. Only numerical and integer variable. There is no missing value in data set.
### From correlation analysis, it can be shown that variable RAD and Tax are strongly correlated with each other in absolute term.
### The final regression model consists with independent variables PCCR, PRLZ, NOX, AVR, AGE, DIS, RAD, TAX and the dependent variable is MEDV.
### The value of adjusted R square (Goodness of Fit Statistics) suggests that the model has 94.57% explanatory power and it is a better fit model.
### The p-value of F-statistics is less than .05 which indicates that the model is statistically significant.
Recommendation
### In order to increase the price of the house, data scientist should focus more on the variable PRLZ, AVR, RAD.