library(MASS)
library(ggplot2)
library(pastecs)
## Loading required package: boot
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:boot':
##
## logit
library(qpcR)
## Loading required package: minpack.lm
## Loading required package: rgl
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:boot':
##
## salinity
## Loading required package: Matrix
library(MPV)
##
## Attaching package: 'MPV'
## The following object is masked from 'package:qpcR':
##
## PRESS
## The following object is masked from 'package:MASS':
##
## cement
## The following object is masked from 'package:datasets':
##
## stackloss
library(lattice)
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
library(stats)
library(foreign)
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:robustbase':
##
## cushny
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:boot':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:robustbase':
##
## heart
## The following object is masked from 'package:boot':
##
## aml
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(HSAUR)
## Loading required package: tools
##
## Attaching package: 'HSAUR'
## The following object is masked from 'package:robustbase':
##
## epilepsy
library(MVN)
## sROC 0.1-2 loaded
##
## Attaching package: 'MVN'
## The following object is masked from 'package:psych':
##
## mardia
library(caTools) # for spliting the sample
#to remove all the files or data set from the r environment
#rm(list=ls())
#to save any file in the R environment into csv to the working directory
#write.table(Boston, file = "boston.csv", sep = ",", col.names = NA)
#save the data package file in txt format
#write.table(reg1, "reg2.txt", sep="\t")
reg1<-read.csv("E:/4 Inurture/2016-17/Final Project/nitesh/nitesh dallu3.csv")
set.seed(1000)
split1 = sample.split(reg1$id, SplitRatio = 0.75)
summary(split1)
## Mode FALSE TRUE NA's
## logical 104485 313454 0
# create a sub sample for the same
reg1train<-subset(reg1,split1==TRUE)
reg1Val<-subset(reg1,split1==FALSE)
summary(reg1train)
## id egkm mkg wmm
## Min. : 1 Min. : 0 Min. : 390 Min. : 500
## 1st Qu.:104480 1st Qu.:115 1st Qu.:1320 1st Qu.:2600
## Median :208908 Median :132 Median :1503 Median :2685
## Mean :208954 Mean :139 Mean :1534 Mean :2702
## 3rd Qu.:313444 3rd Qu.:154 3rd Qu.:1695 3rd Qu.:2810
## Max. :417939 Max. :598 Max. :4670 Max. :6000
## NA's :19127 NA's :18660 NA's :35479
## at1mm at2mm eccm3 epkw
## Min. : 155 Min. : 500 Min. : 97 Min. : 1.0
## 1st Qu.:1531 1st Qu.:1508 1st Qu.:1461 1st Qu.: 80.0
## Median :1554 Median :1551 Median :1798 Median : 103.0
## Mean :1587 Mean :1583 Mean :1879 Mean : 117.2
## 3rd Qu.:1585 3rd Qu.:1586 3rd Qu.:1997 3rd Qu.: 132.0
## Max. :4325 Max. :4325 Max. :8382 Max. :1461.0
## NA's :36990 NA's :47043 NA's :20486 NA's :82357
## MS MP FT
## Min. : 1.00 Min. : 1.00 Min. :1.000
## 1st Qu.: 6.00 1st Qu.: 4.00 1st Qu.:1.000
## Median :11.00 Median :10.00 Median :1.000
## Mean :12.17 Mean : 8.46 Mean :1.685
## 3rd Qu.:18.00 3rd Qu.:14.00 3rd Qu.:2.000
## Max. :28.00 Max. :14.00 Max. :5.000
## NA's :364
summary(reg1Val)
## id egkm mkg wmm
## Min. : 6 Min. : 0.0 Min. : 417 Min. : 920
## 1st Qu.:104509 1st Qu.:115.0 1st Qu.:1322 1st Qu.:2600
## Median :209116 Median :132.0 Median :1503 Median :2685
## Mean :209019 Mean :138.9 Mean :1534 Mean :2702
## 3rd Qu.:313480 3rd Qu.:154.0 3rd Qu.:1695 3rd Qu.:2810
## Max. :417930 Max. :548.0 Max. :4501 Max. :5605
## NA's :6377 NA's :6252 NA's :11795
## at1mm at2mm eccm3 epkw
## Min. : 501 Min. : 480 Min. : 197 Min. : 5.0
## 1st Qu.:1531 1st Qu.:1508 1st Qu.:1461 1st Qu.: 80.0
## Median :1554 Median :1551 Median :1798 Median : 103.0
## Mean :1588 Mean :1585 Mean :1879 Mean : 117.3
## 3rd Qu.:1585 3rd Qu.:1586 3rd Qu.:1997 3rd Qu.: 133.0
## Max. :4325 Max. :4325 Max. :8382 Max. :1770.0
## NA's :12247 NA's :15548 NA's :6842 NA's :27523
## MS MP FT
## Min. : 1.00 Min. : 1.000 Min. :1.000
## 1st Qu.: 6.00 1st Qu.: 4.000 1st Qu.:1.000
## Median :11.00 Median :10.000 Median :1.000
## Mean :12.17 Mean : 8.468 Mean :1.684
## 3rd Qu.:18.00 3rd Qu.:14.000 3rd Qu.:2.000
## Max. :28.00 Max. :14.000 Max. :5.000
## NA's :107
#stepwise regression model
library(leaps)
fit1<-regsubsets(egkm ~ mkg+wmm+at1mm+at2mm+eccm3+epkw, data=reg1train,nbest=1)
# view results
summary(fit1)
## Subset selection object
## Call: regsubsets.formula(egkm ~ mkg + wmm + at1mm + at2mm + eccm3 +
## epkw, data = reg1train, nbest = 1)
## 6 Variables (and intercept)
## Forced in Forced out
## mkg FALSE FALSE
## wmm FALSE FALSE
## at1mm FALSE FALSE
## at2mm FALSE FALSE
## eccm3 FALSE FALSE
## epkw FALSE FALSE
## 1 subsets of each size up to 6
## Selection Algorithm: exhaustive
## mkg wmm at1mm at2mm eccm3 epkw
## 1 ( 1 ) " " " " " " " " "*" " "
## 2 ( 1 ) "*" " " " " " " " " "*"
## 3 ( 1 ) "*" " " " " " " "*" "*"
## 4 ( 1 ) "*" " " " " "*" "*" "*"
## 5 ( 1 ) "*" " " "*" "*" "*" "*"
## 6 ( 1 ) "*" "*" "*" "*" "*" "*"
plot(fit1, scale ="r2")
fit<-lm(egkm ~ mkg+wmm+at1mm+at2mm+eccm3+epkw, data=reg1train)
#fit<-lm(egkm ~ mkg+wmm+at2mm+eccm3+epkw, data=reg1train)
#fit<-lm(egkm ~ mkg+wmm+at1mm+eccm3+epkw, data=reg1train)
summary(fit)
##
## Call:
## lm(formula = egkm ~ mkg + wmm + at1mm + at2mm + eccm3 + epkw,
## data = reg1train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -284.87 -16.16 -1.50 13.38 462.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.3583902 0.8393361 44.509 < 2e-16 ***
## mkg 0.0430914 0.0003393 127.006 < 2e-16 ***
## wmm -0.0027376 0.0004039 -6.778 1.22e-11 ***
## at1mm -0.0267420 0.0015466 -17.291 < 2e-16 ***
## at2mm 0.0311850 0.0015453 20.181 < 2e-16 ***
## eccm3 0.0070140 0.0001828 38.365 < 2e-16 ***
## epkw 0.2001550 0.0019445 102.936 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.44 on 202266 degrees of freedom
## (111181 observations deleted due to missingness)
## Multiple R-squared: 0.5692, Adjusted R-squared: 0.5692
## F-statistic: 4.454e+04 on 6 and 202266 DF, p-value: < 2.2e-16
vif(fit)
## mkg wmm at1mm at2mm eccm3 epkw
## 4.268903 2.867027 57.223503 58.463379 5.808925 4.632971
#no multicollinearity
fit2<-lm(egkm ~ mkg+wmm+at1mm+eccm3+epkw, data=reg1train)
fit2<-lm(egkm ~ mkg+wmm+at2mm+eccm3+epkw, data=reg1train)
summary(fit2)
#plot to find randomness of the residuals
plot(fit2$residuals, c(1:length(fit2$residuals)))
#normality for residual values
qqnorm(fit2$residuals, pch=20, cex=1)
qqline(fit2$residuals, col="red",lwd=3)
boxplot(fit2$residuals)
#sapiro test has to be applied
#there seems to be a lot of outliers which are creating the problem of non normality of residuals
#autocorrelation
dwtest(fit2)
#test for heteroscadisticity
plot(fit2$residuals, fit2$fitted.values)
#from the plot it is not easy to analyze the pattern of the data so we will apply bptest
bptest(fit2)
#estimating the model for the validation data set
fit3<-lm(egkm ~ mkg+wmm+at1mm+at2mm+eccm3+epkw, data=reg1Val)
summary(fit3)
##
## Call:
## lm(formula = egkm ~ mkg + wmm + at1mm + at2mm + eccm3 + epkw,
## data = reg1Val)
##
## Residuals:
## Min 1Q Median 3Q Max
## -328.62 -16.18 -1.56 13.54 236.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.3046241 1.4484978 25.754 < 2e-16 ***
## mkg 0.0425349 0.0005891 72.208 < 2e-16 ***
## wmm -0.0022165 0.0006969 -3.181 0.00147 **
## at1mm -0.0290895 0.0026897 -10.815 < 2e-16 ***
## at2mm 0.0331733 0.0026870 12.346 < 2e-16 ***
## eccm3 0.0073950 0.0003138 23.566 < 2e-16 ***
## epkw 0.1940894 0.0032857 59.072 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.47 on 67514 degrees of freedom
## (36964 observations deleted due to missingness)
## Multiple R-squared: 0.5625, Adjusted R-squared: 0.5625
## F-statistic: 1.447e+04 on 6 and 67514 DF, p-value: < 2.2e-16
# plot a table of models showing variables in each model.
# models are ordered by the selection statistic.
validity of the model
#compare the best estimated model using step wise regression model of the test data and estimating the same model from the validation data set
#compare the model fit2 and fit2 with respect to R square, adjusted r square and regression coefficients
since there is problem of autocorrelation and heteroscadisticity, it seems that there should be more variables
Durbin-Watson test show that there is some autocorrelation remaining in the residuals. This means there is some information remaining in the residuals that can be exploited to obtain better forecasts. The forecasts from the current model are still unbiased, but will have larger prediction intervals than they need to.
Heteroscadisticity is the basic assumption for using Least square method for estimation of coefficients, if there is heteroscadisticity means that the coefficients are not estimated properly, still we can consider them by increasing the confidence interval.