library(MASS)
library(ggplot2)
library(pastecs)
## Loading required package: boot
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:boot':
## 
##     logit
library(qpcR)
## Loading required package: minpack.lm
## Loading required package: rgl
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:boot':
## 
##     salinity
## Loading required package: Matrix
library(MPV)
## 
## Attaching package: 'MPV'
## The following object is masked from 'package:qpcR':
## 
##     PRESS
## The following object is masked from 'package:MASS':
## 
##     cement
## The following object is masked from 'package:datasets':
## 
##     stackloss
library(lattice)
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
library(stats)
library(foreign)
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:robustbase':
## 
##     cushny
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:boot':
## 
##     logit
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(Hmisc)
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:robustbase':
## 
##     heart
## The following object is masked from 'package:boot':
## 
##     aml
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(HSAUR)
## Loading required package: tools
## 
## Attaching package: 'HSAUR'
## The following object is masked from 'package:robustbase':
## 
##     epilepsy
library(MVN)
## sROC 0.1-2 loaded
## 
## Attaching package: 'MVN'
## The following object is masked from 'package:psych':
## 
##     mardia
library(caTools) # for spliting the sample
#to remove all the files or data set from the r environment
#rm(list=ls())


#to save any file in the R environment into csv to the working directory

#write.table(Boston, file = "boston.csv", sep = ",", col.names = NA)

#save the data package file in txt format
#write.table(reg1, "reg2.txt", sep="\t")

reg1<-read.csv("E:/4 Inurture/2016-17/Final Project/nitesh/nitesh dallu3.csv")

spliting the data into test and validation

set.seed(1000)
split1 = sample.split(reg1$id, SplitRatio = 0.75)
summary(split1)
##    Mode   FALSE    TRUE    NA's 
## logical  104485  313454       0
# create a sub sample for the same 

reg1train<-subset(reg1,split1==TRUE)
reg1Val<-subset(reg1,split1==FALSE)

summary(reg1train)
##        id              egkm            mkg             wmm       
##  Min.   :     1   Min.   :  0     Min.   : 390    Min.   : 500   
##  1st Qu.:104480   1st Qu.:115     1st Qu.:1320    1st Qu.:2600   
##  Median :208908   Median :132     Median :1503    Median :2685   
##  Mean   :208954   Mean   :139     Mean   :1534    Mean   :2702   
##  3rd Qu.:313444   3rd Qu.:154     3rd Qu.:1695    3rd Qu.:2810   
##  Max.   :417939   Max.   :598     Max.   :4670    Max.   :6000   
##                   NA's   :19127   NA's   :18660   NA's   :35479  
##      at1mm           at2mm           eccm3            epkw       
##  Min.   : 155    Min.   : 500    Min.   :  97    Min.   :   1.0  
##  1st Qu.:1531    1st Qu.:1508    1st Qu.:1461    1st Qu.:  80.0  
##  Median :1554    Median :1551    Median :1798    Median : 103.0  
##  Mean   :1587    Mean   :1583    Mean   :1879    Mean   : 117.2  
##  3rd Qu.:1585    3rd Qu.:1586    3rd Qu.:1997    3rd Qu.: 132.0  
##  Max.   :4325    Max.   :4325    Max.   :8382    Max.   :1461.0  
##  NA's   :36990   NA's   :47043   NA's   :20486   NA's   :82357   
##        MS              MP              FT       
##  Min.   : 1.00   Min.   : 1.00   Min.   :1.000  
##  1st Qu.: 6.00   1st Qu.: 4.00   1st Qu.:1.000  
##  Median :11.00   Median :10.00   Median :1.000  
##  Mean   :12.17   Mean   : 8.46   Mean   :1.685  
##  3rd Qu.:18.00   3rd Qu.:14.00   3rd Qu.:2.000  
##  Max.   :28.00   Max.   :14.00   Max.   :5.000  
##                                  NA's   :364
summary(reg1Val)
##        id              egkm            mkg            wmm       
##  Min.   :     6   Min.   :  0.0   Min.   : 417   Min.   : 920   
##  1st Qu.:104509   1st Qu.:115.0   1st Qu.:1322   1st Qu.:2600   
##  Median :209116   Median :132.0   Median :1503   Median :2685   
##  Mean   :209019   Mean   :138.9   Mean   :1534   Mean   :2702   
##  3rd Qu.:313480   3rd Qu.:154.0   3rd Qu.:1695   3rd Qu.:2810   
##  Max.   :417930   Max.   :548.0   Max.   :4501   Max.   :5605   
##                   NA's   :6377    NA's   :6252   NA's   :11795  
##      at1mm           at2mm           eccm3           epkw       
##  Min.   : 501    Min.   : 480    Min.   : 197   Min.   :   5.0  
##  1st Qu.:1531    1st Qu.:1508    1st Qu.:1461   1st Qu.:  80.0  
##  Median :1554    Median :1551    Median :1798   Median : 103.0  
##  Mean   :1588    Mean   :1585    Mean   :1879   Mean   : 117.3  
##  3rd Qu.:1585    3rd Qu.:1586    3rd Qu.:1997   3rd Qu.: 133.0  
##  Max.   :4325    Max.   :4325    Max.   :8382   Max.   :1770.0  
##  NA's   :12247   NA's   :15548   NA's   :6842   NA's   :27523   
##        MS              MP               FT       
##  Min.   : 1.00   Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 6.00   1st Qu.: 4.000   1st Qu.:1.000  
##  Median :11.00   Median :10.000   Median :1.000  
##  Mean   :12.17   Mean   : 8.468   Mean   :1.684  
##  3rd Qu.:18.00   3rd Qu.:14.000   3rd Qu.:2.000  
##  Max.   :28.00   Max.   :14.000   Max.   :5.000  
##                                   NA's   :107

Finding the model

#stepwise regression model
library(leaps)
fit1<-regsubsets(egkm ~ mkg+wmm+at1mm+at2mm+eccm3+epkw, data=reg1train,nbest=1)
# view results 
summary(fit1)
## Subset selection object
## Call: regsubsets.formula(egkm ~ mkg + wmm + at1mm + at2mm + eccm3 + 
##     epkw, data = reg1train, nbest = 1)
## 6 Variables  (and intercept)
##       Forced in Forced out
## mkg       FALSE      FALSE
## wmm       FALSE      FALSE
## at1mm     FALSE      FALSE
## at2mm     FALSE      FALSE
## eccm3     FALSE      FALSE
## epkw      FALSE      FALSE
## 1 subsets of each size up to 6
## Selection Algorithm: exhaustive
##          mkg wmm at1mm at2mm eccm3 epkw
## 1  ( 1 ) " " " " " "   " "   "*"   " " 
## 2  ( 1 ) "*" " " " "   " "   " "   "*" 
## 3  ( 1 ) "*" " " " "   " "   "*"   "*" 
## 4  ( 1 ) "*" " " " "   "*"   "*"   "*" 
## 5  ( 1 ) "*" " " "*"   "*"   "*"   "*" 
## 6  ( 1 ) "*" "*" "*"   "*"   "*"   "*"
plot(fit1, scale ="r2")

fitting the best model and checking the multicollinearity

fit<-lm(egkm ~ mkg+wmm+at1mm+at2mm+eccm3+epkw, data=reg1train)

#fit<-lm(egkm ~ mkg+wmm+at2mm+eccm3+epkw, data=reg1train)
#fit<-lm(egkm ~ mkg+wmm+at1mm+eccm3+epkw, data=reg1train)

summary(fit)
## 
## Call:
## lm(formula = egkm ~ mkg + wmm + at1mm + at2mm + eccm3 + epkw, 
##     data = reg1train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -284.87  -16.16   -1.50   13.38  462.36 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.3583902  0.8393361  44.509  < 2e-16 ***
## mkg          0.0430914  0.0003393 127.006  < 2e-16 ***
## wmm         -0.0027376  0.0004039  -6.778 1.22e-11 ***
## at1mm       -0.0267420  0.0015466 -17.291  < 2e-16 ***
## at2mm        0.0311850  0.0015453  20.181  < 2e-16 ***
## eccm3        0.0070140  0.0001828  38.365  < 2e-16 ***
## epkw         0.2001550  0.0019445 102.936  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.44 on 202266 degrees of freedom
##   (111181 observations deleted due to missingness)
## Multiple R-squared:  0.5692, Adjusted R-squared:  0.5692 
## F-statistic: 4.454e+04 on 6 and 202266 DF,  p-value: < 2.2e-16
vif(fit)
##       mkg       wmm     at1mm     at2mm     eccm3      epkw 
##  4.268903  2.867027 57.223503 58.463379  5.808925  4.632971
#no multicollinearity

residual analysis

fit2<-lm(egkm ~ mkg+wmm+at1mm+eccm3+epkw, data=reg1train)
fit2<-lm(egkm ~ mkg+wmm+at2mm+eccm3+epkw, data=reg1train)

summary(fit2)

#plot to find randomness of the residuals
plot(fit2$residuals, c(1:length(fit2$residuals)))


#normality for residual values
qqnorm(fit2$residuals, pch=20, cex=1)
qqline(fit2$residuals, col="red",lwd=3)
boxplot(fit2$residuals)
#sapiro test has to be applied

#there seems to be a lot of outliers which are creating the problem of non normality of residuals

#autocorrelation
dwtest(fit2)

#test for heteroscadisticity 
plot(fit2$residuals, fit2$fitted.values)

#from the plot it is not easy to analyze the pattern of the data so we will apply bptest

bptest(fit2)
#estimating the model for the validation data set
fit3<-lm(egkm ~ mkg+wmm+at1mm+at2mm+eccm3+epkw, data=reg1Val)

summary(fit3)
## 
## Call:
## lm(formula = egkm ~ mkg + wmm + at1mm + at2mm + eccm3 + epkw, 
##     data = reg1Val)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -328.62  -16.18   -1.56   13.54  236.19 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.3046241  1.4484978  25.754  < 2e-16 ***
## mkg          0.0425349  0.0005891  72.208  < 2e-16 ***
## wmm         -0.0022165  0.0006969  -3.181  0.00147 ** 
## at1mm       -0.0290895  0.0026897 -10.815  < 2e-16 ***
## at2mm        0.0331733  0.0026870  12.346  < 2e-16 ***
## eccm3        0.0073950  0.0003138  23.566  < 2e-16 ***
## epkw         0.1940894  0.0032857  59.072  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.47 on 67514 degrees of freedom
##   (36964 observations deleted due to missingness)
## Multiple R-squared:  0.5625, Adjusted R-squared:  0.5625 
## F-statistic: 1.447e+04 on 6 and 67514 DF,  p-value: < 2.2e-16
# plot a table of models showing variables in each model.
# models are ordered by the selection statistic.

validity of the model

#compare the best estimated model using step wise regression model of the test data and estimating the same model from the validation data set

#compare the model fit2 and fit2 with respect to R square, adjusted r square and regression coefficients

since there is problem of autocorrelation and heteroscadisticity, it seems that there should be more variables

Durbin-Watson test show that there is some autocorrelation remaining in the residuals. This means there is some information remaining in the residuals that can be exploited to obtain better forecasts. The forecasts from the current model are still unbiased, but will have larger prediction intervals than they need to.

Heteroscadisticity is the basic assumption for using Least square method for estimation of coefficients, if there is heteroscadisticity means that the coefficients are not estimated properly, still we can consider them by increasing the confidence interval.