Multi Linear Regression Assignment With Outliers

Data Description

Size: 21613 obs. of 21 variables:

Attributes:
id - Unique ID for each home sold
date - Date of the home sale
price - Price of each home sold
bedrooms - Number of bedrooms
bathrooms - Number of bathrooms, where .5 accounts for a room with a toilet but no shower
sqft_living - Square footage of the apartments interior living space
sqft_lot - Square footage of the land space
floors - Number of floors
waterfront - A variable for whether the apartment was overlooking the waterfront or not
view - An index from 0 to 4 of how good the view of the property was
condition - An index from 1 to 5 on the condition of the apartment,
grade - An index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average level of construction and design, and 11-13 have a high quality level of construction and design.
sqft_above - The square footage of the interior housing space that is above ground level
sqft_basement - The square footage of the interior housing space that is below ground level
yr_built - The year the house was initially built
yr_renovated - The year of the house’s last renovation
zipcode - What zipcode area the house is in
lat - Lattitude
long - Longitude
sqft_living15 - The square footage of interior housing living space for the nearest 15 neighbors
sqft_lot15 - The square footage of the land lots of the nearest 15 neighbors

Setup

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(corrgram)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

Functions

detect_outliers <- function(inp, na.rm=TRUE) {
  i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
  i.max <- 1.5 * IQR(inp, na.rm=na.rm)
  otp <- inp
  otp[inp < (i.qnt[1] - i.max)] <- NA
  otp[inp > (i.qnt[2] + i.max)] <- NA
  #inp <- count(inp[is.na(otp)])
  sum(is.na(otp))
}

Except_outliers <- function(x, na.rm = TRUE, ...) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
  H <- 1.5 * IQR(x, na.rm = na.rm)
  y <- x
  y[x < (qnt[1] - H)] <- NA
  y[x > (qnt[2] + H)] <- NA
  y
}

Remove_Outliers <- function ( z, na.rm = TRUE){
 Out <- Except_outliers(z)
 Out <-as.data.frame (Out)
 z <- Out$Out[match(z, Out$Out)]
 z
}

Graph_Boxplot <- function (input, na.rm = TRUE){
Plot <- ggplot(dfrModel, aes(x="", y=input)) +
            geom_boxplot(aes(fill=input), color="green") +
            labs(title="Outliers")
Plot
}

Dataset

dfrModel <- read.csv("./house-data.csv", header=T, stringsAsFactors=F)
head(dfrModel)

##           id            date   price bedrooms bathrooms sqft_living
## 1 7129300520 20141013T000000  221900        3      1.00        1180
## 2 6414100192 20141209T000000  538000        3      2.25        2570
## 3 5631500400 20150225T000000  180000        2      1.00         770
## 4 2487200875 20141209T000000  604000        4      3.00        1960
## 5 1954400510 20150218T000000  510000        3      2.00        1680
## 6 7237550310 20140512T000000 1225000        4      4.50        5420
##   sqft_lot floors waterfront view condition grade sqft_above sqft_basement
## 1     5650      1          0    0         3     7       1180             0
## 2     7242      2          0    0         3     7       2170           400
## 3    10000      1          0    0         3     6        770             0
## 4     5000      1          0    0         5     7       1050           910
## 5     8080      1          0    0         3     8       1680             0
## 6   101930      1          0    0         3    11       3890          1530
##   yr_built yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 1     1955            0   98178 47.5112 -122.257          1340       5650
## 2     1951         1991   98125 47.7210 -122.319          1690       7639
## 3     1933            0   98028 47.7379 -122.233          2720       8062
## 4     1965            0   98136 47.5208 -122.393          1360       5000
## 5     1987            0   98074 47.6168 -122.045          1800       7503
## 6     2001            0   98053 47.6561 -122.005          4760     101930

Observation
Since id, unknown let’s drop it.

dfrModel <- select(dfrModel, -c(id))
head(dfrModel)

##              date   price bedrooms bathrooms sqft_living sqft_lot floors
## 1 20141013T000000  221900        3      1.00        1180     5650      1
## 2 20141209T000000  538000        3      2.25        2570     7242      2
## 3 20150225T000000  180000        2      1.00         770    10000      1
## 4 20141209T000000  604000        4      3.00        1960     5000      1
## 5 20150218T000000  510000        3      2.00        1680     8080      1
## 6 20140512T000000 1225000        4      4.50        5420   101930      1
##   waterfront view condition grade sqft_above sqft_basement yr_built
## 1          0    0         3     7       1180             0     1955
## 2          0    0         3     7       2170           400     1951
## 3          0    0         3     6        770             0     1933
## 4          0    0         5     7       1050           910     1965
## 5          0    0         3     8       1680             0     1987
## 6          0    0         3    11       3890          1530     2001
##   yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 1            0   98178 47.5112 -122.257          1340       5650
## 2         1991   98125 47.7210 -122.319          1690       7639
## 3            0   98028 47.7379 -122.233          2720       8062
## 4            0   98136 47.5208 -122.393          1360       5000
## 5            0   98074 47.6168 -122.045          1800       7503
## 6            0   98053 47.6561 -122.005          4760     101930

Summary

#summary(dfrModel)
lapply(dfrModel, FUN=summary)

## $date
##    Length     Class      Mode 
##     21613 character character 
## 
## $price
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   75000  321950  450000  540088  645000 7700000 
## 
## $bedrooms
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   3.000   3.371   4.000  33.000 
## 
## $bathrooms
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.750   2.250   2.115   2.500   8.000 
## 
## $sqft_living
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     290    1427    1910    2080    2550   13540 
## 
## $sqft_lot
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     520    5040    7618   15107   10688 1651359 
## 
## $floors
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.500   1.494   2.000   3.500 
## 
## $waterfront
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.007542 0.000000 1.000000 
## 
## $view
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2343  0.0000  4.0000 
## 
## $condition
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   3.000   3.409   4.000   5.000 
## 
## $grade
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   7.000   7.000   7.657   8.000  13.000 
## 
## $sqft_above
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     290    1190    1560    1788    2210    9410 
## 
## $sqft_basement
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.0   291.5   560.0  4820.0 
## 
## $yr_built
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1900    1951    1975    1971    1997    2015 
## 
## $yr_renovated
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.0    84.4     0.0  2015.0 
## 
## $zipcode
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   98001   98033   98065   98078   98118   98199 
## 
## $lat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47.16   47.47   47.57   47.56   47.68   47.78 
## 
## $long
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -122.5  -122.3  -122.2  -122.2  -122.1  -121.3 
## 
## $sqft_living15
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     399    1490    1840    1987    2360    6210 
## 
## $sqft_lot15
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     651    5100    7620   12768   10083  871200

Data Cleaning

dfrModel$Sold_Yr <- substr(dfrModel$date, 1, 4)
dfrModel$Sold_Yr <- as.integer(dfrModel$Sold_Yr)
dfrModel$Sold_Month <- substr(dfrModel$date, 5, 6)
dfrModel$Sold_Month <- as.integer(dfrModel$Sold_Month)
dfrModel$yrs_Old <- (dfrModel$Sold_Yr - dfrModel$yr_built)
dfrModel <- select(dfrModel, -c(date))

dfrModel$renovated <- ifelse(dfrModel$yr_renovated == 0, 0, 1)

#dfrModel$sqft_lot_new <- (dfrModel$sqft_lot+dfrModel$sqft_lot15)/2
#dfrModel$sqft_living_new <- (dfrModel$sqft_living+dfrModel$sqft_living15)/2
#dfrModel$sqft_area <- (dfrModel$sqft_above+dfrModel$sqft_basement)/2
#dfrModel <- select(dfrModel, -c(sqft_lot, sqft_lot15, sqft_living, sqft_living15, sqft_above, sqft_basement, date))

#dfrModel$price <- Remove_Outliers(dfrModel$price)
#dfrModel$sqft_lot_new <- Remove_Outliers(dfrModel$sqft_lot_new)
#dfrModel <- dfrModel[complete.cases(dfrModel), ]
head(dfrModel)

##     price bedrooms bathrooms sqft_living sqft_lot floors waterfront view
## 1  221900        3      1.00        1180     5650      1          0    0
## 2  538000        3      2.25        2570     7242      2          0    0
## 3  180000        2      1.00         770    10000      1          0    0
## 4  604000        4      3.00        1960     5000      1          0    0
## 5  510000        3      2.00        1680     8080      1          0    0
## 6 1225000        4      4.50        5420   101930      1          0    0
##   condition grade sqft_above sqft_basement yr_built yr_renovated zipcode
## 1         3     7       1180             0     1955            0   98178
## 2         3     7       2170           400     1951         1991   98125
## 3         3     6        770             0     1933            0   98028
## 4         5     7       1050           910     1965            0   98136
## 5         3     8       1680             0     1987            0   98074
## 6         3    11       3890          1530     2001            0   98053
##       lat     long sqft_living15 sqft_lot15 Sold_Yr Sold_Month yrs_Old
## 1 47.5112 -122.257          1340       5650    2014         10      59
## 2 47.7210 -122.319          1690       7639    2014         12      63
## 3 47.7379 -122.233          2720       8062    2015          2      82
## 4 47.5208 -122.393          1360       5000    2014         12      49
## 5 47.6168 -122.045          1800       7503    2015          2      28
## 6 47.6561 -122.005          4760     101930    2014          5      13
##   renovated
## 1         0
## 2         1
## 3         0
## 4         0
## 5         0
## 6         0

Observation
Below New Columns have been introduced which may affect the price of the house.
1. Year and Month of Sold from Date
2. No. of years house old, subracting year of sold and year of Built
3. Renovated or not renovated (Categorical 0 and)

#detect_outliers(dfrModel$bedrooms)
#detect_outliers(dfrModel$sqft_living15)
lapply(dfrModel, FUN=detect_outliers)

## $price
## [1] 1146
## 
## $bedrooms
## [1] 546
## 
## $bathrooms
## [1] 571
## 
## $sqft_living
## [1] 572
## 
## $sqft_lot
## [1] 2425
## 
## $floors
## [1] 0
## 
## $waterfront
## [1] 163
## 
## $view
## [1] 2124
## 
## $condition
## [1] 30
## 
## $grade
## [1] 1911
## 
## $sqft_above
## [1] 611
## 
## $sqft_basement
## [1] 496
## 
## $yr_built
## [1] 0
## 
## $yr_renovated
## [1] 914
## 
## $zipcode
## [1] 0
## 
## $lat
## [1] 2
## 
## $long
## [1] 256
## 
## $sqft_living15
## [1] 544
## 
## $sqft_lot15
## [1] 2194
## 
## $Sold_Yr
## [1] 0
## 
## $Sold_Month
## [1] 0
## 
## $yrs_Old
## [1] 0
## 
## $renovated
## [1] 914

** Obsevations**
We can see that no. of Outliers are more.
For this model we are going with Outliers.

lapply(dfrModel, FUN=Graph_Boxplot)

## $price

## 
## $bedrooms

## 
## $bathrooms

## 
## $sqft_living

## 
## $sqft_lot

## 
## $floors

## 
## $waterfront

## 
## $view

## 
## $condition

## 
## $grade

## 
## $sqft_above

## 
## $sqft_basement

## 
## $yr_built

## 
## $yr_renovated

## 
## $zipcode

## 
## $lat

## 
## $long

## 
## $sqft_living15

## 
## $sqft_lot15

## 
## $Sold_Yr

## 
## $Sold_Month

## 
## $yrs_Old

## 
## $renovated

Observation
Here we see that many predictor variables numeric categoric variable. We should convert these to factor variables

dfrModel1<-dfrModel
dfrModel1$waterfront <- as.factor(dfrModel$waterfront)
levels(dfrModel1$waterfront) <- c("No-Waterfront", "Yes-Waterfront")
dfrModel1$view <- as.factor(dfrModel$view)
levels(dfrModel1$view) <- c("Bad View", "Average View", "Good View", "Very Good View", "Excellent View")
dfrModel1$condition <- as.factor(dfrModel$condition)
levels(dfrModel1$condition) <- c("Bad Condition", "Average Condition", "Good Condition", "Very Good Condition", "Excellent Condition")
dfrModel1$grade <- as.factor(dfrModel$grade)
head(dfrModel1)

##     price bedrooms bathrooms sqft_living sqft_lot floors    waterfront
## 1  221900        3      1.00        1180     5650      1 No-Waterfront
## 2  538000        3      2.25        2570     7242      2 No-Waterfront
## 3  180000        2      1.00         770    10000      1 No-Waterfront
## 4  604000        4      3.00        1960     5000      1 No-Waterfront
## 5  510000        3      2.00        1680     8080      1 No-Waterfront
## 6 1225000        4      4.50        5420   101930      1 No-Waterfront
##       view           condition grade sqft_above sqft_basement yr_built
## 1 Bad View      Good Condition     7       1180             0     1955
## 2 Bad View      Good Condition     7       2170           400     1951
## 3 Bad View      Good Condition     6        770             0     1933
## 4 Bad View Excellent Condition     7       1050           910     1965
## 5 Bad View      Good Condition     8       1680             0     1987
## 6 Bad View      Good Condition    11       3890          1530     2001
##   yr_renovated zipcode     lat     long sqft_living15 sqft_lot15 Sold_Yr
## 1            0   98178 47.5112 -122.257          1340       5650    2014
## 2         1991   98125 47.7210 -122.319          1690       7639    2014
## 3            0   98028 47.7379 -122.233          2720       8062    2015
## 4            0   98136 47.5208 -122.393          1360       5000    2014
## 5            0   98074 47.6168 -122.045          1800       7503    2015
## 6            0   98053 47.6561 -122.005          4760     101930    2014
##   Sold_Month yrs_Old renovated
## 1         10      59         0
## 2         12      63         1
## 3          2      82         0
## 4         12      49         0
## 5          2      28         0
## 6          5      13         0

Correlation

vctCorr = numeric(0)
for (i in names(dfrModel)){
cor.result <- cor(dfrModel$price, as.numeric(dfrModel[,i]))
vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dfrModel)
dfrCorr

##         price      bedrooms     bathrooms   sqft_living      sqft_lot 
##   1.000000000   0.308349598   0.525137505   0.702035052   0.089660860 
##        floors    waterfront          view     condition         grade 
##   0.256793884   0.266369434   0.397293487   0.036361791   0.667434253 
##    sqft_above sqft_basement      yr_built  yr_renovated       zipcode 
##   0.605567295   0.323816022   0.054011527   0.126433794  -0.053202851 
##           lat          long sqft_living15    sqft_lot15       Sold_Yr 
##   0.307003482   0.021626239   0.585378902   0.082447152   0.003576038 
##    Sold_Month       yrs_Old     renovated 
##  -0.010080809  -0.053950773   0.126092263

cor(dfrModel$sqft_living, dfrModel$sqft_living15)

## [1] 0.7564203

cor(dfrModel$sqft_lot, dfrModel$sqft_lot15)

## [1] 0.7185568

cor(dfrModel$sqft_above, dfrModel$sqft_basement)

## [1] -0.05194331

cor(dfrModel$sqft_above, dfrModel$sqft_living)

## [1] 0.8765966

cor(dfrModel$sqft_basement, dfrModel$sqft_living)

## [1] 0.435043

cor(dfrModel$bedrooms, dfrModel$bathrooms)

## [1] 0.5158836

cor (dfrModel$view, dfrModel$grade)

## [1] 0.2513206

** Observations**
As given above after checking the correlation between the Dependent (Price) variable and each independent variable, there are may variables for which correlation is very low.
As per Regression model Assumptions correlation should not be low to continue with the model, So we are removing all the columns for which correlation is less than 0.30.
Below are the columns which are going to remove,
sqft_lot, Floors, Waterfront, Condition, Yr_Built, Yr_Renovated, Zipcode, Long,sqft_lot15, Sold_Yr, Sold_Month, Yrs_Old, Renovate etc.

As well as we can see that correlation between sqft_living and sqft_living15 is high so both are dependent, so as per assumption of regression we can not take both variables, so we are only taking with whom correlation of price is more which is sqft_living.
Same for
correlation for sqft_living and sqft_above 0.8765 so need to drop sqft_above
correlation for bedrooms and bathrooms is 0.5158 so need to drop bedrooms

Data Cleaning 2

dfrModel <- select(dfrModel, -c(sqft_lot, floors, waterfront, condition, yr_built, yr_renovated, zipcode, long, sqft_lot15, Sold_Yr, Sold_Month, yrs_Old, renovated, sqft_living15, sqft_above,sqft_basement, bedrooms))

head(dfrModel)

##     price bathrooms sqft_living view grade     lat
## 1  221900      1.00        1180    0     7 47.5112
## 2  538000      2.25        2570    0     7 47.7210
## 3  180000      1.00         770    0     6 47.7379
## 4  604000      3.00        1960    0     7 47.5208
## 5  510000      2.00        1680    0     8 47.6168
## 6 1225000      4.50        5420    0    11 47.6561

Summary

#summary(dfrModel)
lapply(dfrModel, FUN=summary)

## $price
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   75000  321950  450000  540088  645000 7700000 
## 
## $bathrooms
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.750   2.250   2.115   2.500   8.000 
## 
## $sqft_living
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     290    1427    1910    2080    2550   13540 
## 
## $view
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2343  0.0000  4.0000 
## 
## $grade
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   7.000   7.000   7.657   8.000  13.000 
## 
## $lat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47.16   47.47   47.57   47.56   47.68   47.78

#detect_outliers(dfrModel$bedrooms)
#detect_outliers(dfrModel$sqft_living15)
lapply(dfrModel, FUN=detect_outliers)

## $price
## [1] 1146
## 
## $bathrooms
## [1] 571
## 
## $sqft_living
## [1] 572
## 
## $view
## [1] 2124
## 
## $grade
## [1] 1911
## 
## $lat
## [1] 2

lapply(dfrModel, FUN=Graph_Boxplot)

## $price

## 
## $bathrooms

## 
## $sqft_living

## 
## $view

## 
## $grade

## 
## $lat

Observation
Outliers present in many features.
For this Model we are not removing Outliers.

head(dfrModel)

##     price bathrooms sqft_living view grade     lat
## 1  221900      1.00        1180    0     7 47.5112
## 2  538000      2.25        2570    0     7 47.7210
## 3  180000      1.00         770    0     6 47.7379
## 4  604000      3.00        1960    0     7 47.5208
## 5  510000      2.00        1680    0     8 47.6168
## 6 1225000      4.50        5420    0    11 47.6561

Visualize

dfrGraph <- gather(dfrModel, variable, value, -price)
head(dfrGraph)

##     price  variable value
## 1  221900 bathrooms  1.00
## 2  538000 bathrooms  2.25
## 3  180000 bathrooms  1.00
## 4  604000 bathrooms  3.00
## 5  510000 bathrooms  2.00
## 6 1225000 bathrooms  4.50

ggplot(dfrGraph) +
geom_jitter(aes(value,price, colour=variable)) + 
geom_smooth(aes(value,price, colour=variable), method=lm, se=FALSE) +
facet_wrap(~variable, scales="free_x") +
labs(title="Relation Of Price With Other Features")

Observation
There is some impact of all the features with Price.

Find Best Multi Linear Model
Choose the best linear model by using step(). Choose a model by AIC in a Stepwise Algorithm
In statistics, stepwise regression is a method of fitting regression models in which the choice of predictive variables is carried out by an automatic procedure. In each step, a variable is considered for addition to or subtraction from the set of explanatory variables based on some prespecified criterion.
The Akaike information criterion (AIC) is a measure of the relative quality of statistical models for a given set of data. Given a collection of models for the data, AIC estimates the quality of each model, relative to each of the other models. Hence, AIC provides a means for model selection.

#?step()
stpModel=step(lm(data=dfrModel, price~.), trace=0, steps=10000)
stpSummary <- summary(stpModel)
stpSummary

## 
## Call:
## lm(formula = price ~ bathrooms + sqft_living + view + grade + 
##     lat, data = dfrModel)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1209579  -111673   -15912    77148  4681081 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.227e+07  5.202e+05  -62.03  < 2e-16 ***
## bathrooms   -2.084e+04  3.056e+03   -6.82 9.32e-12 ***
## sqft_living  1.848e+02  2.988e+00   61.84  < 2e-16 ***
## view         9.879e+04  2.056e+03   48.05  < 2e-16 ***
## grade        8.225e+04  2.046e+03   40.21  < 2e-16 ***
## lat          6.690e+05  1.097e+04   61.01  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 221300 on 21607 degrees of freedom
## Multiple R-squared:  0.6367, Adjusted R-squared:  0.6367 
## F-statistic:  7575 on 5 and 21607 DF,  p-value: < 2.2e-16

Observation
Best results given by price ~ bathrooms + view + grade + lat + sqft_living.

Multiple R square value is also 0.6367 which is showing it as a average to good model.

Make Final Multi Linear Model

#x1 <- dfrModel$bedrooms
x2 <- dfrModel$bathrooms
x3 <- dfrModel$view
x4 <- dfrModel$grade
x5 <- dfrModel$lat
#x6 <- dfrModel$sqft_above
x7 <- dfrModel$sqft_living
y <- dfrModel$price
slmModel <- lm(y~x2+x3+x4+x5+x7, data=dfrModel)

Observation
No errors. Model successfully created.

Show Model

# print summary
summary(slmModel)

## 
## Call:
## lm(formula = y ~ x2 + x3 + x4 + x5 + x7, data = dfrModel)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1209579  -111673   -15912    77148  4681081 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.227e+07  5.202e+05  -62.03  < 2e-16 ***
## x2          -2.084e+04  3.056e+03   -6.82 9.32e-12 ***
## x3           9.879e+04  2.056e+03   48.05  < 2e-16 ***
## x4           8.225e+04  2.046e+03   40.21  < 2e-16 ***
## x5           6.690e+05  1.097e+04   61.01  < 2e-16 ***
## x7           1.848e+02  2.988e+00   61.84  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 221300 on 21607 degrees of freedom
## Multiple R-squared:  0.6367, Adjusted R-squared:  0.6367 
## F-statistic:  7575 on 5 and 21607 DF,  p-value: < 2.2e-16

Test Data

# find mpg of a person with weight 3.0
dfrTest <- data.frame(x2=c(1, 1.5, 2.25),x3=c(0, 1, 0), x4=c(7,10,7), x5=c(47.5112, 47.721, 47.721), x7=c(1180, 1300, 2570))
dfrTest

##     x2 x3 x4      x5   x7
## 1 1.00  0  7 47.5112 1180
## 2 1.50  1 10 47.7210 1300
## 3 2.25  0  7 47.7210 2570

#names(dfrTest) <- c("x1","x2","x3")
#dfrTest

Observation
Test Data successfully created.

Predict

result <-  predict(slmModel, dfrTest)
print(result)

##        1        2        3 
## 287196.8 784824.0 658320.8

** Observation**
1. Prediction is working fine but have some deviation from true values as r squared values was less.

Multi Linear Regression Assignment With Outliers

Monesh Sharma

July 23, 2017

Problem Definition

Data Location

Data Description

Summary