# Loading the data into a variable
library(data.table)
startup_50 <- fread("C:\\Users\\Pawan Srivastav\\Desktop\\Data Science\\Data Sets\\Data Sets\\Multilinear Regression\\50_Startups.csv")
# Getting Summary of data
summary(startup_50)
## R&D Spend Administration Marketing Spend State
## Min. : 0 Min. : 51283 Min. : 0 Length:50
## 1st Qu.: 39936 1st Qu.:103731 1st Qu.:129300 Class :character
## Median : 73051 Median :122700 Median :212716 Mode :character
## Mean : 73722 Mean :121345 Mean :211025
## 3rd Qu.:101603 3rd Qu.:144842 3rd Qu.:299469
## Max. :165349 Max. :182646 Max. :471784
## Profit
## Min. : 14681
## 1st Qu.: 90139
## Median :107978
## Mean :112013
## 3rd Qu.:139766
## Max. :192262
# Variance
var(startup_50$`R&D Spend`)
## [1] 2107017150
var(startup_50$Administration)
## [1] 784997271
var(startup_50$`Marketing Spend`)
## [1] 14954920097
var(startup_50$Profit)
## [1] 1624588173
# Standard Deviation
sd(startup_50$`R&D Spend`)
## [1] 45902.26
sd(startup_50$Administration)
## [1] 28017.8
sd(startup_50$`Marketing Spend`)
## [1] 122290.3
sd(startup_50$Profit)
## [1] 40306.18
unique(startup_50$State)
## [1] "New York" "California" "Florida"
startup_50 <- cbind(startup_50,ifelse(startup_50$State=="New York",1,0), ifelse(startup_50$State=="California",1,0), ifelse(startup_50$State=="Florida",1,0))
# Renaming the column
setnames(startup_50, 'V2','New York')
setnames(startup_50, 'V3','California')
setnames(startup_50, 'V4','Florida')
# Ploting the data on scatter plot
# plot(startup_50) # This line give us error because we have a texual values state
plot(startup_50[,-c('State')]) # In this plot we are plotting dummy which seems no relative
plot(startup_50[,-c('State','New York','California','Florida')]) # After removing state and dummy columns
library(corpcor)
cor2pcor(cor(startup_50[,-c('State','New York','California','Florida')]))
## [,1] [,2] [,3] [,4]
## [1,] 1.00000000 0.20852619 0.03890336 0.93477127
## [2,] 0.20852619 1.00000000 -0.28192506 -0.07725021
## [3,] 0.03890336 -0.28192506 1.00000000 0.23707116
## [4,] 0.93477127 -0.07725021 0.23707116 1.00000000
colnames(startup_50)
## [1] "R&D Spend" "Administration" "Marketing Spend" "State"
## [5] "Profit" "New York" "California" "Florida"
Profit_Model <- lm(Profit~`R&D Spend`+Administration+`Marketing Spend`, data = startup_50)
summary(Profit_Model)
##
## Call:
## lm(formula = Profit ~ `R&D Spend` + Administration + `Marketing Spend`,
## data = startup_50)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33534 -4795 63 6606 17275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.012e+04 6.572e+03 7.626 1.06e-09 ***
## `R&D Spend` 8.057e-01 4.515e-02 17.846 < 2e-16 ***
## Administration -2.682e-02 5.103e-02 -0.526 0.602
## `Marketing Spend` 2.723e-02 1.645e-02 1.655 0.105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared: 0.9507, Adjusted R-squared: 0.9475
## F-statistic: 296 on 3 and 46 DF, p-value: < 2.2e-16
P value is greater than 0.05 so now checking the influence records
library(car)
## Loading required package: carData
influenceIndexPlot(Profit_Model)
influencePlot(Profit_Model,id.n=3)
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## StudRes Hat CookD
## 46 2.0220730 0.08617007 0.09032342
## 47 -0.8268684 0.24060165 0.05453034
## 49 -1.6861241 0.21801940 0.19052744
## 50 -4.4961657 0.07477116 0.28808229
Profit_Model_Inf <- lm(Profit~`R&D Spend`+Administration+`Marketing Spend`, data = startup_50[-c(50,49),])
summary(Profit_Model_Inf)
##
## Call:
## lm(formula = Profit ~ `R&D Spend` + Administration + `Marketing Spend`,
## data = startup_50[-c(50, 49), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -16252 -4983 -2042 6019 13631
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.910e+04 5.917e+03 9.988 6.92e-13 ***
## `R&D Spend` 7.895e-01 3.635e-02 21.718 < 2e-16 ***
## Administration -6.335e-02 4.392e-02 -1.442 0.156
## `Marketing Spend` 1.689e-02 1.353e-02 1.249 0.218
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7349 on 44 degrees of freedom
## Multiple R-squared: 0.9627, Adjusted R-squared: 0.9601
## F-statistic: 378.3 on 3 and 44 DF, p-value: < 2.2e-16
Profit_Model <- lm(Profit~`R&D Spend`+Administration+`Marketing Spend`, data = startup_50)
class(startup_50$`Marketing Spend`)
## [1] "numeric"
vif(Profit_Model)
## `R&D Spend` Administration `Marketing Spend`
## 2.468903 1.175091 2.326773
summary(Profit_Model)
##
## Call:
## lm(formula = Profit ~ `R&D Spend` + Administration + `Marketing Spend`,
## data = startup_50)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33534 -4795 63 6606 17275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.012e+04 6.572e+03 7.626 1.06e-09 ***
## `R&D Spend` 8.057e-01 4.515e-02 17.846 < 2e-16 ***
## Administration -2.682e-02 5.103e-02 -0.526 0.602
## `Marketing Spend` 2.723e-02 1.645e-02 1.655 0.105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared: 0.9507, Adjusted R-squared: 0.9475
## F-statistic: 296 on 3 and 46 DF, p-value: < 2.2e-16
## vif>10 then there exists collinearity among all the variables
## Added Variable plot to check correlation b/n variables and o/p variable
avPlots(Profit_Model)
Profit_Model_Revised <- lm(Profit~`R&D Spend`+Administration+`Marketing Spend`+`New York`+California+Florida, data = startup_50)
library(MASS)
stepAIC(Profit_Model_Revised)
## Start: AIC=920.87
## Profit ~ `R&D Spend` + Administration + `Marketing Spend` + `New York` +
## California + Florida
##
##
## Step: AIC=920.87
## Profit ~ `R&D Spend` + Administration + `Marketing Spend` + `New York` +
## California
##
## Df Sum of Sq RSS AIC
## - California 1 3.0984e+05 3.9206e+09 918.88
## - `New York` 1 4.6296e+05 3.9208e+09 918.88
## - Administration 1 2.3816e+07 3.9442e+09 919.17
## <none> 3.9203e+09 920.87
## - `Marketing Spend` 1 2.2071e+08 4.1410e+09 921.61
## - `R&D Spend` 1 2.6878e+10 3.0799e+10 1021.94
##
## Step: AIC=918.88
## Profit ~ `R&D Spend` + Administration + `Marketing Spend` + `New York`
##
## Df Sum of Sq RSS AIC
## - `New York` 1 2.0682e+05 3.9209e+09 916.88
## - Administration 1 2.3662e+07 3.9443e+09 917.18
## <none> 3.9206e+09 918.88
## - `Marketing Spend` 1 2.3000e+08 4.1507e+09 919.73
## - `R&D Spend` 1 2.6901e+10 3.0821e+10 1019.97
##
## Step: AIC=916.88
## Profit ~ `R&D Spend` + Administration + `Marketing Spend`
##
## Df Sum of Sq RSS AIC
## - Administration 1 2.3539e+07 3.9444e+09 915.18
## <none> 3.9209e+09 916.88
## - `Marketing Spend` 1 2.3349e+08 4.1543e+09 917.77
## - `R&D Spend` 1 2.7147e+10 3.1068e+10 1018.37
##
## Step: AIC=915.18
## Profit ~ `R&D Spend` + `Marketing Spend`
##
## Df Sum of Sq RSS AIC
## <none> 3.9444e+09 915.18
## - `Marketing Spend` 1 3.1165e+08 4.2560e+09 916.98
## - `R&D Spend` 1 3.1149e+10 3.5094e+10 1022.46
##
## Call:
## lm(formula = Profit ~ `R&D Spend` + `Marketing Spend`, data = startup_50)
##
## Coefficients:
## (Intercept) `R&D Spend` `Marketing Spend`
## 4.698e+04 7.966e-01 2.991e-02
Profit_Model_Final <- lm(Profit~`R&D Spend`+`Marketing Spend`, data = startup_50)
summary(Profit_Model_Final)
##
## Call:
## lm(formula = Profit ~ `R&D Spend` + `Marketing Spend`, data = startup_50)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33645 -4632 -414 6484 17097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.698e+04 2.690e+03 17.464 <2e-16 ***
## `R&D Spend` 7.966e-01 4.135e-02 19.266 <2e-16 ***
## `Marketing Spend` 2.991e-02 1.552e-02 1.927 0.06 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared: 0.9505, Adjusted R-squared: 0.9483
## F-statistic: 450.8 on 2 and 47 DF, p-value: < 2.2e-16
plot(Profit_Model_Final)
qqPlot(Profit_Model_Final, id.n=5)
## [1] 46 50
R square value is 0.9483 and all p value is also significant.
# Read dats from file
library(data.table)
Computer_Data <- fread("C:\\Users\\Pawan Srivastav\\Desktop\\Data Science\\Data Sets\\Data Sets\\Multilinear Regression\\Computer_Data.csv")
colnames(Computer_Data)
## [1] "V1" "price" "speed" "hd" "ram" "screen" "cd"
## [8] "multi" "premium" "ads" "trend"
str(Computer_Data)
## Classes 'data.table' and 'data.frame': 6259 obs. of 11 variables:
## $ V1 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : int 1499 1795 1595 1849 3295 3695 1720 1995 2225 2575 ...
## $ speed : int 25 33 25 25 33 66 25 50 50 50 ...
## $ hd : int 80 85 170 170 340 340 170 85 210 210 ...
## $ ram : int 4 2 4 8 16 16 4 2 8 4 ...
## $ screen : int 14 14 15 14 14 14 14 14 14 15 ...
## $ cd : chr "no" "no" "no" "no" ...
## $ multi : chr "no" "no" "no" "no" ...
## $ premium: chr "yes" "yes" "yes" "no" ...
## $ ads : int 94 94 94 94 94 94 94 94 94 94 ...
## $ trend : int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Creating Dummy Variable
Computer_Data$cd_dummy1 <- ifelse(Computer_Data$cd=="yes",1,0)
Computer_Data$multi_dummy1 <- ifelse(Computer_Data$multi=='yes',1,0)
Computer_Data$premium_dummy1 <- ifelse(Computer_Data$premium=='yes',1,0)
Computer_Data$cd_dummy2 <- ifelse(Computer_Data$cd=='no',1,0)
Computer_Data$multi_dummy2 <- ifelse(Computer_Data$multi=='no',1,0)
Computer_Data$premium_dummy2 <- ifelse(Computer_Data$premium=='no',1,0)
comp_data <- (Computer_Data[,-c('V1','cd','multi','premium')])
str(comp_data)
## Classes 'data.table' and 'data.frame': 6259 obs. of 13 variables:
## $ price : int 1499 1795 1595 1849 3295 3695 1720 1995 2225 2575 ...
## $ speed : int 25 33 25 25 33 66 25 50 50 50 ...
## $ hd : int 80 85 170 170 340 340 170 85 210 210 ...
## $ ram : int 4 2 4 8 16 16 4 2 8 4 ...
## $ screen : int 14 14 15 14 14 14 14 14 14 15 ...
## $ ads : int 94 94 94 94 94 94 94 94 94 94 ...
## $ trend : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cd_dummy1 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ multi_dummy1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ premium_dummy1: num 1 1 1 0 1 1 1 1 1 1 ...
## $ cd_dummy2 : num 1 1 1 1 1 1 0 1 1 1 ...
## $ multi_dummy2 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ premium_dummy2: num 0 0 0 1 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(comp_data)
## price speed hd ram
## Min. : 949 Min. : 25.00 Min. : 80.0 Min. : 2.000
## 1st Qu.:1794 1st Qu.: 33.00 1st Qu.: 214.0 1st Qu.: 4.000
## Median :2144 Median : 50.00 Median : 340.0 Median : 8.000
## Mean :2220 Mean : 52.01 Mean : 416.6 Mean : 8.287
## 3rd Qu.:2595 3rd Qu.: 66.00 3rd Qu.: 528.0 3rd Qu.: 8.000
## Max. :5399 Max. :100.00 Max. :2100.0 Max. :32.000
## screen ads trend cd_dummy1
## Min. :14.00 Min. : 39.0 Min. : 1.00 Min. :0.0000
## 1st Qu.:14.00 1st Qu.:162.5 1st Qu.:10.00 1st Qu.:0.0000
## Median :14.00 Median :246.0 Median :16.00 Median :0.0000
## Mean :14.61 Mean :221.3 Mean :15.93 Mean :0.4646
## 3rd Qu.:15.00 3rd Qu.:275.0 3rd Qu.:21.50 3rd Qu.:1.0000
## Max. :17.00 Max. :339.0 Max. :35.00 Max. :1.0000
## multi_dummy1 premium_dummy1 cd_dummy2 multi_dummy2
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.0000 Median :1.0000 Median :1.0000 Median :1.0000
## Mean :0.1395 Mean :0.9022 Mean :0.5354 Mean :0.8605
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## premium_dummy2
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.09778
## 3rd Qu.:0.00000
## Max. :1.00000
colnames(comp_data)
## [1] "price" "speed" "hd" "ram"
## [5] "screen" "ads" "trend" "cd_dummy1"
## [9] "multi_dummy1" "premium_dummy1" "cd_dummy2" "multi_dummy2"
## [13] "premium_dummy2"
attach(comp_data)
comp_model <- lm(price ~ speed+hd+ram+screen+ads+trend+cd_dummy1+multi_dummy1+premium_dummy1, data = comp_data)
summary(comp_model)
##
## Call:
## lm(formula = price ~ speed + hd + ram + screen + ads + trend +
## cd_dummy1 + multi_dummy1 + premium_dummy1, data = comp_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1093.77 -174.24 -11.49 146.49 2001.05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 307.98798 60.35341 5.103 3.44e-07 ***
## speed 9.32028 0.18506 50.364 < 2e-16 ***
## hd 0.78178 0.02761 28.311 < 2e-16 ***
## ram 48.25596 1.06608 45.265 < 2e-16 ***
## screen 123.08904 3.99950 30.776 < 2e-16 ***
## ads 0.65729 0.05132 12.809 < 2e-16 ***
## trend -51.84958 0.62871 -82.470 < 2e-16 ***
## cd_dummy1 60.91671 9.51559 6.402 1.65e-10 ***
## multi_dummy1 104.32382 11.41268 9.141 < 2e-16 ***
## premium_dummy1 -509.22473 12.34225 -41.259 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 275.3 on 6249 degrees of freedom
## Multiple R-squared: 0.7756, Adjusted R-squared: 0.7752
## F-statistic: 2399 on 9 and 6249 DF, p-value: < 2.2e-16
influenceIndexPlot(comp_model, id.n=3)
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
avPlots(comp_model)
library(MASS)
vif(comp_model)
## speed hd ram screen ads
## 1.265364 4.207395 2.974628 1.081644 1.217218
## trend cd_dummy1 multi_dummy1 premium_dummy1
## 2.022790 1.859370 1.290568 1.109388
stepAIC(comp_model)
## Start: AIC=70336.65
## price ~ speed + hd + ram + screen + ads + trend + cd_dummy1 +
## multi_dummy1 + premium_dummy1
##
## Df Sum of Sq RSS AIC
## <none> 473783875 70337
## - cd_dummy1 1 3107211 476891087 70376
## - multi_dummy1 1 6335218 480119093 70418
## - ads 1 12439298 486223174 70497
## - hd 1 60768013 534551889 71090
## - screen 1 71812147 545596023 71218
## - premium_dummy1 1 129062420 602846296 71843
## - ram 1 155342777 629126653 72110
## - speed 1 192316497 666100373 72467
## - trend 1 515661043 989444918 74944
##
## Call:
## lm(formula = price ~ speed + hd + ram + screen + ads + trend +
## cd_dummy1 + multi_dummy1 + premium_dummy1, data = comp_data)
##
## Coefficients:
## (Intercept) speed hd ram
## 307.9880 9.3203 0.7818 48.2560
## screen ads trend cd_dummy1
## 123.0890 0.6573 -51.8496 60.9167
## multi_dummy1 premium_dummy1
## 104.3238 -509.2247
comp_model_final <- lm(price ~ speed+hd+ram+screen+ads+trend+cd_dummy1+multi_dummy1+premium_dummy1, data = comp_data)
summary(comp_model_final)
##
## Call:
## lm(formula = price ~ speed + hd + ram + screen + ads + trend +
## cd_dummy1 + multi_dummy1 + premium_dummy1, data = comp_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1093.77 -174.24 -11.49 146.49 2001.05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 307.98798 60.35341 5.103 3.44e-07 ***
## speed 9.32028 0.18506 50.364 < 2e-16 ***
## hd 0.78178 0.02761 28.311 < 2e-16 ***
## ram 48.25596 1.06608 45.265 < 2e-16 ***
## screen 123.08904 3.99950 30.776 < 2e-16 ***
## ads 0.65729 0.05132 12.809 < 2e-16 ***
## trend -51.84958 0.62871 -82.470 < 2e-16 ***
## cd_dummy1 60.91671 9.51559 6.402 1.65e-10 ***
## multi_dummy1 104.32382 11.41268 9.141 < 2e-16 ***
## premium_dummy1 -509.22473 12.34225 -41.259 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 275.3 on 6249 degrees of freedom
## Multiple R-squared: 0.7756, Adjusted R-squared: 0.7752
## F-statistic: 2399 on 9 and 6249 DF, p-value: < 2.2e-16
ToyotaCorolla <- fread("C:\\Users\\Pawan Srivastav\\Desktop\\Data Science\\Data Sets\\Data Sets\\Multilinear Regression\\ToyotaCorolla.csv")
setkey(ToyotaCorolla, Id)
Corolla <- ToyotaCorolla[, c('Price','Age_08_04','KM','HP','cc','Doors','Gears','Quarterly_Tax','Weight')]
Corolla_Model <- lm(Price ~ Age_08_04+KM+HP+cc+Doors+Gears+Quarterly_Tax+Weight,data = Corolla)
summary(Corolla_Model)
##
## Call:
## lm(formula = Price ~ Age_08_04 + KM + HP + cc + Doors + Gears +
## Quarterly_Tax + Weight, data = Corolla)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9366.4 -793.3 -21.3 799.7 6444.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.573e+03 1.411e+03 -3.949 8.24e-05 ***
## Age_08_04 -1.217e+02 2.616e+00 -46.512 < 2e-16 ***
## KM -2.082e-02 1.252e-03 -16.622 < 2e-16 ***
## HP 3.168e+01 2.818e+00 11.241 < 2e-16 ***
## cc -1.211e-01 9.009e-02 -1.344 0.17909
## Doors -1.617e+00 4.001e+01 -0.040 0.96777
## Gears 5.943e+02 1.971e+02 3.016 0.00261 **
## Quarterly_Tax 3.949e+00 1.310e+00 3.015 0.00262 **
## Weight 1.696e+01 1.068e+00 15.880 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1342 on 1427 degrees of freedom
## Multiple R-squared: 0.8638, Adjusted R-squared: 0.863
## F-statistic: 1131 on 8 and 1427 DF, p-value: < 2.2e-16
vif(Corolla_Model)
## Age_08_04 KM HP cc Doors
## 1.884620 1.756905 1.419422 1.163894 1.156575
## Gears Quarterly_Tax Weight
## 1.098723 2.311431 2.516420
avPlots(Corolla_Model)
stepAIC(Corolla_Model)
## Start: AIC=20693.89
## Price ~ Age_08_04 + KM + HP + cc + Doors + Gears + Quarterly_Tax +
## Weight
##
## Df Sum of Sq RSS AIC
## - Doors 1 2943 2571786477 20692
## - cc 1 3256511 2575040045 20694
## <none> 2571783534 20694
## - Quarterly_Tax 1 16377633 2588161166 20701
## - Gears 1 16393629 2588177163 20701
## - HP 1 227730786 2799514319 20814
## - Weight 1 454465243 3026248777 20926
## - KM 1 497917334 3069700867 20946
## - Age_08_04 1 3898860600 6470644134 22017
##
## Step: AIC=20691.89
## Price ~ Age_08_04 + KM + HP + cc + Gears + Quarterly_Tax + Weight
##
## Df Sum of Sq RSS AIC
## - cc 1 3254209 2575040686 20692
## <none> 2571786477 20692
## - Quarterly_Tax 1 16503849 2588290326 20699
## - Gears 1 17093855 2588880332 20699
## - HP 1 228761929 2800548406 20812
## - Weight 1 484447009 3056233485 20938
## - KM 1 498427860 3070214337 20944
## - Age_08_04 1 3898877516 6470663993 22015
##
## Step: AIC=20691.7
## Price ~ Age_08_04 + KM + HP + Gears + Quarterly_Tax + Weight
##
## Df Sum of Sq RSS AIC
## <none> 2575040686 20692
## - Quarterly_Tax 1 14976762 2590017448 20698
## - Gears 1 17276597 2592317283 20699
## - HP 1 225684613 2800725299 20810
## - Weight 1 484245502 3059286188 20937
## - KM 1 506728527 3081769213 20948
## - Age_08_04 1 3902107988 6477148674 22014
##
## Call:
## lm(formula = Price ~ Age_08_04 + KM + HP + Gears + Quarterly_Tax +
## Weight, data = Corolla)
##
## Coefficients:
## (Intercept) Age_08_04 KM HP Gears
## -5.478e+03 -1.217e+02 -2.094e-02 3.133e+01 5.990e+02
## Quarterly_Tax Weight
## 3.737e+00 1.673e+01
Corolla_Model_final <- lm(Price ~ Age_08_04+KM+HP+log(cc)+Gears+Quarterly_Tax+Weight,data = Corolla)
summary(Corolla_Model_final)
##
## Call:
## lm(formula = Price ~ Age_08_04 + KM + HP + log(cc) + Gears +
## Quarterly_Tax + Weight, data = Corolla)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10498.6 -763.2 -30.4 759.7 6611.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.288e+03 2.662e+03 3.114 0.00188 **
## Age_08_04 -1.211e+02 2.585e+00 -46.868 < 2e-16 ***
## KM -1.928e-02 1.263e-03 -15.262 < 2e-16 ***
## HP 3.677e+01 2.907e+00 12.649 < 2e-16 ***
## log(cc) -2.261e+03 3.726e+02 -6.067 1.67e-09 ***
## Gears 5.582e+02 1.912e+02 2.920 0.00356 **
## Quarterly_Tax 6.545e+00 1.361e+00 4.808 1.69e-06 ***
## Weight 1.870e+01 1.059e+00 17.658 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1326 on 1428 degrees of freedom
## Multiple R-squared: 0.867, Adjusted R-squared: 0.8664
## F-statistic: 1330 on 7 and 1428 DF, p-value: < 2.2e-16