Prepare a prediction model for profit of 50_startups data. Do transformations for getting better predictions of profit and make a table containing R^2 value for each prepared model.

# Loading the data into a variable
library(data.table)

startup_50 <- fread("C:\\Users\\Pawan Srivastav\\Desktop\\Data Science\\Data Sets\\Data Sets\\Multilinear Regression\\50_Startups.csv")

# Getting Summary of data
summary(startup_50)
##    R&D Spend      Administration   Marketing Spend     State          
##  Min.   :     0   Min.   : 51283   Min.   :     0   Length:50         
##  1st Qu.: 39936   1st Qu.:103731   1st Qu.:129300   Class :character  
##  Median : 73051   Median :122700   Median :212716   Mode  :character  
##  Mean   : 73722   Mean   :121345   Mean   :211025                     
##  3rd Qu.:101603   3rd Qu.:144842   3rd Qu.:299469                     
##  Max.   :165349   Max.   :182646   Max.   :471784                     
##      Profit      
##  Min.   : 14681  
##  1st Qu.: 90139  
##  Median :107978  
##  Mean   :112013  
##  3rd Qu.:139766  
##  Max.   :192262
# Variance
var(startup_50$`R&D Spend`)
## [1] 2107017150
var(startup_50$Administration)
## [1] 784997271
var(startup_50$`Marketing Spend`)
## [1] 14954920097
var(startup_50$Profit)
## [1] 1624588173
# Standard Deviation
sd(startup_50$`R&D Spend`)
## [1] 45902.26
sd(startup_50$Administration)
## [1] 28017.8
sd(startup_50$`Marketing Spend`)
## [1] 122290.3
sd(startup_50$Profit)
## [1] 40306.18

Checking how many city are in state

unique(startup_50$State)
## [1] "New York"   "California" "Florida"

Creating 3 dummy variable for state

startup_50 <- cbind(startup_50,ifelse(startup_50$State=="New York",1,0), ifelse(startup_50$State=="California",1,0),  ifelse(startup_50$State=="Florida",1,0))


# Renaming the column
setnames(startup_50, 'V2','New York')
setnames(startup_50, 'V3','California')
setnames(startup_50, 'V4','Florida')

# Ploting the data on scatter plot
# plot(startup_50) # This line give us error because we have a texual values state
plot(startup_50[,-c('State')]) # In this plot we are plotting dummy which seems no relative

plot(startup_50[,-c('State','New York','California','Florida')]) # After removing state and dummy columns

After seeing scatter finding correlation

library(corpcor)
cor2pcor(cor(startup_50[,-c('State','New York','California','Florida')]))
##            [,1]        [,2]        [,3]        [,4]
## [1,] 1.00000000  0.20852619  0.03890336  0.93477127
## [2,] 0.20852619  1.00000000 -0.28192506 -0.07725021
## [3,] 0.03890336 -0.28192506  1.00000000  0.23707116
## [4,] 0.93477127 -0.07725021  0.23707116  1.00000000

Creating Model

colnames(startup_50)
## [1] "R&D Spend"       "Administration"  "Marketing Spend" "State"          
## [5] "Profit"          "New York"        "California"      "Florida"
Profit_Model <- lm(Profit~`R&D Spend`+Administration+`Marketing Spend`, data = startup_50)

summary(Profit_Model)
## 
## Call:
## lm(formula = Profit ~ `R&D Spend` + Administration + `Marketing Spend`, 
##     data = startup_50)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33534  -4795     63   6606  17275 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.012e+04  6.572e+03   7.626 1.06e-09 ***
## `R&D Spend`        8.057e-01  4.515e-02  17.846  < 2e-16 ***
## Administration    -2.682e-02  5.103e-02  -0.526    0.602    
## `Marketing Spend`  2.723e-02  1.645e-02   1.655    0.105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9475 
## F-statistic:   296 on 3 and 46 DF,  p-value: < 2.2e-16

P value is greater than 0.05 so now checking the influence records

library(car)
## Loading required package: carData
influenceIndexPlot(Profit_Model)

influencePlot(Profit_Model,id.n=3)
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

##       StudRes        Hat      CookD
## 46  2.0220730 0.08617007 0.09032342
## 47 -0.8268684 0.24060165 0.05453034
## 49 -1.6861241 0.21801940 0.19052744
## 50 -4.4961657 0.07477116 0.28808229
Profit_Model_Inf <- lm(Profit~`R&D Spend`+Administration+`Marketing Spend`, data = startup_50[-c(50,49),])

summary(Profit_Model_Inf)
## 
## Call:
## lm(formula = Profit ~ `R&D Spend` + Administration + `Marketing Spend`, 
##     data = startup_50[-c(50, 49), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -16252  -4983  -2042   6019  13631 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.910e+04  5.917e+03   9.988 6.92e-13 ***
## `R&D Spend`        7.895e-01  3.635e-02  21.718  < 2e-16 ***
## Administration    -6.335e-02  4.392e-02  -1.442    0.156    
## `Marketing Spend`  1.689e-02  1.353e-02   1.249    0.218    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7349 on 44 degrees of freedom
## Multiple R-squared:  0.9627, Adjusted R-squared:  0.9601 
## F-statistic: 378.3 on 3 and 44 DF,  p-value: < 2.2e-16

Variance Inflation factor to check collinearity b/n variables

Profit_Model <- lm(Profit~`R&D Spend`+Administration+`Marketing Spend`, data = startup_50)
class(startup_50$`Marketing Spend`)
## [1] "numeric"
vif(Profit_Model)
##       `R&D Spend`    Administration `Marketing Spend` 
##          2.468903          1.175091          2.326773
summary(Profit_Model)
## 
## Call:
## lm(formula = Profit ~ `R&D Spend` + Administration + `Marketing Spend`, 
##     data = startup_50)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33534  -4795     63   6606  17275 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.012e+04  6.572e+03   7.626 1.06e-09 ***
## `R&D Spend`        8.057e-01  4.515e-02  17.846  < 2e-16 ***
## Administration    -2.682e-02  5.103e-02  -0.526    0.602    
## `Marketing Spend`  2.723e-02  1.645e-02   1.655    0.105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9232 on 46 degrees of freedom
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9475 
## F-statistic:   296 on 3 and 46 DF,  p-value: < 2.2e-16
## vif>10 then there exists collinearity among all the variables 
## Added Variable plot to check correlation b/n variables and o/p variable
avPlots(Profit_Model)

Creating final model after removing Administration data.

Profit_Model_Revised <- lm(Profit~`R&D Spend`+Administration+`Marketing Spend`+`New York`+California+Florida, data = startup_50)

library(MASS)

stepAIC(Profit_Model_Revised)
## Start:  AIC=920.87
## Profit ~ `R&D Spend` + Administration + `Marketing Spend` + `New York` + 
##     California + Florida
## 
## 
## Step:  AIC=920.87
## Profit ~ `R&D Spend` + Administration + `Marketing Spend` + `New York` + 
##     California
## 
##                     Df  Sum of Sq        RSS     AIC
## - California         1 3.0984e+05 3.9206e+09  918.88
## - `New York`         1 4.6296e+05 3.9208e+09  918.88
## - Administration     1 2.3816e+07 3.9442e+09  919.17
## <none>                            3.9203e+09  920.87
## - `Marketing Spend`  1 2.2071e+08 4.1410e+09  921.61
## - `R&D Spend`        1 2.6878e+10 3.0799e+10 1021.94
## 
## Step:  AIC=918.88
## Profit ~ `R&D Spend` + Administration + `Marketing Spend` + `New York`
## 
##                     Df  Sum of Sq        RSS     AIC
## - `New York`         1 2.0682e+05 3.9209e+09  916.88
## - Administration     1 2.3662e+07 3.9443e+09  917.18
## <none>                            3.9206e+09  918.88
## - `Marketing Spend`  1 2.3000e+08 4.1507e+09  919.73
## - `R&D Spend`        1 2.6901e+10 3.0821e+10 1019.97
## 
## Step:  AIC=916.88
## Profit ~ `R&D Spend` + Administration + `Marketing Spend`
## 
##                     Df  Sum of Sq        RSS     AIC
## - Administration     1 2.3539e+07 3.9444e+09  915.18
## <none>                            3.9209e+09  916.88
## - `Marketing Spend`  1 2.3349e+08 4.1543e+09  917.77
## - `R&D Spend`        1 2.7147e+10 3.1068e+10 1018.37
## 
## Step:  AIC=915.18
## Profit ~ `R&D Spend` + `Marketing Spend`
## 
##                     Df  Sum of Sq        RSS     AIC
## <none>                            3.9444e+09  915.18
## - `Marketing Spend`  1 3.1165e+08 4.2560e+09  916.98
## - `R&D Spend`        1 3.1149e+10 3.5094e+10 1022.46
## 
## Call:
## lm(formula = Profit ~ `R&D Spend` + `Marketing Spend`, data = startup_50)
## 
## Coefficients:
##       (Intercept)        `R&D Spend`  `Marketing Spend`  
##         4.698e+04          7.966e-01          2.991e-02
Profit_Model_Final <- lm(Profit~`R&D Spend`+`Marketing Spend`, data = startup_50)

summary(Profit_Model_Final)
## 
## Call:
## lm(formula = Profit ~ `R&D Spend` + `Marketing Spend`, data = startup_50)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33645  -4632   -414   6484  17097 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.698e+04  2.690e+03  17.464   <2e-16 ***
## `R&D Spend`       7.966e-01  4.135e-02  19.266   <2e-16 ***
## `Marketing Spend` 2.991e-02  1.552e-02   1.927     0.06 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared:  0.9505, Adjusted R-squared:  0.9483 
## F-statistic: 450.8 on 2 and 47 DF,  p-value: < 2.2e-16
plot(Profit_Model_Final)

qqPlot(Profit_Model_Final, id.n=5)

## [1] 46 50

R square value is 0.9483 and all p value is also significant.

Predict sales of the computer

# Read dats from file
library(data.table)
Computer_Data <- fread("C:\\Users\\Pawan Srivastav\\Desktop\\Data Science\\Data Sets\\Data Sets\\Multilinear Regression\\Computer_Data.csv")

colnames(Computer_Data)
##  [1] "V1"      "price"   "speed"   "hd"      "ram"     "screen"  "cd"     
##  [8] "multi"   "premium" "ads"     "trend"
str(Computer_Data)
## Classes 'data.table' and 'data.frame':   6259 obs. of  11 variables:
##  $ V1     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price  : int  1499 1795 1595 1849 3295 3695 1720 1995 2225 2575 ...
##  $ speed  : int  25 33 25 25 33 66 25 50 50 50 ...
##  $ hd     : int  80 85 170 170 340 340 170 85 210 210 ...
##  $ ram    : int  4 2 4 8 16 16 4 2 8 4 ...
##  $ screen : int  14 14 15 14 14 14 14 14 14 15 ...
##  $ cd     : chr  "no" "no" "no" "no" ...
##  $ multi  : chr  "no" "no" "no" "no" ...
##  $ premium: chr  "yes" "yes" "yes" "no" ...
##  $ ads    : int  94 94 94 94 94 94 94 94 94 94 ...
##  $ trend  : int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# Creating Dummy Variable

Computer_Data$cd_dummy1 <- ifelse(Computer_Data$cd=="yes",1,0)
Computer_Data$multi_dummy1 <- ifelse(Computer_Data$multi=='yes',1,0)
Computer_Data$premium_dummy1 <- ifelse(Computer_Data$premium=='yes',1,0)

Computer_Data$cd_dummy2 <- ifelse(Computer_Data$cd=='no',1,0)
Computer_Data$multi_dummy2 <- ifelse(Computer_Data$multi=='no',1,0)
Computer_Data$premium_dummy2 <- ifelse(Computer_Data$premium=='no',1,0)

comp_data <- (Computer_Data[,-c('V1','cd','multi','premium')])

str(comp_data)
## Classes 'data.table' and 'data.frame':   6259 obs. of  13 variables:
##  $ price         : int  1499 1795 1595 1849 3295 3695 1720 1995 2225 2575 ...
##  $ speed         : int  25 33 25 25 33 66 25 50 50 50 ...
##  $ hd            : int  80 85 170 170 340 340 170 85 210 210 ...
##  $ ram           : int  4 2 4 8 16 16 4 2 8 4 ...
##  $ screen        : int  14 14 15 14 14 14 14 14 14 15 ...
##  $ ads           : int  94 94 94 94 94 94 94 94 94 94 ...
##  $ trend         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ cd_dummy1     : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ multi_dummy1  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ premium_dummy1: num  1 1 1 0 1 1 1 1 1 1 ...
##  $ cd_dummy2     : num  1 1 1 1 1 1 0 1 1 1 ...
##  $ multi_dummy2  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ premium_dummy2: num  0 0 0 1 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
summary(comp_data)
##      price          speed              hd              ram        
##  Min.   : 949   Min.   : 25.00   Min.   :  80.0   Min.   : 2.000  
##  1st Qu.:1794   1st Qu.: 33.00   1st Qu.: 214.0   1st Qu.: 4.000  
##  Median :2144   Median : 50.00   Median : 340.0   Median : 8.000  
##  Mean   :2220   Mean   : 52.01   Mean   : 416.6   Mean   : 8.287  
##  3rd Qu.:2595   3rd Qu.: 66.00   3rd Qu.: 528.0   3rd Qu.: 8.000  
##  Max.   :5399   Max.   :100.00   Max.   :2100.0   Max.   :32.000  
##      screen           ads            trend         cd_dummy1     
##  Min.   :14.00   Min.   : 39.0   Min.   : 1.00   Min.   :0.0000  
##  1st Qu.:14.00   1st Qu.:162.5   1st Qu.:10.00   1st Qu.:0.0000  
##  Median :14.00   Median :246.0   Median :16.00   Median :0.0000  
##  Mean   :14.61   Mean   :221.3   Mean   :15.93   Mean   :0.4646  
##  3rd Qu.:15.00   3rd Qu.:275.0   3rd Qu.:21.50   3rd Qu.:1.0000  
##  Max.   :17.00   Max.   :339.0   Max.   :35.00   Max.   :1.0000  
##   multi_dummy1    premium_dummy1     cd_dummy2       multi_dummy2   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.0000   Median :1.0000   Median :1.0000   Median :1.0000  
##  Mean   :0.1395   Mean   :0.9022   Mean   :0.5354   Mean   :0.8605  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  premium_dummy2   
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.09778  
##  3rd Qu.:0.00000  
##  Max.   :1.00000

Creating first model

colnames(comp_data)
##  [1] "price"          "speed"          "hd"             "ram"           
##  [5] "screen"         "ads"            "trend"          "cd_dummy1"     
##  [9] "multi_dummy1"   "premium_dummy1" "cd_dummy2"      "multi_dummy2"  
## [13] "premium_dummy2"
attach(comp_data)
comp_model <- lm(price ~ speed+hd+ram+screen+ads+trend+cd_dummy1+multi_dummy1+premium_dummy1, data = comp_data)

summary(comp_model)
## 
## Call:
## lm(formula = price ~ speed + hd + ram + screen + ads + trend + 
##     cd_dummy1 + multi_dummy1 + premium_dummy1, data = comp_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1093.77  -174.24   -11.49   146.49  2001.05 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     307.98798   60.35341   5.103 3.44e-07 ***
## speed             9.32028    0.18506  50.364  < 2e-16 ***
## hd                0.78178    0.02761  28.311  < 2e-16 ***
## ram              48.25596    1.06608  45.265  < 2e-16 ***
## screen          123.08904    3.99950  30.776  < 2e-16 ***
## ads               0.65729    0.05132  12.809  < 2e-16 ***
## trend           -51.84958    0.62871 -82.470  < 2e-16 ***
## cd_dummy1        60.91671    9.51559   6.402 1.65e-10 ***
## multi_dummy1    104.32382   11.41268   9.141  < 2e-16 ***
## premium_dummy1 -509.22473   12.34225 -41.259  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 275.3 on 6249 degrees of freedom
## Multiple R-squared:  0.7756, Adjusted R-squared:  0.7752 
## F-statistic:  2399 on 9 and 6249 DF,  p-value: < 2.2e-16
influenceIndexPlot(comp_model, id.n=3)
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter
## Warning in plot.window(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.n" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.n" is not
## a graphical parameter
## Warning in box(...): "id.n" is not a graphical parameter
## Warning in title(...): "id.n" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.n" is not a
## graphical parameter

avPlots(comp_model)

library(MASS)
vif(comp_model)
##          speed             hd            ram         screen            ads 
##       1.265364       4.207395       2.974628       1.081644       1.217218 
##          trend      cd_dummy1   multi_dummy1 premium_dummy1 
##       2.022790       1.859370       1.290568       1.109388
stepAIC(comp_model)
## Start:  AIC=70336.65
## price ~ speed + hd + ram + screen + ads + trend + cd_dummy1 + 
##     multi_dummy1 + premium_dummy1
## 
##                  Df Sum of Sq       RSS   AIC
## <none>                        473783875 70337
## - cd_dummy1       1   3107211 476891087 70376
## - multi_dummy1    1   6335218 480119093 70418
## - ads             1  12439298 486223174 70497
## - hd              1  60768013 534551889 71090
## - screen          1  71812147 545596023 71218
## - premium_dummy1  1 129062420 602846296 71843
## - ram             1 155342777 629126653 72110
## - speed           1 192316497 666100373 72467
## - trend           1 515661043 989444918 74944
## 
## Call:
## lm(formula = price ~ speed + hd + ram + screen + ads + trend + 
##     cd_dummy1 + multi_dummy1 + premium_dummy1, data = comp_data)
## 
## Coefficients:
##    (Intercept)           speed              hd             ram  
##       307.9880          9.3203          0.7818         48.2560  
##         screen             ads           trend       cd_dummy1  
##       123.0890          0.6573        -51.8496         60.9167  
##   multi_dummy1  premium_dummy1  
##       104.3238       -509.2247
comp_model_final <- lm(price ~ speed+hd+ram+screen+ads+trend+cd_dummy1+multi_dummy1+premium_dummy1, data = comp_data)


summary(comp_model_final)
## 
## Call:
## lm(formula = price ~ speed + hd + ram + screen + ads + trend + 
##     cd_dummy1 + multi_dummy1 + premium_dummy1, data = comp_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1093.77  -174.24   -11.49   146.49  2001.05 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     307.98798   60.35341   5.103 3.44e-07 ***
## speed             9.32028    0.18506  50.364  < 2e-16 ***
## hd                0.78178    0.02761  28.311  < 2e-16 ***
## ram              48.25596    1.06608  45.265  < 2e-16 ***
## screen          123.08904    3.99950  30.776  < 2e-16 ***
## ads               0.65729    0.05132  12.809  < 2e-16 ***
## trend           -51.84958    0.62871 -82.470  < 2e-16 ***
## cd_dummy1        60.91671    9.51559   6.402 1.65e-10 ***
## multi_dummy1    104.32382   11.41268   9.141  < 2e-16 ***
## premium_dummy1 -509.22473   12.34225 -41.259  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 275.3 on 6249 degrees of freedom
## Multiple R-squared:  0.7756, Adjusted R-squared:  0.7752 
## F-statistic:  2399 on 9 and 6249 DF,  p-value: < 2.2e-16

prepare a prediction model for predicting Price.

ToyotaCorolla <- fread("C:\\Users\\Pawan Srivastav\\Desktop\\Data Science\\Data Sets\\Data Sets\\Multilinear Regression\\ToyotaCorolla.csv")

setkey(ToyotaCorolla, Id)

Corolla <- ToyotaCorolla[, c('Price','Age_08_04','KM','HP','cc','Doors','Gears','Quarterly_Tax','Weight')]

Corolla_Model <- lm(Price ~ Age_08_04+KM+HP+cc+Doors+Gears+Quarterly_Tax+Weight,data = Corolla)

summary(Corolla_Model)
## 
## Call:
## lm(formula = Price ~ Age_08_04 + KM + HP + cc + Doors + Gears + 
##     Quarterly_Tax + Weight, data = Corolla)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9366.4  -793.3   -21.3   799.7  6444.0 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -5.573e+03  1.411e+03  -3.949 8.24e-05 ***
## Age_08_04     -1.217e+02  2.616e+00 -46.512  < 2e-16 ***
## KM            -2.082e-02  1.252e-03 -16.622  < 2e-16 ***
## HP             3.168e+01  2.818e+00  11.241  < 2e-16 ***
## cc            -1.211e-01  9.009e-02  -1.344  0.17909    
## Doors         -1.617e+00  4.001e+01  -0.040  0.96777    
## Gears          5.943e+02  1.971e+02   3.016  0.00261 ** 
## Quarterly_Tax  3.949e+00  1.310e+00   3.015  0.00262 ** 
## Weight         1.696e+01  1.068e+00  15.880  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1342 on 1427 degrees of freedom
## Multiple R-squared:  0.8638, Adjusted R-squared:  0.863 
## F-statistic:  1131 on 8 and 1427 DF,  p-value: < 2.2e-16
vif(Corolla_Model)
##     Age_08_04            KM            HP            cc         Doors 
##      1.884620      1.756905      1.419422      1.163894      1.156575 
##         Gears Quarterly_Tax        Weight 
##      1.098723      2.311431      2.516420
avPlots(Corolla_Model)

stepAIC(Corolla_Model)
## Start:  AIC=20693.89
## Price ~ Age_08_04 + KM + HP + cc + Doors + Gears + Quarterly_Tax + 
##     Weight
## 
##                 Df  Sum of Sq        RSS   AIC
## - Doors          1       2943 2571786477 20692
## - cc             1    3256511 2575040045 20694
## <none>                        2571783534 20694
## - Quarterly_Tax  1   16377633 2588161166 20701
## - Gears          1   16393629 2588177163 20701
## - HP             1  227730786 2799514319 20814
## - Weight         1  454465243 3026248777 20926
## - KM             1  497917334 3069700867 20946
## - Age_08_04      1 3898860600 6470644134 22017
## 
## Step:  AIC=20691.89
## Price ~ Age_08_04 + KM + HP + cc + Gears + Quarterly_Tax + Weight
## 
##                 Df  Sum of Sq        RSS   AIC
## - cc             1    3254209 2575040686 20692
## <none>                        2571786477 20692
## - Quarterly_Tax  1   16503849 2588290326 20699
## - Gears          1   17093855 2588880332 20699
## - HP             1  228761929 2800548406 20812
## - Weight         1  484447009 3056233485 20938
## - KM             1  498427860 3070214337 20944
## - Age_08_04      1 3898877516 6470663993 22015
## 
## Step:  AIC=20691.7
## Price ~ Age_08_04 + KM + HP + Gears + Quarterly_Tax + Weight
## 
##                 Df  Sum of Sq        RSS   AIC
## <none>                        2575040686 20692
## - Quarterly_Tax  1   14976762 2590017448 20698
## - Gears          1   17276597 2592317283 20699
## - HP             1  225684613 2800725299 20810
## - Weight         1  484245502 3059286188 20937
## - KM             1  506728527 3081769213 20948
## - Age_08_04      1 3902107988 6477148674 22014
## 
## Call:
## lm(formula = Price ~ Age_08_04 + KM + HP + Gears + Quarterly_Tax + 
##     Weight, data = Corolla)
## 
## Coefficients:
##   (Intercept)      Age_08_04             KM             HP          Gears  
##    -5.478e+03     -1.217e+02     -2.094e-02      3.133e+01      5.990e+02  
## Quarterly_Tax         Weight  
##     3.737e+00      1.673e+01
Corolla_Model_final <- lm(Price ~ Age_08_04+KM+HP+log(cc)+Gears+Quarterly_Tax+Weight,data = Corolla)

summary(Corolla_Model_final)
## 
## Call:
## lm(formula = Price ~ Age_08_04 + KM + HP + log(cc) + Gears + 
##     Quarterly_Tax + Weight, data = Corolla)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10498.6   -763.2    -30.4    759.7   6611.2 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    8.288e+03  2.662e+03   3.114  0.00188 ** 
## Age_08_04     -1.211e+02  2.585e+00 -46.868  < 2e-16 ***
## KM            -1.928e-02  1.263e-03 -15.262  < 2e-16 ***
## HP             3.677e+01  2.907e+00  12.649  < 2e-16 ***
## log(cc)       -2.261e+03  3.726e+02  -6.067 1.67e-09 ***
## Gears          5.582e+02  1.912e+02   2.920  0.00356 ** 
## Quarterly_Tax  6.545e+00  1.361e+00   4.808 1.69e-06 ***
## Weight         1.870e+01  1.059e+00  17.658  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1326 on 1428 degrees of freedom
## Multiple R-squared:  0.867,  Adjusted R-squared:  0.8664 
## F-statistic:  1330 on 7 and 1428 DF,  p-value: < 2.2e-16