start ups problem with multilinear regression

startups <- read.csv("C:\\Users\\prakruthi\\Desktop\\datascience assignments\\multi linear regression\\50_Startups.csv")
attach(startups)
class(startups)
## [1] "data.frame"
str(startups)
## 'data.frame':    50 obs. of  5 variables:
##  $ R.D.Spend      : num  165349 162598 153442 144372 142107 ...
##  $ Administration : num  136898 151378 101146 118672 91392 ...
##  $ Marketing.Spend: num  471784 443899 407935 383200 366168 ...
##  $ State          : Factor w/ 3 levels "California","Florida",..: 3 1 2 3 2 3 1 2 3 1 ...
##  $ Profit         : num  192262 191792 191050 182902 166188 ...
summary(startups)
##    R.D.Spend      Administration   Marketing.Spend         State   
##  Min.   :     0   Min.   : 51283   Min.   :     0   California:17  
##  1st Qu.: 39936   1st Qu.:103731   1st Qu.:129300   Florida   :16  
##  Median : 73051   Median :122700   Median :212716   New York  :17  
##  Mean   : 73722   Mean   :121345   Mean   :211025                  
##  3rd Qu.:101603   3rd Qu.:144842   3rd Qu.:299469                  
##  Max.   :165349   Max.   :182646   Max.   :471784                  
##      Profit      
##  Min.   : 14681  
##  1st Qu.: 90139  
##  Median :107978  
##  Mean   :112013  
##  3rd Qu.:139766  
##  Max.   :192262
View(startups)
#since the one of the column is in categorical format we have to convert it into numerical
startups$State <- as.numeric(factor(startups$State))
View(startups)

plot(startups)

colnames(startups)
## [1] "R.D.Spend"       "Administration"  "Marketing.Spend" "State"          
## [5] "Profit"
m1 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State,data =startups)

summary(m1)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + 
##     State, data = startups)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33553  -4779     63   6595  17301 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.016e+04  7.322e+03   6.851 1.69e-08 ***
## R.D.Spend        8.058e-01  4.576e-02  17.609  < 2e-16 ***
## Administration  -2.683e-02  5.160e-02  -0.520    0.606    
## Marketing.Spend  2.723e-02  1.663e-02   1.637    0.109    
## State           -2.232e+01  1.610e+03  -0.014    0.989    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9334 on 45 degrees of freedom
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9464 
## F-statistic: 217.2 on 4 and 45 DF,  p-value: < 2.2e-16
cor(startups)
##                 R.D.Spend Administration Marketing.Spend      State
## R.D.Spend       1.0000000     0.24195525      0.72424813 0.10468511
## Administration  0.2419552     1.00000000     -0.03215388 0.01184720
## Marketing.Spend 0.7242481    -0.03215388      1.00000000 0.07766961
## State           0.1046851     0.01184720      0.07766961 1.00000000
## Profit          0.9729005     0.20071657      0.74776572 0.10179631
##                    Profit
## R.D.Spend       0.9729005
## Administration  0.2007166
## Marketing.Spend 0.7477657
## State           0.1017963
## Profit          1.0000000
#finding influencers
library(mvinfluence)
## Loading required package: car
## Loading required package: carData
## Loading required package: heplots
influencePlot(m1)

##       StudRes        Hat      CookD
## 46  2.0508431 0.12502380 0.11220265
## 47 -0.8175683 0.24121705 0.04281347
## 49 -1.7072322 0.25478210 0.19116349
## 50 -4.5278887 0.09664062 0.30602820
m2 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State,data =startups[-50])

summary(m2)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + 
##     State, data = startups[-50])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33553  -4779     63   6595  17301 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.016e+04  7.322e+03   6.851 1.69e-08 ***
## R.D.Spend        8.058e-01  4.576e-02  17.609  < 2e-16 ***
## Administration  -2.683e-02  5.160e-02  -0.520    0.606    
## Marketing.Spend  2.723e-02  1.663e-02   1.637    0.109    
## State           -2.232e+01  1.610e+03  -0.014    0.989    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9334 on 45 degrees of freedom
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9464 
## F-statistic: 217.2 on 4 and 45 DF,  p-value: < 2.2e-16
m3 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State,data =startups[-c(50,49),])

summary(m3)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + 
##     State, data = startups[-c(50, 49), ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15944  -4787  -1904   6066  13646 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.960e+04  6.314e+03   9.438 4.84e-12 ***
## R.D.Spend        7.898e-01  3.677e-02  21.480  < 2e-16 ***
## Administration  -6.257e-02  4.451e-02  -1.406    0.167    
## Marketing.Spend  1.705e-02  1.369e-02   1.245    0.220    
## State           -3.267e+02  1.326e+03  -0.246    0.806    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7429 on 43 degrees of freedom
## Multiple R-squared:  0.9627, Adjusted R-squared:  0.9593 
## F-statistic: 277.7 on 4 and 43 DF,  p-value: < 2.2e-16
m4 <- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State,data =startups[-c(50,49,47,46),])

summary(m4)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + 
##     State, data = startups[-c(50, 49, 47, 46), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15606.7  -5244.0   -656.3   5402.7  13092.6 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.812e+04  6.110e+03   9.513 6.22e-12 ***
## R.D.Spend        7.730e-01  4.027e-02  19.198  < 2e-16 ***
## Administration  -5.203e-02  4.342e-02  -1.198   0.2376    
## Marketing.Spend  2.829e-02  1.458e-02   1.940   0.0593 .  
## State           -8.725e+02  1.306e+03  -0.668   0.5079    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7137 on 41 degrees of freedom
## Multiple R-squared:  0.963,  Adjusted R-squared:  0.9594 
## F-statistic: 266.8 on 4 and 41 DF,  p-value: < 2.2e-16
vif(m1)
##       R.D.Spend  Administration Marketing.Spend           State 
##        2.481178        1.175315        2.326780        1.011281
avPlots(m1)

# lets remove state and see

m5 <- lm(Profit ~ R.D.Spend +  Marketing.Spend ,data = startups[-c(50,49,47,46)])
summary(m5)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = startups[-c(50, 
##     49, 47, 46)])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33645  -4632   -414   6484  17097 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.698e+04  2.690e+03  17.464   <2e-16 ***
## R.D.Spend       7.966e-01  4.135e-02  19.266   <2e-16 ***
## Marketing.Spend 2.991e-02  1.552e-02   1.927     0.06 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9161 on 47 degrees of freedom
## Multiple R-squared:  0.9505, Adjusted R-squared:  0.9483 
## F-statistic: 450.8 on 2 and 47 DF,  p-value: < 2.2e-16
#final model

final<- lm(Profit ~ R.D.Spend + Administration + Marketing.Spend + State,data =startups[-c(50,49,47,46),])

summary(final)
## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + 
##     State, data = startups[-c(50, 49, 47, 46), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15606.7  -5244.0   -656.3   5402.7  13092.6 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.812e+04  6.110e+03   9.513 6.22e-12 ***
## R.D.Spend        7.730e-01  4.027e-02  19.198  < 2e-16 ***
## Administration  -5.203e-02  4.342e-02  -1.198   0.2376    
## Marketing.Spend  2.829e-02  1.458e-02   1.940   0.0593 .  
## State           -8.725e+02  1.306e+03  -0.668   0.5079    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7137 on 41 degrees of freedom
## Multiple R-squared:  0.963,  Adjusted R-squared:  0.9594 
## F-statistic: 266.8 on 4 and 41 DF,  p-value: < 2.2e-16
prediction <- predict(final,data =startups)
pre <- data.frame(startups[-c(50,49,47,46),],prediction)
summary(pre)
##    R.D.Spend      Administration   Marketing.Spend      State      
##  Min.   :     0   Min.   : 51283   Min.   :     0   Min.   :1.000  
##  1st Qu.: 48693   1st Qu.:103731   1st Qu.:138616   1st Qu.:1.000  
##  Median : 75791   Median :124919   Median :221898   Median :2.000  
##  Mean   : 80070   Mean   :123012   Mean   :221893   Mean   :1.978  
##  3rd Qu.:111371   3rd Qu.:145417   3rd Qu.:302424   3rd Qu.:3.000  
##  Max.   :165349   Max.   :182646   Max.   :471784   Max.   :3.000  
##      Profit         prediction    
##  Min.   : 42560   Min.   : 50204  
##  1st Qu.: 96729   1st Qu.: 94997  
##  Median :109543   Median :114455  
##  Mean   :118171   Mean   :118171  
##  3rd Qu.:143591   3rd Qu.:143137  
##  Max.   :192262   Max.   :189550
View(pre)