# Importing the dataset
dataset <- read.csv("G:\\RStudio\\udemy\\ml\\Machine Learning AZ\\Part 2 - Regression\\Section 5 - Multiple Linear Regression\\Multiple_Linear_Regression\\50_Startups.csv")
head(dataset)
# taking care of missing values
# Test for missing values
sum(is.na(dataset$R.D.Spend))
[1] 0
sum(is.na(dataset$Administration))
[1] 0
sum(is.na(dataset$Marketing.Spend))
[1] 0
sum(is.na(dataset$State))
[1] 0
sum(is.na(dataset$Profit))
[1] 0
# if all zeros, then there are no missing numbers.
# no need to work with miss
# convert variables into factors
dataset$State <- factor(dataset$State,
levels = c("New York", "California","Florida"),
labels = c(1,2,3))
# feature scaling
# skipped.
training_set
test_set
# fitting mulitple linear regression to the training set
regressor = lm(formula = Profit ~ R.D.Spend+ Administration + Marketing.Spend + State, data = training_set)
# you can also use the dot
# regressor = lm(formula = Profit ~ ., data=training_set)
View(training_set)
View(test_set)
summary(regressor)
Call:
lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +
State, data = training_set)
Residuals:
Min 1Q Median 3Q Max
-33128 -4865 5 6098 18065
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.965e+04 7.637e+03 6.501 1.94e-07 ***
R.D.Spend 7.986e-01 5.604e-02 14.251 6.70e-16 ***
Administration -2.942e-02 5.828e-02 -0.505 0.617
Marketing.Spend 3.268e-02 2.127e-02 1.537 0.134
State2 1.213e+02 3.751e+03 0.032 0.974
State3 2.376e+02 4.127e+03 0.058 0.954
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9908 on 34 degrees of freedom
Multiple R-squared: 0.9499, Adjusted R-squared: 0.9425
F-statistic: 129 on 5 and 34 DF, p-value: < 2.2e-16
# Preditcing the Test set results
y_pred = predict(regressor, newdata = test_set)
y_pred
4 5 8 11 16 20 21 24 31
173981.09 172655.64 160250.02 135513.90 146059.36 114151.03 117081.62 110671.31 98975.29
32
96867.03
# Building the optimal model using backward elimination
regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + State,
data = dataset)
# we used dataset to see the whole effect and find the most significant variables
summary(regressor)
Call:
lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +
State, data = dataset)
Residuals:
Min 1Q Median 3Q Max
-33504 -4736 90 6672 17338
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.008e+04 6.953e+03 7.204 5.76e-09 ***
R.D.Spend 8.060e-01 4.641e-02 17.369 < 2e-16 ***
Administration -2.700e-02 5.223e-02 -0.517 0.608
Marketing.Spend 2.698e-02 1.714e-02 1.574 0.123
State2 4.189e+01 3.256e+03 0.013 0.990
State3 2.407e+02 3.339e+03 0.072 0.943
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9439 on 44 degrees of freedom
Multiple R-squared: 0.9508, Adjusted R-squared: 0.9452
F-statistic: 169.9 on 5 and 44 DF, p-value: < 2.2e-16
# remove the highest P value to build a new regressor
regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend,
data = dataset)
summary(regressor)
Call:
lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend,
data = dataset)
Residuals:
Min 1Q Median 3Q Max
-33534 -4795 63 6606 17275
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.012e+04 6.572e+03 7.626 1.06e-09 ***
R.D.Spend 8.057e-01 4.515e-02 17.846 < 2e-16 ***
Administration -2.682e-02 5.103e-02 -0.526 0.602
Marketing.Spend 2.723e-02 1.645e-02 1.655 0.105
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9232 on 46 degrees of freedom
Multiple R-squared: 0.9507, Adjusted R-squared: 0.9475
F-statistic: 296 on 3 and 46 DF, p-value: < 2.2e-16
# remove the highest P value to build a new regressor
regressor = lm(formula = Profit ~ R.D.Spend + Marketing.Spend,
data = dataset)
summary(regressor)
Call:
lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = dataset)
Residuals:
Min 1Q Median 3Q Max
-33645 -4632 -414 6484 17097
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.698e+04 2.690e+03 17.464 <2e-16 ***
R.D.Spend 7.966e-01 4.135e-02 19.266 <2e-16 ***
Marketing.Spend 2.991e-02 1.552e-02 1.927 0.06 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9161 on 47 degrees of freedom
Multiple R-squared: 0.9505, Adjusted R-squared: 0.9483
F-statistic: 450.8 on 2 and 47 DF, p-value: < 2.2e-16
# remove the highest P value to build a new regressor
regressor = lm(formula = Profit ~ R.D.Spend ,
data = dataset)
summary(regressor)
Call:
lm(formula = Profit ~ R.D.Spend, data = dataset)
Residuals:
Min 1Q Median 3Q Max
-34351 -4626 -375 6249 17188
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.903e+04 2.538e+03 19.32 <2e-16 ***
R.D.Spend 8.543e-01 2.931e-02 29.15 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9416 on 48 degrees of freedom
Multiple R-squared: 0.9465, Adjusted R-squared: 0.9454
F-statistic: 849.8 on 1 and 48 DF, p-value: < 2.2e-16
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCiMgSW1wb3J0aW5nIHRoZSBkYXRhc2V0DQpkYXRhc2V0IDwtICByZWFkLmNzdigiRzpcXFJTdHVkaW9cXHVkZW15XFxtbFxcTWFjaGluZSBMZWFybmluZyBBWlxcUGFydCAyIC0gUmVncmVzc2lvblxcU2VjdGlvbiA1IC0gTXVsdGlwbGUgTGluZWFyIFJlZ3Jlc3Npb25cXE11bHRpcGxlX0xpbmVhcl9SZWdyZXNzaW9uXFw1MF9TdGFydHVwcy5jc3YiKQ0KaGVhZChkYXRhc2V0KQ0KDQpgYGANCg0KYGBge3J9DQojIHRha2luZyBjYXJlIG9mIG1pc3NpbmcgdmFsdWVzDQojIFRlc3QgZm9yIG1pc3NpbmcgdmFsdWVzDQpzdW0oaXMubmEoZGF0YXNldCRSLkQuU3BlbmQpKQ0Kc3VtKGlzLm5hKGRhdGFzZXQkQWRtaW5pc3RyYXRpb24pKQ0Kc3VtKGlzLm5hKGRhdGFzZXQkTWFya2V0aW5nLlNwZW5kKSkNCnN1bShpcy5uYShkYXRhc2V0JFN0YXRlKSkNCnN1bShpcy5uYShkYXRhc2V0JFByb2ZpdCkpDQoNCiMgaWYgYWxsIHplcm9zLCB0aGVuIHRoZXJlIGFyZSBubyBtaXNzaW5nIG51bWJlcnMuIA0KIyBubyBuZWVkIHRvIHdvcmsgd2l0aCBtaXNzDQoNCmBgYA0KDQpgYGB7cn0NCiMgY29udmVydCB2YXJpYWJsZXMgaW50byBmYWN0b3JzDQoNCmRhdGFzZXQkU3RhdGUgPC0gIGZhY3RvcihkYXRhc2V0JFN0YXRlLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgbGV2ZWxzID0gYygiTmV3IFlvcmsiLCAiQ2FsaWZvcm5pYSIsIkZsb3JpZGEiKSwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgIGxhYmVscyA9IGMoMSwyLDMpKQ0KDQpgYGANCg0KYGBge3J9DQpsaWJyYXJ5KGNhVG9vbHMpDQpzZXQuc2VlZCgxMjMpDQpzcGxpdCA9IHNhbXBsZS5zcGxpdChkYXRhc2V0JFByb2ZpdCwgU3BsaXRSYXRpbyA9IDAuOCkNCg0KIyBjcmVhdGUgdHJhaW5pbmcgc2V0DQp0cmFpbmluZ19zZXQgPSBzdWJzZXQoZGF0YXNldCwgc3BsaXQgPT0gVFJVRSkNCnRlc3Rfc2V0ID0gc3Vic2V0KGRhdGFzZXQsIHNwbGl0ID09IEZBTFNFKQ0KDQojIGZlYXR1cmUgc2NhbGluZw0KIyBza2lwcGVkLg0KDQp0cmFpbmluZ19zZXQNCnRlc3Rfc2V0DQpgYGANCg0KYGBge3J9DQojIGZpdHRpbmcgbXVsaXRwbGUgbGluZWFyIHJlZ3Jlc3Npb24gdG8gdGhlIHRyYWluaW5nIHNldA0KcmVncmVzc29yID0gbG0oZm9ybXVsYSA9IFByb2ZpdCB+IFIuRC5TcGVuZCsgQWRtaW5pc3RyYXRpb24gKyBNYXJrZXRpbmcuU3BlbmQgKyBTdGF0ZSwgZGF0YSA9IHRyYWluaW5nX3NldCkNCiMgeW91IGNhbiBhbHNvIHVzZSB0aGUgZG90IA0KIyByZWdyZXNzb3IgPSBsbShmb3JtdWxhID0gUHJvZml0IH4gLiwgZGF0YT10cmFpbmluZ19zZXQpDQpWaWV3KHRyYWluaW5nX3NldCkNClZpZXcodGVzdF9zZXQpDQpzdW1tYXJ5KHJlZ3Jlc3NvcikNCmBgYA0KDQpgYGB7cn0NCiMgUHJlZGl0Y2luZyB0aGUgVGVzdCBzZXQgcmVzdWx0cw0KeV9wcmVkID0gcHJlZGljdChyZWdyZXNzb3IsIG5ld2RhdGEgPSB0ZXN0X3NldCkNCnlfcHJlZA0KDQpgYGANCg0KDQoNCmBgYHtyfQ0KIyBCdWlsZGluZyB0aGUgb3B0aW1hbCBtb2RlbCB1c2luZyBiYWNrd2FyZCBlbGltaW5hdGlvbg0KcmVncmVzc29yID0gbG0oZm9ybXVsYSA9IFByb2ZpdCB+IFIuRC5TcGVuZCArIEFkbWluaXN0cmF0aW9uICsgTWFya2V0aW5nLlNwZW5kICsgU3RhdGUsIA0KICAgICAgICAgICAgICAgZGF0YSA9IGRhdGFzZXQpDQojIHdlIHVzZWQgZGF0YXNldCB0byBzZWUgdGhlIHdob2xlIGVmZmVjdCBhbmQgZmluZCB0aGUgbW9zdCBzaWduaWZpY2FudCB2YXJpYWJsZXMNCnN1bW1hcnkocmVncmVzc29yKQ0KYGBgDQpgYGB7cn0NCiMgcmVtb3ZlIHRoZSBoaWdoZXN0IFAgdmFsdWUgdG8gYnVpbGQgYSBuZXcgcmVncmVzc29yDQpyZWdyZXNzb3IgPSBsbShmb3JtdWxhID0gUHJvZml0IH4gUi5ELlNwZW5kICsgQWRtaW5pc3RyYXRpb24gKyBNYXJrZXRpbmcuU3BlbmQsIA0KICAgICAgICAgICAgICAgZGF0YSA9IGRhdGFzZXQpDQpzdW1tYXJ5KHJlZ3Jlc3NvcikNCmBgYA0KDQpgYGB7cn0NCiMgcmVtb3ZlIHRoZSBoaWdoZXN0IFAgdmFsdWUgdG8gYnVpbGQgYSBuZXcgcmVncmVzc29yDQpyZWdyZXNzb3IgPSBsbShmb3JtdWxhID0gUHJvZml0IH4gUi5ELlNwZW5kICArIE1hcmtldGluZy5TcGVuZCwgDQogICAgICAgICAgICAgICBkYXRhID0gZGF0YXNldCkNCnN1bW1hcnkocmVncmVzc29yKQ0KYGBgDQoNCmBgYHtyfQ0KIyByZW1vdmUgdGhlIGhpZ2hlc3QgUCB2YWx1ZSB0byBidWlsZCBhIG5ldyByZWdyZXNzb3INCnJlZ3Jlc3NvciA9IGxtKGZvcm11bGEgPSBQcm9maXQgfiBSLkQuU3BlbmQgLCANCiAgICAgICAgICAgICAgIGRhdGEgPSBkYXRhc2V0KQ0Kc3VtbWFyeShyZWdyZXNzb3IpDQoNCmBgYA0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg==