fifty.st <- read.csv("C:\\Users\\bdee\\Videos\\Data science\\Linear Regression files\\50_startups.csv")
View(fifty.st)
attach(fifty.st)
summary(fifty.st)
## R.D.Spend Administration Marketing.Spend State
## Min. : 0 Min. : 51283 Min. : 0 California:17
## 1st Qu.: 39936 1st Qu.:103731 1st Qu.:129300 Florida :16
## Median : 73051 Median :122700 Median :212716 New York :17
## Mean : 73722 Mean :121345 Mean :211025
## 3rd Qu.:101603 3rd Qu.:144842 3rd Qu.:299469
## Max. :165349 Max. :182646 Max. :471784
## Profit
## Min. : 14681
## 1st Qu.: 90139
## Median :107978
## Mean :112013
## 3rd Qu.:139766
## Max. :192262
fifty.st <- fifty.st[, c("Marketing.Spend","Profit")]
mean(Profit)
## [1] 112012.6
mean(Marketing.Spend)
## [1] 211025.1
summary(Profit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14681 90139 107978 112013 139766 192262
summary(Marketing.Spend)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 129300 212716 211025 299469 471784
qqnorm(Profit)

qqnorm(Marketing.Spend)

plot(Marketing.Spend,Profit)

cor(Marketing.Spend,Profit)
## [1] 0.7477657
m1 <- lm(Profit ~ Marketing.Spend)
summary(m1)
##
## Call:
## lm(formula = Profit ~ Marketing.Spend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83739 -18802 4925 15879 64642
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.000e+04 7.685e+03 7.808 4.29e-10 ***
## Marketing.Spend 2.465e-01 3.159e-02 7.803 4.38e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27040 on 48 degrees of freedom
## Multiple R-squared: 0.5592, Adjusted R-squared: 0.55
## F-statistic: 60.88 on 1 and 48 DF, p-value: 4.381e-10
pv <- predict(m1,fifty.st)
class(pv)
## [1] "numeric"
pv <- as.data.frame(pv)
pv
## pv
## 1 176279.11
## 2 169406.45
## 3 160542.80
## 4 154446.65
## 5 150249.15
## 6 149434.09
## 7 91480.54
## 8 139825.96
## 9 136803.53
## 10 135169.09
## 11 116482.39
## 12 121555.41
## 13 121578.79
## 14 122275.16
## 15 123223.53
## 16 124520.73
## 17 125154.08
## 18 129646.61
## 19 132689.21
## 20 60003.55
## 21 133612.17
## 22 133876.58
## 23 134759.39
## 24 135116.63
## 25 94649.51
## 26 94005.71
## 27 93041.43
## 28 147048.97
## 29 89122.27
## 30 86408.80
## 31 82463.69
## 32 81745.75
## 33 71361.69
## 34 112902.29
## 35 111956.59
## 36 110655.28
## 37 109573.12
## 38 108563.27
## 39 105663.85
## 40 103133.75
## 41 102590.64
## 42 100538.88
## 43 96479.79
## 44 68761.27
## 45 66986.90
## 46 60472.79
## 47 133230.16
## 48 60003.55
## 49 60003.55
## 50 71136.87
final <- cbind(fifty.st,pv)
write.csv(final, "aug04.csv")
getwd()
## [1] "C:/Users/bdee/Desktop"
test <- read.csv("C:\\Users\\bdee\\Videos\\Data science\\Linear Regression files\\newdata.csv")
pv1 <- predict(m1, newdata = test)
## Warning: 'newdata' had 4 rows but variables found have 50 rows
pv1
## 1 2 3 4 5 6 7
## 176279.11 169406.45 160542.80 154446.65 150249.15 149434.09 91480.54
## 8 9 10 11 12 13 14
## 139825.96 136803.53 135169.09 116482.39 121555.41 121578.79 122275.16
## 15 16 17 18 19 20 21
## 123223.53 124520.73 125154.08 129646.61 132689.21 60003.55 133612.17
## 22 23 24 25 26 27 28
## 133876.58 134759.39 135116.63 94649.51 94005.71 93041.43 147048.97
## 29 30 31 32 33 34 35
## 89122.27 86408.80 82463.69 81745.75 71361.69 112902.29 111956.59
## 36 37 38 39 40 41 42
## 110655.28 109573.12 108563.27 105663.85 103133.75 102590.64 100538.88
## 43 44 45 46 47 48 49
## 96479.79 68761.27 66986.90 60472.79 133230.16 60003.55 60003.55
## 50
## 71136.87
pv <- as.data.frame(pv)
getwd()
## [1] "C:/Users/bdee/Desktop"
#Logarthmic Trans
#reg_log <- lm(Profit ~ log(Marketing.Spend))
reg_exp <-lm(log(Profit) ~ Marketing.Spend)
summary(reg_exp)
##
## Call:
## lm(formula = log(Profit) ~ Marketing.Spend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5289 -0.1353 0.0459 0.2052 0.7091
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.101e+01 9.905e-02 111.147 < 2e-16 ***
## Marketing.Spend 2.527e-06 4.071e-07 6.207 1.21e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3485 on 48 degrees of freedom
## Multiple R-squared: 0.4453, Adjusted R-squared: 0.4337
## F-statistic: 38.53 on 1 and 48 DF, p-value: 1.208e-07
predict(reg_exp, newdata = test)
## Warning: 'newdata' had 4 rows but variables found have 50 rows
## 1 2 3 4 5 6 7 8
## 12.20137 12.13089 12.04000 11.97749 11.93445 11.92609 11.33182 11.82757
## 9 10 11 12 13 14 15 16
## 11.79658 11.77982 11.58820 11.64022 11.64046 11.64760 11.65732 11.67062
## 17 18 19 20 21 22 23 24
## 11.67712 11.72319 11.75439 11.00905 11.76385 11.76656 11.77561 11.77928
## 25 26 27 28 29 30 31 32
## 11.36432 11.35772 11.34783 11.90163 11.30764 11.27982 11.23936 11.23200
## 33 34 35 36 37 38 39 40
## 11.12552 11.55149 11.54179 11.52845 11.51735 11.50699 11.47726 11.45132
## 41 42 43 44 45 46 47 48
## 11.44575 11.42471 11.38309 11.09885 11.08066 11.01386 11.75993 11.00905
## 49 50
## 11.00905 11.12321
reg_exl <- lm(Profit ~ 1/log(Marketing.Spend))
summary(reg_exl)
##
## Call:
## lm(formula = Profit ~ 1/log(Marketing.Spend))
##
## Residuals:
## Min 1Q Median 3Q Max
## -97331 -21874 -4034 27753 80249
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 112013 5700 19.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40310 on 49 degrees of freedom
reg_exr <- lm(sqrt(Profit) ~ Marketing.Spend)
summary(reg_exr)
##
## Call:
## lm(formula = sqrt(Profit) ~ Marketing.Spend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.69 -27.51 8.49 27.84 102.86
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.475e+02 1.287e+01 19.229 < 2e-16 ***
## Marketing.Spend 3.835e-04 5.291e-05 7.247 3.07e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.3 on 48 degrees of freedom
## Multiple R-squared: 0.5225, Adjusted R-squared: 0.5126
## F-statistic: 52.53 on 1 and 48 DF, p-value: 3.073e-09
#reg_sqr <- lm(profile ~ sqrt(Marketing.Spend))
reg_tr <- lm(1/log(Profit) ~ Marketing.Spend)
summary(reg_tr)
##
## Call:
## lm(formula = 1/log(Profit) ~ Marketing.Spend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0056117 -0.0015993 -0.0003705 0.0008909 0.0141699
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.095e-02 8.371e-04 108.642 < 2e-16 ***
## Marketing.Spend -1.973e-08 3.441e-09 -5.734 6.37e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.002946 on 48 degrees of freedom
## Multiple R-squared: 0.4065, Adjusted R-squared: 0.3942
## F-statistic: 32.88 on 1 and 48 DF, p-value: 6.366e-07