# Uploading the xlsx file
library(readxl)
Brokerage_Satisfaction <- read_excel("C:/Users/lswa/Downloads/Brokerage Satisfaction.xlsx")
# Converting xlsx file to a dataframe
brokerage_data <- as.data.frame(Brokerage_Satisfaction)
str(brokerage_data)
## 'data.frame': 13 obs. of 4 variables:
## $ Brokerage : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Satisfaction_with_Trade_Price : num 3.2 3.3 3.1 2.8 2.9 2.7 2.4 2.6 2.3 3.7 ...
## $ Satisfaction_with_Speed_of_Execution : num 3.1 3.1 3.3 3.5 3.2 3.8 3.7 2.6 2.7 3.9 ...
## $ Overall_Satisfaction_with_Electronic_Trades: num 3.2 3.2 4 3.7 3 2.7 3.4 2.7 2.3 4 ...
# Creating the regression model
model <- lm(Overall_Satisfaction_with_Electronic_Trades ~ Satisfaction_with_Trade_Price+Satisfaction_with_Speed_of_Execution, data = brokerage_data)
summary(model)
##
## Call:
## lm(formula = Overall_Satisfaction_with_Electronic_Trades ~ Satisfaction_with_Trade_Price +
## Satisfaction_with_Speed_of_Execution, data = brokerage_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59269 -0.14822 -0.11176 0.06944 0.64298
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.6400 0.8730 -0.733 0.48035
## Satisfaction_with_Trade_Price 0.7703 0.1610 4.785 0.00074 ***
## Satisfaction_with_Speed_of_Execution 0.4876 0.2114 2.307 0.04376 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3596 on 10 degrees of freedom
## Multiple R-squared: 0.7196, Adjusted R-squared: 0.6635
## F-statistic: 12.83 on 2 and 10 DF, p-value: 0.001734
Based on the resulting p-values, there is likely a relationship between the overall satisfaction and the satisfaction with the speed of execution.
The p-value is 0.04376.
Based on the resulting p-values, there is also a likelihood of a relationship between the overall satisfaction and the satisfaction with trade price variable.
The p-value is 0.00074.
# Calculating the variation
pi <- predict(object = model, newdata = brokerage_data)
SSE <- sum((pi - brokerage_data$Overall_Satisfaction_with_Electronic_Trades)^2)
SST <- sum((brokerage_data$Overall_Satisfaction_with_Electronic_Trades - mean(brokerage_data$Overall_Satisfaction_with_Electronic_Trades))^2)
SSR <- sum((pi - mean(brokerage_data$Overall_Satisfaction_with_Electronic_Trades))^2)
R_Squared <- SSR/SST
R_Squared
## [1] 0.7195709
# Obtaining the residuals
model$residuals
## 1 2 3 4 5 6
## -0.136538690 -0.213570295 0.642976250 0.476554401 -0.154202205 -0.592688992
## 7 8 9 10 11 12
## 0.387164159 0.069442612 -0.148220904 -0.111763385 -0.004767449 -0.133717145
## 13
## -0.080668358
# Obtaining MSRes
MSRes = summary(model)$sigma^2
MSRes
## [1] 0.1293425
# Obtaining standardized residuals
StandardRes = model$residuals/summary(model)$sigma
StandardRes
## 1 2 3 4 5 6
## -0.37965142 -0.59384096 1.78782181 1.32507904 -0.42876555 -1.64799603
## 7 8 9 10 11 12
## 1.07652581 0.19308803 -0.41213430 -0.31076267 -0.01325609 -0.37180600
## 13
## -0.22430168
# Plotting the QQ Plot
qqnorm(model$residuals, main="QQ Plot")
qqline(model$residuals)
The average leverage of the datapoints is
mean(hatvalues(model))
## [1] 0.2307692
# Influence measure
influence.measures(model)
## Influence measures of
## lm(formula = Overall_Satisfaction_with_Electronic_Trades ~ Satisfaction_with_Trade_Price + Satisfaction_with_Speed_of_Execution, data = brokerage_data) :
##
## dfb.1_ dfb.S__T dfb.S__S dffit cov.r cook.d hat inf
## 1 0.00638 -0.08170 0.02737 -0.14640 1.491 0.007808 0.1246
## 2 0.02878 -0.16249 0.04020 -0.25528 1.413 0.023141 0.1442
## 3 -0.21052 0.39845 0.10745 0.76957 0.407 0.140945 0.1058
## 4 -0.16744 0.07823 0.22451 0.48840 0.795 0.071135 0.0987
## 5 -0.00734 -0.03172 0.01153 -0.12929 1.408 0.006067 0.0832
## 6 0.53722 -0.06929 -0.72909 -0.97249 0.502 0.234811 0.1761
## 7 -0.12883 -0.14633 0.30863 0.51164 1.043 0.083660 0.1547
## 8 0.09676 -0.02703 -0.09563 0.11749 1.773 0.005088 0.2378
## 9 -0.22567 0.12412 0.18572 -0.25542 1.688 0.023622 0.2406
## 10 0.30144 -0.26306 -0.23571 -0.36375 2.369 0.048142 0.4509 *
## 11 -0.00862 0.00297 0.00831 -0.00986 1.961 0.000036 0.3007 *
## 12 -0.03593 -0.03966 0.05476 -0.13245 1.471 0.006397 0.1100
## 13 -0.12621 0.67685 -0.30976 -0.83233 5.644 0.250904 0.7727 *
Observing the Influence Measures table above, datapoints 6 and 10 are considered to have a high leverage.
# Cooks Distance
cooks.distance(model)
## 1 2 3 4 5 6
## 7.807911e-03 2.314099e-02 1.409451e-01 7.113529e-02 6.067034e-03 2.348107e-01
## 7 8 9 10 11 12
## 8.365990e-02 5.087726e-03 2.362204e-02 4.814212e-02 3.601908e-05 6.396672e-03
## 13
## 2.509036e-01
plot(model)
## Question 3
# Regression models with different scenarios
new_scenario <- data.frame(Satisfaction_with_Speed_of_Execution = c(2, 3, 3, 2), Satisfaction_with_Trade_Price = c(4, 5, 4, 4))
new_scenario
## Satisfaction_with_Speed_of_Execution Satisfaction_with_Trade_Price
## 1 2 4
## 2 3 5
## 3 3 4
## 4 2 4
prediction <- predict(model, new_scenario, type = "response")
prediction
## 1 2 3 4
## 3.416450 4.674349 3.904033 3.416450
Maximum leverage of the datapoints is
max(hatvalues(model))
## [1] 0.7727345
To determine extrapolation, the leverages of the datapoints given in the new scenarios are observed below
matrix = model.matrix(model)
scenario_a = c(1,2,4)
t(scenario_a)%*%solve(t(matrix)%*%matrix)%*%scenario_a
## [,1]
## [1,] 0.333138
scenario_b = c(1,3,5)
t(scenario_b)%*%solve(t(matrix)%*%matrix)%*%scenario_b
## [,1]
## [1,] 1.169531
scenario_c = c(1,3,4)
t(scenario_c)%*%solve(t(matrix)%*%matrix)%*%scenario_c
## [,1]
## [1,] 0.2942763
scenario_d = c(1,2,3)
t(scenario_d)%*%solve(t(matrix)%*%matrix)%*%scenario_d
## [,1]
## [1,] 0.2214316
Scenario B would be extrapolated.