Homework 2

Question 1

# Uploading the xlsx file
library(readxl)
Brokerage_Satisfaction <- read_excel("C:/Users/lswa/Downloads/Brokerage Satisfaction.xlsx")

# Converting xlsx file to a dataframe
brokerage_data <- as.data.frame(Brokerage_Satisfaction)
str(brokerage_data)

## 'data.frame':    13 obs. of  4 variables:
##  $ Brokerage                                  : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Satisfaction_with_Trade_Price              : num  3.2 3.3 3.1 2.8 2.9 2.7 2.4 2.6 2.3 3.7 ...
##  $ Satisfaction_with_Speed_of_Execution       : num  3.1 3.1 3.3 3.5 3.2 3.8 3.7 2.6 2.7 3.9 ...
##  $ Overall_Satisfaction_with_Electronic_Trades: num  3.2 3.2 4 3.7 3 2.7 3.4 2.7 2.3 4 ...

# Creating the regression model
model <- lm(Overall_Satisfaction_with_Electronic_Trades ~ Satisfaction_with_Trade_Price+Satisfaction_with_Speed_of_Execution, data = brokerage_data)
summary(model)

## 
## Call:
## lm(formula = Overall_Satisfaction_with_Electronic_Trades ~ Satisfaction_with_Trade_Price + 
##     Satisfaction_with_Speed_of_Execution, data = brokerage_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59269 -0.14822 -0.11176  0.06944  0.64298 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           -0.6400     0.8730  -0.733  0.48035    
## Satisfaction_with_Trade_Price          0.7703     0.1610   4.785  0.00074 ***
## Satisfaction_with_Speed_of_Execution   0.4876     0.2114   2.307  0.04376 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3596 on 10 degrees of freedom
## Multiple R-squared:  0.7196, Adjusted R-squared:  0.6635 
## F-statistic: 12.83 on 2 and 10 DF,  p-value: 0.001734

Based on the resulting p-values, there is likely a relationship between the overall satisfaction and the satisfaction with the speed of execution.

The p-value is 0.04376.

Based on the resulting p-values, there is also a likelihood of a relationship between the overall satisfaction and the satisfaction with trade price variable.

The p-value is 0.00074.

# Calculating the variation
pi <- predict(object = model, newdata = brokerage_data)

SSE <- sum((pi - brokerage_data$Overall_Satisfaction_with_Electronic_Trades)^2)

SST <- sum((brokerage_data$Overall_Satisfaction_with_Electronic_Trades - mean(brokerage_data$Overall_Satisfaction_with_Electronic_Trades))^2)

SSR <- sum((pi - mean(brokerage_data$Overall_Satisfaction_with_Electronic_Trades))^2)

R_Squared <- SSR/SST
R_Squared

## [1] 0.7195709

# Obtaining the residuals
model$residuals

##            1            2            3            4            5            6 
## -0.136538690 -0.213570295  0.642976250  0.476554401 -0.154202205 -0.592688992 
##            7            8            9           10           11           12 
##  0.387164159  0.069442612 -0.148220904 -0.111763385 -0.004767449 -0.133717145 
##           13 
## -0.080668358

# Obtaining MSRes
MSRes = summary(model)$sigma^2
MSRes

## [1] 0.1293425

# Obtaining standardized residuals
StandardRes = model$residuals/summary(model)$sigma
StandardRes

##           1           2           3           4           5           6 
## -0.37965142 -0.59384096  1.78782181  1.32507904 -0.42876555 -1.64799603 
##           7           8           9          10          11          12 
##  1.07652581  0.19308803 -0.41213430 -0.31076267 -0.01325609 -0.37180600 
##          13 
## -0.22430168

# Plotting the QQ Plot
qqnorm(model$residuals, main="QQ Plot")
qqline(model$residuals)

Question 2

The average leverage of the datapoints is

mean(hatvalues(model))

## [1] 0.2307692

# Influence measure
influence.measures(model)

## Influence measures of
##   lm(formula = Overall_Satisfaction_with_Electronic_Trades ~ Satisfaction_with_Trade_Price +      Satisfaction_with_Speed_of_Execution, data = brokerage_data) :
## 
##      dfb.1_ dfb.S__T dfb.S__S    dffit cov.r   cook.d    hat inf
## 1   0.00638 -0.08170  0.02737 -0.14640 1.491 0.007808 0.1246    
## 2   0.02878 -0.16249  0.04020 -0.25528 1.413 0.023141 0.1442    
## 3  -0.21052  0.39845  0.10745  0.76957 0.407 0.140945 0.1058    
## 4  -0.16744  0.07823  0.22451  0.48840 0.795 0.071135 0.0987    
## 5  -0.00734 -0.03172  0.01153 -0.12929 1.408 0.006067 0.0832    
## 6   0.53722 -0.06929 -0.72909 -0.97249 0.502 0.234811 0.1761    
## 7  -0.12883 -0.14633  0.30863  0.51164 1.043 0.083660 0.1547    
## 8   0.09676 -0.02703 -0.09563  0.11749 1.773 0.005088 0.2378    
## 9  -0.22567  0.12412  0.18572 -0.25542 1.688 0.023622 0.2406    
## 10  0.30144 -0.26306 -0.23571 -0.36375 2.369 0.048142 0.4509   *
## 11 -0.00862  0.00297  0.00831 -0.00986 1.961 0.000036 0.3007   *
## 12 -0.03593 -0.03966  0.05476 -0.13245 1.471 0.006397 0.1100    
## 13 -0.12621  0.67685 -0.30976 -0.83233 5.644 0.250904 0.7727   *

Observing the Influence Measures table above, datapoints 6 and 10 are considered to have a high leverage.

# Cooks Distance
cooks.distance(model)

##            1            2            3            4            5            6 
## 7.807911e-03 2.314099e-02 1.409451e-01 7.113529e-02 6.067034e-03 2.348107e-01 
##            7            8            9           10           11           12 
## 8.365990e-02 5.087726e-03 2.362204e-02 4.814212e-02 3.601908e-05 6.396672e-03 
##           13 
## 2.509036e-01

plot(model)

## Question 3

# Regression models with different scenarios
new_scenario <- data.frame(Satisfaction_with_Speed_of_Execution = c(2, 3, 3, 2), Satisfaction_with_Trade_Price = c(4, 5, 4, 4))
new_scenario

##   Satisfaction_with_Speed_of_Execution Satisfaction_with_Trade_Price
## 1                                    2                             4
## 2                                    3                             5
## 3                                    3                             4
## 4                                    2                             4

prediction <- predict(model, new_scenario, type = "response")
prediction

##        1        2        3        4 
## 3.416450 4.674349 3.904033 3.416450

Question 4

Maximum leverage of the datapoints is

max(hatvalues(model))

## [1] 0.7727345

Question 5

To determine extrapolation, the leverages of the datapoints given in the new scenarios are observed below

matrix = model.matrix(model)

scenario_a = c(1,2,4)
t(scenario_a)%*%solve(t(matrix)%*%matrix)%*%scenario_a

##          [,1]
## [1,] 0.333138

scenario_b = c(1,3,5)
t(scenario_b)%*%solve(t(matrix)%*%matrix)%*%scenario_b

##          [,1]
## [1,] 1.169531

scenario_c = c(1,3,4)
t(scenario_c)%*%solve(t(matrix)%*%matrix)%*%scenario_c

##           [,1]
## [1,] 0.2942763

scenario_d = c(1,2,3)
t(scenario_d)%*%solve(t(matrix)%*%matrix)%*%scenario_d

##           [,1]
## [1,] 0.2214316

Scenario B would be extrapolated.

Homework 2

Lynx

2022-09-21

Question 1

Question 2

Question 4

Question 5