Exercise 1

library(readxl)
brokerage <- read_excel("/Users/macuser/Downloads/Brokerage Satisfaction.xlsx")
as.data.frame(brokerage)
##                              Brokerage Satisfaction_with_Trade_Price
## 1                      Scottrade, Inc.                           3.2
## 2                       Charles Schwab                           3.3
## 3          Fidelity Brokerage Services                           3.1
## 4                        TD Ameritrade                           2.8
## 5                    E*Trade Financial                           2.9
## 6                         (Not listed)                           2.4
## 7          Vanguard Brokerage Services                           2.7
## 8              USAA Brokerage Services                           2.4
## 9                          Thinkorswim                           2.6
## 10             Wells Fargo Investments                           2.3
## 11                 Interactive Brokers                           3.7
## 12                           Zecco.com                           2.5
## 13                Firstrade Securities                           3.0
## 14 Banc of America Investment Services                           1.0
##    Satisfaction_with_Speed_of_Execution
## 1                                   3.1
## 2                                   3.1
## 3                                   3.3
## 4                                   3.5
## 5                                   3.2
## 6                                   3.2
## 7                                   3.8
## 8                                   3.7
## 9                                   2.6
## 10                                  2.7
## 11                                  3.9
## 12                                  2.5
## 13                                  3.0
## 14                                  4.0
##    Overall_Satisfaction_with_Electronic_Trades
## 1                                          3.2
## 2                                          3.2
## 3                                          4.0
## 4                                          3.7
## 5                                          3.0
## 6                                          2.7
## 7                                          2.7
## 8                                          3.4
## 9                                          2.7
## 10                                         2.3
## 11                                         4.0
## 12                                         2.5
## 13                                         3.0
## 14                                         2.0
colnames(brokerage)
## [1] "  Brokerage"                                
## [2] "Satisfaction_with_Trade_Price"              
## [3] "Satisfaction_with_Speed_of_Execution"       
## [4] "Overall_Satisfaction_with_Electronic_Trades"
summary(brokerage)
##    Brokerage        Satisfaction_with_Trade_Price
##  Length:14          Min.   :1.000                
##  Class :character   1st Qu.:2.425                
##  Mode  :character   Median :2.750                
##                     Mean   :2.707                
##                     3rd Qu.:3.075                
##                     Max.   :3.700                
##  Satisfaction_with_Speed_of_Execution
##  Min.   :2.500                       
##  1st Qu.:3.025                       
##  Median :3.200                       
##  Mean   :3.257                       
##  3rd Qu.:3.650                       
##  Max.   :4.000                       
##  Overall_Satisfaction_with_Electronic_Trades
##  Min.   :2.000                              
##  1st Qu.:2.700                              
##  Median :3.000                              
##  Mean   :3.029                              
##  3rd Qu.:3.350                              
##  Max.   :4.000
brokerage_lm1 <- lm(Overall_Satisfaction_with_Electronic_Trades~Satisfaction_with_Trade_Price+Satisfaction_with_Speed_of_Execution, data=brokerage)
summary(brokerage_lm1)
## 
## Call:
## lm(formula = Overall_Satisfaction_with_Electronic_Trades ~ Satisfaction_with_Trade_Price + 
##     Satisfaction_with_Speed_of_Execution, data = brokerage)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58886 -0.13863 -0.09120  0.05781  0.64613 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           -0.6633     0.8248  -0.804 0.438318    
## Satisfaction_with_Trade_Price          0.7746     0.1521   5.093 0.000348 ***
## Satisfaction_with_Speed_of_Execution   0.4897     0.2016   2.429 0.033469 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3435 on 11 degrees of freedom
## Multiple R-squared:  0.7256, Adjusted R-squared:  0.6757 
## F-statistic: 14.54 on 2 and 11 DF,  p-value: 0.0008157

Based on the results from regression model, I can state that there is a relation between Overall Satisfaction and Satisfaction_with_Speed_of_Execution.

pi <- predict(object = brokerage_lm1, newdata = brokerage)

#SST
SST <- sum((brokerage$Overall_Satisfaction_with_Electronic_Trades - mean(brokerage$Overall_Satisfaction_with_Electronic_Trades))^2)
SST
## [1] 4.728571
#SSR
SSR <- sum((pi - mean(brokerage$Overall_Satisfaction_with_Electronic_Trades))^2)
SSR
## [1] 3.430819
R_Squared <- SSR/SST
R_Squared
## [1] 0.7255508

The regression model explains 72.5% of the variation in the samples.

qqnorm(brokerage_lm1$residuals)
qqline(brokerage_lm1$residuals)

Based on the Q-Q plot, the residuals are not normally distributed although most of the residuals seems to aline the dotted line which represents the normal distribution. However, data points number 3,4, and 7 are far from being normally distributed.

Exercise 2

mean(hatvalues(brokerage_lm1))
## [1] 0.2142857

The average leverage of datapoints in Brokerage Satisfaction excel file equals to 0.2142857

hatvalues(brokerage_lm1)
##          1          2          3          4          5          6          7 
## 0.12226809 0.14248379 0.10348052 0.09498002 0.07909281 0.09225511 0.17268548 
##          8          9         10         11         12         13         14 
## 0.14817354 0.22725402 0.22639250 0.45080022 0.28805237 0.10586879 0.74621276
hatvalues(brokerage_lm1) > 2*mean(hatvalues(brokerage_lm1))
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE 
##    14 
##  TRUE

Data points number 11 and 14 are considered high leverage points.

cooks.distance(brokerage_lm1)
##            1            2            3            4            5            6 
## 0.0079790547 0.0243407712 0.1518660850 0.0756708677 0.0059271795 0.0012425789 
##            7            8            9           10           11           12 
## 0.2471814307 0.0888808107 0.0062442356 0.0210623352 0.0533835032 0.0000111262 
##           13           14 
## 0.0062752299 0.1601935254
cooks.distance(brokerage_lm1)>1
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    14 
## FALSE
plot(brokerage_lm1)

As we can see, none of the data points has a cook’s distance greater than 1. Moreover, the fourth graph that shows cook’s distance for each data point, proves that none of the data points exceeded allowed range. This means none of the data points should be considered an outlier.

Exercise 3

x_newa <- c(1,4,2)
coefficients <- as.vector(brokerage_lm1$coefficients)
coefficients
## [1] -0.6633279  0.7746098  0.4896684
t(x_newa)%*%coefficients
##          [,1]
## [1,] 3.414448

The Overall_Satisfaction_with_Electronic_Trades equals to 3.414448

x_newb <- c(1,5,3)
t(x_newb)%*%coefficients
##          [,1]
## [1,] 4.678726

The Overall_Satisfaction_with_Electronic_Trades equals to 4.678726

x_newc <- c(1,4,3)
t(x_newc)%*%coefficients
##          [,1]
## [1,] 3.904117

The Overall_Satisfaction_with_Electronic_Trades equals to 3.904117

x_newd <- c(1,3,2)
t(x_newd)%*%coefficients
##          [,1]
## [1,] 2.639838

The Overall_Satisfaction_with_Electronic_Trades equals to 2.639838

Exercise 4

max(hatvalues(brokerage_lm1))
## [1] 0.7462128

The maximum leverage of our data points is equal to the highest leverage of a single data point in the train dataset. Hence, the maximum leverage equals to 0.7462128.

Exercise 5

X=model.matrix(brokerage_lm1)
t(x_newa)%*%solve(t(X)%*%X)%*%x_newa
##           [,1]
## [1,] 0.8323367
t(x_newa)%*%solve(t(X)%*%X)%*%x_newa > max(hatvalues(brokerage_lm1))
##      [,1]
## [1,] TRUE

The new data point is a point of extrapolation since its leverage is higher than maximum leverage of the dataset.

t(x_newb)%*%solve(t(X)%*%X)%*%x_newb
##         [,1]
## [1,] 1.08481
t(x_newb)%*%solve(t(X)%*%X)%*%x_newb > max(hatvalues(brokerage_lm1))
##      [,1]
## [1,] TRUE

The new data point is a point of extrapolation since its leverage is higher than maximum leverage of the dataset.

t(x_newc)%*%solve(t(X)%*%X)%*%x_newc
##           [,1]
## [1,] 0.3992325
t(x_newc)%*%solve(t(X)%*%X)%*%x_newc > max(hatvalues(brokerage_lm1))
##       [,1]
## [1,] FALSE

The new data point is not a point of extrapolation since its leverage is lower than maximum leverage of the dataset.

t(x_newd)%*%solve(t(X)%*%X)%*%x_newd
##           [,1]
## [1,] 0.6074396
t(x_newd)%*%solve(t(X)%*%X)%*%x_newd > max(hatvalues(brokerage_lm1))
##       [,1]
## [1,] FALSE

The new data point is not a point of extrapolation since its leverage is lower than maximum leverage of the dataset.