Week 2
Use the Brokerage Satisfaction excel file to answer the following questions in R. Create an R Markdown file to answer the questions, and then “knit” your file to create an HTML document. Your HTML document should contain both textual explanations of your answers, as well as all R code needed to support your work. Submit both your HTML document and original R Markdown file (i.e., your Rmd file).
brokerage <- read.csv("C:/Users/raze1/OneDrive/Desktop/UIndy/MSDA 621-50/Homework/HWK 2/BrokerageSatisfaction.csv")
brokerage
## X..Brokerage Satisfaction_with_Trade_Price
## 1 Scottrade, Inc. 3.2
## 2 Charles Schwab 3.3
## 3 Fidelity Brokerage Services 3.1
## 4 TD Ameritrade 2.8
## 5 E*Trade Financial 2.9
## 6 (Not listed) 2.4
## 7 Vanguard Brokerage Services 2.7
## 8 USAA Brokerage Services 2.4
## 9 Thinkorswim 2.6
## 10 Wells Fargo Investments 2.3
## 11 Interactive Brokers 3.7
## 12 Zecco.com 2.5
## 13 Firstrade Securities 3.0
## 14 Banc of America Investment Services 1.0
## Satisfaction_with_Speed_of_Execution
## 1 3.1
## 2 3.1
## 3 3.3
## 4 3.5
## 5 3.2
## 6 3.2
## 7 3.8
## 8 3.7
## 9 2.6
## 10 2.7
## 11 3.9
## 12 2.5
## 13 3.0
## 14 4.0
## Overall_Satisfaction_with_Electronic_Trades
## 1 3.2
## 2 3.2
## 3 4.0
## 4 3.7
## 5 3.0
## 6 2.7
## 7 2.7
## 8 3.4
## 9 2.7
## 10 2.3
## 11 4.0
## 12 2.5
## 13 3.0
## 14 2.0
summary(brokerage)
## X..Brokerage Satisfaction_with_Trade_Price
## Length:14 Min. :1.000
## Class :character 1st Qu.:2.425
## Mode :character Median :2.750
## Mean :2.707
## 3rd Qu.:3.075
## Max. :3.700
## Satisfaction_with_Speed_of_Execution
## Min. :2.500
## 1st Qu.:3.025
## Median :3.200
## Mean :3.257
## 3rd Qu.:3.650
## Max. :4.000
## Overall_Satisfaction_with_Electronic_Trades
## Min. :2.000
## 1st Qu.:2.700
## Median :3.000
## Mean :3.029
## 3rd Qu.:3.350
## Max. :4.000
sample_brokerage <- sample(nrow(brokerage), nrow(brokerage)*1)
brokerage_train <- brokerage[sample_brokerage,]
brokerage_test <- brokerage[-sample_brokerage]
brokerage_train
## X..Brokerage Satisfaction_with_Trade_Price
## 5 E*Trade Financial 2.9
## 1 Scottrade, Inc. 3.2
## 7 Vanguard Brokerage Services 2.7
## 12 Zecco.com 2.5
## 11 Interactive Brokers 3.7
## 10 Wells Fargo Investments 2.3
## 6 (Not listed) 2.4
## 14 Banc of America Investment Services 1.0
## 8 USAA Brokerage Services 2.4
## 9 Thinkorswim 2.6
## 13 Firstrade Securities 3.0
## 3 Fidelity Brokerage Services 3.1
## 4 TD Ameritrade 2.8
## 2 Charles Schwab 3.3
## Satisfaction_with_Speed_of_Execution
## 5 3.2
## 1 3.1
## 7 3.8
## 12 2.5
## 11 3.9
## 10 2.7
## 6 3.2
## 14 4.0
## 8 3.7
## 9 2.6
## 13 3.0
## 3 3.3
## 4 3.5
## 2 3.1
## Overall_Satisfaction_with_Electronic_Trades
## 5 3.0
## 1 3.2
## 7 2.7
## 12 2.5
## 11 4.0
## 10 2.3
## 6 2.7
## 14 2.0
## 8 3.4
## 9 2.7
## 13 3.0
## 3 4.0
## 4 3.7
## 2 3.2
colnames(brokerage_train)
## [1] "X..Brokerage"
## [2] "Satisfaction_with_Trade_Price"
## [3] "Satisfaction_with_Speed_of_Execution"
## [4] "Overall_Satisfaction_with_Electronic_Trades"
brokerage_train$X..Brokerage <- NULL
brokerage_train
## Satisfaction_with_Trade_Price Satisfaction_with_Speed_of_Execution
## 5 2.9 3.2
## 1 3.2 3.1
## 7 2.7 3.8
## 12 2.5 2.5
## 11 3.7 3.9
## 10 2.3 2.7
## 6 2.4 3.2
## 14 1.0 4.0
## 8 2.4 3.7
## 9 2.6 2.6
## 13 3.0 3.0
## 3 3.1 3.3
## 4 2.8 3.5
## 2 3.3 3.1
## Overall_Satisfaction_with_Electronic_Trades
## 5 3.0
## 1 3.2
## 7 2.7
## 12 2.5
## 11 4.0
## 10 2.3
## 6 2.7
## 14 2.0
## 8 3.4
## 9 2.7
## 13 3.0
## 3 4.0
## 4 3.7
## 2 3.2
brokerage_model <- lm(Overall_Satisfaction_with_Electronic_Trades ~ ., data = brokerage_train)
summary(brokerage_model)
##
## Call:
## lm(formula = Overall_Satisfaction_with_Electronic_Trades ~ .,
## data = brokerage_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58886 -0.13863 -0.09120 0.05781 0.64613
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.6633 0.8248 -0.804 0.438318
## Satisfaction_with_Trade_Price 0.7746 0.1521 5.093 0.000348 ***
## Satisfaction_with_Speed_of_Execution 0.4897 0.2016 2.429 0.033469 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3435 on 11 degrees of freedom
## Multiple R-squared: 0.7256, Adjusted R-squared: 0.6757
## F-statistic: 14.54 on 2 and 11 DF, p-value: 0.0008157
print(brokerage_model)
##
## Call:
## lm(formula = Overall_Satisfaction_with_Electronic_Trades ~ .,
## data = brokerage_train)
##
## Coefficients:
## (Intercept) Satisfaction_with_Trade_Price
## -0.6633 0.7746
## Satisfaction_with_Speed_of_Execution
## 0.4897
Yes, but it is a weak relationship.
0.033469
Yes, it is a strong relationship
000348
The Adjusted R-squared is 0.6757, so approximately 67.57% is explained by the model.
brokerage_model$residuals
## 5 1 7 12 11 10
## -0.149979422 -0.133395541 -0.588858465 0.002632365 -0.112435149 -0.140379336
## 6 14 8 9 13 3
## -0.062674498 -0.069955392 0.392491325 0.076204545 -0.129506736 0.646131773
## 4 2
## 0.480581057 -0.210856526
BSRes = summary(brokerage_model)$sigma^2
BSRes
## [1] 0.1179775
standardized_res = brokerage_model$residuals/summary(brokerage_model)$sigma
standardized_res
## 5 1 7 12 11 10
## -0.436648567 -0.388366425 -1.714396556 0.007663841 -0.327342551 -0.408698974
## 6 14 8 9 13 3
## -0.182469896 -0.203667419 1.142695261 0.221861138 -0.377044597 1.881141485
## 4 2
## 1.399158810 -0.613885550
mean(standardized_res)
## [1] -2.0383e-17
var(standardized_res)
## [1] 0.8461538
plot(brokerage_model)
qqnorm(brokerage_model$residuals, main = "Brokerage_Model")
qqline(brokerage_model$residuals)
The residuals are not normally distributed; per the reading,
the residuals would be considered heavily tailed.
hatvalues(brokerage_model)
## 5 1 7 12 11 10 6
## 0.07909281 0.12226809 0.17268548 0.28805237 0.45080022 0.22639250 0.09225511
## 14 8 9 13 3 4 2
## 0.74621276 0.14817354 0.22725402 0.10586879 0.10348052 0.09498002 0.14248379
mean(hatvalues(brokerage_model))
## [1] 0.2142857
nrow(brokerage_test)
## [1] 14
3/14
## [1] 0.2142857
hatvalues(brokerage_model)
## 5 1 7 12 11 10 6
## 0.07909281 0.12226809 0.17268548 0.28805237 0.45080022 0.22639250 0.09225511
## 14 8 9 13 3 4 2
## 0.74621276 0.14817354 0.22725402 0.10586879 0.10348052 0.09498002 0.14248379
Data points 11 (0.45080022) and 14 (0.74621276) have high leverage.
cooks.distance(brokerage_model)
## 5 1 7 12 11 10
## 0.0059271795 0.0079790547 0.2471814307 0.0000111262 0.0533835032 0.0210623352
## 6 14 8 9 13 3
## 0.0012425789 0.1601935254 0.0888808107 0.0062442356 0.0062752299 0.1518660850
## 4 2
## 0.0756708677 0.0243407712
plot(brokerage_model)
“Cook’s Distance is a measure used to determine outliers
that are so unusual, we need to give them special attention. This is the
case when Cook’s Distance is bigger than 1.” None of that data points,
when Cook’s Distance is calculated, is greater than 1 and thus do not
need to be treated differently or removed.
predict(brokerage_model, data.frame(Satisfaction_with_Trade_Price = 4, Satisfaction_with_Speed_of_Execution=2))
## 1
## 3.414448
predict(brokerage_model, data.frame(Satisfaction_with_Trade_Price = 5, Satisfaction_with_Speed_of_Execution=3))
## 1
## 4.678726
predict(brokerage_model, data.frame(Satisfaction_with_Trade_Price = 4, Satisfaction_with_Speed_of_Execution=3))
## 1
## 3.904117
predict(brokerage_model, data.frame(Satisfaction_with_Trade_Price = 3, Satisfaction_with_Speed_of_Execution=2))
## 1
## 2.639838
X = model.matrix(brokerage_model)
X
## (Intercept) Satisfaction_with_Trade_Price
## 5 1 2.9
## 1 1 3.2
## 7 1 2.7
## 12 1 2.5
## 11 1 3.7
## 10 1 2.3
## 6 1 2.4
## 14 1 1.0
## 8 1 2.4
## 9 1 2.6
## 13 1 3.0
## 3 1 3.1
## 4 1 2.8
## 2 1 3.3
## Satisfaction_with_Speed_of_Execution
## 5 3.2
## 1 3.1
## 7 3.8
## 12 2.5
## 11 3.9
## 10 2.7
## 6 3.2
## 14 4.0
## 8 3.7
## 9 2.6
## 13 3.0
## 3 3.3
## 4 3.5
## 2 3.1
## attr(,"assign")
## [1] 0 1 2
max(hatvalues(brokerage_model))
## [1] 0.7462128
x_newa = c(1, 4, 2)
t(x_newa)%*%solve(t(X)%*%X)%*%x_newa
## [,1]
## [1,] 0.8323367
Yes, 0.8323367 > 0.7462128
x_newb = c(1, 5, 3)
t(x_newb)%*%solve(t(X)%*%X)%*%x_newb
## [,1]
## [1,] 1.08481
Yes, 1.08481 > 0.7462128
x_newc = c(1, 4, 3)
t(x_newc)%*%solve(t(X)%*%X)%*%x_newc
## [,1]
## [1,] 0.3992325
No, 0.3992325 < 0.7462128
x_newd = c(1, 3, 2)
t(x_newd)%*%solve(t(X)%*%X)%*%x_newd
## [,1]
## [1,] 0.6074396
No, 0.6074396 < 0.7462128