###setwd("c:/Users/Michael/DROPBOX/priv/CUNY/MSDS/201902-Spring/DATA606-Jason/Homework")require(lmSupport) ## Note: the "S" is capitalized in the package name## Loading required package: lmSupport
## Warning: package 'lmSupport' was built under R version 3.5.3
modelAssumptions(sbux_model,"LINEAR")##
## Call:
## lm(formula = carb ~ calories, data = starbucks)
##
## Coefficients:
## (Intercept) calories
## 8.94356 0.10603
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = Model)
##
## Value p-value Decision
## Global Stat 1.807940 0.77103 Assumptions acceptable.
## Skewness 0.017401 0.89505 Assumptions acceptable.
## Kurtosis 0.511569 0.47446 Assumptions acceptable.
## Link Function 1.189958 0.27534 Assumptions acceptable.
## Heteroscedasticity 0.089013 0.76544 Assumptions acceptable.
shapiro.test(sbux_model$residuals)##
## Shapiro-Wilk normality test
##
## data: sbux_model$residuals
## W = 0.990507, p-value = 0.84255
ks.test(sbux_model$residuals,"pnorm",0,sd(sbux_model$residuals))## Warning in ks.test(sbux_model$residuals, "pnorm", 0, sd(sbux_model$residuals)): ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: sbux_model$residuals
## D = 0.0605478, p-value = 0.94034
## alternative hypothesis: two-sided
require(nortest)## Loading required package: nortest
ad.test(sbux_model$residuals)##
## Anderson-Darling normality test
##
## data: sbux_model$residuals
## A = 0.265636, p-value = 0.68343
require(tseries)## Loading required package: tseries
## Warning: package 'tseries' was built under R version 3.5.3
jarque.bera.test(sbux_model$residuals)##
## Jarque Bera Test
##
## data: sbux_model$residuals
## X-squared = 0.52897, df = 2, p-value = 0.7676
require(olsrr)## Loading required package: olsrr
## Warning: package 'olsrr' was built under R version 3.5.3
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(sbux_model)##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## --------------------------------
## Response : carb
## Variables: fitted values of carb
##
## Test Summary
## -----------------------------
## DF = 1
## Chi2 = 5.0494237
## Prob > Chi2 = 0.02463413
Exercise 7.15 introduces data on shoulder girth and height of a group of individuals.
The mean shoulder girth is 107.20 cm with a standard deviation of 10.37 cm.
The mean height is 171.14 cm with a standard deviation of 9.41 cm.
The correlation between height and shoulder girth is 0.67.
# look at the exact data set referenced
data("bdims")
#summary(bdims)
p726_shogi_mean <- mean(bdims$sho.gi)
p726_shogi_mean## [1] 108.19507
p726_shogi_sd <- sd(bdims$sho.gi)
p726_shogi_sd## [1] 10.374834
p726_hgt_mean <- mean(bdims$hgt)
p726_hgt_mean## [1] 171.14379
p726_hgt_sd <- sd(bdims$hgt)
p726_hgt_sd## [1] 9.4072052
p726_correl <- cor(bdims$sho.gi, bdims$hgt)
p726_slope <- p726_hgt_sd / p726_shogi_sd * p726_correl
p726_slope## [1] 0.60364419
b1 <- p726_slope
p726_intercept <- p726_hgt_mean - b1 * p726_shogi_mean
p726_intercept## [1] 105.83246
\[{y - \bar{y} = b_1{(x - \bar{x})}}\] \[{y - HeightMean = b_1{(x - shoulderGirthMean)}}\] \[{y - HeightMean = b_1{(x - shoulderGirthMean)}} = b_1*x-b_1*shoulderGirthMean\] \[{y = b_1x + (HeightMean-b_1shoulderGirthMean)}\]
###
mod <- lm(bdims$hgt ~ bdims$sho.gi)
summary(mod)##
## Call:
## lm(formula = bdims$hgt ~ bdims$sho.gi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.22968 -4.79756 -0.11418 4.78854 21.09789
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 105.832462 3.272451 32.340 < 0.00000000000000022 ***
## bdims$sho.gi 0.603644 0.030108 20.049 < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.0265 on 505 degrees of freedom
## Multiple R-squared: 0.4432, Adjusted R-squared: 0.4421
## F-statistic: 401.97 on 1 and 505 DF, p-value: < 0.000000000000000222
\[ height = 105.832462 + .603644 * ShoulderGirth \]
b1 = 9.41 / 10.37 * 0.67
b1## [1] 0.60797493
b0 = 171.14 -b1 * 107.20
b0## [1] 105.96509
\[ Height = 105.965 + .60797 * ShoulderGirth \]
summary(mod)$r.squared## [1] 0.44320349
predict_exact <- 105.832462 + .603644*100
predict_exact## [1] 166.19686
predict_texterror <- 105.965 + .60797*100
predict_texterror## [1] 166.762
plot(bdims$hgt ~ bdims$sho.gi)
abline(mod, col="red")
title(main="Body Dimensions: shoulder girth (cm) vs. height (cm)")The following regression output is for predicting the heart weight (in g) of cats from their body weight (in kg). The coefficients are estimated using a dataset of 144 domestic cats.
# look at the exact data set referenced
data("cats")
summary(cats)## Sex Bwt Hwt
## F:47 Min. :2.0000 Min. : 6.300
## M:97 1st Qu.:2.3000 1st Qu.: 8.950
## Median :2.7000 Median :10.100
## Mean :2.7236 Mean :10.631
## 3rd Qu.:3.0250 3rd Qu.:12.125
## Max. :3.9000 Max. :20.500
catmodel <- lm(cats$Hwt ~cats$Bwt)
summary(catmodel)##
## Call:
## lm(formula = cats$Hwt ~ cats$Bwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.56937 -0.96341 -0.09212 1.04255 5.12382
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.35666 0.69228 -0.5152 0.6072
## cats$Bwt 4.03406 0.25026 16.1194 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.4524 on 142 degrees of freedom
## Multiple R-squared: 0.64662, Adjusted R-squared: 0.64413
## F-statistic: 259.83 on 1 and 142 DF, p-value: < 0.000000000000000222
anova(catmodel)## Analysis of Variance Table
##
## Response: cats$Hwt
## Df Sum Sq Mean Sq F value Pr(>F)
## cats$Bwt 1 548.092 548.092 259.835 < 0.000000000000000222 ***
## Residuals 142 299.533 2.109
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# plot the data
plot(cats$Hwt ~cats$Bwt, col="blue")
abline(catmodel, col="red")
title(main = "Cat body weight (kg) vs. heart weight (g)")\[ HeartWeight(g) = 4.034 * BodyWeight(kg) - 0.357\]
p730_R2 <- .6466
p730_correlation = sqrt(p730_R2)
p730_correlation## [1] 0.80411442
cor(cats$Bwt,cats$Hwt)## [1] 0.80412742
# look at the exact data set referenced
data("prof.evaltns.beauty.public")
summary(prof.evaltns.beauty.public)## tenured profnumber minority age beautyf2upper beautyflowerdiv beautyfupperdiv beautym2upper
## Min. :0.00000 Min. : 1.000 Min. :0.00000 Min. :29.000 Min. : 1.0000 Min. :1.0000 Min. :1.0000 Min. :1.0000
## 1st Qu.:0.00000 1st Qu.:20.000 1st Qu.:0.00000 1st Qu.:42.000 1st Qu.: 4.0000 1st Qu.:2.0000 1st Qu.:4.0000 1st Qu.:4.0000
## Median :1.00000 Median :44.000 Median :0.00000 Median :48.000 Median : 5.0000 Median :4.0000 Median :5.0000 Median :5.0000
## Mean :0.54644 Mean :45.434 Mean :0.13823 Mean :48.365 Mean : 5.2138 Mean :3.9633 Mean :5.0194 Mean :4.7516
## 3rd Qu.:1.00000 3rd Qu.:70.500 3rd Qu.:0.00000 3rd Qu.:57.000 3rd Qu.: 6.0000 3rd Qu.:5.0000 3rd Qu.:7.0000 3rd Qu.:6.0000
## Max. :1.00000 Max. :94.000 Max. :1.00000 Max. :73.000 Max. :10.0000 Max. :8.0000 Max. :9.0000 Max. :9.0000
## beautymlowerdiv beautymupperdiv btystdave btystdf2u btystdfl btystdfu btystdm2u
## Min. :1.0000 Min. :1.0000 Min. :-1.538843 Min. :-2.096532 Min. :-1.668032 Min. :-2.029827 Min. :-2.340983
## 1st Qu.:2.0000 1st Qu.:3.0000 1st Qu.:-0.744618 1st Qu.:-0.665002 1st Qu.:-1.136523 1st Qu.:-0.577006 1st Qu.:-0.527364
## Median :3.0000 Median :4.0000 Median :-0.156363 Median :-0.187825 Median :-0.073507 Median :-0.092733 Median : 0.077175
## Mean :3.4125 Mean :4.1469 Mean :-0.088349 Mean :-0.085794 Mean :-0.093022 Mean :-0.080182 Mean :-0.072980
## 3rd Qu.:5.0000 3rd Qu.:5.0000 3rd Qu.: 0.457253 3rd Qu.: 0.289352 3rd Qu.: 0.458002 3rd Qu.: 0.875814 3rd Qu.: 0.681715
## Max. :7.0000 Max. :9.0000 Max. : 1.881674 Max. : 2.198059 Max. : 2.052527 Max. : 1.844361 Max. : 2.495334
## btystdml btystdmu class1 class2 class3 class4 class5
## Min. :-1.487607 Min. :-1.57312 Min. :0.000000 Min. :0.0000000 Min. :0.000000 Min. :0.000000 Min. :0.0000000
## 1st Qu.:-0.900065 1st Qu.:-0.65465 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000
## Median :-0.312523 Median :-0.19542 Median :0.000000 Median :0.0000000 Median :0.000000 Median :0.000000 Median :0.0000000
## Mean :-0.070146 Mean :-0.12797 Mean :0.010799 Mean :0.0043197 Mean :0.017279 Mean :0.041037 Mean :0.0086393
## 3rd Qu.: 0.862562 3rd Qu.: 0.26381 3rd Qu.:0.000000 3rd Qu.:0.0000000 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.0000000
## Max. : 2.037647 Max. : 2.10074 Max. :1.000000 Max. :1.0000000 Max. :1.000000 Max. :1.000000 Max. :1.0000000
## class6 class7 class8 class9 class10 class11 class12
## Min. :0.000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000
## 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000
## Median :0.000000 Median :0.0000000 Median :0.0000000 Median :0.000000 Median :0.000000 Median :0.0000000 Median :0.0000000
## Mean :0.012959 Mean :0.0086393 Mean :0.0043197 Mean :0.017279 Mean :0.010799 Mean :0.0043197 Mean :0.0064795
## 3rd Qu.:0.000000 3rd Qu.:0.0000000 3rd Qu.:0.0000000 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.0000000 3rd Qu.:0.0000000
## Max. :1.000000 Max. :1.0000000 Max. :1.0000000 Max. :1.000000 Max. :1.000000 Max. :1.0000000 Max. :1.0000000
## class13 class14 class15 class16 class17 class18 class19
## Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.000000
## 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000
## Median :0.0000000 Median :0.0000000 Median :0.0000000 Median :0.0000000 Median :0.000000 Median :0.0000000 Median :0.000000
## Mean :0.0064795 Mean :0.0064795 Mean :0.0043197 Mean :0.0086393 Mean :0.015119 Mean :0.0086393 Mean :0.012959
## 3rd Qu.:0.0000000 3rd Qu.:0.0000000 3rd Qu.:0.0000000 3rd Qu.:0.0000000 3rd Qu.:0.000000 3rd Qu.:0.0000000 3rd Qu.:0.000000
## Max. :1.0000000 Max. :1.0000000 Max. :1.0000000 Max. :1.0000000 Max. :1.000000 Max. :1.0000000 Max. :1.000000
## class20 class21 class22 class23 class24 class25 class26
## Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000
## Median :0.000000 Median :0.000000 Median :0.000000 Median :0.000000 Median :0.0000000 Median :0.0000000 Median :0.0000000
## Mean :0.010799 Mean :0.030238 Mean :0.023758 Mean :0.010799 Mean :0.0064795 Mean :0.0064795 Mean :0.0064795
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.0000000 3rd Qu.:0.0000000 3rd Qu.:0.0000000
## Max. :1.000000 Max. :1.000000 Max. :1.000000 Max. :1.000000 Max. :1.0000000 Max. :1.0000000 Max. :1.0000000
## class27 class28 class29 class30 courseevaluation didevaluation female
## Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :2.1000 Min. : 5.000 Min. :0.00000
## 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:3.6000 1st Qu.: 15.000 1st Qu.:0.00000
## Median :0.0000000 Median :0.0000000 Median :0.0000000 Median :0.000000 Median :4.0000 Median : 23.000 Median :0.00000
## Mean :0.0043197 Mean :0.0086393 Mean :0.0043197 Mean :0.017279 Mean :3.9983 Mean : 36.624 Mean :0.42117
## 3rd Qu.:0.0000000 3rd Qu.:0.0000000 3rd Qu.:0.0000000 3rd Qu.:0.000000 3rd Qu.:4.4000 3rd Qu.: 40.000 3rd Qu.:1.00000
## Max. :1.0000000 Max. :1.0000000 Max. :1.0000000 Max. :1.000000 Max. :5.0000 Max. :380.000 Max. :1.00000
## formal fulldept lower multipleclass nonenglish onecredit percentevaluating
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.000000 Min. : 10.417
## 1st Qu.:0.00000 1st Qu.:1.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.: 62.696
## Median :0.00000 Median :1.00000 Median :0.00000 Median :0.00000 Median :0.000000 Median :0.000000 Median : 76.923
## Mean :0.16631 Mean :0.89417 Mean :0.33909 Mean :0.33909 Mean :0.060475 Mean :0.058315 Mean : 74.428
## 3rd Qu.:0.00000 3rd Qu.:1.00000 3rd Qu.:1.00000 3rd Qu.:1.00000 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.: 87.249
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.000000 Max. :1.000000 Max. :100.000
## profevaluation students tenuretrack blkandwhite btystdvariance btystdavepos btystdaveneg
## Min. :2.3000 Min. : 8.000 Min. :0.0000 Min. :0.00000 Min. :0.085029 Min. :0.00000 Min. :-1.53884
## 1st Qu.:3.8000 1st Qu.: 19.000 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.:0.828371 1st Qu.:0.00000 1st Qu.:-0.74462
## Median :4.3000 Median : 29.000 Median :1.0000 Median :0.00000 Median :1.565791 Median :0.00000 Median :-0.15636
## Mean :4.1747 Mean : 55.177 Mean :0.7797 Mean :0.16847 Mean :1.842626 Mean :0.28241 Mean :-0.37076
## 3rd Qu.:4.6000 3rd Qu.: 60.000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.682287 3rd Qu.:0.45725 3rd Qu.: 0.00000
## Max. :5.0000 Max. :581.000 Max. :1.0000 Max. :1.00000 Max. :5.791667 Max. :1.88167 Max. : 0.00000
\[b_1 = \frac{y-\bar{y}}{x-\bar{x}}=\frac{4.010-3.9983}{0-(-0.0883)}=\frac{0.0117}{0.0883} = 0.133\]
print("Beauty:")## [1] "Beauty:"
beauty = prof.evaltns.beauty.public$btystdave
t(t(summary(beauty)))## [,1]
## Min. -1.538843
## 1st Qu. -0.744618
## Median -0.156363
## Mean -0.088349
## 3rd Qu. 0.457253
## Max. 1.881674
print("Evaluations:")## [1] "Evaluations:"
eval = prof.evaltns.beauty.public$courseevaluation
t(t(summary(eval)))## [,1]
## Min. 2.1000
## 1st Qu. 3.6000
## Median 4.0000
## Mean 3.9983
## 3rd Qu. 4.4000
## Max. 5.0000
ratingmodel <- lm(eval~beauty)
summary(ratingmodel)##
## Call:
## lm(formula = eval ~ beauty)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80015 -0.36304 0.07254 0.40207 1.10373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.010023 0.025508 157.2052 < 0.00000000000000022 ***
## beauty 0.133001 0.032178 4.1334 0.00004247 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.54545 on 461 degrees of freedom
## Multiple R-squared: 0.035736, Adjusted R-squared: 0.033644
## F-statistic: 17.085 on 1 and 461 DF, p-value: 0.000042471
plot(eval~beauty, col="blue")
abline(ratingmodel, col="red")
title(main="impact of instructor beauty on students' evaluations of the instructor")require(lmSupport) ## Note: the "S" is capitalized in the package name
modelAssumptions(ratingmodel,"LINEAR")##
## Call:
## lm(formula = eval ~ beauty)
##
## Coefficients:
## (Intercept) beauty
## 4.010 0.133
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = Model)
##
## Value p-value Decision
## Global Stat 17.54617 0.001513318 Assumptions NOT satisfied!
## Skewness 15.85720 0.000068306 Assumptions NOT satisfied!
## Kurtosis 0.54822 0.459045812 Assumptions acceptable.
## Link Function 0.92552 0.336029551 Assumptions acceptable.
## Heteroscedasticity 0.21522 0.642705825 Assumptions acceptable.
shapiro.test(ratingmodel$residuals)##
## Shapiro-Wilk normality test
##
## data: ratingmodel$residuals
## W = 0.981504, p-value = 0.000012576
ks.test(ratingmodel$residuals,"pnorm",0,sd(sbux_model$residuals))## Warning in ks.test(ratingmodel$residuals, "pnorm", 0, sd(sbux_model$residuals)): ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: ratingmodel$residuals
## D = 0.463993, p-value < 0.000000000000000222
## alternative hypothesis: two-sided
require(nortest)
ad.test(ratingmodel$residuals)##
## Anderson-Darling normality test
##
## data: ratingmodel$residuals
## A = 1.99014, p-value = 0.00004483
require(tseries)
jarque.bera.test(ratingmodel$residuals)##
## Jarque Bera Test
##
## data: ratingmodel$residuals
## X-squared = 16.4054, df = 2, p-value = 0.00027391
require(olsrr)
ols_test_breusch_pagan(ratingmodel)##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## --------------------------------
## Response : eval
## Variables: fitted values of eval
##
## Test Summary
## -----------------------------
## DF = 1
## Chi2 = 0.93015634
## Prob > Chi2 = 0.3348223
plot(ratingmodel$residuals ~ beauty, col="blue")
abline(h = 0, lty = 3, col="red", lwd=2) # adds a horizontal dashed line at y = 0
title(main="Instructor Beauty rating vs. residual of student evaluation\n(actual minus projected)")hist(ratingmodel$residuals, col="lightblue")qqnorm(ratingmodel$residuals)
qqline(ratingmodel$residuals) # adds diagonal line to the normal prob plotFinally, the “Order of Data Collection” plot (which I am uncertain how to reproduce) does not appear to show any pattern or bias in regard to the impact of such sequence on the corresponding residuals.