Original Authors: Shulin Cao
This material has been created using the following resources:
https://www.statmethods.net/stats/index.html
This section describes basic statistics. It includes code for obtaining descriptive statistics and crosstabulations (including tests of independence), correlations (pearson), t-tests, nonparametric tests of group differences (Mann Whitney U, Wilcoxon Signed Rank), multiple linear regression (including diagnostics, cross-validation and variable selection), analysis of variance (including ANCOVA).
First, let’s load dataset that we will use (I will only load the packages when needed).
library(car)
data(mtcars)
If you want to derive the average values for all different variables.
sapply(mtcars, mean, na.rm=TRUE)
## mpg cyl disp hp drat wt
## 20.090625 6.187500 230.721875 146.687500 3.596563 3.217250
## qsec vs am gear carb
## 17.848750 0.437500 0.406250 3.687500 2.812500
# mean,median,25th and 75th quartiles,min,max
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(mtcars)
## mtcars
##
## 11 Variables 32 Observations
## ---------------------------------------------------------------------------
## mpg
## n missing distinct Info Mean Gmd .05 .10
## 32 0 25 0.999 20.09 6.796 12.00 14.34
## .25 .50 .75 .90 .95
## 15.43 19.20 22.80 30.09 31.30
##
## lowest : 10.4 13.3 14.3 14.7 15.0, highest: 26.0 27.3 30.4 32.4 33.9
## ---------------------------------------------------------------------------
## cyl
## n missing distinct Info Mean Gmd
## 32 0 3 0.866 6.188 1.948
##
## Value 4 6 8
## Frequency 11 7 14
## Proportion 0.344 0.219 0.438
## ---------------------------------------------------------------------------
## disp
## n missing distinct Info Mean Gmd .05 .10
## 32 0 27 0.999 230.7 142.5 77.35 80.61
## .25 .50 .75 .90 .95
## 120.83 196.30 326.00 396.00 449.00
##
## lowest : 71.1 75.7 78.7 79.0 95.1, highest: 360.0 400.0 440.0 460.0 472.0
## ---------------------------------------------------------------------------
## hp
## n missing distinct Info Mean Gmd .05 .10
## 32 0 22 0.997 146.7 77.04 63.65 66.00
## .25 .50 .75 .90 .95
## 96.50 123.00 180.00 243.50 253.55
##
## lowest : 52 62 65 66 91, highest: 215 230 245 264 335
## ---------------------------------------------------------------------------
## drat
## n missing distinct Info Mean Gmd .05 .10
## 32 0 22 0.997 3.597 0.6099 2.853 3.007
## .25 .50 .75 .90 .95
## 3.080 3.695 3.920 4.209 4.314
##
## lowest : 2.76 2.93 3.00 3.07 3.08, highest: 4.08 4.11 4.22 4.43 4.93
## ---------------------------------------------------------------------------
## wt
## n missing distinct Info Mean Gmd .05 .10
## 32 0 29 0.999 3.217 1.089 1.736 1.956
## .25 .50 .75 .90 .95
## 2.581 3.325 3.610 4.048 5.293
##
## lowest : 1.513 1.615 1.835 1.935 2.140, highest: 3.845 4.070 5.250 5.345 5.424
## ---------------------------------------------------------------------------
## qsec
## n missing distinct Info Mean Gmd .05 .10
## 32 0 30 1 17.85 2.009 15.05 15.53
## .25 .50 .75 .90 .95
## 16.89 17.71 18.90 19.99 20.10
##
## lowest : 14.50 14.60 15.41 15.50 15.84, highest: 19.90 20.00 20.01 20.22 22.90
## ---------------------------------------------------------------------------
## vs
## n missing distinct Info Sum Mean Gmd
## 32 0 2 0.739 14 0.4375 0.5081
##
## ---------------------------------------------------------------------------
## am
## n missing distinct Info Sum Mean Gmd
## 32 0 2 0.724 13 0.4062 0.498
##
## ---------------------------------------------------------------------------
## gear
## n missing distinct Info Mean Gmd
## 32 0 3 0.841 3.688 0.7863
##
## Value 3 4 5
## Frequency 15 12 5
## Proportion 0.469 0.375 0.156
## ---------------------------------------------------------------------------
## carb
## n missing distinct Info Mean Gmd
## 32 0 6 0.929 2.812 1.718
##
## Value 1 2 3 4 6 8
## Frequency 7 10 3 10 1 1
## Proportion 0.219 0.312 0.094 0.312 0.031 0.031
## ---------------------------------------------------------------------------
# n, nmiss, unique, mean, 5,10,25,50,75,90,95th percentiles
# 5 lowest and 5 highest scores
library(pastecs)
stat.desc(mtcars)
## mpg cyl disp hp
## nbr.val 32.0000000 32.0000000 3.200000e+01 32.0000000
## nbr.null 0.0000000 0.0000000 0.000000e+00 0.0000000
## nbr.na 0.0000000 0.0000000 0.000000e+00 0.0000000
## min 10.4000000 4.0000000 7.110000e+01 52.0000000
## max 33.9000000 8.0000000 4.720000e+02 335.0000000
## range 23.5000000 4.0000000 4.009000e+02 283.0000000
## sum 642.9000000 198.0000000 7.383100e+03 4694.0000000
## median 19.2000000 6.0000000 1.963000e+02 123.0000000
## mean 20.0906250 6.1875000 2.307219e+02 146.6875000
## SE.mean 1.0654240 0.3157093 2.190947e+01 12.1203173
## CI.mean.0.95 2.1729465 0.6438934 4.468466e+01 24.7195501
## var 36.3241028 3.1895161 1.536080e+04 4700.8669355
## std.dev 6.0269481 1.7859216 1.239387e+02 68.5628685
## coef.var 0.2999881 0.2886338 5.371779e-01 0.4674077
## drat wt qsec vs am
## nbr.val 32.00000000 32.0000000 32.0000000 32.00000000 32.00000000
## nbr.null 0.00000000 0.0000000 0.0000000 18.00000000 19.00000000
## nbr.na 0.00000000 0.0000000 0.0000000 0.00000000 0.00000000
## min 2.76000000 1.5130000 14.5000000 0.00000000 0.00000000
## max 4.93000000 5.4240000 22.9000000 1.00000000 1.00000000
## range 2.17000000 3.9110000 8.4000000 1.00000000 1.00000000
## sum 115.09000000 102.9520000 571.1600000 14.00000000 13.00000000
## median 3.69500000 3.3250000 17.7100000 0.00000000 0.00000000
## mean 3.59656250 3.2172500 17.8487500 0.43750000 0.40625000
## SE.mean 0.09451874 0.1729685 0.3158899 0.08909831 0.08820997
## CI.mean.0.95 0.19277224 0.3527715 0.6442617 0.18171719 0.17990541
## var 0.28588135 0.9573790 3.1931661 0.25403226 0.24899194
## std.dev 0.53467874 0.9784574 1.7869432 0.50401613 0.49899092
## coef.var 0.14866382 0.3041285 0.1001159 1.15203687 1.22828533
## gear carb
## nbr.val 32.0000000 32.0000000
## nbr.null 0.0000000 0.0000000
## nbr.na 0.0000000 0.0000000
## min 3.0000000 1.0000000
## max 5.0000000 8.0000000
## range 2.0000000 7.0000000
## sum 118.0000000 90.0000000
## median 4.0000000 2.0000000
## mean 3.6875000 2.8125000
## SE.mean 0.1304266 0.2855297
## CI.mean.0.95 0.2660067 0.5823417
## var 0.5443548 2.6088710
## std.dev 0.7378041 1.6152000
## coef.var 0.2000825 0.5742933
# nbr.val, nbr.null, nbr.na, min max, range, sum,
# median, mean, SE.mean, CI.mean, var, std.dev, coef.var
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:car':
##
## logit
describeBy(mtcars, mtcars$cyl)
##
## Descriptive statistics by group
## group: 4
## vars n mean sd median trimmed mad min max range skew
## mpg 1 11 26.66 4.51 26.00 26.44 6.52 21.40 33.90 12.50 0.26
## cyl 2 11 4.00 0.00 4.00 4.00 0.00 4.00 4.00 0.00 NaN
## disp 3 11 105.14 26.87 108.00 104.30 43.00 71.10 146.70 75.60 0.12
## hp 4 11 82.64 20.93 91.00 82.67 32.62 52.00 113.00 61.00 0.01
## drat 5 11 4.07 0.37 4.08 4.02 0.34 3.69 4.93 1.24 1.00
## wt 6 11 2.29 0.57 2.20 2.27 0.54 1.51 3.19 1.68 0.30
## qsec 7 11 19.14 1.68 18.90 18.99 1.48 16.70 22.90 6.20 0.55
## vs 8 11 0.91 0.30 1.00 1.00 0.00 0.00 1.00 1.00 -2.47
## am 9 11 0.73 0.47 1.00 0.78 0.00 0.00 1.00 1.00 -0.88
## gear 10 11 4.09 0.54 4.00 4.11 0.00 3.00 5.00 2.00 0.11
## carb 11 11 1.55 0.52 2.00 1.56 0.00 1.00 2.00 1.00 -0.16
## kurtosis se
## mpg -1.65 1.36
## cyl NaN 0.00
## disp -1.64 8.10
## hp -1.71 6.31
## drat 0.12 0.11
## wt -1.36 0.17
## qsec -0.02 0.51
## vs 4.52 0.09
## am -1.31 0.14
## gear -0.01 0.16
## carb -2.15 0.16
## --------------------------------------------------------
## group: 6
## vars n mean sd median trimmed mad min max range skew
## mpg 1 7 19.74 1.45 19.70 19.74 1.93 17.80 21.40 3.60 -0.16
## cyl 2 7 6.00 0.00 6.00 6.00 0.00 6.00 6.00 0.00 NaN
## disp 3 7 183.31 41.56 167.60 183.31 11.27 145.00 258.00 113.00 0.80
## hp 4 7 122.29 24.26 110.00 122.29 7.41 105.00 175.00 70.00 1.36
## drat 5 7 3.59 0.48 3.90 3.59 0.03 2.76 3.92 1.16 -0.74
## wt 6 7 3.12 0.36 3.21 3.12 0.36 2.62 3.46 0.84 -0.22
## qsec 7 7 17.98 1.71 18.30 17.98 1.90 15.50 20.22 4.72 -0.12
## vs 8 7 0.57 0.53 1.00 0.57 0.00 0.00 1.00 1.00 -0.23
## am 9 7 0.43 0.53 0.00 0.43 0.00 0.00 1.00 1.00 0.23
## gear 10 7 3.86 0.69 4.00 3.86 0.00 3.00 5.00 2.00 0.11
## carb 11 7 3.43 1.81 4.00 3.43 0.00 1.00 6.00 5.00 -0.26
## kurtosis se
## mpg -1.91 0.55
## cyl NaN 0.00
## disp -1.23 15.71
## hp 0.25 9.17
## drat -1.40 0.18
## wt -1.98 0.13
## qsec -1.75 0.65
## vs -2.20 0.20
## am -2.20 0.20
## gear -1.24 0.26
## carb -1.50 0.69
## --------------------------------------------------------
## group: 8
## vars n mean sd median trimmed mad min max range skew
## mpg 1 14 15.10 2.56 15.20 15.15 1.56 10.40 19.20 8.80 -0.36
## cyl 2 14 8.00 0.00 8.00 8.00 0.00 8.00 8.00 0.00 NaN
## disp 3 14 353.10 67.77 350.50 349.63 73.39 275.80 472.00 196.20 0.45
## hp 4 14 209.21 50.98 192.50 203.67 44.48 150.00 335.00 185.00 0.91
## drat 5 14 3.23 0.37 3.12 3.19 0.16 2.76 4.22 1.46 1.34
## wt 6 14 4.00 0.76 3.75 3.95 0.41 3.17 5.42 2.25 0.99
## qsec 7 14 16.77 1.20 17.18 16.86 0.79 14.50 18.00 3.50 -0.80
## vs 8 14 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 NaN
## am 9 14 0.14 0.36 0.00 0.08 0.00 0.00 1.00 1.00 1.83
## gear 10 14 3.29 0.73 3.00 3.17 0.00 3.00 5.00 2.00 1.83
## carb 11 14 3.50 1.56 3.50 3.25 0.74 2.00 8.00 6.00 1.48
## kurtosis se
## mpg -0.57 0.68
## cyl NaN 0.00
## disp -1.26 18.11
## hp 0.09 13.62
## drat 1.08 0.10
## wt -0.71 0.20
## qsec -0.92 0.32
## vs NaN 0.00
## am 1.45 0.10
## gear 1.45 0.19
## carb 2.24 0.42
Now take a look at the correlation matrix
#mtcars is a data frame
rcorr(as.matrix(mtcars), type = 'pearson')
## mpg cyl disp hp drat wt qsec vs am gear carb
## mpg 1.00 -0.85 -0.85 -0.78 0.68 -0.87 0.42 0.66 0.60 0.48 -0.55
## cyl -0.85 1.00 0.90 0.83 -0.70 0.78 -0.59 -0.81 -0.52 -0.49 0.53
## disp -0.85 0.90 1.00 0.79 -0.71 0.89 -0.43 -0.71 -0.59 -0.56 0.39
## hp -0.78 0.83 0.79 1.00 -0.45 0.66 -0.71 -0.72 -0.24 -0.13 0.75
## drat 0.68 -0.70 -0.71 -0.45 1.00 -0.71 0.09 0.44 0.71 0.70 -0.09
## wt -0.87 0.78 0.89 0.66 -0.71 1.00 -0.17 -0.55 -0.69 -0.58 0.43
## qsec 0.42 -0.59 -0.43 -0.71 0.09 -0.17 1.00 0.74 -0.23 -0.21 -0.66
## vs 0.66 -0.81 -0.71 -0.72 0.44 -0.55 0.74 1.00 0.17 0.21 -0.57
## am 0.60 -0.52 -0.59 -0.24 0.71 -0.69 -0.23 0.17 1.00 0.79 0.06
## gear 0.48 -0.49 -0.56 -0.13 0.70 -0.58 -0.21 0.21 0.79 1.00 0.27
## carb -0.55 0.53 0.39 0.75 -0.09 0.43 -0.66 -0.57 0.06 0.27 1.00
##
## n= 32
##
##
## P
## mpg cyl disp hp drat wt qsec vs am gear
## mpg 0.0000 0.0000 0.0000 0.0000 0.0000 0.0171 0.0000 0.0003 0.0054
## cyl 0.0000 0.0000 0.0000 0.0000 0.0000 0.0004 0.0000 0.0022 0.0042
## disp 0.0000 0.0000 0.0000 0.0000 0.0000 0.0131 0.0000 0.0004 0.0010
## hp 0.0000 0.0000 0.0000 0.0100 0.0000 0.0000 0.0000 0.1798 0.4930
## drat 0.0000 0.0000 0.0000 0.0100 0.0000 0.6196 0.0117 0.0000 0.0000
## wt 0.0000 0.0000 0.0000 0.0000 0.0000 0.3389 0.0010 0.0000 0.0005
## qsec 0.0171 0.0004 0.0131 0.0000 0.6196 0.3389 0.0000 0.2057 0.2425
## vs 0.0000 0.0000 0.0000 0.0000 0.0117 0.0010 0.0000 0.3570 0.2579
## am 0.0003 0.0022 0.0004 0.1798 0.0000 0.0000 0.2057 0.3570 0.0000
## gear 0.0054 0.0042 0.0010 0.4930 0.0000 0.0005 0.2425 0.2579 0.0000
## carb 0.0011 0.0019 0.0253 0.0000 0.6212 0.0146 0.0000 0.0007 0.7545 0.1290
## carb
## mpg 0.0011
## cyl 0.0019
## disp 0.0253
## hp 0.0000
## drat 0.6212
## wt 0.0146
## qsec 0.0000
## vs 0.0007
## am 0.7545
## gear 0.1290
## carb
# Correlation matrix from mtcars
# with mpg, cyl, and disp as rows
# and hp, drat, and wt as columns
x <- mtcars[1:3]
y <- mtcars[4:6]
cor(x, y)
## hp drat wt
## mpg -0.7761684 0.6811719 -0.8676594
## cyl 0.8324475 -0.6999381 0.7824958
## disp 0.7909486 -0.7102139 0.8879799
Visualize it
# First Correlogram Example
par(mfrow = c(1,2))
library(corrgram)
##
## Attaching package: 'corrgram'
## The following object is masked from 'package:lattice':
##
## panel.fill
corrgram(mtcars, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Car Milage Data in PC2/PC1 Order")
corrgram(mtcars, order=NULL, lower.panel=panel.shade,
upper.panel=NULL, text.panel=panel.txt,
main="Car Milage Data (unsorted)")
t.test(mtcars[1,],mtcars[15,])
##
## Welch Two Sample t-test
##
## data: mtcars[1, ] and mtcars[15, ]
## t = -0.76849, df = 12.595, p-value = 0.4564
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -138.7776 66.1267
## sample estimates:
## mean of x mean of y
## 29.90727 66.23273
# Boxplot of MPG by Car Cylinders
boxplot(mpg~cyl,data=mtcars, main="Car Milage Data",
xlab="Number of Cylinders", ylab="Miles Per Gallon")
wilcox.test(as.numeric(mtcars[3,1:7]),as.numeric(mtcars[4,1:7]))
##
## Wilcoxon rank sum test
##
## data: as.numeric(mtcars[3, 1:7]) and as.numeric(mtcars[4, 1:7])
## W = 22, p-value = 0.8048
## alternative hypothesis: true location shift is not equal to 0
For the wilcox.test you can use the alternative=“less” or alternative=“greater” option to specify a one tailed test.
fit <- lm(mpg ~ cyl + disp + hp, data=mtcars)
summary(fit) # show results
##
## Call:
## lm(formula = mpg ~ cyl + disp + hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0889 -2.0845 -0.7745 1.3972 6.9183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.18492 2.59078 13.195 1.54e-13 ***
## cyl -1.22742 0.79728 -1.540 0.1349
## disp -0.01884 0.01040 -1.811 0.0809 .
## hp -0.01468 0.01465 -1.002 0.3250
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.055 on 28 degrees of freedom
## Multiple R-squared: 0.7679, Adjusted R-squared: 0.743
## F-statistic: 30.88 on 3 and 28 DF, p-value: 5.054e-09
# Other useful functions
coefficients(fit) # model coefficients
## (Intercept) cyl disp hp
## 34.18491917 -1.22741994 -0.01883809 -0.01467933
confint(fit, level=0.95) # CIs for model parameters
## 2.5 % 97.5 %
## (Intercept) 28.87795186 39.491886473
## cyl -2.86056643 0.405726550
## disp -0.04014908 0.002472913
## hp -0.04469028 0.015331608
fitted(fit) # predicted values
## Mazda RX4 Mazda RX4 Wag Datsun 710
## 22.19158 22.19158 25.87555
## Hornet 4 Drive Hornet Sportabout Valiant
## 20.34545 15.01497 21.04050
## Duster 360 Merc 240D Merc 230
## 13.98741 25.60157 25.22830
## Merc 280 Merc 280C Merc 450SE
## 21.85758 21.85758 16.52774
## Merc 450SL Merc 450SLC Cadillac Fleetwood
## 16.52774 16.52774 12.46472
## Lincoln Continental Chrysler Imperial Fiat 128
## 12.54398 12.70056 26.82385
## Honda Civic Toyota Corolla Toyota Corona
## 27.08587 26.98169 25.58889
## Dodge Challenger AMC Javelin Camaro Z28
## 16.17315 16.43688 14.17579
## Pontiac Firebird Fiat X1-9 Porsche 914-2
## 14.26144 26.81819 25.67320
## Lotus Europa Ford Pantera L Ferrari Dino
## 25.82497 13.87805 21.51999
## Maserati Bora Volvo 142E
## 13.77772 25.39578
residuals(fit) # residuals
## Mazda RX4 Mazda RX4 Wag Datsun 710
## -1.1915791 -1.1915791 -3.0755481
## Hornet 4 Drive Hornet Sportabout Valiant
## 1.0545533 3.6850347 -2.9405002
## Duster 360 Merc 240D Merc 230
## 0.3125881 -1.2015735 -2.4283002
## Merc 280 Merc 280C Merc 450SE
## -2.6575782 -4.0575782 -0.1277355
## Merc 450SL Merc 450SLC Cadillac Fleetwood
## 0.7722645 -1.3277355 -2.0647197
## Lincoln Continental Chrysler Imperial Fiat 128
## -2.1439834 1.9994449 5.5761540
## Honda Civic Toyota Corolla Toyota Corona
## 3.3141291 6.9183052 -4.0888899
## Dodge Challenger AMC Javelin Camaro Z28
## -0.6731483 -1.2368815 -0.8757928
## Pontiac Firebird Fiat X1-9 Porsche 914-2
## 4.9385581 0.4818054 0.3268017
## Lotus Europa Ford Pantera L Ferrari Dino
## 4.5750273 1.9219527 -1.8199936
## Maserati Bora Volvo 142E
## 1.2222812 -3.9957836
anova(fit) # anova table
## Analysis of Variance Table
##
## Response: mpg
## Df Sum Sq Mean Sq F value Pr(>F)
## cyl 1 817.71 817.71 87.6000 4.064e-10 ***
## disp 1 37.59 37.59 4.0274 0.05451 .
## hp 1 9.37 9.37 1.0039 0.32495
## Residuals 28 261.37 9.33
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vcov(fit) # covariance matrix for model parameters
## (Intercept) cyl disp hp
## (Intercept) 6.712128488 -1.786156956 0.0156477359 0.0069613946
## cyl -1.786156956 0.635649515 -0.0059597991 -0.0052619916
## disp 0.015647736 -0.005959799 0.0001082368 -0.0000255242
## hp 0.006961395 -0.005261992 -0.0000255242 0.0002146479
influence(fit) # regression diagnostics
## $hat
## Mazda RX4 Mazda RX4 Wag Datsun 710
## 0.08371177 0.08371177 0.08677819
## Hornet 4 Drive Hornet Sportabout Valiant
## 0.07747090 0.09010123 0.06249839
## Duster 360 Merc 240D Merc 230
## 0.10139193 0.12140928 0.10822416
## Merc 280 Merc 280C Merc 450SE
## 0.06444896 0.06444896 0.12342137
## Merc 450SL Merc 450SLC Cadillac Fleetwood
## 0.12342137 0.12342137 0.25364222
## Lincoln Continental Chrysler Imperial Fiat 128
## 0.21591194 0.17244911 0.08406747
## Honda Civic Toyota Corolla Toyota Corona
## 0.09510678 0.08734285 0.09416712
## Dodge Challenger AMC Javelin Camaro Z28
## 0.13318562 0.13977761 0.10109200
## Pontiac Firebird Fiat X1-9 Porsche 914-2
## 0.12980375 0.08398124 0.09038548
## Lotus Europa Ford Pantera L Ferrari Dino
## 0.10957458 0.14390165 0.13601470
## Maserati Bora Volvo 142E
## 0.50782507 0.10731118
##
## $coefficients
## (Intercept) cyl disp hp
## Mazda RX4 0.102454442 -0.069009533 7.802728e-04 7.081534e-04
## Mazda RX4 Wag 0.102454442 -0.069009533 7.802728e-04 7.081534e-04
## Datsun 710 -0.687248826 0.135865644 -4.056553e-04 -1.125318e-03
## Hornet 4 Drive 0.097729396 -0.010863012 6.130778e-04 -9.287958e-04
## Hornet Sportabout -0.314853040 0.100943673 1.070713e-03 -2.932848e-03
## Valiant -0.082952123 -0.045118188 -5.249102e-04 2.626077e-03
## Duster 360 -0.008884086 -0.005056034 2.538403e-05 3.080168e-04
## Merc 240D -0.336186118 0.065065786 -8.943525e-04 6.626352e-04
## Merc 230 -0.709446663 0.169946192 -1.348713e-03 -7.908746e-04
## Merc 280 0.160069525 -0.116142305 1.555058e-03 7.567420e-04
## Merc 280C 0.244393414 -0.177325537 2.374255e-03 1.155390e-03
## Merc 450SE 0.031353120 -0.011055039 1.057363e-04 5.522279e-05
## Merc 450SL -0.189555071 0.066836690 -6.392618e-04 -3.338666e-04
## Merc 450SLC 0.325897375 -0.114910679 1.099067e-03 5.740087e-04
## Cadillac Fleetwood -0.366207285 0.175650002 -4.097056e-03 9.421632e-04
## Lincoln Continental -0.327355475 0.168080733 -3.594388e-03 2.127717e-04
## Chrysler Imperial 0.235277687 -0.138094810 2.516619e-03 7.774901e-04
## Fiat 128 0.820736114 -0.039057443 -8.855576e-04 -1.257790e-03
## Honda Civic 0.437090217 0.012422146 -5.199411e-04 -1.905676e-03
## Toyota Corolla 0.919710241 -0.007576702 -1.749934e-03 -1.582915e-03
## Toyota Corona -1.026162442 0.227154791 -1.127657e-03 -1.774138e-03
## Dodge Challenger 0.129526705 -0.051124265 1.197964e-04 9.196216e-04
## AMC Javelin 0.273567946 -0.107510823 4.552200e-04 1.647672e-03
## Camaro Z28 0.041214716 0.007940562 4.187445e-05 -8.893376e-04
## Pontiac Firebird -0.060669138 -0.003483099 4.132616e-03 -4.730556e-03
## Fiat X1-9 0.071173221 -0.003475171 -7.467942e-05 -1.091001e-04
## Porsche 914-2 0.080187211 -0.016910434 9.648001e-05 9.144197e-05
## Lotus Europa 1.014017003 -0.222890263 -4.306393e-04 4.261015e-03
## Ford Pantera L -0.059395765 -0.043775612 -1.870934e-04 3.023983e-03
## Ferrari Dino 0.116815634 -0.054773909 2.004697e-03 -2.087826e-03
## Maserati Bora -0.142358380 -0.068540510 -2.128887e-03 7.739178e-03
## Volvo 142E -1.064371890 0.258101556 -1.018045e-03 -2.983384e-03
##
## $sigma
## Mazda RX4 Mazda RX4 Wag Datsun 710
## 3.102089 3.102089 3.049053
## Hornet 4 Drive Hornet Sportabout Valiant
## 3.104142 3.021192 3.055938
## Duster 360 Merc 240D Merc 230
## 3.110678 3.101529 3.071717
## Merc 280 Merc 280C Merc 450SE
## 3.066063 3.004757 3.111215
## Merc 450SL Merc 450SLC Cadillac Fleetwood
## 3.107273 3.099332 3.077141
## Lincoln Continental Chrysler Imperial Fiat 128
## 3.076235 3.082438 2.902247
## Honda Civic Toyota Corolla Toyota Corona
## 3.038223 2.781725 2.999458
## Dodge Challenger AMC Javelin Camaro Z28
## 3.108213 3.100722 3.106243
## Pontiac Firebird Fiat X1-9 Porsche 914-2
## 2.939778 3.109817 3.110627
## Lotus Europa Ford Pantera L Ferrari Dino
## 2.968119 3.085537 3.088422
## Maserati Bora Volvo 142E
## 3.093206 3.002985
##
## $wt.res
## Mazda RX4 Mazda RX4 Wag Datsun 710
## -1.1915791 -1.1915791 -3.0755481
## Hornet 4 Drive Hornet Sportabout Valiant
## 1.0545533 3.6850347 -2.9405002
## Duster 360 Merc 240D Merc 230
## 0.3125881 -1.2015735 -2.4283002
## Merc 280 Merc 280C Merc 450SE
## -2.6575782 -4.0575782 -0.1277355
## Merc 450SL Merc 450SLC Cadillac Fleetwood
## 0.7722645 -1.3277355 -2.0647197
## Lincoln Continental Chrysler Imperial Fiat 128
## -2.1439834 1.9994449 5.5761540
## Honda Civic Toyota Corolla Toyota Corona
## 3.3141291 6.9183052 -4.0888899
## Dodge Challenger AMC Javelin Camaro Z28
## -0.6731483 -1.2368815 -0.8757928
## Pontiac Firebird Fiat X1-9 Porsche 914-2
## 4.9385581 0.4818054 0.3268017
## Lotus Europa Ford Pantera L Ferrari Dino
## 4.5750273 1.9219527 -1.8199936
## Maserati Bora Volvo 142E
## 1.2222812 -3.9957836
layout(matrix(c(1,2,3,4),2,2)) # optional 4 graphs/page
plot(fit)
# compare models
fit1 <- lm(mpg ~ cyl + disp + hp + drat, data=mtcars)
anova(fit1, fit)
## Analysis of Variance Table
##
## Model 1: mpg ~ cyl + disp + hp + drat
## Model 2: mpg ~ cyl + disp + hp
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 27 244.90
## 2 28 261.37 -1 -16.467 1.8155 0.189
# Stepwise Regression
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:DAAG':
##
## hills
fit <- lm(mpg ~ cyl + disp + hp, data=mtcars)
step <- stepAIC(fit, direction="both")
## Start: AIC=75.2
## mpg ~ cyl + disp + hp
##
## Df Sum of Sq RSS AIC
## - hp 1 9.37 271 74.3
## <none> 261 75.2
## - cyl 1 22.12 284 75.8
## - disp 1 30.61 292 76.7
##
## Step: AIC=74.3
## mpg ~ cyl + disp
##
## Df Sum of Sq RSS AIC
## <none> 271 74.3
## + hp 1 9.4 261 75.2
## - disp 1 37.6 308 76.5
## - cyl 1 46.4 317 77.4
step$anova # display results
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## mpg ~ cyl + disp + hp
##
## Final Model:
## mpg ~ cyl + disp
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 28 261 75.2
## 2 - hp 1 9.37 29 271 74.3
# All Subsets Regression
library(leaps)
leaps<-regsubsets(mpg ~ cyl + disp + hp, data=mtcars,nbest=10)
# view results
summary(leaps)
## Subset selection object
## Call: regsubsets.formula(mpg ~ cyl + disp + hp, data = mtcars, nbest = 10)
## 3 Variables (and intercept)
## Forced in Forced out
## cyl FALSE FALSE
## disp FALSE FALSE
## hp FALSE FALSE
## 10 subsets of each size up to 3
## Selection Algorithm: exhaustive
## cyl disp hp
## 1 ( 1 ) "*" " " " "
## 1 ( 2 ) " " "*" " "
## 1 ( 3 ) " " " " "*"
## 2 ( 1 ) "*" "*" " "
## 2 ( 2 ) " " "*" "*"
## 2 ( 3 ) "*" " " "*"
## 3 ( 1 ) "*" "*" "*"
# plot a table of models showing variables in each model.
# models are ordered by the selection statistic.
plot(leaps,scale="r2")
# plot statistic by subset size
plot(booteval.relimp(boot,sort=TRUE)) # plot result
## Warning in rev(variances[[p]]) - variances[[p + 1]]: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
# Assessing Outliers
outlierTest(fit) # Bonferonni p-value for most extreme obs
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## Toyota Corolla 2.6 0.0148 0.474
qqPlot(fit, main="QQ Plot") #qq plot for studentized resid
## Fiat 128 Toyota Corolla
## 18 20
leveragePlots(fit) # leverage plots
Alternatively, you can use avPlots(fit) to achieve the same results. ### Outliers
# Influential Observations
# added variable plots
# Cook's D plot
# identify D values > 4/(n-k-1)
cutoff <- 4/((nrow(mtcars)-length(fit$coefficients)-2))
plot(fit, which=4, cook.levels=cutoff)
# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fit)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 4.34, Df = 1, p = 0.04
# plot studentized residuals vs. fitted values
spreadLevelPlot(fit)
##
## Suggested power transformation: 0.144
# Evaluate Collinearity
vif(fit) # variance inflation factors
## cyl disp hp
## 6.73 5.52 3.35
sqrt(vif(fit)) > 2 # problem?
## cyl disp hp
## TRUE TRUE FALSE
# One Way Anova (Completely Randomized Design)
fit_an <- aov(mpg ~ disp, data=mtcars)
# Randomized Block Design (B is the blocking factor, cyl is the blocking)
fit_bl <- aov(mpg ~ disp+cyl, data=mtcars)
layout(matrix(c(1,2,3,4),2,2)) # optional layout
plot(fit_bl) # diagnostic plots
# Detect Outliers in the MTCARS Data
library(mvoutlier)
## Loading required package: sgeostat
## sROC 0.1-2 loaded
outliers <- aq.plot(mtcars[c("mpg","disp","hp","drat","wt","qsec")])
## Projection to the first and second robust principal components.
## Proportion of total variation (explained variance): 0.974
outliers # show list of outliers
## $outliers
## Mazda RX4 Mazda RX4 Wag Datsun 710
## FALSE FALSE FALSE
## Hornet 4 Drive Hornet Sportabout Valiant
## FALSE FALSE FALSE
## Duster 360 Merc 240D Merc 230
## TRUE TRUE TRUE
## Merc 280 Merc 280C Merc 450SE
## FALSE FALSE FALSE
## Merc 450SL Merc 450SLC Cadillac Fleetwood
## FALSE FALSE FALSE
## Lincoln Continental Chrysler Imperial Fiat 128
## FALSE FALSE TRUE
## Honda Civic Toyota Corolla Toyota Corona
## FALSE FALSE FALSE
## Dodge Challenger AMC Javelin Camaro Z28
## FALSE FALSE TRUE
## Pontiac Firebird Fiat X1-9 Porsche 914-2
## FALSE FALSE FALSE
## Lotus Europa Ford Pantera L Ferrari Dino
## FALSE TRUE FALSE
## Maserati Bora Volvo 142E
## TRUE FALSE
There are many other statistical analysis including nomality check and multivariate analysis. Due to the limitation of the time, please refer to other resources listed from Quick-R presented by datacamp.