library("MASS")
data(cats)
str(cats)
## 'data.frame': 144 obs. of 3 variables:
## $ Sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
## $ Bwt: num 2 2 2 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ...
## $ Hwt: num 7 7.4 9.5 7.2 7.3 7.6 8.1 8.2 8.3 8.5 ...
summary(cats)
## Sex Bwt Hwt
## F:47 Min. :2.000 Min. : 6.30
## M:97 1st Qu.:2.300 1st Qu.: 8.95
## Median :2.700 Median :10.10
## Mean :2.724 Mean :10.63
## 3rd Qu.:3.025 3rd Qu.:12.12
## Max. :3.900 Max. :20.50
with(cats, plot(Bwt, Hwt))
title(main="Heart Weight (g) vs. Body Weight (kg)\nof Domestic Cats")
with(cats, plot(Hwt ~ Bwt)) # or...
plot(Hwt ~ Bwt, data=cats)
####The scatterplot shows a fairly strong and reasonably linear relationship between the two variables. A Pearson product-moment correlation coef????cient can be calculate using the cor() function (which will fail if there are missing values).
with(cats, cor(Bwt, Hwt))
## [1] 0.8041274
with(cats, cor(Bwt, Hwt))^2
## [1] 0.6466209
with(cats, cor.test(Bwt, Hwt)) # cor.test(~Bwt + Hwt, data=cats) also works
##
## Pearson's product-moment correlation
##
## data: Bwt and Hwt
## t = 16.119, df = 142, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7375682 0.8552122
## sample estimates:
## cor
## 0.8041274
with(cats, cor.test(Bwt, Hwt, alternative="greater", conf.level=.8))
##
## Pearson's product-moment correlation
##
## data: Bwt and Hwt
## t = 16.119, df = 142, p-value < 2.2e-16
## alternative hypothesis: true correlation is greater than 0
## 80 percent confidence interval:
## 0.7776141 1.0000000
## sample estimates:
## cor
## 0.8041274
cor.test(~ Bwt + Hwt, data=cats) # output not shown
##
## Pearson's product-moment correlation
##
## data: Bwt and Hwt
## t = 16.119, df = 142, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7375682 0.8552122
## sample estimates:
## cor
## 0.8041274
cor.test(~Bwt + Hwt, data=cats, subset=(Sex=="F"))
##
## Pearson's product-moment correlation
##
## data: Bwt and Hwt
## t = 4.2152, df = 45, p-value = 0.0001186
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2890452 0.7106399
## sample estimates:
## cor
## 0.5320497
with(cats, plot(Bwt, Hwt, type="n", las=1, xlab="Body Weight in kg", ylab="Heart Weight in g", main="Heart Weight vs. Body Weight of Cats"))
with(cats, points(Bwt[Sex=="F"], Hwt[Sex=="F"], pch=16, col="red"))
with(cats, points(Bwt[Sex=="M"], Hwt[Sex=="M"], pch=17, col="blue"))
rm(cats) # if you haven't already
data(cement) # also in the MASS library
str(cement)
## 'data.frame': 13 obs. of 5 variables:
## $ x1: int 7 1 11 11 7 11 3 1 2 21 ...
## $ x2: int 26 29 56 31 52 55 71 31 54 47 ...
## $ x3: int 6 15 8 8 6 9 17 22 18 4 ...
## $ x4: int 60 52 20 47 33 22 6 44 22 26 ...
## $ y : num 78.5 74.3 104.3 87.6 95.9 ...
cor(cement)
## x1 x2 x3 x4 y
## x1 1.0000000 0.2285795 -0.8241338 -0.2454451 0.7307175
## x2 0.2285795 1.0000000 -0.1392424 -0.9729550 0.8162526
## x3 -0.8241338 -0.1392424 1.0000000 0.0295370 -0.5346707
## x4 -0.2454451 -0.9729550 0.0295370 1.0000000 -0.8213050
## y 0.7307175 0.8162526 -0.5346707 -0.8213050 1.0000000
cov(cement)
## x1 x2 x3 x4 y
## x1 34.60256 20.92308 -31.051282 -24.166667 64.66346
## x2 20.92308 242.14103 -13.878205 -253.416667 191.07949
## x3 -31.05128 -13.87821 41.025641 3.166667 -51.51923
## x4 -24.16667 -253.41667 3.166667 280.166667 -206.80833
## y 64.66346 191.07949 -51.519231 -206.808333 226.31359
cov.matr = cov(cement)
cov2cor(cov.matr)
## x1 x2 x3 x4 y
## x1 1.0000000 0.2285795 -0.8241338 -0.2454451 0.7307175
## x2 0.2285795 1.0000000 -0.1392424 -0.9729550 0.8162526
## x3 -0.8241338 -0.1392424 1.0000000 0.0295370 -0.5346707
## x4 -0.2454451 -0.9729550 0.0295370 1.0000000 -0.8213050
## y 0.7307175 0.8162526 -0.5346707 -0.8213050 1.0000000
pairs(cement)
plot(cement)
**The dnominal data: distinguishes between different categories, but there is no implied order between categories.
**The ordinal data: distinguished by the fact that there is some kind of natural order between elements but no indication of the relative size difference/refers to quantities that have a natural ordering
**The numeric: –>interval data: is like ordinal except we can say the intervals between each value are equally split
–>ratio data: this is data where all kinds of mathematical operations are allowed, in particular the ability to multiply and divide ###What does it mean when determining associations (correlation) between variables?
ls()
## [1] "cement" "cov.matr"
rm(cement, cov.matr) # clean up first
coach1 = c(1,2,3,4,5,6,7,8,9,10)
coach2 = c(4,8,1,5,9,2,10,7,3,6)
cor(coach1, coach2, method="spearman") # do not capitalize "spearman"
## [1] 0.1272727
cor.test(coach1, coach2, method="spearman")
##
## Spearman's rank correlation rho
##
## data: coach1 and coach2
## S = 144, p-value = 0.7329
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1272727
cor(coach1, coach2, method="kendall")
## [1] 0.1111111
cor.test(coach1, coach2, method="kendall")
##
## Kendall's rank correlation tau
##
## data: coach1 and coach2
## T = 25, p-value = 0.7275
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.1111111
ls()
## [1] "coach1" "coach2"
rm(coach1,coach2)
data(cats)
cats[12,2] = NA
cats[101,3] = NA
cats[132,2:3] = NA
summary(cats)
## Sex Bwt Hwt
## F:47 Min. :2.000 Min. : 6.30
## M:97 1st Qu.:2.300 1st Qu.: 8.85
## Median :2.700 Median :10.10
## Mean :2.723 Mean :10.62
## 3rd Qu.:3.000 3rd Qu.:12.07
## Max. :3.900 Max. :20.50
## NA's :2 NA's :2
with(cats, cor(Bwt, Hwt))
## [1] NA
To see what work and what not
with(cats, cor(Bwt, Hwt))
## [1] NA
with(cats, cov(Bwt, Hwt))
## [1] NA
with(cats, cor.test(Bwt, Hwt))
##
## Pearson's product-moment correlation
##
## data: Bwt and Hwt
## t = 16.092, df = 139, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7400416 0.8576152
## sample estimates:
## cor
## 0.8066677
with(cats, plot(Bwt, Hwt))
To get cor() and cov() to work with missing values, you set the “use=” option
with(cats, cor(Bwt, Hwt, use="pairwise"))
## [1] 0.8066677
rm(cats) # get rid of the messed up version
data(cats) # still in the "MASS" library
Getting the regression equation for our “cats” is simple enough.
lm(Hwt ~ Bwt, data=cats)
##
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
##
## Coefficients:
## (Intercept) Bwt
## -0.3567 4.0341
from above the coefficients shows the slope and the y-intercept So the regression equation is: Hwt-hat = 4.0341 (Bwt) - 0.3567.
lm.out = lm(Hwt ~ Bwt, data=cats) # name the output anything you like
lm.out
##
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
##
## Coefficients:
## (Intercept) Bwt
## -0.3567 4.0341
summary(lm.out) # gives a lot more info
##
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5694 -0.9634 -0.0921 1.0426 5.1238
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3567 0.6923 -0.515 0.607
## Bwt 4.0341 0.2503 16.119 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.452 on 142 degrees of freedom
## Multiple R-squared: 0.6466, Adjusted R-squared: 0.6441
## F-statistic: 259.8 on 1 and 142 DF, p-value: < 2.2e-16
options(show.signif.stars=F)
anova(lm.out) # shows an ANOVA table
## Analysis of Variance Table
##
## Response: Hwt
## Df Sum Sq Mean Sq F value Pr(>F)
## Bwt 1 548.09 548.09 259.83 < 2.2e-16
## Residuals 142 299.53 2.11
The “anova” used to Compute analysis of variance (or deviance) tables for one or more fitted model objects.
plot(Hwt ~ Bwt, data=cats, main="Kitty Cat Plot")
abline(lm.out, col="red") ## the abline function is to Add Straight Lines to a Plot
par(mfrow=c(2,2)) # partition the graphics device
plot(lm.out)
####Case 144
cats[144,]
## Sex Bwt Hwt
## 144 M 3.9 20.5
lm.out$fitted[144]
## 144
## 15.37618
lm.out$residuals[144]
## 144
## 5.123818
This cat had the largest body weight and the largest heart weight in the data set. The observed value of “Hwt” was 20.5g,but the fi????tted value was only 15.4g, giving a residual of 5.1g. The residual standard error (from the model output above) was 1.452, so converting this to a standardized residual gives 5.124/1.452 = 3.53, a substantial value to say the least! A commonlyused measure of in????uence is Cook’s Distance, which can be visualized for all the cases in the model as follows.
par(mfrow=c(1,1))
plot(cooks.distance(lm.out))
lm.without144 = lm(Hwt ~ Bwt, data=cats, subset=(Hwt<20.5))
lm.without144
##
## Call:
## lm(formula = Hwt ~ Bwt, data = cats, subset = (Hwt < 20.5))
##
## Coefficients:
## (Intercept) Bwt
## 0.118 3.846
rlm(Hwt ~ Bwt, data=cats)
## Call:
## rlm(formula = Hwt ~ Bwt, data = cats)
## Converged in 5 iterations
##
## Coefficients:
## (Intercept) Bwt
## -0.1361777 3.9380535
##
## Degrees of freedom: 144 total; 142 residual
## Scale estimate: 1.52
plot(Hwt ~ Bwt, data=cats)
lines(lowess(cats$Hwt ~ cats$Bwt), col="red")
### the lowess() function will take a formula interface but will not use
### the data= option; an oversight in the programming, I suspect the other method is (close the graphics device first)...
scatter.smooth(cats$Hwt ~ cats$Bwt) # same programming flaw here
scatter.smooth(cats$Hwt ~ cats$Bwt, pch=16, cex=.6)
A non parametric method it make no assumption on the population distribution or sample size.
The technique of (weighted) smoothing to Plot and add a smooth curve computed by loess to a scatter plot