Create a data frame
library(Stat2Data)
data("Kids198")
Loading libraries
library(ggplot2)
library(HH)
## Loading required package: lattice
## Loading required package: grid
## Loading required package: latticeExtra
## Loading required package: RColorBrewer
##
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
##
## layer
## Loading required package: multcomp
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
##
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
##
## geyser
## Loading required package: gridExtra
A quick look at the data
head(Kids198)
## Height Weight Age Sex Race
## 1 67.8 166 210 0 1
## 2 63.0 93 144 1 0
## 3 50.1 54 119 0 0
## 4 55.7 69 130 1 0
## 5 63.2 115 157 0 0
## 6 48.8 52 102 0 0
plot(Weight~Age, pch=Sex, data=Kids198)
Subsetting the dataframe by sex
male <- subset(Kids198, Sex==0)
female <- subset(Kids198, Sex==1)
Fitting a regression model for the females
regmodelf=lm(Weight~Age, data=female)
summary(regmodelf)
##
## Call:
## lm(formula = Weight ~ Age, data = female)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.381 -11.849 -4.341 10.351 58.581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.84197 7.90251 -0.233 0.816
## Age 0.62749 0.04937 12.710 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.48 on 100 degrees of freedom
## Multiple R-squared: 0.6176, Adjusted R-squared: 0.6138
## F-statistic: 161.5 on 1 and 100 DF, p-value: < 2.2e-16
anova(regmodelf)
## Analysis of Variance Table
##
## Response: Weight
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 49365 49365 161.53 < 2.2e-16 ***
## Residuals 100 30560 306
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ci.plot(regmodelf)
Fitting a regression model for the males
regmodelm=lm(Weight~Age, data=male)
summary(regmodelm)
##
## Call:
## lm(formula = Weight ~ Age, data = male)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.884 -12.918 -1.792 10.148 47.373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -33.69254 10.87366 -3.099 0.00257 **
## Age 0.90871 0.06635 13.696 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.85 on 94 degrees of freedom
## Multiple R-squared: 0.6662, Adjusted R-squared: 0.6626
## F-statistic: 187.6 on 1 and 94 DF, p-value: < 2.2e-16
anova(regmodelm)
## Analysis of Variance Table
##
## Response: Weight
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 81525 81525 187.58 < 2.2e-16 ***
## Residuals 94 40853 435
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ci.plot(regmodelm)
Creating a scatterplot with different symbols and two different regression lines
plot(Weight~Age, pch=Sex, data=Kids198)
abline(regmodelm, col="blue")
abline(regmodelf, col="pink")
A multiple regression model for both males and females
regmodel=lm(Weight~Age+Sex+Sex*Age, data=Kids198)
summary(regmodel)
##
## Call:
## lm(formula = Weight ~ Age + Sex + Sex * Age, data = Kids198)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.884 -12.055 -2.782 10.185 58.581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -33.69254 10.00727 -3.367 0.000917 ***
## Age 0.90871 0.06106 14.882 < 2e-16 ***
## Sex 31.85057 13.24269 2.405 0.017106 *
## Age:Sex -0.28122 0.08164 -3.445 0.000700 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.19 on 194 degrees of freedom
## Multiple R-squared: 0.6683, Adjusted R-squared: 0.6631
## F-statistic: 130.3 on 3 and 194 DF, p-value: < 2.2e-16
anova(regmodel)
## Analysis of Variance Table
##
## Response: Weight
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 131450 131450 357.092 < 2.2e-16 ***
## Sex 1 8046 8046 21.858 5.488e-06 ***
## Age:Sex 1 4368 4368 11.866 0.0007004 ***
## Residuals 194 71414 368
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(regmodel)
## 2.5 % 97.5 %
## (Intercept) -53.4295458 -13.9555301
## Age 0.7882792 1.0291404
## Sex 5.7324387 57.9686922
## Age:Sex -0.4422327 -0.1202112
A multiple regression model with the same slopes (parallel lines model)
parallel=lm(Weight~Age+Sex, data=Kids198)
summary(parallel)
##
## Call:
## lm(formula = Weight ~ Age + Sex, data = Kids198)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.311 -12.577 -1.933 10.024 58.727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.40655 6.98870 -1.203 0.23
## Age 0.75138 0.04164 18.043 < 2e-16 ***
## Sex -12.78433 2.80963 -4.550 9.41e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.71 on 195 degrees of freedom
## Multiple R-squared: 0.648, Adjusted R-squared: 0.6444
## F-statistic: 179.5 on 2 and 195 DF, p-value: < 2.2e-16
anova(parallel)
## Analysis of Variance Table
##
## Response: Weight
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 131450 131450 338.243 < 2.2e-16 ***
## Sex 1 8046 8046 20.704 9.412e-06 ***
## Residuals 195 75782 389
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(parallel, regmodel)
## Analysis of Variance Table
##
## Model 1: Weight ~ Age + Sex
## Model 2: Weight ~ Age + Sex + Sex * Age
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 195 75782
## 2 194 71414 1 4368.2 11.866 0.0007004 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interesting plots and scatterplot smoothing
qplot(Age, Weight, color=Sex, data=Kids198)
qplot(Age, Weight, facets= ~Sex, data=Kids198)
qplot(Age, Weight, facets= ~Sex, geom=c("point", "smooth"), data=Kids198)
## `geom_smooth()` using method = 'loess'