# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Module 2 Day 3 ~
# ~ Regression Analysis ~
# ~ Height and Weight of Kids ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Data on Age, Weight, and Height of Children #
# recorded for a group of school children. #
# From Lewis and Taylor (1967).
# In this example, the weights of school children are modeled as a function of
# their heights and ages. Modeling is performed separately for boys and girls.
# Data Set: htwtkids
# Data Dictionary:
# (1) age age (number; years)
# (2) height height (number; inches)
# (3) weight Weight (number; pounds)
# (4) sex sex (character; "f", "m")
# Download and load the data files
download.file("http://www.duke.edu/~sgrambow/crp245data/htwtkids.RData",
destfile = "htwtkids.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("htwtkids.RData")
#
# Summary and Visualization of Data
#
# separate into females and males
htwt.female <- subset(htwtkids,sex=="f")
htwt.male <- subset(htwtkids,sex=="m")
htwt.male <- htwt.male[,-1]
# quick analysis of males
# summary for Weight
# smallest values
sort(unique(htwt.male$weight), decreasing = FALSE)[1:5]
## [1] 70.0 71.5 72.0 73.5 75.0
# largest values
sort(unique(htwt.male$weight), decreasing = TRUE)[1:5]
## [1] 171.5 150.5 150.0 147.0 140.0
# descriptive statistics
summary(htwt.male$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 70.00 86.62 104.50 103.45 113.62 171.50
sd(htwt.male$weight)
## [1] 19.96806
# fivenum() generates min, Q1, median, Q3, max
fivenum(htwt.male$weight)
## [1] 70.0 86.5 104.5 114.0 171.5
# Visualization of Y -- Histogram
hist(htwt.male$weight, # histogram
col="blue", # column color
border="black",
prob = TRUE, # show densities instead of frequencies
xlab = "Weight (pounds)",
main = "Histogram w Density Plot")
lines(density(htwt.male$weight), # density plot
lwd = 2, # thickness of line
col = "red")

# summary for X1 -- age -- in years
# smallest values
sort(unique(htwt.male$age), decreasing = FALSE)[1:5]
## [1] 11.58 11.67 11.75 11.83 11.92
# largest values
sort(unique(htwt.male$age), decreasing = TRUE)[1:5]
## [1] 20.83 17.17 16.92 16.67 16.33
# descriptive statistics
summary(htwt.male$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.58 12.50 13.50 13.70 14.67 20.83
sd(htwt.male$age)
## [1] 1.56311
# fivenum() generates min, Q1, median, Q3, max
fivenum(htwt.male$age)
## [1] 11.58 12.50 13.50 14.67 20.83
# Visualization of Y vs X
hist(htwt.male$age, # histogram
col="blue", # column color
border="black",
prob = TRUE, # show densities instead of frequencies
xlab = "Age in Years",
main = "Histogram w Density Plot")
lines(density(htwt.male$age), # density plot
lwd = 2, # thickness of line
col = "red")

cor.test(htwt.male$age,htwt.male$weight)
##
## Pearson's product-moment correlation
##
## data: htwt.male$age and htwt.male$weight
## t = 11.492, df = 124, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6213075 0.7934140
## sample estimates:
## cor
## 0.7181702
# Scatterplot with Y
# link to web page about scatterplots
# http://www.sthda.com/english/wiki/scatter-plots-r-base-graphs
# Enhanced Scatterplot
library(car)
## Loading required package: carData
scatterplot(weight ~ age, data=htwt.male,
xlab="Age", ylab="weight",
main=" Scatter Plot")

# the plot contains
# -- the points
# -- the regression line
# -- the smoothed conditional spread
# -- the non-parametric regression smootH
# summary for X2 -- height -- height in cm
# smallest values
sort(unique(htwt.male$height), decreasing = FALSE)[1:5]
## [1] 50.5 52.5 53.3 55.0 56.0
# largest values
sort(unique(htwt.male$height), decreasing = TRUE)[1:5]
## [1] 72.0 71.0 69.8 69.5 69.0
# descriptive statistics
summary(htwt.male$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.50 59.00 61.90 62.10 65.72 72.00
sd(htwt.male$height)
## [1] 4.276689
# fivenum() generates min, Q1, median, Q3, max
fivenum(htwt.male$height)
## [1] 50.5 59.0 61.9 65.8 72.0
# Visualization of Y vs X
hist(htwt.male$height, # histogram
col="blue", # column color
border="black",
prob = TRUE, # show densities instead of frequencies
xlab = "height in Inches",
main = "Histogram w Density Plot")
lines(density(htwt.male$height), # density plot
lwd = 2, # thickness of line
col = "red")

# Pearson correlation between height and weight
cor.test(htwt.male$height,htwt.male$weight)
##
## Pearson's product-moment correlation
##
## data: htwt.male$height and htwt.male$weight
## t = 14.361, df = 124, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7140535 0.8479619
## sample estimates:
## cor
## 0.7902623
# Scatterplot with Y
# link to web pheight about scatterplots
# http://www.sthda.com/english/wiki/scatter-plots-r-base-graphs
# Enhanced Scatterplot
library(car)
scatterplot(weight ~ height, data=htwt.male,
xlab="height", ylab="weight",
main=" Scatter Plot")

# the plot contains
# -- the points
# -- the regression line
# -- the smoothed conditional spread
# -- the non-parametric regression smootH
# SCATTERPLOT MATRIX WEIGHT, AGE, HEIGHT
# includes pearson correlations
# just change the data frame name in the
# pairs command
# Correlation panel
upper.panel<-function(x, y){
points(x,y, pch=19)
r <- round(cor(x, y), digits=2)
txt <- paste0("R = ", r)
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
text(0.5, 0.9, txt)}
# Create the plots
pairs(htwt.male, lower.panel = NULL,
upper.panel = upper.panel)

#
# REGRESSION MODELS
#
# Simple Linear Regression
# Model mvc (Y) ~ age (X1)
ufit1 <- lm(htwt.male$weight ~ htwt.male$age)
summary(ufit1)
##
## Call:
## lm(formula = htwt.male$weight ~ htwt.male$age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.306 -10.406 -0.650 7.236 36.868
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -22.2812 11.0106 -2.024 0.0452 *
## htwt.male$age 9.1743 0.7983 11.492 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.95 on 124 degrees of freedom
## Multiple R-squared: 0.5158, Adjusted R-squared: 0.5119
## F-statistic: 132.1 on 1 and 124 DF, p-value: < 2.2e-16
# Simple Linear Regression
# Model weight (Y) ~ height (X2)
ufit2 <- lm(htwt.male$weight ~ htwt.male$height)
summary(ufit2)
##
## Call:
## lm(formula = htwt.male$weight ~ htwt.male$height)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.913 -8.146 -2.059 6.824 48.139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -125.6981 15.9936 -7.859 1.59e-12 ***
## htwt.male$height 3.6898 0.2569 14.361 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.29 on 124 degrees of freedom
## Multiple R-squared: 0.6245, Adjusted R-squared: 0.6215
## F-statistic: 206.2 on 1 and 124 DF, p-value: < 2.2e-16
# Multiple Linear Regression
# Model: weight (Y) ~ age (X1) + height (X2)
mfit <- lm(weight ~ age + height, data=htwt.male)
summary(mfit)
##
## Call:
## lm(formula = weight ~ age + height, data = htwt.male)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.405 -8.301 -1.660 7.251 38.296
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -113.713 15.588 -7.295 3.2e-11 ***
## age 3.701 1.007 3.676 0.000353 ***
## height 2.680 0.368 7.283 3.4e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.71 on 123 degrees of freedom
## Multiple R-squared: 0.6617, Adjusted R-squared: 0.6562
## F-statistic: 120.3 on 2 and 123 DF, p-value: < 2.2e-16
# We can also make a plot of observed vs predicted
plot(predict(mfit),htwt.male$weight,
xlab="predicted",ylab="actual")
# we can use abline(a,b)
# which intercept/slope form
# a=0 is y-intercept -- indicating (0,0)
# b=1 is slope of 1 --
# so abline(a=0,b=1) is the line y=x
# if observed = predicted, all points should be on this line
abline(a=0,b=1)

# Now Check Model Diagnostics
# Model (mfit): mvc (Y) ~ age (X1) + height (X2)
plot(mfit)



# Additional Diagnostics focused
# on measures of influence
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers

# Cook's D Chart
ols_plot_cooksd_chart(mfit)

# DFBETAs Panel
ols_plot_dfbetas(mfit)

# DFFITs Plot
ols_plot_dffits(mfit)

# Standardized Residual Chart
ols_plot_resid_stand(mfit)

# Studentized Residuals vs Leverage Plot
ols_plot_resid_lev(mfit)

# collinearity diagnostics
# Need car package for multiollinearity function vif()
#install.packages('car')
library(car)
# Evaluate multicollinearity
vif(mfit) # variance inflation factors
## age height
## 2.258248 2.258248
sqrt(vif(mfit)) > 2 # if TRUE there may be a problem
## age height
## FALSE FALSE
# Remove one of the collinear variables to fix this issue!
# End of Program