module2_day3_htwtkids

# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Module 2 Day 3 ~
# ~ Regression Analysis ~
# ~ Height and Weight of Kids   ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~

#
#  Data on Age, Weight, and Height of Children   #
#  recorded for a group of school children.      #
#  From Lewis and Taylor (1967).
# In this example, the weights of school children are modeled as a function of 
# their heights and ages. Modeling is performed separately for boys and girls.

# Data Set: htwtkids

# Data Dictionary: 
# (1) age      age (number; years) 
# (2) height   height (number; inches)
# (3) weight   Weight (number; pounds)
# (4) sex      sex (character; "f", "m")


# Download and load the data files 
download.file("http://www.duke.edu/~sgrambow/crp245data/htwtkids.RData",
              destfile = "htwtkids.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("htwtkids.RData")
#
# Summary and Visualization of Data
#
# separate into females and males
htwt.female <- subset(htwtkids,sex=="f") 
htwt.male <- subset(htwtkids,sex=="m") 
htwt.male <- htwt.male[,-1]

# quick analysis of males
# summary for Weight

# smallest values
sort(unique(htwt.male$weight), decreasing = FALSE)[1:5]

## [1] 70.0 71.5 72.0 73.5 75.0

# largest values
sort(unique(htwt.male$weight), decreasing = TRUE)[1:5]

## [1] 171.5 150.5 150.0 147.0 140.0

# descriptive statistics
summary(htwt.male$weight)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   70.00   86.62  104.50  103.45  113.62  171.50

sd(htwt.male$weight)

## [1] 19.96806

# fivenum() generates min, Q1, median, Q3, max
fivenum(htwt.male$weight)

## [1]  70.0  86.5 104.5 114.0 171.5

# Visualization of Y -- Histogram
hist(htwt.male$weight, # histogram
     col="blue", # column color
     border="black",
     prob = TRUE, # show densities instead of frequencies
     xlab = "Weight (pounds)",
     main = "Histogram w Density Plot")
lines(density(htwt.male$weight), # density plot
      lwd = 2, # thickness of line
      col = "red")

# summary for X1 -- age -- in years
# smallest values
sort(unique(htwt.male$age), decreasing = FALSE)[1:5]

## [1] 11.58 11.67 11.75 11.83 11.92

# largest values
sort(unique(htwt.male$age), decreasing = TRUE)[1:5]

## [1] 20.83 17.17 16.92 16.67 16.33

# descriptive statistics
summary(htwt.male$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   11.58   12.50   13.50   13.70   14.67   20.83

sd(htwt.male$age)

## [1] 1.56311

# fivenum() generates min, Q1, median, Q3, max
fivenum(htwt.male$age)

## [1] 11.58 12.50 13.50 14.67 20.83

# Visualization of Y vs X
hist(htwt.male$age, # histogram
     col="blue", # column color
     border="black",
     prob = TRUE, # show densities instead of frequencies
     xlab = "Age in Years",
     main = "Histogram w Density Plot")
lines(density(htwt.male$age), # density plot
      lwd = 2, # thickness of line
      col = "red")

cor.test(htwt.male$age,htwt.male$weight)

## 
##  Pearson's product-moment correlation
## 
## data:  htwt.male$age and htwt.male$weight
## t = 11.492, df = 124, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6213075 0.7934140
## sample estimates:
##       cor 
## 0.7181702

# Scatterplot with Y
# link to web page about scatterplots
# http://www.sthda.com/english/wiki/scatter-plots-r-base-graphs
# Enhanced Scatterplot 
library(car)

## Loading required package: carData

scatterplot(weight ~ age, data=htwt.male,
            xlab="Age", ylab="weight",
            main=" Scatter Plot")

# the plot contains
# -- the points
# -- the regression line 
# -- the smoothed conditional spread 
# -- the non-parametric regression smootH


# summary for X2 -- height -- height in cm
# smallest values
sort(unique(htwt.male$height), decreasing = FALSE)[1:5]

## [1] 50.5 52.5 53.3 55.0 56.0

# largest values
sort(unique(htwt.male$height), decreasing = TRUE)[1:5]

## [1] 72.0 71.0 69.8 69.5 69.0

# descriptive statistics
summary(htwt.male$height)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50.50   59.00   61.90   62.10   65.72   72.00

sd(htwt.male$height)

## [1] 4.276689

# fivenum() generates min, Q1, median, Q3, max
fivenum(htwt.male$height)

## [1] 50.5 59.0 61.9 65.8 72.0

# Visualization of Y vs X
hist(htwt.male$height, # histogram
     col="blue", # column color
     border="black",
     prob = TRUE, # show densities instead of frequencies
     xlab = "height in Inches",
     main = "Histogram w Density Plot")
lines(density(htwt.male$height), # density plot
      lwd = 2, # thickness of line
      col = "red")

# Pearson correlation between height and weight
cor.test(htwt.male$height,htwt.male$weight)

## 
##  Pearson's product-moment correlation
## 
## data:  htwt.male$height and htwt.male$weight
## t = 14.361, df = 124, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7140535 0.8479619
## sample estimates:
##       cor 
## 0.7902623

# Scatterplot with Y
# link to web pheight about scatterplots
# http://www.sthda.com/english/wiki/scatter-plots-r-base-graphs
# Enhanced Scatterplot 
library(car)
scatterplot(weight ~ height, data=htwt.male,
            xlab="height", ylab="weight",
            main=" Scatter Plot")

# the plot contains
# -- the points
# -- the regression line 
# -- the smoothed conditional spread 
# -- the non-parametric regression smootH

# SCATTERPLOT MATRIX WEIGHT, AGE, HEIGHT
# includes pearson correlations
# just change the data frame name in the
# pairs command
# Correlation panel
upper.panel<-function(x, y){
  points(x,y, pch=19)
  r <- round(cor(x, y), digits=2)
  txt <- paste0("R = ", r)
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  text(0.5, 0.9, txt)}
# Create the plots
pairs(htwt.male, lower.panel = NULL, 
      upper.panel = upper.panel)

#
# REGRESSION MODELS              
#

# Simple Linear Regression 
# Model mvc (Y) ~ age (X1)
ufit1 <- lm(htwt.male$weight ~ htwt.male$age)
summary(ufit1)

## 
## Call:
## lm(formula = htwt.male$weight ~ htwt.male$age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.306 -10.406  -0.650   7.236  36.868 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -22.2812    11.0106  -2.024   0.0452 *  
## htwt.male$age   9.1743     0.7983  11.492   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.95 on 124 degrees of freedom
## Multiple R-squared:  0.5158, Adjusted R-squared:  0.5119 
## F-statistic: 132.1 on 1 and 124 DF,  p-value: < 2.2e-16

# Simple Linear Regression 
# Model weight (Y) ~ height (X2)
ufit2 <- lm(htwt.male$weight ~ htwt.male$height)
summary(ufit2)

## 
## Call:
## lm(formula = htwt.male$weight ~ htwt.male$height)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.913  -8.146  -2.059   6.824  48.139 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -125.6981    15.9936  -7.859 1.59e-12 ***
## htwt.male$height    3.6898     0.2569  14.361  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.29 on 124 degrees of freedom
## Multiple R-squared:  0.6245, Adjusted R-squared:  0.6215 
## F-statistic: 206.2 on 1 and 124 DF,  p-value: < 2.2e-16

# Multiple Linear Regression
# Model:  weight (Y) ~ age (X1) + height (X2)
mfit <- lm(weight ~ age + height, data=htwt.male)
summary(mfit)

## 
## Call:
## lm(formula = weight ~ age + height, data = htwt.male)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.405  -8.301  -1.660   7.251  38.296 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -113.713     15.588  -7.295  3.2e-11 ***
## age            3.701      1.007   3.676 0.000353 ***
## height         2.680      0.368   7.283  3.4e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.71 on 123 degrees of freedom
## Multiple R-squared:  0.6617, Adjusted R-squared:  0.6562 
## F-statistic: 120.3 on 2 and 123 DF,  p-value: < 2.2e-16

# We can also make a plot of observed vs predicted
plot(predict(mfit),htwt.male$weight,
     xlab="predicted",ylab="actual")
# we can use abline(a,b)
# which intercept/slope form
# a=0 is y-intercept -- indicating (0,0)
# b=1 is slope of 1 -- 
# so abline(a=0,b=1) is the line y=x 
# if observed = predicted, all points should be on this line
abline(a=0,b=1)

# Now Check Model Diagnostics
# Model (mfit):  mvc (Y) ~ age (X1) + height (X2)
plot(mfit)

# Additional Diagnostics focused
# on measures of influence
library(olsrr)

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:datasets':
## 
##     rivers

# Cook's D Chart 
ols_plot_cooksd_chart(mfit)

# DFBETAs Panel
ols_plot_dfbetas(mfit)

# DFFITs Plot
ols_plot_dffits(mfit)

# Standardized Residual Chart
ols_plot_resid_stand(mfit)

# Studentized Residuals vs Leverage Plot
ols_plot_resid_lev(mfit)

# collinearity diagnostics
# Need car package for multiollinearity function vif()
#install.packages('car')
library(car)
# Evaluate multicollinearity
vif(mfit) # variance inflation factors

##      age   height 
## 2.258248 2.258248

sqrt(vif(mfit)) > 2 #  if TRUE there may be a problem

##    age height 
##  FALSE  FALSE

# Remove one of the collinear variables to fix this issue!
# End of Program

module2_day3_htwtkids_Rscript.R

sgrambow

2020-02-20