Coursera Statistics One Week 3 Assignment 3

Write a script in R to run correlation and multiple regression analyses. Use the data in “DAA.03.txt”. The file contains
fictional data from 245 adults. The THREE (3) variables of interest are (1) physical endurance, (2) age, and (3) number of years
engaged in an active sport.

From your R output, report the TEN (10) values listed in the table below. Round to TWO (2) significant digits (for example, if
the correlation is .456 then write .46).

library(psych)
library(ltm)
## Loading required package: MASS
## Loading required package: msm
## Loading required package: mvtnorm
## Loading required package: polycor
## Loading required package: sfsmisc
## Attaching package: 'polycor'
## The following object(s) are masked from 'package:psych':
## 
## polyserial
## Attaching package: 'ltm'
## The following object(s) are masked from 'package:psych':
## 
## factor.scores
library(gclus)
## Loading required package: cluster

#
# |------------------------------------------------------------------------------------------|
# | I N I T I A L I Z A T I O N |
# |------------------------------------------------------------------------------------------|
Init <- function(fileStr, workDirStr = "C:/Users/denbrige/100 FxOption/103 FxOptionVerBack/080 Fx Git/R-nonsource") {
    setwd(workDirStr)
    retDfr <- read.table(fileStr, header = T)
    return(retDfr)
}

#
# |------------------------------------------------------------------------------------------|
# | M A I N P R O C E D U R E |
# |------------------------------------------------------------------------------------------|

# --- Init loading raw data
rawDfr <- Init("DAA.03.txt")

# --- Count of raw data
nrow(rawDfr)
## [1] 245

# --- Names of header
names(rawDfr)
## [1] "pid"         "age"         "activeyears" "endurance"

# --- Remove column 1
rawDfr <- rawDfr[, 2:4]

# --- Layout
layout(matrix(1:6, 3, 2, byrow = TRUE))

# --- Custom Plot
for (nameStr in names(rawDfr)) {
    # --- Check for normality p>0.05 is normal
    p <- shapiro.test(rawDfr[[nameStr]])$p.value
    hist(rawDfr[[nameStr]], prob = T, main = c(paste("RAW", nameStr), paste("Shapiro p=", 
        prettyNum(p, digits = 2))), xlab = nameStr)
    lines(density(rawDfr[[nameStr]]))
    # --- Check for normality
    qqnorm(rawDfr[[nameStr]])
}

plot of chunk unnamed-chunk-1


describe(rawDfr)
##             var   n  mean    sd median trimmed   mad min max range skew
## age           1 245 49.18 10.11     48   49.11 10.38  20  82    62 0.15
## activeyears   2 245 10.67  4.78     11   10.56  4.45   0  26    26 0.27
## endurance     3 245 26.53 10.82     27   26.39 10.38   0  55    55 0.11
##             kurtosis   se
## age            -0.08 0.65
## activeyears     0.23 0.31
## endurance      -0.30 0.69

# --- Scatterplot and Correlation Analysis (library gclus and ltm)
# Scatterplot
cpairs(rawDfr, panel.colors = dmat.color(abs(cor(rawDfr))), gap = 0.5, main = "RAW Variables Ordered and Colored by Correlations")

plot of chunk unnamed-chunk-1

# --- Correlation matrix
cor(rawDfr)
##                 age activeyears endurance
## age          1.0000      0.2827   -0.1259
## activeyears  0.2827      1.0000    0.3365
## endurance   -0.1259      0.3365    1.0000
# --- Perform correlation test for matrix (library ltm) Correlation null
# hypothesis is that the correlation is zero (not correlated) If the
# p-value is less than the alpha level, then the null hypothesis is
# rejected Check for correlation p<0.05 is correlated
rcor.test(rawDfr)
## 
##             age    activeyears endurance
## age          *****  0.283      -0.126   
## activeyears <0.001  *****       0.337   
## endurance    0.049 <0.001       *****   
## 
## upper diagonal part contains correlation coefficient estimates 
## lower diagonal part contains corresponding p-values

# --- Simple Regression (unstandardized) Y = endurance; X = age;
raw1Lm <- lm(rawDfr$endurance ~ rawDfr$age)
summary(raw1Lm)
## 
## Call:
## lm(formula = rawDfr$endurance ~ rawDfr$age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.073  -7.633   0.097   6.771  30.870 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  33.1567     3.4203    9.69   <2e-16 ***
## rawDfr$age   -0.1347     0.0681   -1.98    0.049 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 10.8 on 243 degrees of freedom
## Multiple R-squared: 0.0158,  Adjusted R-squared: 0.0118 
## F-statistic: 3.91 on 1 and 243 DF,  p-value: 0.0491
# --- Simple Regression (standardized)
sraw1Lm <- lm(scale(rawDfr$endurance) ~ scale(rawDfr$age))
summary(sraw1Lm)
## 
## Call:
## lm(formula = scale(rawDfr$endurance) ~ scale(rawDfr$age))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.317 -0.706  0.009  0.626  2.853 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       -1.98e-17   6.35e-02    0.00    1.000  
## scale(rawDfr$age) -1.26e-01   6.36e-02   -1.98    0.049 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.994 on 243 degrees of freedom
## Multiple R-squared: 0.0158,  Adjusted R-squared: 0.0118 
## F-statistic: 3.91 on 1 and 243 DF,  p-value: 0.0491

# --- Simple Regression (unstandardized) Y = endurance; X = activeyears;
raw2Lm <- lm(rawDfr$endurance ~ rawDfr$activeyears)
summary(raw2Lm)
## 
## Call:
## lm(formula = rawDfr$endurance ~ rawDfr$activeyears)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.730  -7.067   0.558   5.745  31.083 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          18.392      1.600   11.50  < 2e-16 ***
## rawDfr$activeyears    0.762      0.137    5.57  6.7e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 10.2 on 243 degrees of freedom
## Multiple R-squared: 0.113,   Adjusted R-squared: 0.11 
## F-statistic:   31 on 1 and 243 DF,  p-value: 6.7e-08
# --- Simple Regression (standardized)
sraw2Lm <- lm(scale(rawDfr$endurance) ~ scale(rawDfr$activeyears))
summary(sraw2Lm)
## 
## Call:
## lm(formula = scale(rawDfr$endurance) ~ scale(rawDfr$activeyears))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1933 -0.6532  0.0516  0.5310  2.8730 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -3.67e-17   6.03e-02    0.00        1    
## scale(rawDfr$activeyears)  3.37e-01   6.04e-02    5.57  6.7e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.944 on 243 degrees of freedom
## Multiple R-squared: 0.113,   Adjusted R-squared: 0.11 
## F-statistic:   31 on 1 and 243 DF,  p-value: 6.7e-08

# --- Multiple Regression (unstandardized) Y = endurance; X1 = age; X2 =
# activeyears;
raw3Lm <- lm(rawDfr$endurance ~ rawDfr$age + rawDfr$activeyears)
summary(raw3Lm)
## 
## Call:
## lm(formula = rawDfr$endurance ~ rawDfr$age + rawDfr$activeyears)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -21.80  -6.90   0.57   5.63  27.23 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         29.3952     3.2054    9.17  < 2e-16 ***
## rawDfr$age          -0.2571     0.0655   -3.93  0.00011 ***
## rawDfr$activeyears   0.9163     0.1386    6.61  2.4e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 9.92 on 242 degrees of freedom
## Multiple R-squared: 0.166,   Adjusted R-squared: 0.159 
## F-statistic: 24.1 on 2 and 242 DF,  p-value: 2.75e-10
# --- Multiple Regression (standardized)
sraw3Lm <- lm(scale(rawDfr$endurance) ~ scale(rawDfr$age) + scale(rawDfr$activeyears))
summary(sraw3Lm)
## 
## Call:
## lm(formula = scale(rawDfr$endurance) ~ scale(rawDfr$age) + scale(rawDfr$activeyears))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0149 -0.6381  0.0527  0.5206  2.5166 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -6.40e-17   5.86e-02    0.00  1.00000    
## scale(rawDfr$age)         -2.40e-01   6.12e-02   -3.93  0.00011 ***
## scale(rawDfr$activeyears)  4.04e-01   6.12e-02    6.61  2.4e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.917 on 242 degrees of freedom
## Multiple R-squared: 0.166,   Adjusted R-squared: 0.159 
## F-statistic: 24.1 on 2 and 242 DF,  p-value: 2.75e-10

#
# |------------------------------------------------------------------------------------------|
# | E N D O F S C R I P T |
# |------------------------------------------------------------------------------------------|