#clean up the sample dataset
#set it as a data frame
st <- as.data.frame(state.x77)
#get rid of spaces in names - changes the column name numbered in bracket
colnames(st)[4] = "Life.Exp"
colnames(st)[6] = "HS.Grad"
#add a population density variable as a new column
st$Density = st$Population *1000/st$Area
#look at your data
summary(st)
## Population Income Illiteracy Life.Exp
## Min. : 365 Min. :3098 Min. :0.500 Min. :67.96
## 1st Qu.: 1080 1st Qu.:3993 1st Qu.:0.625 1st Qu.:70.12
## Median : 2838 Median :4519 Median :0.950 Median :70.67
## Mean : 4246 Mean :4436 Mean :1.170 Mean :70.88
## 3rd Qu.: 4968 3rd Qu.:4814 3rd Qu.:1.575 3rd Qu.:71.89
## Max. :21198 Max. :6315 Max. :2.800 Max. :73.60
## Murder HS.Grad Frost Area
## Min. : 1.400 Min. :37.80 Min. : 0.00 Min. : 1049
## 1st Qu.: 4.350 1st Qu.:48.05 1st Qu.: 66.25 1st Qu.: 36985
## Median : 6.850 Median :53.25 Median :114.50 Median : 54277
## Mean : 7.378 Mean :53.11 Mean :104.46 Mean : 70736
## 3rd Qu.:10.675 3rd Qu.:59.15 3rd Qu.:139.75 3rd Qu.: 81162
## Max. :15.100 Max. :67.30 Max. :188.00 Max. :566432
## Density
## Min. : 0.6444
## 1st Qu.: 25.3352
## Median : 73.0154
## Mean :149.2245
## 3rd Qu.:144.2828
## Max. :975.0033
#generate correlation matrix
cor(st)
## Population Income Illiteracy Life.Exp Murder
## Population 1.00000000 0.2082276 0.107622373 -0.06805195 0.3436428
## Income 0.20822756 1.0000000 -0.437075186 0.34025534 -0.2300776
## Illiteracy 0.10762237 -0.4370752 1.000000000 -0.58847793 0.7029752
## Life.Exp -0.06805195 0.3402553 -0.588477926 1.00000000 -0.7808458
## Murder 0.34364275 -0.2300776 0.702975199 -0.78084575 1.0000000
## HS.Grad -0.09848975 0.6199323 -0.657188609 0.58221620 -0.4879710
## Frost -0.33215245 0.2262822 -0.671946968 0.26206801 -0.5388834
## Area 0.02254384 0.3633154 0.077261132 -0.10733194 0.2283902
## Density 0.24622789 0.3299683 0.009274348 0.09106176 -0.1850352
## HS.Grad Frost Area Density
## Population -0.09848975 -0.332152454 0.02254384 0.246227888
## Income 0.61993232 0.226282179 0.36331544 0.329968277
## Illiteracy -0.65718861 -0.671946968 0.07726113 0.009274348
## Life.Exp 0.58221620 0.262068011 -0.10733194 0.091061763
## Murder -0.48797102 -0.538883437 0.22839021 -0.185035233
## HS.Grad 1.00000000 0.366779702 0.33354187 -0.088367214
## Frost 0.36677970 1.000000000 0.05922910 0.002276734
## Area 0.33354187 0.059229102 1.00000000 -0.341388515
## Density -0.08836721 0.002276734 -0.34138851 1.000000000
#rounding
round(cor(st), 3)
## Population Income Illiteracy Life.Exp Murder HS.Grad Frost
## Population 1.000 0.208 0.108 -0.068 0.344 -0.098 -0.332
## Income 0.208 1.000 -0.437 0.340 -0.230 0.620 0.226
## Illiteracy 0.108 -0.437 1.000 -0.588 0.703 -0.657 -0.672
## Life.Exp -0.068 0.340 -0.588 1.000 -0.781 0.582 0.262
## Murder 0.344 -0.230 0.703 -0.781 1.000 -0.488 -0.539
## HS.Grad -0.098 0.620 -0.657 0.582 -0.488 1.000 0.367
## Frost -0.332 0.226 -0.672 0.262 -0.539 0.367 1.000
## Area 0.023 0.363 0.077 -0.107 0.228 0.334 0.059
## Density 0.246 0.330 0.009 0.091 -0.185 -0.088 0.002
## Area Density
## Population 0.023 0.246
## Income 0.363 0.330
## Illiteracy 0.077 0.009
## Life.Exp -0.107 0.091
## Murder 0.228 -0.185
## HS.Grad 0.334 -0.088
## Frost 0.059 0.002
## Area 1.000 -0.341
## Density -0.341 1.000
#look at your correlations
plot(st)
### optional - look at outliers and distribution
par(mfrow=c(3,3), oma = c(0,0,2,0))
for(i in 1:9) qqnorm(st[,i], main=colnames(st)[i])
mtext("Figure 1: Q-Q Norm Plots of Variables",
outer = TRUE, cex = 1, font = 2)
####The Maximal or Full Model
#hide ugly significance stars
options(show.signif.stars = FALSE)
#show the variables that will be compared
names(st)
## [1] "Population" "Income" "Illiteracy" "Life.Exp" "Murder"
## [6] "HS.Grad" "Frost" "Area" "Density"
# create the linear additive model
model1 <- lm(Life.Exp ~ ., data =st)
#look at details
summary(model1)
##
## Call:
## lm(formula = Life.Exp ~ ., data = st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.47514 -0.45887 -0.06352 0.59362 1.21823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.995e+01 1.843e+00 37.956 < 2e-16
## Population 6.480e-05 3.001e-05 2.159 0.0367
## Income 2.701e-04 3.087e-04 0.875 0.3867
## Illiteracy 3.029e-01 4.024e-01 0.753 0.4559
## Murder -3.286e-01 4.941e-02 -6.652 5.12e-08
## HS.Grad 4.291e-02 2.332e-02 1.840 0.0730
## Frost -4.580e-03 3.189e-03 -1.436 0.1585
## Area -1.558e-06 1.914e-06 -0.814 0.4205
## Density -1.105e-03 7.312e-04 -1.511 0.1385
##
## Residual standard error: 0.7337 on 41 degrees of freedom
## Multiple R-squared: 0.7501, Adjusted R-squared: 0.7013
## F-statistic: 15.38 on 8 and 41 DF, p-value: 3.787e-10
#sequential testing summary
summary.aov(model1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Population 1 0.409 0.409 0.760 0.38849
## Income 1 11.595 11.595 21.541 3.53e-05
## Illiteracy 1 19.421 19.421 36.081 4.23e-07
## Murder 1 27.429 27.429 50.959 1.05e-08
## HS.Grad 1 4.099 4.099 7.615 0.00861
## Frost 1 2.049 2.049 3.806 0.05792
## Area 1 0.001 0.001 0.002 0.96438
## Density 1 1.229 1.229 2.283 0.13847
## Residuals 41 22.068 0.538
#create the new model that throws out one predictor at a time
model2 = update(model1, .~. -Area)
#print the summary
model2.info <- summary(model2$model)
model2.info
## Life.Exp Population Income Illiteracy
## Min. :67.96 Min. : 365 Min. :3098 Min. :0.500
## 1st Qu.:70.12 1st Qu.: 1080 1st Qu.:3993 1st Qu.:0.625
## Median :70.67 Median : 2838 Median :4519 Median :0.950
## Mean :70.88 Mean : 4246 Mean :4436 Mean :1.170
## 3rd Qu.:71.89 3rd Qu.: 4968 3rd Qu.:4814 3rd Qu.:1.575
## Max. :73.60 Max. :21198 Max. :6315 Max. :2.800
## Murder HS.Grad Frost Density
## Min. : 1.400 Min. :37.80 Min. : 0.00 Min. : 0.6444
## 1st Qu.: 4.350 1st Qu.:48.05 1st Qu.: 66.25 1st Qu.: 25.3352
## Median : 6.850 Median :53.25 Median :114.50 Median : 73.0154
## Mean : 7.378 Mean :53.11 Mean :104.46 Mean :149.2245
## 3rd Qu.:10.675 3rd Qu.:59.15 3rd Qu.:139.75 3rd Qu.:144.2828
## Max. :15.100 Max. :67.30 Max. :188.00 Max. :975.0033
?summary.lm
#compare the old and new model using ANOVA - reduced model goes first
anova.1.2<- anova(model2, model1)
anova.1.2$`Pr(>F)`
## [1] NA 0.4205086
#no significant differences - look at the rest of the factors
model3 = update(model2, .~. -Illiteracy)
summary(model3)
##
## Call:
## lm(formula = Life.Exp ~ Population + Income + Murder + HS.Grad +
## Frost + Density, data = st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.49555 -0.41246 -0.05336 0.58399 1.50535
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.131e+01 1.042e+00 68.420 < 2e-16
## Population 5.811e-05 2.753e-05 2.110 0.0407
## Income 1.324e-04 2.636e-04 0.502 0.6181
## Murder -3.208e-01 4.054e-02 -7.912 6.32e-10
## HS.Grad 3.499e-02 2.122e-02 1.649 0.1065
## Frost -6.191e-03 2.465e-03 -2.512 0.0158
## Density -7.324e-04 5.978e-04 -1.225 0.2272
##
## Residual standard error: 0.7236 on 43 degrees of freedom
## Multiple R-squared: 0.745, Adjusted R-squared: 0.7094
## F-statistic: 20.94 on 6 and 43 DF, p-value: 2.632e-11
#some differences - change in R^2
model4 = update(model3, .~. -Income)
summary(model4)
##
## Call:
## lm(formula = Life.Exp ~ Population + Murder + HS.Grad + Frost +
## Density, data = st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.56877 -0.40951 -0.04554 0.57362 1.54752
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.142e+01 1.011e+00 70.665 < 2e-16
## Population 6.083e-05 2.676e-05 2.273 0.02796
## Murder -3.160e-01 3.910e-02 -8.083 3.07e-10
## HS.Grad 4.233e-02 1.525e-02 2.776 0.00805
## Frost -5.999e-03 2.414e-03 -2.485 0.01682
## Density -5.864e-04 5.178e-04 -1.132 0.26360
##
## Residual standard error: 0.7174 on 44 degrees of freedom
## Multiple R-squared: 0.7435, Adjusted R-squared: 0.7144
## F-statistic: 25.51 on 5 and 44 DF, p-value: 5.524e-12
#removing income had very little effect - continue with density
model5 = update(model4, .~. -Density)
summary(model5)
##
## Call:
## lm(formula = Life.Exp ~ Population + Murder + HS.Grad + Frost,
## data = st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.47095 -0.53464 -0.03701 0.57621 1.50683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.103e+01 9.529e-01 74.542 < 2e-16
## Population 5.014e-05 2.512e-05 1.996 0.05201
## Murder -3.001e-01 3.661e-02 -8.199 1.77e-10
## HS.Grad 4.658e-02 1.483e-02 3.142 0.00297
## Frost -5.943e-03 2.421e-03 -2.455 0.01802
##
## Residual standard error: 0.7197 on 45 degrees of freedom
## Multiple R-squared: 0.736, Adjusted R-squared: 0.7126
## F-statistic: 31.37 on 4 and 45 DF, p-value: 1.696e-12
#compare new model to model4 and original model1
anova.5.4<- anova(model5, model4)
anova.5.4$`Pr(>F)`
## [1] NA 0.2636006
#test 3-way interactions, specified by the carat - can go up to 4 ways
model.int = update(model5, .~.^3)
anova(model5, model.int)
## Analysis of Variance Table
##
## Model 1: Life.Exp ~ Population + Murder + HS.Grad + Frost
## Model 2: Life.Exp ~ Population + Murder + HS.Grad + Frost + Population:Murder +
## Population:HS.Grad + Population:Frost + Murder:HS.Grad +
## Murder:Frost + HS.Grad:Frost + Population:Murder:HS.Grad +
## Population:Murder:Frost + Population:HS.Grad:Frost + Murder:HS.Grad:Frost
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 45 23.308
## 2 35 15.869 10 7.4395 1.6409 0.1356
#create the model without population
model6 = update(model5, .~. -Population)
summary(model6)
##
## Call:
## lm(formula = Life.Exp ~ Murder + HS.Grad + Frost, data = st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5015 -0.5391 0.1014 0.5921 1.2268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.036379 0.983262 72.246 < 2e-16
## Murder -0.283065 0.036731 -7.706 8.04e-10
## HS.Grad 0.049949 0.015201 3.286 0.00195
## Frost -0.006912 0.002447 -2.824 0.00699
##
## Residual standard error: 0.7427 on 46 degrees of freedom
## Multiple R-squared: 0.7127, Adjusted R-squared: 0.6939
## F-statistic: 38.03 on 3 and 46 DF, p-value: 1.634e-12
#use step() function to automate all of this stuff
step(model1, direction="backward")
## Start: AIC=-22.89
## Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad +
## Frost + Area + Density
##
## Df Sum of Sq RSS AIC
## - Illiteracy 1 0.3050 22.373 -24.208
## - Area 1 0.3564 22.425 -24.093
## - Income 1 0.4120 22.480 -23.969
## <none> 22.068 -22.894
## - Frost 1 1.1102 23.178 -22.440
## - Density 1 1.2288 23.297 -22.185
## - HS.Grad 1 1.8225 23.891 -20.926
## - Population 1 2.5095 24.578 -19.509
## - Murder 1 23.8173 45.886 11.707
##
## Step: AIC=-24.21
## Life.Exp ~ Population + Income + Murder + HS.Grad + Frost + Area +
## Density
##
## Df Sum of Sq RSS AIC
## - Area 1 0.1427 22.516 -25.890
## - Income 1 0.2316 22.605 -25.693
## <none> 22.373 -24.208
## - Density 1 0.9286 23.302 -24.174
## - HS.Grad 1 1.5218 23.895 -22.918
## - Population 1 2.2047 24.578 -21.509
## - Frost 1 3.1324 25.506 -19.656
## - Murder 1 26.7071 49.080 13.072
##
## Step: AIC=-25.89
## Life.Exp ~ Population + Income + Murder + HS.Grad + Frost + Density
##
## Df Sum of Sq RSS AIC
## - Income 1 0.132 22.648 -27.598
## - Density 1 0.786 23.302 -26.174
## <none> 22.516 -25.890
## - HS.Grad 1 1.424 23.940 -24.824
## - Population 1 2.332 24.848 -22.962
## - Frost 1 3.304 25.820 -21.043
## - Murder 1 32.779 55.295 17.033
##
## Step: AIC=-27.6
## Life.Exp ~ Population + Murder + HS.Grad + Frost + Density
##
## Df Sum of Sq RSS AIC
## - Density 1 0.660 23.308 -28.161
## <none> 22.648 -27.598
## - Population 1 2.659 25.307 -24.046
## - Frost 1 3.179 25.827 -23.030
## - HS.Grad 1 3.966 26.614 -21.529
## - Murder 1 33.626 56.274 15.910
##
## Step: AIC=-28.16
## Life.Exp ~ Population + Murder + HS.Grad + Frost
##
## Df Sum of Sq RSS AIC
## <none> 23.308 -28.161
## - Population 1 2.064 25.372 -25.920
## - Frost 1 3.122 26.430 -23.877
## - HS.Grad 1 5.112 28.420 -20.246
## - Murder 1 34.816 58.124 15.528
##
## Call:
## lm(formula = Life.Exp ~ Population + Murder + HS.Grad + Frost,
## data = st)
##
## Coefficients:
## (Intercept) Population Murder HS.Grad Frost
## 7.103e+01 5.014e-05 -3.001e-01 4.658e-02 -5.943e-03
#major difference is that the population term was kept
#direction can be backward forward or both
#confidence interval function?!
confint(model5, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 6.910798e+01 72.9462729104
## Population -4.543308e-07 0.0001007343
## Murder -3.738840e-01 -0.2264135705
## HS.Grad 1.671901e-02 0.0764454870
## Frost -1.081918e-02 -0.0010673977
#predictions that can be fed into a vector
predict(model5, list(Population=2000, Murder=10.5,
HS.Grad=48, Frost=100))
## 1
## 69.61746
#graph minus Cook's distance plot
par(mfrow=c(2,2))
plot(model5)
par(mfrow=c(1,1))
#plot(model15, (1,2,3, or 5))
#3 ways to do get coefficients
#coef(model5)
#model5$coefficients
model5[[1]]
## (Intercept) Population Murder HS.Grad Frost
## 7.102713e+01 5.013998e-05 -3.001488e-01 4.658225e-02 -5.943290e-03
#ways to look at the residuals, or unexplained variation
#model5$resid - just gives the values
sort(model5$resid) #sorts them in ascending order
## Maine South Carolina Delaware West Virginia Washington
## -1.47095411 -1.10109172 -1.06646884 -0.96982588 -0.96272426
## Pennsylvania Mississippi Arizona Montana New Jersey
## -0.95045527 -0.91535384 -0.86415671 -0.84024805 -0.66612086
## Massachusetts Wyoming Alaska New Hampshire Nevada
## -0.61105391 -0.58678863 -0.54740399 -0.49635615 -0.49482393
## Louisiana Maryland Oregon Ohio Georgia
## -0.39044846 -0.29851996 -0.28445333 -0.26548767 -0.09694227
## California New York North Carolina Virginia Illinois
## -0.08564599 -0.07937149 -0.07624179 -0.06691392 -0.05244160
## Indiana Florida South Dakota Rhode Island Iowa
## -0.02158526 0.04460505 0.06839119 0.13992982 0.16347124
## Oklahoma New Mexico Idaho Nebraska Connecticut
## 0.26139958 0.28880945 0.37010714 0.42967691 0.44541028
## Wisconsin Alabama Vermont Missouri Tennessee
## 0.47004324 0.56888134 0.57865019 0.58389969 0.64416651
## Kansas Minnesota Michigan Utah Kentucky
## 0.67648037 0.69440380 0.76106640 0.84246817 0.85582067
## North Dakota Texas Colorado Arkansas Hawaii
## 0.90350550 0.92114057 0.95645816 1.08626119 1.50683146
#beta/standardized coefficients
coef(model5)
## (Intercept) Population Murder HS.Grad Frost
## 7.102713e+01 5.013998e-05 -3.001488e-01 4.658225e-02 -5.943290e-03
#standard deviaion of variable
sdexpl <- c(sd(st$Population), sd(st$Murder), sd(st$HS.Grad), sd(st$Frost))
#standard deviation of response variable
sdresp <- sd(st$Life.Exp)
#equation for beta coefficient
coef(model5)[2:5]*sdexpl/sdresp
## Population Murder HS.Grad Frost
## 0.1667540 -0.8253997 0.2802790 -0.2301391
#partial correlations
library(ggm)
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
##
## Attaching package: 'ggm'
## The following object is masked from 'package:igraph':
##
## pa
#pcor()
#a new model with another dataset
rm(list=ls()) #clears workspace
#load new dataset
data(airquality)
#remove NA values in the dataset
airquality <- na.omit(airquality)
#create the new model
lm.out <- lm(Ozone ~Solar.R + Wind + Temp + Month, data = airquality)
coef(lm.out)
## (Intercept) Solar.R Wind Temp Month
## -58.05383883 0.04959683 -3.31650940 1.87087379 -2.99162786
confint(lm.out)
## 2.5 % 97.5 %
## (Intercept) -1.035964e+02 -12.51131722
## Solar.R 3.088529e-03 0.09610512
## Wind -4.596849e+00 -2.03616955
## Temp 1.328372e+00 2.41337586
## Month -5.997092e+00 0.01383629
#perform predictions
(prediction <- c(1,200,11,80,6) * coef(lm.out)) #parentheses will print it!
## (Intercept) Solar.R Wind Temp Month
## -58.053839 9.919365 -36.481603 149.669903 -17.949767
sum(prediction)
## [1] 47.10406
#need confidence intervals - prediction for mean response - "conf" = CIs
predict(lm.out, list(Solar.R=200, Wind=11, Temp=80, Month=6), interval="conf")
## fit lwr upr
## 1 47.10406 41.10419 53.10393
#prediction for 1 new case - "pred" is predict - "level" sets the CI
predict(lm.out, list(Solar.R=200, Wind=11, Temp=80, Month=6), interval="pred")
## fit lwr upr
## 1 47.10406 5.235759 88.97236
#determines the intervals
intervals <- confint(lm.out)
#transpose the matrix
t(intervals)
## (Intercept) Solar.R Wind Temp Month
## 2.5 % -103.59636 0.003088529 -4.596849 1.328372 -5.99709200
## 97.5 % -12.51132 0.096105125 -2.036170 2.413376 0.01383629
#supposed new values
values <- c(1,200,11,80,6)
#add together the predictions
sum(colMeans(t(intervals)) * values)
## [1] 47.10406
#transposes the CIs into a matrix
t(intervals) %*% matrix(values)
## [,1]
## 2.5 % -83.25681
## 97.5 % 177.46493
#read the file
jellydata <- read.csv("jellyfish.csv")
df <- as.data.frame(jellydata)
#create row names (not sure if we'll need this)
row.names(df) <- df$Species
#get rid of column with names
df2 <- df[,2:12]
par(mfrow=c(3,4), oma = c(0,0,2,0))
for(i in 1:11) qqnorm(df2[,i], main=colnames(df2)[i])
mtext("Figure 1: Q-Q Plots of Maximal Model Variable Measurements", outer = TRUE, cex = 1, font = 2)
correlations <- round(cor(df2), 3)
#look at your correlations
par(mfrow=c(1,1), oma = c(0,0,2,0))
plot(df2)
mtext("Figure 2: Correlations of Maximal Model", outer = TRUE, cex = 1, font = 2)
#hide ugly significance stars
options(show.signif.stars = FALSE)
#show the variables that will be compared
# create the linear additive model
model1 <- lm(WakeKE ~ ., data =df2)
sum1 <- summary(model1)
#manual remove max tentacle length
model2 <- update(model1, .~. -Max.Tentacle.Length)
sum2 <- summary(model2) #adjusted R^2 increase to .666
anova.2.1<- anova(model2, model1)
pvalue2 <- anova.2.1$`Pr(>F)`
#manual remove stomach volume
model3 <- update(model2, .~. -Stomach.Vol)
sum3 <- summary(model3) #adjusted R^2 increase to .678
anova.3.1<- anova(model3, model1)
pvalue3 <- anova.3.1$`Pr(>F)`
#manual remove avg tent length
model4 <- update(model3, .~. -Avg.Tentacle.Length)
sum4<- summary(model4) #adjusted R^2 increase to .691
anova.4.1<- anova(model4, model1)
pvalue4<- anova.4.1$`Pr(>F)`
#manual remove body mass
model5 <- update(model4, .~. -Body.Mass)
sum5 <- summary(model5) #This decreased R^2, but not by much
#stepwise regression to create minimal model
anova.5.1<- anova(model5, model1)
pvalue5 <- anova.5.1$`Pr(>F)`
model6 <-step(model1, direction="backward")
## Start: AIC=22.18
## WakeKE ~ Bell.Diam.Contracted + Relaxed.Contr.Ratio + Bell.Height +
## Bell.AspectRatio + Mesoglea.Thickness + Body.Mass + Bell.Volume +
## Max.Tentacle.Length + Avg.Tentacle.Length + Stomach.Vol
##
## Df Sum of Sq RSS AIC
## - Max.Tentacle.Length 1 0.0847 26.269 20.267
## - Stomach.Vol 1 0.6742 26.858 20.844
## - Avg.Tentacle.Length 1 0.7997 26.984 20.966
## <none> 26.184 22.183
## - Body.Mass 1 2.2426 28.427 22.320
## - Relaxed.Contr.Ratio 1 3.2042 29.388 23.185
## - Mesoglea.Thickness 1 5.4015 31.585 25.060
## - Bell.Volume 1 7.4270 33.611 26.676
## - Bell.AspectRatio 1 8.7397 34.924 27.672
## - Bell.Height 1 12.3423 38.526 30.224
## - Bell.Diam.Contracted 1 17.5052 43.689 33.494
##
## Step: AIC=20.27
## WakeKE ~ Bell.Diam.Contracted + Relaxed.Contr.Ratio + Bell.Height +
## Bell.AspectRatio + Mesoglea.Thickness + Body.Mass + Bell.Volume +
## Avg.Tentacle.Length + Stomach.Vol
##
## Df Sum of Sq RSS AIC
## - Stomach.Vol 1 0.6252 26.894 18.879
## - Avg.Tentacle.Length 1 0.9422 27.211 19.183
## <none> 26.269 20.267
## - Body.Mass 1 2.1939 28.463 20.353
## - Relaxed.Contr.Ratio 1 3.4342 29.703 21.462
## - Mesoglea.Thickness 1 5.5721 31.841 23.269
## - Bell.Volume 1 7.9540 34.223 25.145
## - Bell.Height 1 12.2638 38.532 28.229
## - Bell.AspectRatio 1 12.8757 39.144 28.638
## - Bell.Diam.Contracted 1 17.4430 43.712 31.508
##
## Step: AIC=18.88
## WakeKE ~ Bell.Diam.Contracted + Relaxed.Contr.Ratio + Bell.Height +
## Bell.AspectRatio + Mesoglea.Thickness + Body.Mass + Bell.Volume +
## Avg.Tentacle.Length
##
## Df Sum of Sq RSS AIC
## - Avg.Tentacle.Length 1 0.4723 27.366 17.331
## <none> 26.894 18.879
## - Body.Mass 1 2.6643 29.558 19.335
## - Relaxed.Contr.Ratio 1 3.7123 30.606 20.241
## - Mesoglea.Thickness 1 5.2623 32.156 21.525
## - Bell.Volume 1 7.4273 34.321 23.219
## - Bell.AspectRatio 1 12.3465 39.240 26.702
## - Bell.Height 1 13.3170 40.211 27.337
## - Bell.Diam.Contracted 1 17.3420 44.236 29.817
##
## Step: AIC=17.33
## WakeKE ~ Bell.Diam.Contracted + Relaxed.Contr.Ratio + Bell.Height +
## Bell.AspectRatio + Mesoglea.Thickness + Body.Mass + Bell.Volume
##
## Df Sum of Sq RSS AIC
## <none> 27.366 17.331
## - Body.Mass 1 2.9917 30.358 18.029
## - Relaxed.Contr.Ratio 1 4.4566 31.823 19.254
## - Mesoglea.Thickness 1 4.8887 32.255 19.605
## - Bell.Volume 1 11.4400 38.806 24.413
## - Bell.AspectRatio 1 11.8753 39.242 24.703
## - Bell.Height 1 13.5480 40.914 25.788
## - Bell.Diam.Contracted 1 16.9900 44.356 27.888
summary(model6)
##
## Call:
## lm(formula = WakeKE ~ Bell.Diam.Contracted + Relaxed.Contr.Ratio +
## Bell.Height + Bell.AspectRatio + Mesoglea.Thickness + Body.Mass +
## Bell.Volume, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.86623 -0.52339 -0.02426 0.50402 2.41820
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.410377 0.829896 0.494 0.62694
## Bell.Diam.Contracted 0.004245 0.001270 3.343 0.00362
## Relaxed.Contr.Ratio 0.140877 0.082283 1.712 0.10405
## Bell.Height -0.053410 0.017892 -2.985 0.00794
## Bell.AspectRatio 0.062502 0.022364 2.795 0.01197
## Mesoglea.Thickness 0.006102 0.003403 1.793 0.08976
## Body.Mass -0.018111 0.012911 -1.403 0.17770
## Bell.Volume 0.009322 0.003398 2.743 0.01337
##
## Residual standard error: 1.233 on 18 degrees of freedom
## Multiple R-squared: 0.7773, Adjusted R-squared: 0.6907
## F-statistic: 8.976 on 7 and 18 DF, p-value: 8.743e-05
#compare new model to model4 and original model1
anova(model5, model1)
## Analysis of Variance Table
##
## Model 1: WakeKE ~ Bell.Diam.Contracted + Relaxed.Contr.Ratio + Bell.Height +
## Bell.AspectRatio + Mesoglea.Thickness + Bell.Volume
## Model 2: WakeKE ~ Bell.Diam.Contracted + Relaxed.Contr.Ratio + Bell.Height +
## Bell.AspectRatio + Mesoglea.Thickness + Body.Mass + Bell.Volume +
## Max.Tentacle.Length + Avg.Tentacle.Length + Stomach.Vol
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 19 30.358
## 2 15 26.184 4 4.1739 0.5978 0.6699
anova(model5, model6)
## Analysis of Variance Table
##
## Model 1: WakeKE ~ Bell.Diam.Contracted + Relaxed.Contr.Ratio + Bell.Height +
## Bell.AspectRatio + Mesoglea.Thickness + Bell.Volume
## Model 2: WakeKE ~ Bell.Diam.Contracted + Relaxed.Contr.Ratio + Bell.Height +
## Bell.AspectRatio + Mesoglea.Thickness + Body.Mass + Bell.Volume
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 19 30.358
## 2 18 27.366 1 2.9917 1.9678 0.1777
Model | Multiple R2 | Adjusted R2 | ANOVA p-value (Compared to Maximal Model) |
---|---|---|---|
Maximal Model | 0.7869 | 0.6449 | NA |
- Max.Tentacle.Length | 0.7863 | 0.666 | 0.8286494 |
- Stomach.Vol | 0.7812 | 0.6782 | 0.8182151 |
- Avg.Tentacle.Length | 0.7773 | 0.6907 | 0.8770069 |
- Body.Mass (Minimal Model) | 0.753 | 0.675 | 0.6698507 |
Stepwise | 0.7773 | 0.6907 | 0.8770069 |
When determining the Minimal Adequate Model, the primary calculation to use is the R2, particularly the adjusted value. This is because the R2 value of a model indicates the amount of variance that can be explained by the model. Thus, a higher R2 means that model will have a more precise prediction of the variable at hand. Removing variables, if they do not have an effect, can increase the amount of variance explained by the model. Here, 4 variables were removed (as listed in Table 1). For each removal, it is apparent that the adjusted R2 is going up, indicating that predictive power is not being lost when removing certain variables. The high analysis of variance (ANOVA) p-values indicate that there is not a large statistical difference between the maximal model and the reduced model. Removing Body Mass was a judgement call, as the R2 value went down slightly, but the p-value was so high that removing it did not seem to affect the model. Thus, Body Mass was the last variable removed, creating the Minimal Adequate Model.
modelmin <- df2[ -c(7 ,9:11) ]
plot(modelmin)
par(mfrow=c(2,4))
for(i in 1:7) qqnorm(modelmin[,i], main=colnames(modelmin)[i])
mtext("Figure 3: Q-Q Plots of Minimal Model Variable Measurements", outer = TRUE, cex = 1, font = 2)
par(mfrow= c(1,2), oma=c(0,0,2,0))
corrplot(cor(df2), method="shade",
shade.col=NA, tl.col="black", tl.srt=45,
type="full",
title = "Maximal Model",
tl.cex= 0.5)
corrplot(cor(modelmin), method="shade",
shade.col=NA, tl.col="black", tl.srt=45,
type="full",
title = "Minimal Model",
tl.cex = 0.5)
mtext("Figure 4: Correlations of Maximal and Minimal Model", outer = TRUE, cex = 1, font = 2)
#do I want to potentially change the names of the headers in this graph to make it look less garbagey?
Testing <- combn(x = modelmin, m = 2, FUN = alldata.fun)
visual <- matrix(Testing, ncol=9, byrow = T)
colnames(visual) <- c("Pearson Coefficient", "CIs", "p-value","Spearman Coefficient", "CIs", "p-value", "MIC", "CIs", "p-value")
rowtitles.fun <- function(combination){
attributeA <- colnames(combination[1])
attributeB <- colnames(combination[2])
header <-paste(attributeA, attributeB, sep=" || ")
return(list(header))
}
table.rownames <- combn(x = modelmin, m = 2, FUN = rowtitles.fun)
row.names(visual) <- table.rownames
kable(visual, caption = "Table 2: Correlations of the Minimal Adequate Model")
Table: Table 2: Correlations of the Minimal Adequate Model
Pearson Coefficient CIs p-value Spearman Coefficient CIs p-value MIC CIs p-value
Based on the table above, it seems that the greatest amount of correlation exists between Wake Kinetic Energy and Bell Aspect Ratio, as the Spearman correlation and MIC both have CIs that indicate a statistical difference. This is even true for the Pearson Correlation, even though this is not the best measure of correlation due to the correlation not being linear. However, it seems that there might be a source of collinearity, as Bell Height and Mesoglea Thickness are also highly correlated. It is unclear if this is a source of harmful collinearity, or if it will not affect the predictive modeling.
model.pred <- na.omit(modelmin)
compare <- lm(WakeKE ~., data=model.pred)
coef.com <- coef(compare)
#make predictions with random values
prediction <- c(1, 400, 6, 50, 5, 200, 120) * coef(compare)
predicted <- sum(prediction)
pred.conf <- predict(compare, list(Bell.Diam.Contracted= 400, Relaxed.Contr.Ratio= 6, Bell.Height=50, Bell.AspectRatio= 5, Mesoglea.Thickness= 200, Bell.Volume =120), interval="conf")
pred.pred <- predict(compare, list(Bell.Diam.Contracted= 400, Relaxed.Contr.Ratio= 6, Bell.Height=50, Bell.AspectRatio= 5, Mesoglea.Thickness= 200, Bell.Volume =120), interval="pred")
intervals <- confint(compare)
t(intervals)
## (Intercept) Bell.Diam.Contracted Relaxed.Contr.Ratio Bell.Height
## 2.5 % -1.695835 0.001339437 -0.06971819 -0.09348444
## 97.5 % 1.529994 0.006755309 0.21440528 -0.01689854
## Bell.AspectRatio Mesoglea.Thickness Bell.Volume
## 2.5 % 0.03116887 -0.001584095 0.003088564
## 97.5 % 0.11904838 0.012964804 0.017396729
values = c(1, 400, 6, 50, 5, 200, 120)
sum(colMeans(t(intervals)) * values)
## [1] 1.953247
orignal.CI <-t(intervals) %*% matrix(values)
The predicted value for the conditions stated above (Bell.Diam.Contracted= 400, Relaxed.Contr.Ratio= 6, Bell.Height=50, Bell.AspectRatio= 5, Mesoglea.Thickness= 200, Bell.Volume =120) is :1.9532471. The confidence intervals for the mean response are 1.310553, 2.5959413, while for a single case they are -0.7693521, 4.6758464. These numbers indicate the accuracy of the prediction. As is to be expected, the CIs for the mean response are much higher than those of a single case. In fact, the lower limit of the single case negative, which isnt a possible measurement since Kinetic Energy is a scalar quantity. Thus, this model requires further exploration. Indeed the, original confidence intervals given are quite large.
modelg <- glmulti(WakeKE ~ ., level= 2, crit = "bic", method = "g", plotty = F, report = F, marginality = F, data = df2)
## TASK: Genetic algorithm in the candidate set.
## Initialization...
## Algorithm started...
## Improvements in best and average IC have bebingo en below the specified goals.
## Algorithm is declared to have converged.
## Completed.
sumg <- summary(modelg)
par(mfrow=c(1,3), oma=c(2,2,2,2))
plot(modelg, type = "p")
plot(modelg, type = "w")
plot(modelg, type = "s", ps=6)
Unfortunately, the glmulti iterations do not produce the same results every time, so one cannot conclude much about this model. From the current iteration, it seems that tentacle measurements are highly useful, though both the stepwise and manual regression show that this is not the case. This package might be useful with larger datasets, such as genetic datasets, but currently, not many conclusions can be determined from this data.
alldata.fun <- function(combination){
attributeA <- combination[,1]
attributeB <- combination[,2]
#PEARSON STUFF
deals=10000
#pearson CI
pearson <-round(cor(attributeA, attributeB, method="pearson"), 3)
boot.corp = rep(NA,deals)
indices = c(1:length(attributeA))
for (i in 1:deals) {
bootindices = sample(indices, length(indices), replace = T)
boot.corp[i] = cor(attributeA[bootindices],
attributeB[bootindices], method = "pearson")
}
CIP <- round(sort(boot.corp)[c(.025*deals,.975*deals)],2)
#pearson p value
boot.nhp = rep(NA, deals)
indices = c(1:length(attributeA))
for (i in 1:deals){
bootindices = sample(indices, length(indices), replace = F)
boot.nhp[i] = cor(attributeA[bootindices], attributeB,
method = "pearson")
}
HP <- sum(boot.nhp > abs(pearson))
LP <- sum(boot.nhp < -abs(pearson))
pvaluep <- round((HP+LP)/deals, 2)
#SPEARMAN STUFF
#spearman CI
spearman <- round(cor(attributeA, attributeB, method="spearman"), 3)
boot.cors = rep(NA,deals)
indices = c(1:length(attributeA))
for (i in 1:deals) {
bootindices = sample(indices, length(indices), replace = T)
boot.cors[i] = cor(attributeA[bootindices],
attributeB[bootindices], method = "spearman")
}
CIS <- round(sort(boot.cors)[c(.025*deals,.975*deals)],2)
#spearman p value
boot.nhs = rep(NA, deals)
indices = c(1:length(attributeA))
for (i in 1:deals){
bootindices = sample(indices, length(indices), replace = F)
boot.nhs[i] = cor(attributeA[bootindices], attributeB,
method = "spearman")
}
HP <- sum(boot.nhs > abs(spearman))
LP <- sum(boot.nhs < -abs(spearman))
pvalues <- round((HP+LP)/deals, 2)
#MIC STUFF
MIC <- round(mine(attributeA, attributeB)$MIC, 3)
#MIC confidence intervals
boot.mic = rep(NA,deals)
indices = c(1:length(attributeA))
for (i in 1:deals) {
bootindices = sample(indices, length(indices), replace = T)
boot.mic[i] = mine(attributeA[bootindices],
attributeB[bootindices])$MIC
}
CIMIC <- round(sort(boot.mic)[c(.025*deals,.975*deals)],2)
#MIC p value
boot.micnh = rep(NA, deals)
indices = c(1:length(attributeA))
for (i in 1:deals){
bootindices = sample(indices, length(indices), replace = F)
boot.micnh[i] = mine(attributeA[bootindices], attributeB)$MIC
}
HP <- sum(boot.mic > abs(MIC))
pvaluemic = round((HP)/deals,2)
return(list(pearson, CIP, pvaluep, spearman, CIS, pvalues, MIC, CIMIC, pvaluemic))
}