##Set working directory and load the packages for this lab.
setwd("~/Desktop/University of Utah PhD /Course Work/Fall 2022 Semester/GEOG6000_Data Analysis/lab06")
library(ggplot2)
library(dplyr)
library(nlme)
library(tidyr)
library(lme4)
library(lattice)
library(mgcv) ##for generalized additive models
library(plotly) ##for interactive figures in html
library(ggpubr) ##additional package for building figures
mathach = read.csv("../datafiles/MathAch.csv")
mathach$sector = factor(mathach$sector, levels = c("Public", "Catholic")) #Convert sector variable to a factor (remember a factor is another term for category - it can be used for character strings and integer data)
mses = tapply(mathach$ses, mathach$school, mean) ##created a mean SES score by school
print(mses)
mathach$meanses = mses[as.character(mathach$school)] ##creates a new column of mean ses scores by the nnumber of students in each school??? as.character function?
mathach$centered.ses = mathach$ses - mathach$meanses ##creates a centered SES score by student based on their school
##Quick Plot (ggplot) to create the boxplot
math.sector.plot = qplot(sector, mathach, data = mathach, fill = sector,
geom = 'boxplot',
xlab = "Sector",
ylab = "Math Achievement Score",
main = "Math Achievement by School Type")
ses.plot = ggplot(data = mathach, aes(x = sector, y = ses)) +
geom_boxplot(aes(fill = sector)) +
xlab("Sector") +
ylab("Socioeconomic Status") +
labs(title = "Socioeconomic Status by School Type",
subtitle = "By David Leydet")
figure.comb = ggarrange(math.sector.plot, ses.plot,
labels = c("A", "B"),
ncol = 2, nrow = 1)
figure.comb
## Building a hierarchical model to explain the relationship between math score and SES in these two school sectors (Public and Catholic), but wish to account for variability between schools (individually?).
math.lme1 = lme(mathach ~ meanses * centered.ses + sector*centered.ses,
random = ~ centered.ses | school,
data = mathach)
summary(math.lme1)
## Linear mixed-effects model fit by REML
## Data: mathach
## AIC BIC logLik
## 46523.66 46592.45 -23251.83
##
## Random effects:
## Formula: ~centered.ses | school
## Structure: General positive-definite, Log-Cholesky parametrization
## StdDev Corr
## (Intercept) 1.5426647 (Intr)
## centered.ses 0.3178223 0.391
## Residual 6.0598002
##
## Fixed effects: mathach ~ meanses * centered.ses + sector * centered.ses
## Value Std.Error DF t-value p-value
## (Intercept) 12.127931 0.1992966 7022 60.85368 0e+00
## meanses 5.332879 0.3691772 157 14.44531 0e+00
## centered.ses 2.945042 0.1555901 7022 18.92821 0e+00
## sectorCatholic 1.226577 0.3062807 157 4.00475 1e-04
## meanses:centered.ses 1.039250 0.2988784 7022 3.47717 5e-04
## centered.ses:sectorCatholic -1.642680 0.2397641 7022 -6.85124 0e+00
## Correlation:
## (Intr) meanss cntrd. sctrCt mnss:.
## meanses 0.256
## centered.ses 0.075 0.019
## sectorCatholic -0.699 -0.356 -0.053
## meanses:centered.ses 0.019 0.074 0.293 -0.026
## centered.ses:sectorCatholic -0.052 -0.027 -0.696 0.077 -0.351
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -3.15926374 -0.72319836 0.01704118 0.75443272 2.95823363
##
## Number of Observations: 7185
## Number of Groups: 160
VarCorr(math.lme1)
## school = pdLogChol(centered.ses)
## Variance StdDev Corr
## (Intercept) 2.379814 1.5426647 (Intr)
## centered.ses 0.101011 0.3178223 0.391
## Residual 36.721178 6.0598002
math.lme2 = update(math.lme1, random = ~ 1 | school) ##removes the variation of the slope of the centered ses as a random effect across schools. This uses a single slope for all schools by updating the random effect part of the model to only include the intercept
anova(math.lme1, math.lme2) ##Model comparison
## Model df AIC BIC logLik Test L.Ratio p-value
## math.lme1 1 10 46523.66 46592.45 -23251.83
## math.lme2 2 8 46520.79 46575.82 -23252.39 1 vs 2 1.124096 0.57
## Remove the random effect between schools.
math.lme3 = update(math.lme1, random = ~ centered.ses - 1 | school)
anova(math.lme1, math.lme3)
## Model df AIC BIC logLik Test L.Ratio p-value
## math.lme1 1 10 46523.66 46592.45 -23251.83
## math.lme3 2 8 46740.23 46795.26 -23362.11 1 vs 2 220.5634 <.0001
sppint = read.csv("../datafiles/speciesIntro2.csv")
str(sppint)
## 'data.frame': 60 obs. of 5 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ species : int 1 2 3 4 5 6 7 8 9 10 ...
## $ location: chr "australia" "australia" "australia" "australia" ...
## $ success : int 2 7 2 2 5 4 1 2 2 1 ...
## $ failure : int 10 5 10 10 7 8 11 10 10 11 ...
##Data cleanup
##Need to create a proportion of success to failure by species.
sppint$total = sppint$success + sppint$failure
sppint$prop = sppint$success / sppint$total
sppint.bplot = ggplot(data = sppint, aes(x = location, y = prop)) +
geom_boxplot(aes(fill = location))+
labs(title = "Species Success Probability") +
xlab("Location") +
ylab("Probability")
sppint.bplot
##Use glmer from the lme4 package to build this model. This is for fitting generalized linear mixed-effects models
spp.glm1 = glmer(prop ~ location + (1 | species), ##Syntax 1 | species means to vary the intercept by species groups.
family = binomial, #use this because it is proportional data? 0,1?
weights = total, #because we are modeling proportions this argument specifies the total number of trials for each species
data = sppint)
summary(spp.glm1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: prop ~ location + (1 | species)
## Data: sppint
## Weights: total
##
## AIC BIC logLik deviance df.resid
## 278.7 289.2 -134.4 268.7 55
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.3293 -0.3833 -0.1695 0.4073 1.8452
##
## Random effects:
## Groups Name Variance Std.Dev.
## species (Intercept) 0.5085 0.7131
## Number of obs: 60, groups: species, 60
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.61548 0.24793 -2.482 0.0130 *
## locationaustralia -0.64880 0.35861 -1.809 0.0704 .
## locationcanada 0.06737 0.34824 0.193 0.8466
## locationchina 0.52650 0.34764 1.514 0.1299
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) lctnst lctncn
## locatinstrl -0.687
## locationcnd -0.710 0.490
## locationchn -0.714 0.490 0.506
##Odds and probability conversion from log-odds
argprob = exp(-0.615) / (1 + exp(-0.615))
ausprob = exp(-0.64880 ) / (1 + exp(-0.64880))
canprob = exp(0.06737 ) / (1 + exp(0.06737))
chiprob = exp(0.52650 ) / (1 + exp(0.52650))
prob.df = data.frame(argprob, ausprob, canprob, chiprob)
prob.df
## argprob ausprob canprob chiprob
## 1 0.3509195 0.34326 0.5168361 0.6286664
##Model the probability as a function of the intercept, with a random effect by species. This subtracts out the location
spp.glm2 = glmer(prop ~ 1 + (1 | species),
data = sppint,
family = binomial,
weights = total)
anova(spp.glm1, spp.glm2) ##Comparing the two models (glm1 = with location; glm2 = without location)
## Data: sppint
## Models:
## spp.glm2: prop ~ 1 + (1 | species)
## spp.glm1: prop ~ location + (1 | species)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## spp.glm2 2 283.07 287.26 -139.53 279.07
## spp.glm1 5 278.70 289.18 -134.35 268.70 10.365 3 0.01571 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Generalized Additive Models (GAMs) use methods to fit the model to the data locally, using splines to account for non-linear relationships. They also include the possibility to specify the distribution family and link function for the dependent variable.
ozone = read.csv("../datafiles/ozone.csv")
##Building the GAM
ozone.gam = gam(ozone ~ s(temp) + s(wind),
data = ozone) #GAM created with smoothing splines for temp and wind - the s() argument
summary(ozone.gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## ozone ~ s(temp) + s(wind)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.099 1.748 24.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(temp) 4.096 5.068 13.61 <2e-16 ***
## s(wind) 3.059 3.836 12.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.694 Deviance explained = 71.4%
## GCV = 366.09 Scale est. = 339.2 n = 111
plot(ozone.gam,
resid = TRUE, ##plots the residuals instead of just showing the model line
pch = 16, ##filled in circle symbols
col = "blue",
ylab = "Ozone",
xlab = "Temperature",
main = "Ozone as a Function of Temperature",
select = 1) ##selects the parameter...in this case temperature first.
plot(ozone.gam,
resid = TRUE, ##plots the residuals instead of just showing the model line
pch = 17, ##filled in circle symbols
col = "purple",
ylab = "Ozone",
xlab = "Wind Speed",
main = "Ozone as a Function of Wind Speed",
select = 2) ##selects the parameter...in this case wind speed.
##Perspective Plot
vis.gam(ozone.gam, color = "cm",
theta = 230, ##horizontal viewing angle
phi = 15) ##vertical viewing angle
##Creating a time series plot
##Create the index variable
time.idx = seq(1, length(ozone$ozone))
plot(time.idx, ozone$ozone,
type = "l",
col = "blue",
xlab = "Time",
ylab = "Ozone Concentration (ppb)",
main = "Ozone Time Series")
lines(fitted(ozone.gam), col = "red", lty = 2) ##put the fitted model line over the data. The fitted argument extracts the fitted model values
legend("topleft",
legend = c("Observed", "Estimated"), ##What text to write in the legend
lty = 1, ##line type
col = c("blue", "red")) ##colors for the legend lines
newclim = data.frame(temp = 85, wind = 5)
ozone.pred = predict(ozone.gam, newdata = newclim, se.fit = TRUE)
print(ozone.pred)
## $fit
## 1
## 84.38856
##
## $se.fit
## 1
## 4.7692
##Wind speed increase to 15 m/s
newclim2 = data.frame(temp = 85, wind = 15)
predict(ozone.gam, newdata = newclim2, se.fit = TRUE)
## $fit
## 1
## 45.02983
##
## $se.fit
## 1
## 5.27401
##Read in the data
gapdata = read.csv("../datafiles/gapminderData5.csv")
##Estimate mean life expectancy by country
mean.lifeexp = tapply(gapdata$lifeExp, gapdata$country, mean)
mean.lifeexp
## Afghanistan Albania Algeria
## 37.47883 68.43292 59.03017
## Angola Argentina Australia
## 37.88350 69.06042 74.66292
## Austria Bahrain Bangladesh
## 73.10325 65.60567 49.83408
## Belgium Benin Bolivia
## 73.64175 48.77992 52.50458
## Bosnia and Herzegovina Botswana Brazil
## 67.70783 54.59750 62.23950
## Bulgaria Burkina Faso Burundi
## 69.74375 44.69400 44.81733
## Cambodia Cameroon Canada
## 47.90275 48.12850 74.90275
## Central African Republic Chad Chile
## 43.86692 46.77358 67.43092
## China Colombia Comoros
## 61.78514 63.89775 52.38175
## Congo, Dem. Rep. Congo, Rep. Costa Rica
## 44.54375 52.50192 70.18142
## Cote d'Ivoire Croatia Cuba
## 48.43617 70.05592 71.04508
## Czech Republic Denmark Djibouti
## 71.51050 74.37017 46.38075
## Dominican Republic Ecuador Egypt
## 61.55450 62.81683 56.24300
## El Salvador Equatorial Guinea Eritrea
## 59.63333 42.96000 45.99925
## Ethiopia Finland France
## 44.47575 72.99192 74.34892
## Gabon Gambia Germany
## 51.22050 44.40058 73.44442
## Ghana Greece Guatemala
## 52.34067 73.73317 56.72942
## Guinea Guinea-Bissau Haiti
## 43.23983 39.21025 50.16525
## Honduras Hong Kong, China Hungary
## 57.92083 73.49283 69.39317
## Iceland India Indonesia
## 76.51142 53.16608 54.33575
## Iran Iraq Ireland
## 58.63658 56.58175 73.01725
## Israel Italy Jamaica
## 73.64583 74.01383 68.74933
## Japan Jordan Kenya
## 74.82692 59.78642 52.68100
## Korea, Dem. Rep. Korea, Rep. Kuwait
## 63.60733 65.00100 68.92233
## Lebanon Lesotho Liberia
## 65.86567 50.00708 42.47625
## Libya Madagascar Malawi
## 59.30417 47.77058 43.35158
## Malaysia Mali Mauritania
## 64.27958 43.41350 52.30208
## Mauritius Mexico Mongolia
## 64.95325 65.40883 55.89033
## Montenegro Morocco Mozambique
## 70.29917 57.60883 40.37950
## Myanmar Namibia Nepal
## 53.32167 53.49133 48.98633
## Netherlands New Zealand Nicaragua
## 75.64850 73.98950 58.34942
## Niger Nigeria Norway
## 44.55867 43.58133 75.84300
## Oman Pakistan Panama
## 58.44267 54.88225 67.80175
## Paraguay Peru Philippines
## 66.80908 58.85933 60.96725
## Poland Portugal Puerto Rico
## 70.17692 70.41983 72.73933
## Reunion Romania Rwanda
## 66.64425 68.29067 41.48158
## Sao Tome and Principe Saudi Arabia Senegal
## 57.89633 58.67875 50.62592
## Serbia Sierra Leone Singapore
## 68.55100 36.76917 71.22025
## Slovak Republic Slovenia Somalia
## 70.69608 71.60075 40.98867
## South Africa Spain Sri Lanka
## 53.99317 74.20342 66.52608
## Sudan Swaziland Sweden
## 48.40050 49.00242 76.17700
## Switzerland Syria Taiwan
## 75.56508 61.34617 70.33667
## Tanzania Thailand Togo
## 47.91233 62.20025 51.49875
## Trinidad and Tobago Tunisia Turkey
## 66.82800 60.72100 59.69642
## Uganda United Kingdom United States
## 47.61883 73.92258 73.47850
## Uruguay Venezuela Vietnam
## 70.78158 66.58067 57.47950
## West Bank and Gaza Yemen, Rep. Zambia
## 60.32867 46.78042 45.99633
## Zimbabwe
## 52.66317
gapdata$year.factor = as.factor(gapdata$year)
gapdata$country.factor = as.factor(gapdata$country)
##paste("country:", country) syntax allows the country to pop up when using ggplotly to plot this figure**
life.exp.plot1 = ggplot(data = gapdata, aes(x = year.factor, y = lifeExp, text = paste("country:", country), color = continent)) +
geom_point(alpha = 0.6) +
labs(title = "Life Expectancy by Country",
subtitle = "By Dave Leydet") +
xlab("Year") +
ylab("Life Expectancy (Years)")
ggplotly(life.exp.plot1)
#Trying to color by country is too much. This displays a warning message to change the bin width.
#facet_wrap by continent to break out the data a bit. After some minor code modification the plotly tool will display the country by hovering over it.
life.exp.plot2 = life.exp.plot1 +
facet_wrap( ~ continent) +
theme(axis.text.x = element_text(angle = 75, hjust = 1))
ggplotly(life.exp.plot2)
afg.sub = subset(gapdata, country == "Afghanistan") ##subset the data to make an Afghanistan data frame
chad.sub = subset(gapdata, country == "Chad")
life.line = ggplot(data = afg.sub, aes(x = year, y = lifeExp)) +
geom_line(aes(color = "lightgreen")) +
geom_line(data = chad.sub, aes(color = "indianred1")) +
labs(title = "Chad and Afghanistan Life Expectancy") +
xlab("Year") +
ylab("Life Expectancy") +
guides(colour = guide_legend(title = "Country")) +
##changes the legend title
scale_color_discrete(labels = c('Chad', 'Afganistan'))
##changes the legend values
life.line
## Center the year data
gapdata$year.cen = gapdata$year - 1952
## Linear Model Build
life.exp.lm1 = lm(lifeExp ~ year.cen, data = gapdata)
summary(life.exp.lm1)
##
## Call:
## lm(formula = lifeExp ~ year.cen, data = gapdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.949 -9.651 1.697 10.335 22.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.51208 0.53000 95.31 <2e-16 ***
## year.cen 0.32590 0.01632 19.96 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.63 on 1702 degrees of freedom
## Multiple R-squared: 0.1898, Adjusted R-squared: 0.1893
## F-statistic: 398.6 on 1 and 1702 DF, p-value: < 2.2e-16
life.exp.coun.lme = lme(lifeExp ~ year.cen,
random = ~ 1 | country,
data = gapdata,
method = "ML")
summary(life.exp.coun.lme)
## Linear mixed-effects model fit by maximum likelihood
## Data: gapdata
## AIC BIC logLik
## 9867.217 9888.98 -4929.609
##
## Random effects:
## Formula: ~1 | country
## (Intercept) Residual
## StdDev: 11.0577 3.583049
##
## Fixed effects: lifeExp ~ year.cen
## Value Std.Error DF t-value p-value
## (Intercept) 50.51208 0.9427505 1561 53.57948 0
## year.cen 0.32590 0.0050318 1561 64.76851 0
## Correlation:
## (Intr)
## year.cen -0.147
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -6.1714060 -0.4909584 0.1218769 0.6031545 2.4201698
##
## Number of Observations: 1704
## Number of Groups: 142
var.cor.coef = VarCorr(life.exp.coun.lme)
var.cor.coef
## country = pdLogChol(1)
## Variance StdDev
## (Intercept) 122.27279 11.057703
## Residual 12.83824 3.583049
##Calculate the percentage of the variance explained by the intercept varying by country
coef.1.int = as.numeric(var.cor.coef[1,1])
coef.2.res = as.numeric(var.cor.coef[2,1])
total.var = coef.1.int + coef.2.res
int.percentage = (coef.1.int / total.var) * 100
int.percentage
## [1] 90.49801
res.percentage = (coef.2.res / total.var) * 100
res.percentage
## [1] 9.501993
life.exp.coun.year.lme = lme(lifeExp ~ year.cen,
random = ~year.cen | country,
data = gapdata,
method = "ML")
summary(life.exp.coun.year.lme)
## Linear mixed-effects model fit by maximum likelihood
## Data: gapdata
## AIC BIC logLik
## 8742.214 8774.858 -4365.107
##
## Random effects:
## Formula: ~year.cen | country
## Structure: General positive-definite, Log-Cholesky parametrization
## StdDev Corr
## (Intercept) 12.2589967 (Intr)
## year.cen 0.1576987 -0.434
## Residual 2.1807917
##
## Fixed effects: lifeExp ~ year.cen
## Value Std.Error DF t-value p-value
## (Intercept) 50.51208 1.0341480 1561 48.84415 0
## year.cen 0.32590 0.0135911 1561 23.97920 0
## Correlation:
## (Intr)
## year.cen -0.439
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -8.08295473 -0.34870697 0.02684297 0.39762726 4.67325406
##
## Number of Observations: 1704
## Number of Groups: 142
#Note - Discuss the output with others to ensure my understanding of this output.
##Random effects coefficients:
# Amount of variation across countries and years? This is why we run the intra-class correlation (ICC)?
##Fixed effects coefficients:
##Intercept: ~50.5 (grand mean intercept; average life expectancy across countries and years)
##year.cen: ~0.33 (change in life expectancy for every year across countries)
##ICC exploration for this model (Not required, just curious).
var.cor.coef2 = VarCorr(life.exp.coun.year.lme)
var.cor.coef2
## country = pdLogChol(year.cen)
## Variance StdDev Corr
## (Intercept) 150.28300021 12.2589967 (Intr)
## year.cen 0.02486888 0.1576987 -0.434
## Residual 4.75585250 2.1807917
coun.year.lme.total.var = as.numeric(var.cor.coef2[1,1]) + as.numeric(var.cor.coef2[2,1]) + as.numeric(var.cor.coef2[3,1]) #Intercept + year.cen + residual
year.cen.ICC = (as.numeric(var.cor.coef2[2,1]) / coun.year.lme.total.var) *100
year.cen.ICC
## [1] 0.01603785
##Based on this, it is not worth having the slopes vary by year. Double check with the AIC run to see if this matches the output and conclusion from there.
AIC(life.exp.lm1, life.exp.coun.lme, life.exp.coun.year.lme)
## df AIC
## life.exp.lm1 3 13201.732
## life.exp.coun.lme 4 9867.217
## life.exp.coun.year.lme 6 8742.214