# Juan Buzzetti ID 9485651
#### R Assignment 2 - Part 1 ####
#### Question 1 ####
# Import dataset:
wced <- read.csv("/Users/Buzz/Desktop/HENV665Q/Lab7/wced.csv", header = T)
names(wced)
## [1] "City" "Year" "Population"
## [4] "Emission_total" "Emissions_percapita" "Region"
attach(wced)
View(wced)
# Total emissions for each region:
plot(Region, Emission_total, xlab = "Region", ylab = "Total Emissions - Millions T.CO2",
main = "CO2 Emissions by region")
# Total emissions by population:
plot(Population, Emission_total, xlab = "Population", ylab = "Per Capita - T.CO2 per person",
main = "CO2 Emissions by population")
# Total emissions by population (different colours for regions):
plot(Population, Emission_total, pch = 1, col = c("blue", "red", "green", "yellow",
"black")[Region], xlab = "Population", ylab = "CO2 Emissions", main = "CO2 Emissions by population")
# Legend:
legend(14300000, 53, c("Africa", "Asia", "Europe", "N. America", "S. America"),
pch = c(1), cex = 1, col = c("blue", "red", "green", "yellow", "black"))
#### Question 2 ####
# Data subset excluding S. American and African cities:
wced_new <- subset(wced, Region != "S. America" & Region != "Africa")
attach(wced_new)
## The following object(s) are masked from 'wced':
##
## City, Emission_total, Emissions_percapita, Population, Region,
## Year
View(wced_new)
# Mean for total emissions per region:
tapply(Emission_total, Region, mean)
## Africa Asia Europe N. America S. America
## NA 109.12 24.38 59.25 NA
# Standard deviation for total emissions per region:
tapply(Emission_total, Region, sd)
## Africa Asia Europe N. America S. America
## NA 66.72 20.86 48.93 NA
# Mean for per capita emissions per region:
tapply(Emissions_percapita, Region, mean)
## Africa Asia Europe N. America S. America
## NA 8.745 9.300 14.704 NA
# Standard deviation for per capita emissions per region:
tapply(Emissions_percapita, Region, sd)
## Africa Asia Europe N. America S. America
## NA 3.356 5.544 4.210 NA
#### Question 3 ####
# Model for total emissions vs region:
modeltotal <- aov(Emission_total ~ Region)
# Model for per capital emissions vs region:
modelpercapita <- aov(Emissions_percapita ~ Region)
# ANOVA test:
modelET.aov <- aov(Emission_total ~ Region)
summary(modelET.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Region 2 35040 17520 12.8 9.4e-05 ***
## Residuals 30 40973 1366
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(modelET.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Emission_total ~ Region)
##
## $Region
## diff lwr upr p adj
## Europe-Asia -84.74 -126.70 -42.781 0.0001
## N. America-Asia -49.87 -105.04 5.298 0.0826
## N. America-Europe 34.87 -10.27 80.010 0.1549
# Very low P-value indicating that there is a strong correlation between
# regions and CO2 total emissions.
modelPC.aov <- aov(Emissions_percapita ~ Region)
summary(modelPC.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Region 2 131 65.4 2.54 0.096 .
## Residuals 30 773 25.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(modelPC.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Emissions_percapita ~ Region)
##
## $Region
## diff lwr upr p adj
## Europe-Asia 0.5545 -5.2074 6.316 0.9695
## N. America-Asia 5.9590 -1.6165 13.535 0.1453
## N. America-Europe 5.4045 -0.7937 11.603 0.0968
# P-value indicates that emissions per capita and region are correlated
# but in this case the relationship is much weaker.
#### QUESTION 4 ####
# Model Emission_total:
par(mfrow = c(2, 2))
plot(modelET.aov)
# Data does not meet the assumptions for ANOVA between Emission_total and
# Region: 1) Residuals vs fitted: the variance is not constant; 2) Normal
# Q-Q: point values do not fall into the straight correlation line; 3)
# Scale-location: no constant variance; 4) Residual vs Leverage: some
# values have greater influence on the paramenter estimates.
# Model Emissions_percapita:
par(mfrow = c(2, 2))
plot(modelPC.aov)
# Data do not meet the assumptions for ANOVA between Emissions_percapita
# and Region 1) Residuals vs fitted: the variance slightly not constant;
# 2) Normal Q-Q: point values form a straight correlation line, however;
# 3) Scale-location: no constant variance; 4) Residual vs Leverage:values
# have greater influence on the paramenter estimates.
# In order to determine whether the assumptions of our model are true, we
# should run post-hoc test.
#### Question 5 ####
# Linear regression test:
modelETP <- lm(Emission_total ~ Population)
summary(modelETP)
##
## Call:
## lm(formula = Emission_total ~ Population)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.99 -3.33 1.57 10.43 44.51
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.65e-02 5.29e+00 0.02 0.99
## Population 9.22e-06 7.80e-07 11.82 5.1e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.1 on 31 degrees of freedom
## Multiple R-squared: 0.818, Adjusted R-squared: 0.813
## F-statistic: 140 on 1 and 31 DF, p-value: 5.13e-13
# Very small P-value indicates strong correlation between total emissions
# and population. R-square = 0.8184 tells that the model explains almost
# 82% of the variance in the data.
modelEPP <- lm(Emissions_percapita ~ Population)
summary(modelEPP)
##
## Call:
## lm(formula = Emissions_percapita ~ Population)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.183 -3.286 -0.373 1.910 19.021
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.09e+01 1.34e+00 8.12 3.6e-09 ***
## Population -1.71e-07 1.97e-07 -0.87 0.39
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.33 on 31 degrees of freedom
## Multiple R-squared: 0.0236, Adjusted R-squared: -0.00787
## F-statistic: 0.75 on 1 and 31 DF, p-value: 0.393
# P-value indicates a very low correlation between Emissions per capita
# and population. R-square = 0.02 tells that there is almost no variance
# in the data.
#### Question 6 ####
# Testin interaction:
modelINT <- lm(Emission_total ~ Population + Region + Population:Region)
summary(modelINT)
##
## Call:
## lm(formula = Emission_total ~ Population + Region + Population:Region)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.66 -5.02 -1.66 9.82 32.48
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.20e+01 2.54e+01 -1.65 0.111
## Population 1.24e-05 1.98e-06 6.25 1.1e-06 ***
## RegionEurope 4.74e+01 2.62e+01 1.81 0.081 .
## RegionN. America 4.49e+01 2.92e+01 1.54 0.136
## Population:RegionEurope -5.76e-06 2.53e-06 -2.28 0.031 *
## Population:RegionN. America -8.12e-07 3.05e-06 -0.27 0.792
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.5 on 27 degrees of freedom
## Multiple R-squared: 0.865, Adjusted R-squared: 0.84
## F-statistic: 34.7 on 5 and 27 DF, p-value: 6.13e-11
# In this model P indicates that there is a strong relationship total
# emissions and population.When compared to the results obtained in
# questions 3 and 5, this model confirms the correlation between total
# emissions and population
#### R Assignment 2 - Part 2 ####
#### Question 1 ####
# import dataset:
tiger <- read.table("/Users/Buzz/Desktop/HENV665Q/tiger.txt", header = T)
names(tiger)
## [1] "MUM_ID" "Birth_year" "LRS" "Birth_loc" "longevity"
attach(tiger)
View(tiger)
#### Question 2 ####
# Plotting relationships between explantatory and response variables:
pairs(LRS ~ Birth_year + Birth_loc + longevity)
# There are potential relatioships between: a) LRS and longevity b) LRS
# and Birth_year c) longevity and Birth_year d) longevity and
# Birth_location
#### Question 3 ####
# Multimple regression model:
model <- lm(LRS ~ Birth_year + Birth_loc + longevity + Birth_year:longevity)
summary.lm(model)
##
## Call:
## lm(formula = LRS ~ Birth_year + Birth_loc + longevity + Birth_year:longevity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.17 -4.31 -1.67 2.60 19.82
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -809.8521 1073.8280 -0.75 0.45
## Birth_year 0.4107 0.5427 0.76 0.45
## Birth_loczoo_2 -0.1527 2.9816 -0.05 0.96
## Birth_loczoo_3 0.6771 3.0690 0.22 0.83
## Birth_loczoo_4 -2.2137 3.0516 -0.73 0.47
## longevity 104.3113 80.3638 1.30 0.20
## Birth_year:longevity -0.0525 0.0406 -1.29 0.20
##
## Residual standard error: 7.41 on 47 degrees of freedom
## Multiple R-squared: 0.181, Adjusted R-squared: 0.0764
## F-statistic: 1.73 on 6 and 47 DF, p-value: 0.135
#### Question 4 ####
# Simplifications of the model and comparisons between models.
# Removing interaction term:
model1 <- update(model, . ~ . - Birth_year:longevity)
summary.lm(model1)
##
## Call:
## lm(formula = LRS ~ Birth_year + Birth_loc + longevity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.24 -4.66 -1.50 1.96 21.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 529.328 281.936 1.88 0.067 .
## Birth_year -0.266 0.143 -1.87 0.068 .
## Birth_loczoo_2 0.722 2.924 0.25 0.806
## Birth_loczoo_3 1.108 3.072 0.36 0.720
## Birth_loczoo_4 -1.747 3.051 -0.57 0.570
## longevity 0.498 0.259 1.93 0.060 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.46 on 48 degrees of freedom
## Multiple R-squared: 0.152, Adjusted R-squared: 0.0635
## F-statistic: 1.72 on 5 and 48 DF, p-value: 0.148
anova(model, model1)
## Analysis of Variance Table
##
## Model 1: LRS ~ Birth_year + Birth_loc + longevity + Birth_year:longevity
## Model 2: LRS ~ Birth_year + Birth_loc + longevity
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 47 2579
## 2 48 2671 -1 -91.6 1.67 0.2
# Removing variable Birth_loc:
model2 <- update(model1, . ~ . - Birth_loc)
summary.lm(model2)
##
## Call:
## lm(formula = LRS ~ Birth_year + longevity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.85 -4.13 -1.00 2.37 21.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 508.696 266.390 1.91 0.062 .
## Birth_year -0.255 0.134 -1.90 0.063 .
## longevity 0.450 0.248 1.81 0.076 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.32 on 51 degrees of freedom
## Multiple R-squared: 0.133, Adjusted R-squared: 0.0987
## F-statistic: 3.9 on 2 and 51 DF, p-value: 0.0265
anova(model1, model2)
## Analysis of Variance Table
##
## Model 1: LRS ~ Birth_year + Birth_loc + longevity
## Model 2: LRS ~ Birth_year + longevity
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 48 2671
## 2 51 2731 -3 -60.5 0.36 0.78
# Removing variable longevity:
model3 <- update(model2, . ~ . - longevity)
summary.lm(model3)
##
## Call:
## lm(formula = LRS ~ Birth_year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.53 -4.87 -2.60 3.89 22.90
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 571.979 269.821 2.12 0.039 *
## Birth_year -0.284 0.136 -2.08 0.042 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.48 on 52 degrees of freedom
## Multiple R-squared: 0.0769, Adjusted R-squared: 0.0591
## F-statistic: 4.33 on 1 and 52 DF, p-value: 0.0424
anova(model2, model3)
## Analysis of Variance Table
##
## Model 1: LRS ~ Birth_year + longevity
## Model 2: LRS ~ Birth_year
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 51 2731
## 2 52 2907 -1 -176 3.28 0.076 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The simplest model is model3 (LRS ~ Birth_year) because it is the model
# showing the smaller p-values, which makes it more powerful.
#### Question 5 ####
# Using AIC for comparing different versions of the same model:
AIC(model, model1, model2, model3)
## df AIC
## model 8 378.0
## model1 7 377.9
## model2 4 373.1
## model3 3 374.5
# The lowest AIC value suggests that the most powerful model is model2
# (LRS ~ Birth_year + longevity). ANOVA had revealed model3 as the best
# model.
#### Question 6 ####
# Using STEP for simplifying the model:
step(model)
## Start: AIC=222.8
## LRS ~ Birth_year + Birth_loc + longevity + Birth_year:longevity
##
## Df Sum of Sq RSS AIC
## - Birth_loc 3 57.1 2636 218
## - Birth_year:longevity 1 91.6 2671 223
## <none> 2579 223
##
## Step: AIC=218
## LRS ~ Birth_year + longevity + Birth_year:longevity
##
## Df Sum of Sq RSS AIC
## - Birth_year:longevity 1 94.9 2731 218
## <none> 2636 218
##
## Step: AIC=217.9
## LRS ~ Birth_year + longevity
##
## Df Sum of Sq RSS AIC
## <none> 2731 218
## - longevity 1 176 2907 219
## - Birth_year 1 193 2924 220
##
## Call:
## lm(formula = LRS ~ Birth_year + longevity)
##
## Coefficients:
## (Intercept) Birth_year longevity
## 508.696 -0.255 0.450
# STEP also reveals that model2 (LRS ~ Birth_year + longevity) is the best
# because it has the lowest AIC value.
#### Question 7 ####
# Inspecting residuals:
hist(resid(model2), xlab = "Residuals", main = "Histogram of residuals")
# Residuals show that this is a non-normal distribution.
#### Question 8 ####
# The model is probably good but the analysis conducted was not the
# appropriate. It would have been more convenient to test the model by
# using nonparametric tests such as Spearman R, Kendall Tau or Gamma.