# Juan Buzzetti ID 9485651

#### R Assignment 2 - Part 1 ####


#### Question 1 ####

# Import dataset:
wced <- read.csv("/Users/Buzz/Desktop/HENV665Q/Lab7/wced.csv", header = T)
names(wced)

## [1] "City"                "Year"                "Population"         
## [4] "Emission_total"      "Emissions_percapita" "Region"

attach(wced)
View(wced)

# Total emissions for each region:
plot(Region, Emission_total, xlab = "Region", ylab = "Total Emissions - Millions T.CO2", 
    main = "CO2 Emissions by region")

plot of chunk unnamed-chunk-1


# Total emissions by population:
plot(Population, Emission_total, xlab = "Population", ylab = "Per Capita - T.CO2 per person", 
    main = "CO2 Emissions by population")

plot of chunk unnamed-chunk-1


# Total emissions by population (different colours for regions):
plot(Population, Emission_total, pch = 1, col = c("blue", "red", "green", "yellow", 
    "black")[Region], xlab = "Population", ylab = "CO2 Emissions", main = "CO2 Emissions by population")

# Legend:
legend(14300000, 53, c("Africa", "Asia", "Europe", "N. America", "S. America"), 
    pch = c(1), cex = 1, col = c("blue", "red", "green", "yellow", "black"))

plot of chunk unnamed-chunk-1



#### Question 2 ####

# Data subset excluding S. American and African cities:
wced_new <- subset(wced, Region != "S. America" & Region != "Africa")
attach(wced_new)

## The following object(s) are masked from 'wced':
## 
##     City, Emission_total, Emissions_percapita, Population, Region,
##     Year

View(wced_new)

# Mean for total emissions per region:
tapply(Emission_total, Region, mean)

##     Africa       Asia     Europe N. America S. America 
##         NA     109.12      24.38      59.25         NA


# Standard deviation for total emissions per region:
tapply(Emission_total, Region, sd)

##     Africa       Asia     Europe N. America S. America 
##         NA      66.72      20.86      48.93         NA


# Mean for per capita emissions per region:
tapply(Emissions_percapita, Region, mean)

##     Africa       Asia     Europe N. America S. America 
##         NA      8.745      9.300     14.704         NA


# Standard deviation for per capita emissions per region:
tapply(Emissions_percapita, Region, sd)

##     Africa       Asia     Europe N. America S. America 
##         NA      3.356      5.544      4.210         NA



#### Question 3 ####

# Model for total emissions vs region:
modeltotal <- aov(Emission_total ~ Region)

# Model for per capital emissions vs region:
modelpercapita <- aov(Emissions_percapita ~ Region)

# ANOVA test:
modelET.aov <- aov(Emission_total ~ Region)
summary(modelET.aov)

##             Df Sum Sq Mean Sq F value  Pr(>F)    
## Region       2  35040   17520    12.8 9.4e-05 ***
## Residuals   30  40973    1366                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

TukeyHSD(modelET.aov)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Emission_total ~ Region)
## 
## $Region
##                     diff     lwr     upr  p adj
## Europe-Asia       -84.74 -126.70 -42.781 0.0001
## N. America-Asia   -49.87 -105.04   5.298 0.0826
## N. America-Europe  34.87  -10.27  80.010 0.1549


# Very low P-value indicating that there is a strong correlation between
# regions and CO2 total emissions.

modelPC.aov <- aov(Emissions_percapita ~ Region)
summary(modelPC.aov)

##             Df Sum Sq Mean Sq F value Pr(>F)  
## Region       2    131    65.4    2.54  0.096 .
## Residuals   30    773    25.8                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

TukeyHSD(modelPC.aov)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Emissions_percapita ~ Region)
## 
## $Region
##                     diff     lwr    upr  p adj
## Europe-Asia       0.5545 -5.2074  6.316 0.9695
## N. America-Asia   5.9590 -1.6165 13.535 0.1453
## N. America-Europe 5.4045 -0.7937 11.603 0.0968


# P-value indicates that emissions per capita and region are correlated
# but in this case the relationship is much weaker.


#### QUESTION 4 ####

# Model Emission_total:
par(mfrow = c(2, 2))
plot(modelET.aov)

plot of chunk unnamed-chunk-1


# Data does not meet the assumptions for ANOVA between Emission_total and
# Region: 1) Residuals vs fitted: the variance is not constant; 2) Normal
# Q-Q: point values do not fall into the straight correlation line; 3)
# Scale-location: no constant variance; 4) Residual vs Leverage: some
# values have greater influence on the paramenter estimates.

# Model Emissions_percapita:
par(mfrow = c(2, 2))
plot(modelPC.aov)

plot of chunk unnamed-chunk-1


# Data do not meet the assumptions for ANOVA between Emissions_percapita
# and Region 1) Residuals vs fitted: the variance slightly not constant;
# 2) Normal Q-Q: point values form a straight correlation line, however;
# 3) Scale-location: no constant variance; 4) Residual vs Leverage:values
# have greater influence on the paramenter estimates.

# In order to determine whether the assumptions of our model are true, we
# should run post-hoc test.


#### Question 5 ####

# Linear regression test:
modelETP <- lm(Emission_total ~ Population)
summary(modelETP)

## 
## Call:
## lm(formula = Emission_total ~ Population)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -54.99  -3.33   1.57  10.43  44.51 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.65e-02   5.29e+00    0.02     0.99    
## Population  9.22e-06   7.80e-07   11.82  5.1e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 21.1 on 31 degrees of freedom
## Multiple R-squared: 0.818,   Adjusted R-squared: 0.813 
## F-statistic:  140 on 1 and 31 DF,  p-value: 5.13e-13


# Very small P-value indicates strong correlation between total emissions
# and population.  R-square = 0.8184 tells that the model explains almost
# 82% of the variance in the data.

modelEPP <- lm(Emissions_percapita ~ Population)
summary(modelEPP)

## 
## Call:
## lm(formula = Emissions_percapita ~ Population)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.183 -3.286 -0.373  1.910 19.021 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.09e+01   1.34e+00    8.12  3.6e-09 ***
## Population  -1.71e-07   1.97e-07   -0.87     0.39    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 5.33 on 31 degrees of freedom
## Multiple R-squared: 0.0236,  Adjusted R-squared: -0.00787 
## F-statistic: 0.75 on 1 and 31 DF,  p-value: 0.393


# P-value indicates a very low correlation between Emissions per capita
# and population.  R-square = 0.02 tells that there is almost no variance
# in the data.


#### Question 6 ####


# Testin interaction:
modelINT <- lm(Emission_total ~ Population + Region + Population:Region)
summary(modelINT)

## 
## Call:
## lm(formula = Emission_total ~ Population + Region + Population:Region)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -52.66  -5.02  -1.66   9.82  32.48 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -4.20e+01   2.54e+01   -1.65    0.111    
## Population                   1.24e-05   1.98e-06    6.25  1.1e-06 ***
## RegionEurope                 4.74e+01   2.62e+01    1.81    0.081 .  
## RegionN. America             4.49e+01   2.92e+01    1.54    0.136    
## Population:RegionEurope     -5.76e-06   2.53e-06   -2.28    0.031 *  
## Population:RegionN. America -8.12e-07   3.05e-06   -0.27    0.792    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 19.5 on 27 degrees of freedom
## Multiple R-squared: 0.865,   Adjusted R-squared: 0.84 
## F-statistic: 34.7 on 5 and 27 DF,  p-value: 6.13e-11


# In this model P indicates that there is a strong relationship total
# emissions and population.When compared to the results obtained in
# questions 3 and 5, this model confirms the correlation between total
# emissions and population



#### R Assignment 2 - Part 2 ####


#### Question 1 ####

# import dataset:
tiger <- read.table("/Users/Buzz/Desktop/HENV665Q/tiger.txt", header = T)
names(tiger)

## [1] "MUM_ID"     "Birth_year" "LRS"        "Birth_loc"  "longevity"

attach(tiger)
View(tiger)

#### Question 2 ####

# Plotting relationships between explantatory and response variables:
pairs(LRS ~ Birth_year + Birth_loc + longevity)

plot of chunk unnamed-chunk-1


# There are potential relatioships between: a) LRS and longevity b) LRS
# and Birth_year c) longevity and Birth_year d) longevity and
# Birth_location

#### Question 3 ####

# Multimple regression model:
model <- lm(LRS ~ Birth_year + Birth_loc + longevity + Birth_year:longevity)
summary.lm(model)

## 
## Call:
## lm(formula = LRS ~ Birth_year + Birth_loc + longevity + Birth_year:longevity)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11.17  -4.31  -1.67   2.60  19.82 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)          -809.8521  1073.8280   -0.75     0.45
## Birth_year              0.4107     0.5427    0.76     0.45
## Birth_loczoo_2         -0.1527     2.9816   -0.05     0.96
## Birth_loczoo_3          0.6771     3.0690    0.22     0.83
## Birth_loczoo_4         -2.2137     3.0516   -0.73     0.47
## longevity             104.3113    80.3638    1.30     0.20
## Birth_year:longevity   -0.0525     0.0406   -1.29     0.20
## 
## Residual standard error: 7.41 on 47 degrees of freedom
## Multiple R-squared: 0.181,   Adjusted R-squared: 0.0764 
## F-statistic: 1.73 on 6 and 47 DF,  p-value: 0.135



#### Question 4 ####

# Simplifications of the model and comparisons between models.

# Removing interaction term:
model1 <- update(model, . ~ . - Birth_year:longevity)
summary.lm(model1)

## 
## Call:
## lm(formula = LRS ~ Birth_year + Birth_loc + longevity)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -10.24  -4.66  -1.50   1.96  21.60 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     529.328    281.936    1.88    0.067 .
## Birth_year       -0.266      0.143   -1.87    0.068 .
## Birth_loczoo_2    0.722      2.924    0.25    0.806  
## Birth_loczoo_3    1.108      3.072    0.36    0.720  
## Birth_loczoo_4   -1.747      3.051   -0.57    0.570  
## longevity         0.498      0.259    1.93    0.060 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 7.46 on 48 degrees of freedom
## Multiple R-squared: 0.152,   Adjusted R-squared: 0.0635 
## F-statistic: 1.72 on 5 and 48 DF,  p-value: 0.148

anova(model, model1)

## Analysis of Variance Table
## 
## Model 1: LRS ~ Birth_year + Birth_loc + longevity + Birth_year:longevity
## Model 2: LRS ~ Birth_year + Birth_loc + longevity
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)
## 1     47 2579                         
## 2     48 2671 -1     -91.6 1.67    0.2


# Removing variable Birth_loc:
model2 <- update(model1, . ~ . - Birth_loc)
summary.lm(model2)

## 
## Call:
## lm(formula = LRS ~ Birth_year + longevity)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11.85  -4.13  -1.00   2.37  21.75 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  508.696    266.390    1.91    0.062 .
## Birth_year    -0.255      0.134   -1.90    0.063 .
## longevity      0.450      0.248    1.81    0.076 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 7.32 on 51 degrees of freedom
## Multiple R-squared: 0.133,   Adjusted R-squared: 0.0987 
## F-statistic:  3.9 on 2 and 51 DF,  p-value: 0.0265

anova(model1, model2)

## Analysis of Variance Table
## 
## Model 1: LRS ~ Birth_year + Birth_loc + longevity
## Model 2: LRS ~ Birth_year + longevity
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)
## 1     48 2671                         
## 2     51 2731 -3     -60.5 0.36   0.78


# Removing variable longevity:
model3 <- update(model2, . ~ . - longevity)
summary.lm(model3)

## 
## Call:
## lm(formula = LRS ~ Birth_year)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -10.53  -4.87  -2.60   3.89  22.90 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  571.979    269.821    2.12    0.039 *
## Birth_year    -0.284      0.136   -2.08    0.042 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 7.48 on 52 degrees of freedom
## Multiple R-squared: 0.0769,  Adjusted R-squared: 0.0591 
## F-statistic: 4.33 on 1 and 52 DF,  p-value: 0.0424

anova(model2, model3)

## Analysis of Variance Table
## 
## Model 1: LRS ~ Birth_year + longevity
## Model 2: LRS ~ Birth_year
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)  
## 1     51 2731                           
## 2     52 2907 -1      -176 3.28  0.076 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


# The simplest model is model3 (LRS ~ Birth_year) because it is the model
# showing the smaller p-values, which makes it more powerful.


#### Question 5 ####

# Using AIC for comparing different versions of the same model:
AIC(model, model1, model2, model3)

##        df   AIC
## model   8 378.0
## model1  7 377.9
## model2  4 373.1
## model3  3 374.5


# The lowest AIC value suggests that the most powerful model is model2
# (LRS ~ Birth_year + longevity). ANOVA had revealed model3 as the best
# model.

#### Question 6 ####

# Using STEP for simplifying the model:
step(model)

## Start:  AIC=222.8
## LRS ~ Birth_year + Birth_loc + longevity + Birth_year:longevity
## 
##                        Df Sum of Sq  RSS AIC
## - Birth_loc             3      57.1 2636 218
## - Birth_year:longevity  1      91.6 2671 223
## <none>                              2579 223
## 
## Step:  AIC=218
## LRS ~ Birth_year + longevity + Birth_year:longevity
## 
##                        Df Sum of Sq  RSS AIC
## - Birth_year:longevity  1      94.9 2731 218
## <none>                              2636 218
## 
## Step:  AIC=217.9
## LRS ~ Birth_year + longevity
## 
##              Df Sum of Sq  RSS AIC
## <none>                    2731 218
## - longevity   1       176 2907 219
## - Birth_year  1       193 2924 220

## 
## Call:
## lm(formula = LRS ~ Birth_year + longevity)
## 
## Coefficients:
## (Intercept)   Birth_year    longevity  
##     508.696       -0.255        0.450


# STEP also reveals that model2 (LRS ~ Birth_year + longevity) is the best
# because it has the lowest AIC value.


#### Question 7 ####

# Inspecting residuals:
hist(resid(model2), xlab = "Residuals", main = "Histogram of residuals")

plot of chunk unnamed-chunk-1


# Residuals show that this is a non-normal distribution.

#### Question 8 ####

# The model is probably good but the analysis conducted was not the
# appropriate. It would have been more convenient to test the model by
# using nonparametric tests such as Spearman R, Kendall Tau or Gamma.