#this is to set a working directory
setwd("C:\\Users\\melvo\\Documents\\CCMF\\Quantitive methods\\Coursework\\Meat and Happiness")
#this checks the working directory
getwd()
## [1] "C:/Users/melvo/Documents/CCMF/Quantitive methods/Coursework/Meat and Happiness"
#selecting your dataset
meathap <- read.csv("meathappy_plot2017.csv")
Relationship between the meat consumption of each country in 2017 against their happiness scores.
library(ggplot2)
ggplot(meathap, aes(x = meat, y = happiness)) +
geom_smooth(method = "lm", se = FALSE) +
geom_point()+theme_minimal()+
xlab("Meat Consumption kg/cap")+ylab("Happiness Score in Cantril Ladder") + ggtitle("Relationship between meat consumption and happiness 2017")
## `geom_smooth()` using formula 'y ~ x'
reg1=lm(happiness~meat,meathap)
coeftest(reg1,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7311081 0.1580845 29.9277 < 2.2e-16 ***
## meat 0.0150685 0.0029554 5.0987 1.107e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Relationship between GDP per capita (gdppc) against the happiness scores
library(ggplot2)
ggplot(meathap, aes(x = gdppc, y = happiness)) +
geom_smooth(method = "lm", se = FALSE) +
geom_point()+theme_minimal()+
xlab("GDP/cap USD")+ylab("Happiness Score") + ggtitle("Relationship between GDP per capita and happiness 2017")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
reg2=lm(happiness~gdppc,meathap)
coeftest(reg2,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.1365e+00 1.0939e-01 46.954 < 2.2e-16 ***
## gdppc 2.2409e-05 5.8678e-06 3.819 0.0002023 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
However, due to decreasing marginal effect of the happiness outcome as the GDP per capita increases instead log(gdppc) will be used:
library(ggplot2)
ggplot(meathap, aes(x = log(gdppc), y = happiness)) +
geom_smooth(method = "lm", se = FALSE) +
geom_point()+theme_minimal()+
xlab("log(GDP/cap)(log(USD))")+ylab("Happiness Score") + ggtitle("Relationship between log(GDP per capita) and happiness 2017")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
reg2a=lm(happiness~log(gdppc),meathap)
coeftest(reg2a,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.419899 0.572576 4.2263 4.311e-05 ***
## log(gdppc) 0.351304 0.066715 5.2658 5.275e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Now using log(gdppc) as a control variable.
reg3=lm(happiness~meat+log(gdppc),meathap)
coeftest(reg3,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.7245866 0.7711490 3.5332 0.0005615 ***
## meat 0.0030575 0.0046892 0.6520 0.5154864
## log(gdppc) 0.2990311 0.1091662 2.7392 0.0069843 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
6 regions: Europe, Africa, Asia & Pacific, South America, North America, Middle East
Used binary variables for each region to run the regression with all of them taken into account. So for example for eu the value is 1 for in europe and 0 for not in europe, and the same for all 6 of eu af apac me na sa
reg4=lm(happiness~meat+eu+af+apac+me+na,meathap)
coeftest(reg4,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.2496568 0.3388963 15.4905 < 2.2e-16 ***
## meat 0.0102612 0.0037059 2.7689 0.006429 **
## eu -0.2149728 0.2918793 -0.7365 0.462716
## af -0.7559756 0.3230240 -2.3403 0.020756 *
## apac -0.1506666 0.3306295 -0.4557 0.649352
## me -0.0237506 0.3819590 -0.0622 0.950512
## na 0.1070271 0.3356181 0.3189 0.750306
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Adding log(gdppc) as a control too:
reg4=lm(happiness~meat+log(gdppc)+eu+af+apac+me+na,meathap)
coeftest(reg4,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.4381852 1.0111228 3.4004 0.0008924 ***
## meat 0.0011048 0.0048216 0.2291 0.8191269
## log(gdppc) 0.2741746 0.1301484 2.1066 0.0370568 *
## eu -0.4623548 0.3009354 -1.5364 0.1268540
## af -0.7348681 0.3436026 -2.1387 0.0343142 *
## apac -0.2524802 0.3584238 -0.7044 0.4824228
## me -0.1829090 0.3779904 -0.4839 0.6292658
## na -0.0151453 0.3307707 -0.0458 0.9635490
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
South america is the default category here.
Urbanpercent is the percentage of the total population of a country who live in an urban setting.
library(ggplot2)
ggplot(meathap, aes(x = urbanpercent, y = happiness)) +
geom_smooth(method = "lm", se = FALSE) +
geom_point()+theme_minimal()+
xlab("Percentage of urbanisation of total population")+ylab("Happiness Score in Cantril Ladder") + ggtitle("Relationship between urbanisation and happiness 2017")
## `geom_smooth()` using formula 'y ~ x'
library(ggplot2)
ggplot(meathap, aes(x = urbanpercent, y = meat)) +
geom_smooth(method = "lm", se = FALSE) +
geom_point()+theme_minimal()+
xlab("Percentage of urbanisation of total population")+ylab("Meat Consumption kg/cap") + ggtitle("Relationship between urbanisation and meat consumption 2017")
## `geom_smooth()` using formula 'y ~ x'
regression of urbanisation’s impact on happiness
reg5=lm(happiness~urbanpercent,meathap)
coeftest(reg5,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.3432815 0.2917059 14.8892 < 2.2e-16 ***
## urbanpercent 0.0184056 0.0045641 4.0327 9.082e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
regression of urbanisation’s impact on meat consumption
reg6=lm(meat~urbanpercent,meathap)
coeftest(reg6,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.858103 4.430669 -2.4507 0.01551 *
## urbanpercent 0.976342 0.079427 12.2923 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
using urbanisation as a control variable
reg7=lm(happiness~meat + urbanpercent,meathap)
coeftest(reg7,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.4668139 0.2870274 15.5623 < 2e-16 ***
## meat 0.0113770 0.0036764 3.0946 0.00239 **
## urbanpercent 0.0072978 0.0056542 1.2907 0.19899
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
both log(gdppc) and urbanisation as controls
reg8=lm(happiness~meat + log(gdppc) + urbanpercent,meathap)
coeftest(reg8,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6815620 0.7863098 3.4103 0.0008562 ***
## meat 0.0031959 0.0047724 0.6697 0.5042183
## log(gdppc) 0.3126308 0.1264809 2.4718 0.0146890 *
## urbanpercent -0.0013470 0.0068961 -0.1953 0.8454287
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The human development index takes into account many factors for a country such as standard of living, knowledge, life expectancy
library(ggplot2)
ggplot(meathap, aes(x = HDI, y = happiness)) +
geom_smooth(method = "lm", se = FALSE) +
geom_point()+theme_minimal()+
xlab("HDI")+ylab("Happiness Score in Cantril Ladder") + ggtitle("Relationship between HDI and happiness 2017")
## `geom_smooth()` using formula 'y ~ x'
library(ggplot2)
ggplot(meathap, aes(x = HDI, y = meat)) +
geom_smooth(method = "lm", se = FALSE) +
geom_point()+theme_minimal()+
xlab("HDI")+ylab("Meat Consumption kg/cap") + ggtitle("Relationship between HDI and meat consumption 2017")
## `geom_smooth()` using formula 'y ~ x'
regression of HDI and happiness
reg9=lm(happiness~HDI,meathap)
coeftest(reg9,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.07902 0.41082 7.4949 7.242e-12 ***
## HDI 3.28897 0.57725 5.6977 7.048e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
regression of HDI and meat consumption
reg10=lm(meat~HDI,meathap)
coeftest(reg10,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -65.1164 7.0269 -9.2667 3.436e-16 ***
## HDI 156.7815 9.8314 15.9470 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
using hdi as a control
reg11=lm(happiness~meat + log(gdppc) + HDI,meathap)
coeftest(reg11,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.9139639 0.8254903 3.5300 0.0005689 ***
## meat 0.0024587 0.0047539 0.5172 0.6058598
## log(gdppc) 0.1479905 0.2032424 0.7281 0.4677842
## HDI 1.5858034 1.6505222 0.9608 0.3383760
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
log(gdppc) and urbanisation and hdi as controls
reg12=lm(happiness~meat + log(gdppc)+ urbanpercent + HDI,meathap)
coeftest(reg12,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8729870 0.8474960 3.3900 0.0009186 ***
## meat 0.0025905 0.0048375 0.5355 0.5931871
## log(gdppc) 0.1612179 0.2177280 0.7405 0.4603191
## urbanpercent -0.0012650 0.0069927 -0.1809 0.8567152
## HDI 1.5810209 1.6671631 0.9483 0.3446686
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Now looking at using log(gdppc) and urbanisation as controls together with the dummy variables for each region. HDI not included due to multicolinearity issues covered below in VIF test.
reg_final=lm(happiness~meat+log(gdppc)+urbanpercent+eu+af+apac+me+na,meathap)
coeftest(reg_final,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2906650 1.0126624 3.2495 0.001472 **
## meat 0.0015903 0.0048544 0.3276 0.743738
## log(gdppc) 0.3315532 0.1501815 2.2077 0.029020 *
## urbanpercent -0.0050643 0.0076356 -0.6632 0.508348
## eu -0.5529239 0.3193726 -1.7313 0.085774 .
## af -0.7891548 0.3613715 -2.1838 0.030773 *
## apac -0.3574374 0.3833506 -0.9324 0.352857
## me -0.2206237 0.3859988 -0.5716 0.568603
## na -0.0727212 0.3471850 -0.2095 0.834417
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Determining whether it is necessary to robust standard errors via a coeftest() with vcov=vcovHC.
bptest(reg_final)
##
## studentized Breusch-Pagan test
##
## data: reg_final
## BP = 9.6104, df = 8, p-value = 0.2934
The H0 of the Breusch Pagan test is that no heteroscedasticity is present. If the pvalue <0.05 we can reject the null however here that is not the case and therefore we cannot reject the null hypothesis and heteroscedasticity is not present
Using vif to test for imperfect multicolinearity between x variables. First a full regression including HDI as a control variable is shown.
reg_final_HDI=lm(happiness~meat+log(gdppc)+HDI+urbanpercent+eu+af+apac+me+na,meathap)
coeftest(reg_final_HDI,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2783660 1.0272585 3.1914 0.001779 **
## meat 0.0014681 0.0049479 0.2967 0.767164
## log(gdppc) 0.2686708 0.2250785 1.1937 0.234795
## HDI 0.7914173 2.2002882 0.3597 0.719669
## urbanpercent -0.0053223 0.0077316 -0.6884 0.492453
## eu -0.5759524 0.3319270 -1.7352 0.085098 .
## af -0.7422288 0.4044310 -1.8352 0.068773 .
## apac -0.3671534 0.3866063 -0.9497 0.344049
## me -0.2208000 0.3882839 -0.5687 0.570578
## na -0.0571791 0.3548496 -0.1611 0.872238
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Now demonstrating the multicolinearity problem between HDI and log(gdppc).
vif(reg_final_HDI)
## meat log(gdppc) HDI urbanpercent eu af
## 3.501166 14.498592 15.623224 3.542206 4.520735 4.897289
## apac me na
## 2.872000 2.564835 2.129123
vif(reg_final)
## meat log(gdppc) urbanpercent eu af apac
## 3.487242 5.854780 3.510446 4.402332 4.445861 2.861018
## me na
## 2.564832 2.109459
As log(gdppc) and HDI are both >10 there is a multicolinearity problem between the two. Therefore we removed HDI as a control variable and all x variables have vif<10 now. Hence, why HDI is not included in our final regressions.
Using data from 2010 to 2017 for all variables. However, for certain countries happiness data was not available for every year, hence the need to remove NA values from the panel dataset.
panelmeathap <- read.csv("meathappy_plot.csv", as.is=TRUE, na.strings=c(NA, "NA", " NA"))
removing the NA values from the panel data
panelmeathap2 <- panelmeathap[complete.cases(panelmeathap[ ,c("meat", "happiness", "gdppc", "urbanpercent", "HDI", "wworkpercent")]), ]
removing countries with less than 3 years worth of data in order to have a time series was at least 3 years for all countries in the panel regression
panelmeathap3 = panelmeathap2[panelmeathap2$name != "Central African Republic" & panelmeathap2$name != "Jamaica" & panelmeathap2$name != "Lao People's Democratic Republic" & panelmeathap2$name != "Lesotho" & panelmeathap2$name != "Oman" & panelmeathap2$name != "Mozambique" & panelmeathap2$name != "Sri Lanka" & panelmeathap2$name != "Trinidad and Tobago"& panelmeathap2$name != "Gambia"& panelmeathap2$name != "Iceland"& panelmeathap2$name != "Morocco"& panelmeathap2$name != "Namibia"& panelmeathap2$name != "Nigeria", ]
The ordinary least squares (ols) regression used to compare the fixed effects plm regression to in the pFtest.
#OLS regression
ols=lm(happiness~meat+log(gdppc)+urbanpercent+eu+af+apac+me+na,panelmeathap3)
summary(ols)
##
## Call:
## lm(formula = happiness ~ meat + log(gdppc) + urbanpercent + eu +
## af + apac + me + na, data = panelmeathap3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.72734 -0.41828 0.01015 0.44564 3.06278
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.048315 0.232832 4.502 7.54e-06 ***
## meat 0.001991 0.001409 1.413 0.15806
## log(gdppc) 0.597172 0.036789 16.232 < 2e-16 ***
## urbanpercent -0.005207 0.001817 -2.867 0.00424 **
## eu -0.764378 0.092149 -8.295 3.63e-16 ***
## af -0.856208 0.098869 -8.660 < 2e-16 ***
## apac -0.528375 0.102489 -5.155 3.07e-07 ***
## me -0.461010 0.100300 -4.596 4.87e-06 ***
## na 0.154802 0.108681 1.424 0.15466
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.685 on 964 degrees of freedom
## Multiple R-squared: 0.638, Adjusted R-squared: 0.635
## F-statistic: 212.4 on 8 and 964 DF, p-value: < 2.2e-16
#renaming a column heading
colnames(panelmeathap3)[1] <- gsub('^...','',colnames(panelmeathap3)[1])
fixed effects plm:
fixed <- plm(happiness~meat+log(gdppc)+urbanpercent, data=panelmeathap3,
index=c("location","year"),
model="within", effect="twoways")
coeftest(fixed,vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## meat 0.0108945 0.0043433 2.5083 0.01232 *
## log(gdppc) 0.2270784 0.2005704 1.1322 0.25789
## urbanpercent 0.0037451 0.0036471 1.0269 0.30477
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test to demonstrate that a fixed effects plm is more effective than OLS for panel data
pFtest(fixed, ols)
##
## F test for twoways effects
##
## data: happiness ~ meat + log(gdppc) + urbanpercent
## F = 6.1013, df1 = 128, df2 = 836, p-value < 2.2e-16
## alternative hypothesis: significant effects
pFtest is a test that shows a p values <0.05 showing that the fixed effects model is a much better choice.
Using the Hausman test to determine whether to used a fixed or random effects plm.
random <- plm(happiness~meat+log(gdppc)+urbanpercent, data=panelmeathap3,
index=c("location","year"),
model="random")
coeftest(random, vcov=vcovHC)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.2364657 0.3879970 3.1868 0.001485 **
## meat 0.0066635 0.0030271 2.2013 0.027953 *
## log(gdppc) 0.4220301 0.0638544 6.6093 6.367e-11 ***
## urbanpercent 0.0034853 0.0029692 1.1738 0.240757
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Hausman test
phtest(fixed, random)
##
## Hausman Test
##
## data: happiness ~ meat + log(gdppc) + urbanpercent
## chisq = 2.9657, df = 3, p-value = 0.3969
## alternative hypothesis: one model is inconsistent
As the p-value of the hausman test is <0.05 we can use the fixed effects model.
Using a Dickey-Fuller test to test for stationarity of the time series data.
panel.set <- pdata.frame(panelmeathap3, index = c("location", "year"))
adf.test(panel.set$happiness, k=2)
## Warning in adf.test(panel.set$happiness, k = 2): p-value smaller than printed p-
## value
##
## Augmented Dickey-Fuller Test
##
## data: panel.set$happiness
## Dickey-Fuller = -9.324, Lag order = 2, p-value = 0.01
## alternative hypothesis: stationary
P-value is < 0.05 and therefore the data is stationary and it is not necessary to first difference the time series.