# LOAD IN DATA (You need to change the file path)
dat2 = readxl::read_excel("//Users//mwilandhlovu//Desktop//ECONS2000//World_data.xlsx", sheet = "Data")
dat2
## # A tibble: 1,302 × 3
## `Country Name` `Series Name` `2020`
## <chr> <chr> <chr>
## 1 Afghanistan Total greenhouse gas emissions (kt of CO2 equivalent) 31119.0…
## 2 Afghanistan GDP per capita (current US$) 516.866…
## 3 Afghanistan Population, total 38972230
## 4 Afghanistan Government expenditure on education, total (% of GDP) 2.86085…
## 5 Afghanistan Population density (people per sq. km of land area) 59.7522…
## 6 Afghanistan Urban population (% of total population) 26.026
## 7 Albania Total greenhouse gas emissions (kt of CO2 equivalent) 8304.29…
## 8 Albania GDP per capita (current US$) 5343.03…
## 9 Albania Population, total 2837849
## 10 Albania Government expenditure on education, total (% of GDP) 3.09999…
## # ℹ 1,292 more rows
# CONVERT DATA FROM LONG TO WIDE FORM
datw = spread(dat2, "Series Name", "2020")
datw
## # A tibble: 217 × 7
## `Country Name` `GDP per capita (current US$)` Government expenditure o…¹
## <chr> <chr> <chr>
## 1 Afghanistan 516.86679735172572 2.8608593940734899
## 2 Albania 5343.0377039955974 3.0999999046325701
## 3 Algeria 3354.1573026511446 7.04239749908447
## 4 American Samoa 15501.526337439651 ..
## 5 Andorra 37207.222000009482 ..
## 6 Angola 1502.9507541451751 2.41519999504089
## 7 Antigua and Barbuda 15284.772383537815 3.4531199932098402
## 8 Argentina 8496.4281567550042 5.0160498619079599
## 9 Armenia 4505.8677455263787 2.7056701183319101
## 10 Aruba 24487.863569428024 ..
## # ℹ 207 more rows
## # ℹ abbreviated name: ¹`Government expenditure on education, total (% of GDP)`
## # ℹ 4 more variables:
## # `Population density (people per sq. km of land area)` <chr>,
## # `Population, total` <chr>,
## # `Total greenhouse gas emissions (kt of CO2 equivalent)` <chr>,
## # `Urban population (% of total population)` <chr>
# RENAME VARIABLES
datw = rename(datw, GHG = "Total greenhouse gas emissions (kt of CO2 equivalent)",
GDPpc = "GDP per capita (current US$)",
Pop = "Population, total",
EDU = "Government expenditure on education, total (% of GDP)",
PopDen = "Population density (people per sq. km of land area)",
UrbPop = "Urban population (% of total population)")
# DROP MISSING OBSERVATIONS
datw[datw==".."]=NA
datw=na.omit(datw)
# CHANGE DATA TYPE FROM CHARACTER INTO NUMERIC
class(datw$GHG)="double"
class(datw$GDPpc)="double"
class(datw$Pop)="double"
class(datw$EDU)="double"
class(datw$PopDen)="double"
class(datw$UrbPop)="double"
# DISPLAY 1ST 6 ROWS
head(datw)
## # A tibble: 6 × 7
## `Country Name` GDPpc EDU PopDen Pop GHG UrbPop
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 517. 2.86 59.8 38972230 31119. 26.0
## 2 Albania 5343. 3.10 104. 2837849 8304. 62.1
## 3 Algeria 3354. 7.04 18.2 43451666 266703. 73.7
## 4 Angola 1503. 2.42 26.8 33428486 70781. 66.8
## 5 Antigua and Barbuda 15285. 3.45 211. 92664 1204. 24.4
## 6 Argentina 8496. 5.02 16.6 45376763 361433. 92.1
# DIMENSION OF DATA
c(nrow(datw),ncol(datw))
## [1] 150 7
# CREATE GHG PER CAPITA
datw$GHGpc = datw$GHG/datw$Pop*10^6
##Scatter Plot of greenhouse gas emissions against GDP per capita
datw$GHGpc<-datw$GHG*1000000
datw$GHGpc<-datw$GHGpc/datw$Pop
plot(datw$GHGpc,datw$GDPpc, main = "Scatter Plot of GHG per captia against
GDP per capita",
xlab = "GDP per Capita", ylab = "GHG per captia (KG)",
pch = 1, frame = FALSE)
##Consturction of the 90% Confidence interval of GHGpc
#Sample mean of GHGpc
x1=datw$GHGpc
xbar=sum(x1/length(x1))
xbar
## [1] 6130.624
#Sample variance of GHGpc
xvar= sum((x1-xbar)^2)/(length(x1)-1)
xvar
## [1] 41592842
#Sample standard deviation of GHGpc
sdx=sqrt(xvar)
sdx
## [1] 6449.251
#Critial values 90% CI of GHGpc
qt(0.05,length(x1-1))
## [1] -1.655076
qt(0.95,length(x1-1))
## [1] 1.655076
#Construction of 90% CI of GHGpc
xbar+qt(0.05,length(x1)-1)*sqrt(xvar^2/length(x1))
## [1] -5614809
xbar+qt(0.95,length(x1)-1)*sqrt(xvar^2/length(x1))
## [1] 5627070
Interpretation of the CI
The confidence interval means we are 90% sure that the population mean will fall between the values of -5614809 and 5627070. We this assume is based on this particular sample set.However if a different sample set were to be used with different sample means, variance and standard deviations then the estimated values would change.
GDPpc<-datw$GDPpc
Pop<-datw$Pop
EDU<-datw$EDU
PopDen<-datw$PopDen
Urbpop<-datw$UrbPop
GDPpc2<-((GDPpc)^2) #GDP squared
mrl<-lm(GHGpc~GDPpc+Pop+EDU+PopDen+Urbpop+GDPpc2,data = datw)
summary(mrl)
##
## Call:
## lm(formula = GHGpc ~ GDPpc + Pop + EDU + PopDen + Urbpop + GDPpc2,
## data = datw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8908.5 -2637.8 -909.4 944.1 28124.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.422e+02 1.562e+03 0.603 0.54726
## GDPpc 2.943e-01 6.354e-02 4.631 8.1e-06 ***
## Pop 6.694e-07 2.608e-06 0.257 0.79778
## EDU -3.147e+02 2.269e+02 -1.387 0.16747
## PopDen -1.388e-01 6.448e-01 -0.215 0.82992
## Urbpop 6.444e+01 2.423e+01 2.659 0.00873 **
## GDPpc2 -2.367e-06 7.123e-07 -3.323 0.00113 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5243 on 143 degrees of freedom
## Multiple R-squared: 0.3657, Adjusted R-squared: 0.3391
## F-statistic: 13.74 on 6 and 143 DF, p-value: 2.724e-12
Estimated sample regression equation \[ Yi=b0+b1GDPpc+b2Pop+b3EDU+b4Popden+b5Urbpop+b6GDP^2+ei \] Estimated sample regression equation for GHG emissions per capita (in kg) \[ GHGpc = 942.2+0.2943GDPpc+0.0000006694Pop-314.7395EDU-0.1387601Popden+64.4396Urbpop-0.00000367GDPpc^2+ei \]
##Interpretation on R-squared and Standard error of regression R squared equation: \[ R^2=SSE/SST \] In multiple linear regression r squared must be adjusted to be properly interpreted. The adjusted r squared accounts for the multiple added variables that would affect to the r squared value. In this model the adjusted R squared is 0.3391 suggests 33% of the variation in GHG per captia is explained by the explanatory variables (DP per capita, GDP per capita squared, government expenditure on education, population density, and the share of population living in urban areas are the explanatory Values).
This also means that there is still 67% of the variation in GHGpc that is not accounted for. The magnitude of the data compared to the R squared also tell to the goodness of fit.
The standard error of regression measure the average amount the observed data deviates from the predicted values. In this model the standard error is 5243, meaning on average the actual data is 5243 unit away from the predicted value. Generally the higher to number is, the less precise the model’s predicts values are. Hence suggesting that this model is not a good indication of the data.
When put together, the adjusted r squared and the standard error suggest that the model is not a good fit for the data. This also suggests that there may be other important variables that need to be included to make the model more accurate.
##Interpretation of estimated coefficients EDU and PoPDen
Each slope coefficient variable represents how much dependent variable changes when associated variable increases by 1 unit holding other variables constant.
summary(mrl)$coefficients[4,1]#the slope of EDU pulled from the regression table
## [1] -314.7395
summary(mrl)$coefficients[5,1]#the slope of Popden pulled from the regression
## [1] -0.1387601
\[ \displaystyle \frac{\partial GHGpc}{\partial EDU} = -314.7395 \] The slope coefficient for EDU is -314.7395. This indicates that, holding all other variables constant, a one-unit increase in education expenditure (EDU) is associated with a decrease in GHGpc (greenhouse gas emissions per capita) by 314.7395 units. The coefficient is significantly larger than zero meaning the predicted values are most likely not very accurate.
\[ \displaystyle \frac{\partial GHGpc}{\partial PopDen} = -0.1387601 \] The slope coefficient for PopDen is -0.1387601. This means that, holding all other variables constant, a one-unit increase in population density (PopDen) is associated with a decrease in GHGpc by approximately 0.1387601 units.
##Test significance of EDU coefficient
Step 1: state the null hypothesis
The null is the true population coefficient for EDU is less than or equal to 0. \[ H_0: \beta_1 EDU <0 \] The alternative Hypothesis is that coefficient for EDU is greater 0 \[ H_A : \beta_1 EDU >0 \] Step 2: Find the critical values
qt(.05,length(datw$EDU)-2)
## [1] -1.655215
qt(.95,length(datw$EDU)-2)
## [1] 1.655215
Step 3: T test statistics
\[ t. test = b1-0/se(b_1) \]
seEDU=summary(mrl)$coefficients[2,4]
EDU_estimate <-summary(mrl)$coefficients[1,4]
sum(EDU_estimate-0)/(seEDU*(EDU_estimate))
## [1] 123469.6
Step 4: 123469.6 is significantly bigger than the critical value. So we must reject the null.
##Construction of the 99% confidence interval of UrbPop coefficient
# critical values of 99% CI for Urbpop coefficient
qt(0.005,150-2)
## [1] -2.609456
qt(0.995,150-2)
## [1] 2.609456
#(se) is the standard error of Urbpop
se1<-summary(mrl)$coefficients[6,2]
se1
## [1] 24.23331
#b1 is the estimate of Urbpop (slope)
b1<-summary(mrl)$coefficients[6,1]
b1
## [1] 64.4396
# manual calculation
#P(-2.609228<t<2.609228)=0.99
#P(b1-1.976*(se)) < B1 < b1+1.976*(se))=0.99
#P(64.4396-2.609228*24.23331 <B1< 64.4396+2.609228*24.23331)=0.99
#Lower bound of 95% CI
b1+qt(0.005,150-2)*se1
## [1] 1.203826
#upper bound of 95% CI
b1+qt(0.995,150-2)*se1
## [1] 127.6754
#P(1.203826 < B1 < 127.6754)=0.99
\[P(1.203826 < B1 < 127.6754)=0.99 \] Based on the data and statistical analysis, we can say with 99% confidence that the true value of the parameter falls within the range from 1.203826 to 127.6754. This interval provides a range of values for that we believe captures the true effect or relationship represented by this parameter in the population. We cannot say with certainty that is exactly any specific value within this interval, but we are reasonably confident that it falls within this range.
confint(mrl,level = 0.99)
## 0.5 % 99.5 %
## (Intercept) -3.134830e+03 5.019151e+03
## GDPpc 1.283811e-01 4.601495e-01
## Pop -6.138157e-06 7.476868e-06
## EDU -9.069668e+02 2.774878e+02
## PopDen -1.822128e+00 1.544608e+00
## Urbpop 1.174972e+00 1.277042e+02
## GDPpc2 -4.226187e-06 -5.070697e-07
##Plot predicted CO2 over the range of GDP
# Create a sequence of GDP values from 0 to 120,000 in $1,000 increments
GDP_values <- seq(0, 120000, by = 1000)
# Get the sample mean values of PopDen and UrbPop from your dataset
PopDen<-datw$PopDen
Urbpop<-datw$UrbPop
PopDen_mean <- mean(datw$PopDen)
UrbPop_mean <- mean(datw$UrbPop)
# Define the coefficients of regression model
b0 <- 9.422e+02
beta_GDP <- 2.943e-01
beta_Pop <- 6.694e-07
beta_EDU <- -3.147e+02
beta_PopDen <- -1.388e-01
beta_Urbpop <- 6.444e+01
beta_GDP2 <- -2.367e-06
#Initialize an empty vector to store predicted GHGpc values
predicted_GHGpc <- vector()
# Loop through GDP values and calculate predicted GHGpc
for (gdp in GDPpc) {
predicted <- b0+ beta_GDP * gdp + beta_Pop * mean(datw$Pop) +
beta_EDU * mean(datw$EDU) + beta_PopDen * PopDen_mean +
beta_Urbpop * UrbPop_mean + beta_GDP2 * (gdp^2)
predicted_GHGpc <- c(predicted_GHGpc, predicted)
}
# Create a data frame with GDP and predicted GHGpc values
predicted_data <- data.frame(GDP = GDPpc, Predicted_GHGpc = predicted_GHGpc)
predicted_data
## GDP Predicted_GHGpc
## 1 516.8668 3466.870
## 2 5343.0377 4820.271
## 3 3354.1573 4275.887
## 4 1502.9508 3752.360
## 5 15284.7724 7260.708
## 6 8496.4282 5645.015
## 7 4505.8677 4593.408
## 8 51722.0690 12205.060
## 9 48809.2269 12040.943
## 10 4229.9106 4517.900
## 11 23998.2680 9014.883
## 12 23433.1872 8912.022
## 13 2233.3059 3960.844
## 14 16643.8066 7557.963
## 15 6542.8645 5139.624
## 16 45517.9038 11807.168
## 17 5266.8762 4799.769
## 18 1237.9493 3676.089
## 19 3009.9255 4179.765
## 20 3068.8126 4196.248
## 21 5875.0706 4962.721
## 22 10153.4766 6059.535
## 23 833.2443 3558.969
## 24 216.8274 3379.089
## 25 3223.7356 4239.535
## 26 1577.9117 3773.874
## 27 1539.1305 3762.747
## 28 43349.6779 11625.145
## 29 435.4692 3443.098
## 30 643.7722 3503.869
## 31 10408.7191 6122.230
## 32 5304.2891 4809.844
## 33 524.6667 3469.146
## 34 1838.4481 3848.443
## 35 12179.2567 6548.636
## 36 2349.0699 3993.658
## 37 14236.5352 7025.460
## 38 9499.5902 5897.514
## 39 28035.9844 9705.878
## 40 22992.8794 8830.825
## 41 60915.4244 12459.597
## 42 7003.4699 5260.411
## 43 7167.9149 5303.292
## 44 5645.1993 4901.338
## 45 3571.5569 4336.304
## 46 3961.7266 4444.174
## 47 23595.2437 8941.676
## 48 3372.9046 4281.106
## 49 4864.1060 4690.893
## 50 49169.7193 12063.432
## 51 39055.2829 11198.937
## 52 6680.0827 5175.713
## 53 704.0305 3521.411
## 54 4255.7430 4524.984
## 55 46772.8254 11902.352
## 56 17658.9473 7774.295
## 57 4609.8973 4621.780
## 58 1073.6593 3628.638
## 59 710.2581 3523.223
## 60 1283.1412 3689.120
## 61 2354.1215 3995.089
## 62 16125.6094 7445.652
## 63 58813.7970 12436.687
## 64 1913.2197 3869.785
## 65 3895.6182 4425.947
## 66 2746.4195 4105.806
## 67 85420.1909 11183.477
## 68 44846.7916 11753.206
## 69 31918.6935 10297.553
## 70 4897.2648 4699.885
## 71 39986.9286 11298.816
## 72 3987.6502 4451.315
## 73 9121.6364 5802.941
## 74 1936.2508 3876.353
## 75 24297.7011 9068.776
## 76 1182.5215 3660.094
## 77 2593.3551 4062.693
## 78 18207.1396 7889.089
## 79 5599.9575 4889.228
## 80 939.5042 3589.795
## 81 597.5297 3490.396
## 82 117370.4969 5250.128
## 83 462.4042 3450.968
## 84 622.1846 3497.581
## 85 10160.8293 6061.345
## 86 7282.3729 5333.062
## 87 822.9061 3555.967
## 88 29196.9777 9890.278
## 89 5567.9727 4880.660
## 90 1868.4682 3857.015
## 91 9005.4620 5773.736
## 92 4376.2425 4557.985
## 93 4041.1741 4466.050
## 94 3258.1050 4249.122
## 95 454.0624 3448.531
## 96 4252.0418 4523.969
## 97 10124.7005 6052.448
## 98 1139.1899 3647.580
## 99 52162.5701 12226.383
## 100 41760.5948 11477.608
## 101 1876.6074 3859.338
## 102 564.8220 3480.860
## 103 68340.0181 12373.118
## 104 1322.3148 3700.407
## 105 13293.3332 6809.337
## 106 2446.0844 4021.108
## 107 5353.3480 4823.044
## 108 6063.6266 5012.885
## 109 3224.4228 4239.726
## 110 15816.8204 7378.122
## 111 22242.4064 8690.315
## 112 52315.6601 12233.578
## 113 13047.4577 6752.306
## 114 10194.4414 6069.618
## 115 773.8206 3541.706
## 116 4042.7227 4466.476
## 117 2161.3102 3940.405
## 118 20398.0610 8333.674
## 119 1492.4759 3749.351
## 120 12020.0219 6510.894
## 121 493.4323 3460.029
## 122 61274.0065 12461.418
## 123 19551.6212 8164.607
## 124 25545.2410 9288.745
## 125 2222.4621 3957.767
## 126 5741.6412 4927.122
## 127 26959.6754 9529.228
## 128 18566.0178 7963.469
## 129 8458.0751 5635.267
## 130 4796.5333 4672.551
## 131 52837.9040 12257.288
## 132 85656.3227 11157.352
## 133 852.3302 3564.510
## 134 1104.1644 3637.458
## 135 7001.7854 5259.972
## 136 1660.3083 3797.492
## 137 875.2454 3571.160
## 138 4605.9709 4620.710
## 139 13871.7982 6942.384
## 140 8561.0643 5661.428
## 141 846.8812 3562.928
## 142 3751.7373 4386.208
## 143 37629.1742 11038.089
## 144 40318.4169 11333.363
## 145 63528.6343 12458.920
## 146 15650.4994 7341.562
## 147 1759.3075 3825.826
## 148 2917.7568 4153.933
## 149 3586.3473 4340.406
## 150 956.8317 3594.817
# Plot the predicted values
library(ggplot2)
ggplot(predicted_data, aes(x = GDP, y = Predicted_GHGpc)) +
geom_line() +
labs(x = "GDP", y = "Predicted GHGpc") +
ggtitle("Predicted GHGpc vs. GDP") +
theme_minimal()
The shape of the relationship in the plot will likely exhibit a
curvilinear or U-shaped pattern due to the inclusion of the quadratic
term for GDPpc (GDPpc2 GDPpc2) in the regression model. Initially, as
GDP per capita increases, GHG emissions per capita may rise, but at some
point, further increases in GDP per capita might lead to a slower rate
of increase in GHG emissions per capita or even a decrease. This
curvature is a result of the quadratic term capturing potential
diminishing returns or saturation effects associated with economic
development and environmental impact.
\[ GHGpc=β0+β1×GDPpc+β2×Pop+β3×EDU+β4×PopDen+β5×Urbpop+β6×GDPpc^2+ϵ \]
Formulate the null and alternative hypotheses
Step 1: find the null hypotethsis Null Hypothesis (HO): The true population coefficient for EDU is equal or greater than zero.
Alternative Hypothesis (H1) The true population coefficient fo EDU is less than zero.
\[ H_0:\beta EDU = 0 \] \[ H_1:\beta EDU < 0 \]
Step 2: find the critcal value