library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.4
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplR)
regions <- read.csv("Country_Regions.csv")
regions <- regions[,c(1, 6)]
colnames(regions)[1]<- "country"
annualShareCO2 <- read.csv("annual-share-of-co2-emissions.csv", header=TRUE)
colnames(annualShareCO2) <- c('Entity', 'Code', 'Year', 'Share')
share2014<-annualShareCO2 %>%
filter(Year==2014) %>%
select(c("Entity", "Share"))
colnames(share2014)[1]<-"country"
colnames(share2014)[2]<-"CO2share"
popGrowth <- read.csv("population_growth_annual_percent.csv", header=TRUE)
pop2014<-popGrowth %>%
select(c(country, X2014))
colnames(pop2014)[1]<-"country"
colnames(pop2014)[2] <- "popGrowthFactor"
GDPperCap <- read.csv("income_per_person_gdppercapita_ppp_inflation_adjusted.csv")
GDP2014<-GDPperCap %>%
select(c(country, X2014))
colnames(GDP2014)[1]<- "country"
colnames(GDP2014)[2]<- "GDP"
highTechExports <- read.csv("high_technology_exports_percent_of_manufactured_exports.csv")
highTech2014<-highTechExports %>%
select(c(country, X2014))
colnames(highTech2014)[1]<- "country"
colnames(highTech2014)[2]<- "highTechExports"
CO2perPerson <- read.csv("co2_emissions_tonnes_per_person.csv")
CO22014 <-CO2perPerson %>%
select(c(country, X2014))
colnames(CO22014)[1]<- "country"
colnames(CO22014)[2]<- "CO2pp"
GlobalFactors <- merge(regions, share2014)
GlobalFactors <- merge(GlobalFactors, pop2014)
GlobalFactors <- merge(GlobalFactors, GDP2014)
GlobalFactors <- merge(GlobalFactors, highTech2014)
GlobalFactors <- merge(GlobalFactors, CO22014)
GlobalFactors <- drop_na(GlobalFactors)
GlobalFactorsMod <- lm(CO2pp~popGrowthFactor, data = GlobalFactors)
#summary(GlobalFactorsMod)
ggplot(GlobalFactors, aes(x = CO2pp, y = popGrowthFactor))+
geom_point()+
geom_abline(slope = GlobalFactorsMod$coefficients[2], yintercept = GlobalFactorsMod$coefficients[1])
## Warning: Ignoring unknown parameters: yintercept
Because the p-value for the t-statistic is above 0.05, the seems to not be a significant relationship between popGrowthFactor and CO2pp. We can see this in the scatter plot as well.
contrasts(GlobalFactors$region)
## Africa Americas Asia Europe Oceania
## 0 0 0 0 0
## Africa 1 0 0 0 0
## Americas 0 1 0 0 0
## Asia 0 0 1 0 0
## Europe 0 0 0 1 0
## Oceania 0 0 0 0 1
GlobalFactorsMod1 <- lm(CO2pp~region, data = GlobalFactors)
summary(GlobalFactorsMod1)
##
## Call:
## lm(formula = CO2pp ~ region, data = GlobalFactors)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.217 -2.794 -1.101 0.778 36.899
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.371 1.227 1.117 0.26624
## regionAmericas 3.183 1.874 1.699 0.09219 .
## regionAsia 7.130 1.706 4.180 5.81e-05 ***
## regionEurope 5.121 1.680 3.048 0.00287 **
## regionOceania 5.093 2.920 1.744 0.08394 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.492 on 112 degrees of freedom
## Multiple R-squared: 0.1454, Adjusted R-squared: 0.1148
## F-statistic: 4.763 on 4 and 112 DF, p-value: 0.001382
anova(GlobalFactorsMod1)
## Analysis of Variance Table
##
## Response: CO2pp
## Df Sum Sq Mean Sq F value Pr(>F)
## region 4 802.8 200.710 4.7627 0.001382 **
## Residuals 112 4719.9 42.142
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(GlobalFactors, aes(y=CO2pp, x=region, fill=region))+
geom_boxplot()
GlobalFactorsMod2<-lm(CO2pp~popGrowthFactor+region, data=GlobalFactors)
summary(GlobalFactorsMod2)
##
## Call:
## lm(formula = CO2pp ~ popGrowthFactor + region, data = GlobalFactors)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.857 -2.792 -1.268 1.188 31.316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.5031 1.7214 -1.454 0.14874
## popGrowthFactor 1.6250 0.5248 3.096 0.00248 **
## regionAmericas 5.4147 1.9446 2.785 0.00630 **
## regionAsia 7.8774 1.6616 4.741 6.36e-06 ***
## regionEurope 8.7487 1.9985 4.378 2.73e-05 ***
## regionOceania 7.3962 2.9112 2.541 0.01245 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.256 on 111 degrees of freedom
## Multiple R-squared: 0.2133, Adjusted R-squared: 0.1779
## F-statistic: 6.02 on 5 and 111 DF, p-value: 5.75e-05
ggplot(GlobalFactors, aes(x=popGrowthFactor, y=CO2pp, color=region))+
geom_point()+
geom_abline(intercept = -2.5031, slope=1.6250, color="red")+
geom_abline(intercept = -2.5031+5.4147, slope=1.6250, color="orange")+
geom_abline(intercept = -2.5031+7.8774, slope=1.6250, color="green")+
geom_abline(intercept = -2.5031+8.7487, slope=1.6250, color="blue")+
geom_abline(intercept = -2.5031+7.3962, slope=1.6250, color="purple")
GlobalFactorsMod3<-lm(CO2pp~popGrowthFactor*region, data=GlobalFactors)
summary(GlobalFactorsMod3)
##
## Call:
## lm(formula = CO2pp ~ popGrowthFactor * region, data = GlobalFactors)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.1125 -2.5210 -0.5697 0.9494 28.5846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.7095 3.8850 1.470 0.1446
## popGrowthFactor -1.8200 1.5584 -1.168 0.2455
## regionAmericas 3.4369 5.0200 0.685 0.4951
## regionAsia -1.8647 4.2056 -0.443 0.6584
## regionEurope 0.5755 4.0319 0.143 0.8868
## regionOceania -8.6828 7.3300 -1.185 0.2388
## popGrowthFactor:regionAmericas -2.7244 3.2616 -0.835 0.4054
## popGrowthFactor:regionAsia 4.2399 1.6744 2.532 0.0128 *
## popGrowthFactor:regionEurope 3.1851 1.9631 1.622 0.1076
## popGrowthFactor:regionOceania 11.5853 6.1114 1.896 0.0607 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.013 on 107 degrees of freedom
## Multiple R-squared: 0.2994, Adjusted R-squared: 0.2405
## F-statistic: 5.081 on 9 and 107 DF, p-value: 1.028e-05
ggplot(GlobalFactors, aes(x=popGrowthFactor, y=CO2pp, color=region))+
geom_point()+
geom_abline(intercept = 5.7095, slope=-1.8200 , color="red")+
geom_abline(intercept = 5.7095+3.4369, slope=-1.8200 -2.7244, color="orange")+
geom_abline(intercept = 5.7095-1.8647, slope=-1.8200+4.2399, color="green")+
geom_abline(intercept = 5.7095+0.5755, slope=-1.8200+3.1851, color="blue")+
geom_abline(intercept = 5.7095-8.6828, slope=-1.8200+11.5853, color="purple")
The slopes of the linear models that include the interaction with the region are significantly different than those which do not, as evidenced by the lines on the graph. This makes sense and is supported by the p-values we see: the relationship between the population growthfactor and CO@ per person is impacted by the geographic region the country is in.
Sales: A numeric; Indicates the thousands of unit sales at each location.
Price: A numeric; The price of each carseat at each location
Urban: Categorical; a boolean indicating if the store is in an urban or rural area.
US: Categorical; a boolean indicating if the store is in the US or not.
#summary(Carseats)
attach(Carseats)
m1 <- lm(Sales ~ Price + Urban + US)
summary(m1)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
Price: Because the p-value of the t-statistic is low, there seems to be a relationship between price and sales. This relationship is negative.
Urban: There does not seem to be a linear relationship between price and urban location. The p-value is larger than 0.05.
US: Because the p-value of the t-statistic is low, there seems to be a relationship between price and country location. This relationship is postive.
Sales = 13.04 + -0.05 Price + -0.02 Urban + 1.20 US
Because the p-value was below 0.05, we can reject the null hypothesis for Price and US.
m2 <- lm(Sales ~ Price + US)
summary(m2)
##
## Call:
## lm(formula = Sales ~ Price + US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
The R-squared value of both models is 0.2393, indicating that they both fit the data well.
confint(m1)
## 2.5 % 97.5 %
## (Intercept) 11.76359670 14.32334118
## Price -0.06476419 -0.04415351
## UrbanYes -0.55597316 0.51214085
## USYes 0.69130419 1.70984121
The confidence interval for US is somewhat larger than that of Price and Urban, indicating that these results are less likely to occur again for US than for Price and Urban. But overall, these confidence intervals are realitivly small.