Part I A. response = CO2pp categorial = region numeric = popGrowthFactor
Units: popGrowthFactor = Annual population growth rate for year % globally
region = “Africa”, “Americas”, “Asia”, “Europe”, “Oceania”
CO2pp= Carbon dioxide emissions in metric tonnes per person globally
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplR)
regions <- read.csv("Country_Regions.csv")
regions <- regions[,c(1, 6)]
colnames(regions)[1]<- "country"
annualShareCO2 <- read.csv("annual-share-of-co2-emissions.csv", header=TRUE)
colnames(annualShareCO2) <- c('Entity', 'Code', 'Year', 'Share')
share2014<-annualShareCO2 %>%
filter(Year==2014) %>%
select(c("Entity", "Share"))
colnames(share2014)[1]<-"country"
colnames(share2014)[2]<-"CO2share"
popGrowth <- read.csv("population_growth_annual_percent.csv", header=TRUE)
pop2014<-popGrowth %>%
select(c(country, X2014))
colnames(pop2014)[1]<-"country"
colnames(pop2014)[2] <- "popGrowthFactor"
GDPperCap <- read.csv("income_per_person_gdppercapita_ppp_inflation_adjusted.csv")
GDP2014<-GDPperCap %>%
select(c(country, X2014))
colnames(GDP2014)[1]<- "country"
colnames(GDP2014)[2]<- "GDP"
highTechExports <- read.csv("high_technology_exports_percent_of_manufactured_exports (2).csv")
highTech2014<-highTechExports %>%
select(c(country, X2014))
colnames(highTech2014)[1]<- "country"
colnames(highTech2014)[2]<- "highTechExports"
CO2perPerson <- read.csv("co2_emissions_tonnes_per_person.csv")
CO22014 <-CO2perPerson %>%
select(c(country, X2014))
colnames(CO22014)[1]<- "country"
colnames(CO22014)[2]<- "CO2pp"
GlobalFactors <- merge(regions, share2014)
GlobalFactors <- merge(GlobalFactors, pop2014)
GlobalFactors <- merge(GlobalFactors, GDP2014)
GlobalFactors <- merge(GlobalFactors, highTech2014)
GlobalFactors <- merge(GlobalFactors, CO22014)
GlobalFactors <- drop_na(GlobalFactors)
GlobalFactorsMod <- lm(CO2pp~popGrowthFactor, data = GlobalFactors)
summary(GlobalFactorsMod)
##
## Call:
## lm(formula = CO2pp ~ popGrowthFactor, data = GlobalFactors)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.723 -4.403 -1.444 1.956 37.715
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6839 0.8781 5.334 4.87e-07 ***
## popGrowthFactor 0.5598 0.4528 1.236 0.219
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.884 on 115 degrees of freedom
## Multiple R-squared: 0.01312, Adjusted R-squared: 0.004538
## F-statistic: 1.529 on 1 and 115 DF, p-value: 0.2188
Because the p-value for the t-statistic is above 0.05, the seems to not be a significant relationship between popGrowthFactor and CO2pp.
contrasts(GlobalFactors$region)
## Africa Americas Asia Europe Oceania
## 0 0 0 0 0
## Africa 1 0 0 0 0
## Americas 0 1 0 0 0
## Asia 0 0 1 0 0
## Europe 0 0 0 1 0
## Oceania 0 0 0 0 1
GlobalFactorsMod1 <- lm(CO2pp~region, data = GlobalFactors)
summary(GlobalFactorsMod1)
##
## Call:
## lm(formula = CO2pp ~ region, data = GlobalFactors)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.217 -2.794 -1.101 0.778 36.899
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.371 1.227 1.117 0.26624
## regionAmericas 3.183 1.874 1.699 0.09219 .
## regionAsia 7.130 1.706 4.180 5.81e-05 ***
## regionEurope 5.121 1.680 3.048 0.00287 **
## regionOceania 5.093 2.920 1.744 0.08394 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.492 on 112 degrees of freedom
## Multiple R-squared: 0.1454, Adjusted R-squared: 0.1148
## F-statistic: 4.763 on 4 and 112 DF, p-value: 0.001382
anova(GlobalFactorsMod1)
## Analysis of Variance Table
##
## Response: CO2pp
## Df Sum Sq Mean Sq F value Pr(>F)
## region 4 802.8 200.710 4.7627 0.001382 **
## Residuals 112 4719.9 42.142
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(GlobalFactors, aes(y=CO2pp, x=region, fill=region))+
geom_boxplot()
Because the p-value for the f-statistic is below 0.05, it seems that CO2 per person is siginificantly different in the identified regions.
GlobalFactorsMod2<-lm(CO2pp~popGrowthFactor+region, data=GlobalFactors)
summary(GlobalFactorsMod2)
##
## Call:
## lm(formula = CO2pp ~ popGrowthFactor + region, data = GlobalFactors)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.857 -2.792 -1.268 1.188 31.316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.5031 1.7214 -1.454 0.14874
## popGrowthFactor 1.6250 0.5248 3.096 0.00248 **
## regionAmericas 5.4147 1.9446 2.785 0.00630 **
## regionAsia 7.8774 1.6616 4.741 6.36e-06 ***
## regionEurope 8.7487 1.9985 4.378 2.73e-05 ***
## regionOceania 7.3962 2.9112 2.541 0.01245 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.256 on 111 degrees of freedom
## Multiple R-squared: 0.2133, Adjusted R-squared: 0.1779
## F-statistic: 6.02 on 5 and 111 DF, p-value: 5.75e-05
ggplot(GlobalFactors, aes(x=popGrowthFactor, y=CO2pp, color=region))+
geom_point()+
geom_abline(intercept = -2.5031, slope=1.6250, color="red")+
geom_abline(intercept = -2.5031+5.4147, slope=1.6250, color="orange")+
geom_abline(intercept = -2.5031+7.8774, slope=1.6250, color="green")+
geom_abline(intercept = -2.5031+8.7487, slope=1.6250, color="blue")+
geom_abline(intercept = -2.5031+7.3962, slope=1.6250, color="purple")
GlobalFactorsMod3<-lm(CO2pp~popGrowthFactor*region, data=GlobalFactors)
summary(GlobalFactorsMod3)
##
## Call:
## lm(formula = CO2pp ~ popGrowthFactor * region, data = GlobalFactors)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.1125 -2.5210 -0.5697 0.9494 28.5846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.7095 3.8850 1.470 0.1446
## popGrowthFactor -1.8200 1.5584 -1.168 0.2455
## regionAmericas 3.4369 5.0200 0.685 0.4951
## regionAsia -1.8647 4.2056 -0.443 0.6584
## regionEurope 0.5755 4.0319 0.143 0.8868
## regionOceania -8.6828 7.3300 -1.185 0.2388
## popGrowthFactor:regionAmericas -2.7244 3.2616 -0.835 0.4054
## popGrowthFactor:regionAsia 4.2399 1.6744 2.532 0.0128 *
## popGrowthFactor:regionEurope 3.1851 1.9631 1.622 0.1076
## popGrowthFactor:regionOceania 11.5853 6.1114 1.896 0.0607 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.013 on 107 degrees of freedom
## Multiple R-squared: 0.2994, Adjusted R-squared: 0.2405
## F-statistic: 5.081 on 9 and 107 DF, p-value: 1.028e-05
ggplot(GlobalFactors, aes(x=popGrowthFactor, y=CO2pp, color=region))+
geom_point()+
geom_abline(intercept = 5.7095, slope=-1.8200 , color="red")+
geom_abline(intercept = 5.7095+3.4369, slope=-1.8200 -2.7244, color="orange")+
geom_abline(intercept = 5.7095-1.8647, slope=-1.8200+4.2399, color="green")+
geom_abline(intercept = 5.7095+0.5755, slope=-1.8200+3.1851, color="blue")+
geom_abline(intercept = 5.7095-8.6828, slope=-1.8200+11.5853, color="purple")
Part II.
library(ISLR)
data(Carseats)
names(Carseats)
## [1] "Sales" "CompPrice" "Income" "Advertising" "Population"
## [6] "Price" "ShelveLoc" "Age" "Education" "Urban"
## [11] "US"
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
Sales = numeric: the thousands of unit sales at each location
Price = numeric: the price of each carseat at each location
Urban = categorical: either urban or rural location
US = categorical: either US or not
m1 <- lm(Sales ~ Price + Urban + US, data=Carseats)
summary(m1)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
2C. Price: Because the p-value for the t-statisitc is below 0.05, there seems to be a signficant relationship between price and sales.
Urban: Because the p-value for the t-statistic is above 0.05, there does not seem to be a signifcant relationship between urban location and sales.
US: Because the p-value for the t-statistic is below 0.05, there seems to be a significant relationship between price and sales.
2D. Sales = 13.04 + -0.05 Price + -0.02 Urban + 1.20 US
2E. Because the p-value is below 0.05 for both Price and US, we can reject the null hypothesis for them.
m2 <- lm(Sales ~ Price + US, data=Carseats)
summary(m2)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
2G. The r-squared for both models is 0.29393 suggesting that both models fit the data moderately well.
confint(m1)
## 2.5 % 97.5 %
## (Intercept) 11.76359670 14.32334118
## Price -0.06476419 -0.04415351
## UrbanYes -0.55597316 0.51214085
## USYes 0.69130419 1.70984121
The confidence intervals for the coefficients tell us that 95% of the intervals that we make will capture the true relationship in regards to Sales, Price, Urban, and US. The interval is larger for US than Price and Urban suggesting that there is less confidence and more uncertaintiy for the slope of US compared to Price and Urban.