HW 7

Part I A. response = CO2pp categorial = region numeric = popGrowthFactor

Units: popGrowthFactor = Annual population growth rate for year % globally

region = “Africa”, “Americas”, “Asia”, “Europe”, “Oceania”

CO2pp= Carbon dioxide emissions in metric tonnes per person globally

library(tidyverse)

## ── Attaching packages ───────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplR)
regions <- read.csv("Country_Regions.csv")
regions <- regions[,c(1, 6)]
colnames(regions)[1]<- "country"
annualShareCO2 <- read.csv("annual-share-of-co2-emissions.csv", header=TRUE)
colnames(annualShareCO2) <- c('Entity', 'Code', 'Year', 'Share')
share2014<-annualShareCO2 %>%
  filter(Year==2014) %>%
  select(c("Entity", "Share"))
colnames(share2014)[1]<-"country"
colnames(share2014)[2]<-"CO2share"
popGrowth <- read.csv("population_growth_annual_percent.csv", header=TRUE)

pop2014<-popGrowth %>%
  select(c(country, X2014))
colnames(pop2014)[1]<-"country"
colnames(pop2014)[2] <- "popGrowthFactor"

GDPperCap <- read.csv("income_per_person_gdppercapita_ppp_inflation_adjusted.csv")

GDP2014<-GDPperCap %>%
  select(c(country, X2014))
colnames(GDP2014)[1]<- "country"
colnames(GDP2014)[2]<- "GDP"
highTechExports <- read.csv("high_technology_exports_percent_of_manufactured_exports (2).csv")
highTech2014<-highTechExports %>%
  select(c(country, X2014))
colnames(highTech2014)[1]<- "country"
colnames(highTech2014)[2]<- "highTechExports"


CO2perPerson <- read.csv("co2_emissions_tonnes_per_person.csv")
CO22014 <-CO2perPerson %>%
  select(c(country, X2014))
colnames(CO22014)[1]<- "country"
colnames(CO22014)[2]<- "CO2pp"

GlobalFactors <- merge(regions, share2014)
GlobalFactors <- merge(GlobalFactors, pop2014)
GlobalFactors <- merge(GlobalFactors, GDP2014)
GlobalFactors <- merge(GlobalFactors, highTech2014)
GlobalFactors <- merge(GlobalFactors, CO22014)
GlobalFactors <- drop_na(GlobalFactors)

GlobalFactorsMod <- lm(CO2pp~popGrowthFactor, data = GlobalFactors)
summary(GlobalFactorsMod)

## 
## Call:
## lm(formula = CO2pp ~ popGrowthFactor, data = GlobalFactors)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.723 -4.403 -1.444  1.956 37.715 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.6839     0.8781   5.334 4.87e-07 ***
## popGrowthFactor   0.5598     0.4528   1.236    0.219    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.884 on 115 degrees of freedom
## Multiple R-squared:  0.01312,    Adjusted R-squared:  0.004538 
## F-statistic: 1.529 on 1 and 115 DF,  p-value: 0.2188

Because the p-value for the t-statistic is above 0.05, the seems to not be a significant relationship between popGrowthFactor and CO2pp.

contrasts(GlobalFactors$region)

##          Africa Americas Asia Europe Oceania
##               0        0    0      0       0
## Africa        1        0    0      0       0
## Americas      0        1    0      0       0
## Asia          0        0    1      0       0
## Europe        0        0    0      1       0
## Oceania       0        0    0      0       1

GlobalFactorsMod1 <- lm(CO2pp~region, data = GlobalFactors)
summary(GlobalFactorsMod1)

## 
## Call:
## lm(formula = CO2pp ~ region, data = GlobalFactors)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.217 -2.794 -1.101  0.778 36.899 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.371      1.227   1.117  0.26624    
## regionAmericas    3.183      1.874   1.699  0.09219 .  
## regionAsia        7.130      1.706   4.180 5.81e-05 ***
## regionEurope      5.121      1.680   3.048  0.00287 ** 
## regionOceania     5.093      2.920   1.744  0.08394 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.492 on 112 degrees of freedom
## Multiple R-squared:  0.1454, Adjusted R-squared:  0.1148 
## F-statistic: 4.763 on 4 and 112 DF,  p-value: 0.001382

anova(GlobalFactorsMod1)

## Analysis of Variance Table
## 
## Response: CO2pp
##            Df Sum Sq Mean Sq F value   Pr(>F)   
## region      4  802.8 200.710  4.7627 0.001382 **
## Residuals 112 4719.9  42.142                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ggplot(GlobalFactors, aes(y=CO2pp, x=region, fill=region))+
  geom_boxplot()

Because the p-value for the f-statistic is below 0.05, it seems that CO2 per person is siginificantly different in the identified regions.

GlobalFactorsMod2<-lm(CO2pp~popGrowthFactor+region, data=GlobalFactors)
summary(GlobalFactorsMod2)

## 
## Call:
## lm(formula = CO2pp ~ popGrowthFactor + region, data = GlobalFactors)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.857  -2.792  -1.268   1.188  31.316 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2.5031     1.7214  -1.454  0.14874    
## popGrowthFactor   1.6250     0.5248   3.096  0.00248 ** 
## regionAmericas    5.4147     1.9446   2.785  0.00630 ** 
## regionAsia        7.8774     1.6616   4.741 6.36e-06 ***
## regionEurope      8.7487     1.9985   4.378 2.73e-05 ***
## regionOceania     7.3962     2.9112   2.541  0.01245 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.256 on 111 degrees of freedom
## Multiple R-squared:  0.2133, Adjusted R-squared:  0.1779 
## F-statistic:  6.02 on 5 and 111 DF,  p-value: 5.75e-05

ggplot(GlobalFactors, aes(x=popGrowthFactor, y=CO2pp, color=region))+
  geom_point()+
  geom_abline(intercept = -2.5031, slope=1.6250, color="red")+
  geom_abline(intercept = -2.5031+5.4147, slope=1.6250, color="orange")+
  geom_abline(intercept = -2.5031+7.8774, slope=1.6250, color="green")+
  geom_abline(intercept = -2.5031+8.7487, slope=1.6250, color="blue")+
  geom_abline(intercept = -2.5031+7.3962, slope=1.6250, color="purple")

GlobalFactorsMod3<-lm(CO2pp~popGrowthFactor*region, data=GlobalFactors)
summary(GlobalFactorsMod3)

## 
## Call:
## lm(formula = CO2pp ~ popGrowthFactor * region, data = GlobalFactors)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1125  -2.5210  -0.5697   0.9494  28.5846 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                      5.7095     3.8850   1.470   0.1446  
## popGrowthFactor                 -1.8200     1.5584  -1.168   0.2455  
## regionAmericas                   3.4369     5.0200   0.685   0.4951  
## regionAsia                      -1.8647     4.2056  -0.443   0.6584  
## regionEurope                     0.5755     4.0319   0.143   0.8868  
## regionOceania                   -8.6828     7.3300  -1.185   0.2388  
## popGrowthFactor:regionAmericas  -2.7244     3.2616  -0.835   0.4054  
## popGrowthFactor:regionAsia       4.2399     1.6744   2.532   0.0128 *
## popGrowthFactor:regionEurope     3.1851     1.9631   1.622   0.1076  
## popGrowthFactor:regionOceania   11.5853     6.1114   1.896   0.0607 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.013 on 107 degrees of freedom
## Multiple R-squared:  0.2994, Adjusted R-squared:  0.2405 
## F-statistic: 5.081 on 9 and 107 DF,  p-value: 1.028e-05

ggplot(GlobalFactors, aes(x=popGrowthFactor, y=CO2pp, color=region))+
  geom_point()+
  geom_abline(intercept =  5.7095, slope=-1.8200 , color="red")+
  geom_abline(intercept = 5.7095+3.4369, slope=-1.8200 -2.7244, color="orange")+
  geom_abline(intercept = 5.7095-1.8647, slope=-1.8200+4.2399, color="green")+
  geom_abline(intercept = 5.7095+0.5755, slope=-1.8200+3.1851, color="blue")+
  geom_abline(intercept = 5.7095-8.6828, slope=-1.8200+11.5853, color="purple")

I found that there is a signficant relationship between CO2pp and region, suggesting that CO2 per person differes significantly between the different identified regions. On the other hand, I found that there was no significant relationship between CO2pp and popGrowthFactor.

Part II.

library(ISLR)
data(Carseats)
names(Carseats)

##  [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
##  [6] "Price"       "ShelveLoc"   "Age"         "Education"   "Urban"      
## [11] "US"

str(Carseats)

## 'data.frame':    400 obs. of  11 variables:
##  $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : num  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : num  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: num  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : num  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : num  120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : num  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : num  17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...

Sales = numeric: the thousands of unit sales at each location

Price = numeric: the price of each carseat at each location

Urban = categorical: either urban or rural location

US = categorical: either US or not

m1 <- lm(Sales ~ Price + Urban + US, data=Carseats)
summary(m1)

## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16

2C. Price: Because the p-value for the t-statisitc is below 0.05, there seems to be a signficant relationship between price and sales.

Urban: Because the p-value for the t-statistic is above 0.05, there does not seem to be a signifcant relationship between urban location and sales.

US: Because the p-value for the t-statistic is below 0.05, there seems to be a significant relationship between price and sales.

2D. Sales = 13.04 + -0.05 Price + -0.02 Urban + 1.20 US

2E. Because the p-value is below 0.05 for both Price and US, we can reject the null hypothesis for them.

m2 <- lm(Sales ~ Price + US, data=Carseats)
summary(m2)

## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16

2G. The r-squared for both models is 0.29393 suggesting that both models fit the data moderately well.

confint(m1)

##                   2.5 %      97.5 %
## (Intercept) 11.76359670 14.32334118
## Price       -0.06476419 -0.04415351
## UrbanYes    -0.55597316  0.51214085
## USYes        0.69130419  1.70984121

The confidence intervals for the coefficients tell us that 95% of the intervals that we make will capture the true relationship in regards to Sales, Price, Urban, and US. The interval is larger for US than Price and Urban suggesting that there is less confidence and more uncertaintiy for the slope of US compared to Price and Urban.

HW 7

Lauren Collar

4/1/2020