Part 1

B) Fit a simple linear model with response variable and the numeric predictor that you chose. Does the relationship appear to be significant? Make sure to also include a graphic.
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.4
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplR)
regions <- read.csv("Country_Regions.csv")
regions <- regions[,c(1, 6)]
colnames(regions)[1]<- "country"
annualShareCO2 <- read.csv("annual-share-of-co2-emissions.csv", header=TRUE)
colnames(annualShareCO2) <- c('Entity', 'Code', 'Year', 'Share')
share2014<-annualShareCO2 %>%
  filter(Year==2014) %>%
  select(c("Entity", "Share"))
colnames(share2014)[1]<-"country"
colnames(share2014)[2]<-"CO2share"
popGrowth <- read.csv("population_growth_annual_percent.csv", header=TRUE)

pop2014<-popGrowth %>%
  select(c(country, X2014))
colnames(pop2014)[1]<-"country"
colnames(pop2014)[2] <- "popGrowthFactor"

GDPperCap <- read.csv("income_per_person_gdppercapita_ppp_inflation_adjusted.csv")

GDP2014<-GDPperCap %>%
  select(c(country, X2014))
colnames(GDP2014)[1]<- "country"
colnames(GDP2014)[2]<- "GDP"
highTechExports <- read.csv("high_technology_exports_percent_of_manufactured_exports.csv")
highTech2014<-highTechExports %>%
  select(c(country, X2014))
colnames(highTech2014)[1]<- "country"
colnames(highTech2014)[2]<- "highTechExports"


CO2perPerson <- read.csv("co2_emissions_tonnes_per_person.csv")
CO22014 <-CO2perPerson %>%
  select(c(country, X2014))
colnames(CO22014)[1]<- "country"
colnames(CO22014)[2]<- "CO2pp"

GlobalFactors <- merge(regions, share2014)
GlobalFactors <- merge(GlobalFactors, pop2014)
GlobalFactors <- merge(GlobalFactors, GDP2014)
GlobalFactors <- merge(GlobalFactors, highTech2014)
GlobalFactors <- merge(GlobalFactors, CO22014)
GlobalFactors <- drop_na(GlobalFactors)

GlobalFactorsMod <- lm(CO2pp~popGrowthFactor, data = GlobalFactors)
#summary(GlobalFactorsMod)

ggplot(GlobalFactors, aes(x = CO2pp, y = popGrowthFactor))+
  geom_point()+
  geom_abline(slope = GlobalFactorsMod$coefficients[2], yintercept = GlobalFactorsMod$coefficients[1])
## Warning: Ignoring unknown parameters: yintercept

Because the p-value for the t-statistic is above 0.05, the seems to not be a significant relationship between popGrowthFactor and CO2pp. We can see this in the scatter plot as well.

C) Now, write the “dummy” variable coding for your categorical variable. (Hint: the contrasts() function might help).
contrasts(GlobalFactors$region)
##          Africa Americas Asia Europe Oceania
##               0        0    0      0       0
## Africa        1        0    0      0       0
## Americas      0        1    0      0       0
## Asia          0        0    1      0       0
## Europe        0        0    0      1       0
## Oceania       0        0    0      0       1
D) Fit a linear model with response variable and the categorical variable. Does it appear that there are differences among the means of levels of the categorical variable? (Hint: Look at the ANOVA F-test). Be sure to include an appropriate graphic (i.e. side-by-side boxplot)
GlobalFactorsMod1 <- lm(CO2pp~region, data = GlobalFactors)
summary(GlobalFactorsMod1)
## 
## Call:
## lm(formula = CO2pp ~ region, data = GlobalFactors)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.217 -2.794 -1.101  0.778 36.899 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.371      1.227   1.117  0.26624    
## regionAmericas    3.183      1.874   1.699  0.09219 .  
## regionAsia        7.130      1.706   4.180 5.81e-05 ***
## regionEurope      5.121      1.680   3.048  0.00287 ** 
## regionOceania     5.093      2.920   1.744  0.08394 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.492 on 112 degrees of freedom
## Multiple R-squared:  0.1454, Adjusted R-squared:  0.1148 
## F-statistic: 4.763 on 4 and 112 DF,  p-value: 0.001382
anova(GlobalFactorsMod1)
## Analysis of Variance Table
## 
## Response: CO2pp
##            Df Sum Sq Mean Sq F value   Pr(>F)   
## region      4  802.8 200.710  4.7627 0.001382 **
## Residuals 112 4719.9  42.142                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(GlobalFactors, aes(y=CO2pp, x=region, fill=region))+
  geom_boxplot()

E) Now fit a multiple linear model that combines parts (b) and (d), with both the numeric and categorical variables. What are the estimated models for the different levels? Include a graphic of the scatter plot with lines overlaid for each level.
GlobalFactorsMod2<-lm(CO2pp~popGrowthFactor+region, data=GlobalFactors)
summary(GlobalFactorsMod2)
## 
## Call:
## lm(formula = CO2pp ~ popGrowthFactor + region, data = GlobalFactors)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.857  -2.792  -1.268   1.188  31.316 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2.5031     1.7214  -1.454  0.14874    
## popGrowthFactor   1.6250     0.5248   3.096  0.00248 ** 
## regionAmericas    5.4147     1.9446   2.785  0.00630 ** 
## regionAsia        7.8774     1.6616   4.741 6.36e-06 ***
## regionEurope      8.7487     1.9985   4.378 2.73e-05 ***
## regionOceania     7.3962     2.9112   2.541  0.01245 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.256 on 111 degrees of freedom
## Multiple R-squared:  0.2133, Adjusted R-squared:  0.1779 
## F-statistic:  6.02 on 5 and 111 DF,  p-value: 5.75e-05
ggplot(GlobalFactors, aes(x=popGrowthFactor, y=CO2pp, color=region))+
  geom_point()+
  geom_abline(intercept = -2.5031, slope=1.6250, color="red")+
  geom_abline(intercept = -2.5031+5.4147, slope=1.6250, color="orange")+
  geom_abline(intercept = -2.5031+7.8774, slope=1.6250, color="green")+
  geom_abline(intercept = -2.5031+8.7487, slope=1.6250, color="blue")+
  geom_abline(intercept = -2.5031+7.3962, slope=1.6250, color="purple")

F) Finally, fit a multiple linear model that includes also the interaction between the numeric and categorical variables, which allows for different slopes. What are the estimated models for the different levels? Include a graphic of the scatter plot with lines overlaid for each level.
GlobalFactorsMod3<-lm(CO2pp~popGrowthFactor*region, data=GlobalFactors)
summary(GlobalFactorsMod3)
## 
## Call:
## lm(formula = CO2pp ~ popGrowthFactor * region, data = GlobalFactors)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1125  -2.5210  -0.5697   0.9494  28.5846 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                      5.7095     3.8850   1.470   0.1446  
## popGrowthFactor                 -1.8200     1.5584  -1.168   0.2455  
## regionAmericas                   3.4369     5.0200   0.685   0.4951  
## regionAsia                      -1.8647     4.2056  -0.443   0.6584  
## regionEurope                     0.5755     4.0319   0.143   0.8868  
## regionOceania                   -8.6828     7.3300  -1.185   0.2388  
## popGrowthFactor:regionAmericas  -2.7244     3.2616  -0.835   0.4054  
## popGrowthFactor:regionAsia       4.2399     1.6744   2.532   0.0128 *
## popGrowthFactor:regionEurope     3.1851     1.9631   1.622   0.1076  
## popGrowthFactor:regionOceania   11.5853     6.1114   1.896   0.0607 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.013 on 107 degrees of freedom
## Multiple R-squared:  0.2994, Adjusted R-squared:  0.2405 
## F-statistic: 5.081 on 9 and 107 DF,  p-value: 1.028e-05
ggplot(GlobalFactors, aes(x=popGrowthFactor, y=CO2pp, color=region))+
  geom_point()+
  geom_abline(intercept =  5.7095, slope=-1.8200 , color="red")+
  geom_abline(intercept = 5.7095+3.4369, slope=-1.8200 -2.7244, color="orange")+
  geom_abline(intercept = 5.7095-1.8647, slope=-1.8200+4.2399, color="green")+
  geom_abline(intercept = 5.7095+0.5755, slope=-1.8200+3.1851, color="blue")+
  geom_abline(intercept = 5.7095-8.6828, slope=-1.8200+11.5853, color="purple")

G) Conclusion: What did you learn from this exercise? Were any of the relationships significant? (Note: This would be great to include in your final project write up!)

The slopes of the linear models that include the interaction with the region are significantly different than those which do not, as evidenced by the lines on the graph. This makes sense and is supported by the p-values we see: the relationship between the population growthfactor and CO@ per person is impacted by the geographic region the country is in.

Part 2

A) Describe the variables Sales , Price , Urban , and US . Are the variables numeric or categorical? If they are categorical describe the levels.

Sales: A numeric; Indicates the thousands of unit sales at each location.

Price: A numeric; The price of each carseat at each location

Urban: Categorical; a boolean indicating if the store is in an urban or rural area.

US: Categorical; a boolean indicating if the store is in the US or not.

B) Fit a multiple regression model to predict Sales using Price , Urban , and US.
#summary(Carseats)
attach(Carseats)
m1 <-  lm(Sales ~ Price + Urban + US)
summary(m1)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16
C) Provide an interpretation of each coefficient in the model. Be careful - some of the variables in the model are categorical!

Price: Because the p-value of the t-statistic is low, there seems to be a relationship between price and sales. This relationship is negative.

Urban: There does not seem to be a linear relationship between price and urban location. The p-value is larger than 0.05.

US: Because the p-value of the t-statistic is low, there seems to be a relationship between price and country location. This relationship is postive.

D) Write out the model in equation form, being careful to handle the qualitative variables properly. (Hint: You can write separate equations for each combination of Urban and US groups).

Sales = 13.04 + -0.05 Price + -0.02 Urban + 1.20 US

E) For which of the predictors can you reject the null hypothesis H0 : βj = 0 ?

Because the p-value was below 0.05, we can reject the null hypothesis for Price and US.

F) On the basis of your response in the previous question, fit a smaller model that only uses the predictors for which there is evidence of association with the outcome. (i.e. keep only the predictors that are significant)
m2 <-  lm(Sales ~ Price + US)
summary(m2)
## 
## Call:
## lm(formula = Sales ~ Price + US)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16
G) How well do the model in (a) and (f) fit the data? (Hint: Use the MSE)

The R-squared value of both models is 0.2393, indicating that they both fit the data well.

H) Using the model from (a), obtain 95% confidence intervals for the coefficient(s). Discuss what the confidence intervals for the coefficients tell us. (Hint: confint() ).
confint(m1)
##                   2.5 %      97.5 %
## (Intercept) 11.76359670 14.32334118
## Price       -0.06476419 -0.04415351
## UrbanYes    -0.55597316  0.51214085
## USYes        0.69130419  1.70984121

The confidence interval for US is somewhat larger than that of Price and Urban, indicating that these results are less likely to occur again for US than for Price and Urban. But overall, these confidence intervals are realitivly small.